文件目录:
cudaTest
|--utils.cu
|--utils.h
|--squaresum.cu
|--squaresum.h
|--test.cpp
|--CMakeLists.txt
编译命令:
$cd /root/cudaTest
$mkdir build
$cd build
$cmake ..
$make
调佣关系:
utils:提供常用工具,这里提供查询设备信息功能;
squaresum:计算平方和功能,为cuda运行的核心函数实现
test:调用平方和函数
CMakeLists.txt:组织所有文件编译生成可执行文件
注意:调用cu文件中的函数时要在头文件声明成extern “C”
文件内容:
CMakeLists.txt
# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)
# Specify binary name and source file to build it from
#add_library(utils utils.cpp)
cuda_add_executable(
squaresum
test.cpp squaresum.cu utils.cu)
#target_link_libraries(squaresum utils)
test.cpp
#include <iostream>
#include "squaresum.h"
//extern "C" int squaresum();
int main(){
squaresum();
return 0;
}
squaresum.h
#include "utils.h"
#include <cuda_runtime.h>
extern "C" {
int squaresum();
}
squaresum.cu
#include <stdio.h>
#include <stdlib.h>
//#include "utils.h"
#include <iostream>
#include "squaresum.h"
// ======== define area ========
#define DATA_SIZE 1048576 // 1M
// ======== global area ========
int data[DATA_SIZE];
__global__ static void squaresSum(int *data, int *sum, clock_t *time)
{
int sum_t = 0;
clock_t start = clock();
for (int i = 0; i < DATA_SIZE; ++i) {
sum_t += data[i] * data[i];
}
*sum = sum_t;
*time = clock() - start;
}
// ======== used to generate rand datas ========
void generateData(int *data, int size)
{
for (int i = 0; i < size; ++i) {
data[i] = rand() % 10;
}
}
int squaresum()
{
// init CUDA device
if (!InitCUDA()) {
return 0;
}
printf("CUDA initialized.\n");
// generate rand datas
generateData(data, DATA_SIZE);
// malloc space for datas in GPU
int *gpuData, *sum;
clock_t *time;
cudaMalloc((void**) &gpuData, sizeof(int) * DATA_SIZE);
cudaMalloc((void**) &sum, sizeof(int));
cudaMalloc((void**) &time, sizeof(clock_t));
cudaMemcpy(gpuData, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);
// calculate the squares's sum
squaresSum<<<1, 1, 0>>>(gpuData, sum, time);
// copy the result from GPU to HOST
int result;
clock_t time_used;
cudaMemcpy(&result, sum, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
// free GPU spaces
cudaFree(gpuData);
cudaFree(sum);
cudaFree(time);
// print result
printf("(GPU) sum:%d time:%ld\n", result, time_used);
// CPU calculate
result = 0;
clock_t start = clock();
for (int i = 0; i < DATA_SIZE; ++i) {
result += data[i] * data[i];
}
time_used = clock() - start;
printf("(CPU) sum:%d time:%ld\n", result, time_used);
return 0;
}
utils.h
#include <stdio.h>
#include <cuda_runtime.h>
extern "C" {
bool InitCUDA();
}
utils.cu
#include "utils.h"
#include <cuda_runtime.h>
#include <iostream>
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);
printf("memPitch : %d.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %d.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);
printf("textureAlignment : %d.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}
bool InitCUDA()
{
//used to count the device numbers
int count;
// get the cuda device count
cudaGetDeviceCount(&count);
// print("%d\n", count);
std::cout << count << std::endl;
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
// find the device >= 1.X
int i;
for (i = 0; i < count; ++i) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1) {
printDeviceProp(prop);
break;
}
}
}
// if can't find the device
if (i == count) {
fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
return false;
}
// set cuda device
cudaSetDevice(i);
return true;
}
//int main(){
// InitCUDA();
//}