1.必要的头文件
#include <cuda_runtime.h> // CUDA 运行时库
#include <iostream>
2. 定义核函数
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C[i] = A[i] + B[i];
}
}
3.写主函数
- 设置设备
- 初始化数据
- 分配设备内存
- 将数据从主机复制到设备
- 配置和启动核函数
- 从设备复制数据回主机
- 清理资源
int main(void) {
int numElements = 50000;
size_t size = numElements * sizeof(float);
// 分配主机内存
float *h_A = new float[numElements];
float *h_B = new float[numElements];
float *h_C = new float[numElements];
// 初始化数据
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
// 分配设备内存
float *d_A = nullptr;
float *d_B = nullptr;
float *d_C = nullptr;
cudaMalloc((void **)&d_A, size);
cudaMalloc((void **)&d_B, size);
cudaMalloc((void **)&d_C, size);
// 复制数据从主机到设备
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// 启动核函数
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
// 复制数据从设备回主机
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// 清理资源
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete [] h_A;
delete [] h_B;
delete [] h_C;
std::cout << "Done" << std::endl;
return 0;
}
4.编译程序(cmake)
当然也可以直接用nvcc
cmake_minimum_required(VERSION 3.10)
project(MyCudaApp LANGUAGES CXX CUDA)
# 设置 CUDA 编译器和链接器选项
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch=sm_75 -code=sm_75,compute_80")
# 添加可执行文件
add_executable(cuda_app 1.cu)