0_Simple__matrixMulDrv

时间:2023-03-08 17:19:18

使用CUDA的 Driver API 来计算矩阵乘法。

▶ 源代码:

 #include <stdio.h>

 #include <cuda.h>
#include <builtin_types.h>
#include <helper_cuda_drvapi.h>
#include <helper_timer.h>
#include "matrixMul.h" #define PTX_FILE "matrixMul_kernel64.ptx"
#define CUBIN_FILE "matrixMul_kernel64.cubin" const bool use_64bit_memory_address = true;
using namespace std; CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t totalGlobalMem; void constantInit(float *data, int size, float val)
{
for (int i = ; i < size; ++i)
data[i] = val;
} bool inline findModulePath(const char *module_file, string &module_path, char **argv, string &ptx_source)
{
char *actual_path = sdkFindFilePath(module_file, argv[]);// 依命令行的参数 if (actual_path)
module_path = actual_path;
else
{
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
} if (module_path.empty())
{
printf("> findModulePath file not found: <%s> \n", module_file);
return false;
}
printf("> findModulePath <%s>\n", module_path.c_str()); if (module_path.rfind(".ptx") != string::npos)
{
FILE *fp = fopen(module_path.c_str(), "rb");
fseek(fp, , SEEK_END);
int file_size = ftell(fp);
char *buf = new char[file_size + ];
fseek(fp, , SEEK_SET);
fread(buf, sizeof(char), file_size, fp);
fclose(fp);
buf[file_size] = '\0';
ptx_source = buf;
delete[] buf;
}
return true;
} static CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul)
{
CUfunction cuFunction = ;// 用于存放取出的函数
CUresult status; // 记录每一步操作返回的状态,有false时立即用goto语句转到函数末尾退出
int major = , minor = ;
char deviceName[];
string module_path, ptx_source; cuDevice = findCudaDeviceDRV(argc, (const char **)argv);// 寻找设备,依命令行参数指定或者选择计算能力最高的
cuDeviceComputeCapability(&major, &minor, cuDevice);
cuDeviceGetName(deviceName, , cuDevice);
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
cuDeviceTotalMem(&totalGlobalMem, cuDevice); // 获取显存总量
printf(" Total amount of global memory: %llu bytes\n", (unsigned long long)totalGlobalMem);
printf(" 64-bit Memory Address: %s\n", (totalGlobalMem > (unsigned long long) * * * 1024L) ? "YES" : "NO"); status = cuCtxCreate(&cuContext, , cuDevice); // 创建上下文
if (CUDA_SUCCESS != status)
goto Error; if (!findModulePath(PTX_FILE, module_path, argv, ptx_source))// 查找指定的模块 "matrixMul_kernel64.ptx"
{
if (!findModulePath(CUBIN_FILE, module_path, argv, ptx_source))// 查找模块 "matrixMul_kernel64.cubin"
{
printf("> findModulePath could not find <matrixMul_kernel> ptx or cubin\n");
status = CUDA_ERROR_NOT_FOUND;
goto Error;
}
}
else
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); if (module_path.rfind("ptx") != string::npos)
{
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = ;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions]; // set up size of compilation log buffer
jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = ;
jitOptVals[] = (void *)(size_t)jitLogBufferSize; // set up pointer to the compilation log buffer
jitOptions[] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[] = jitLogBuffer; // set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[] = CU_JIT_MAX_REGISTERS;
int jitRegCount = ;
jitOptVals[] = (void *)(size_t)jitRegCount; // 编译模块
status = cuModuleLoadDataEx(&cuModule, ptx_source.c_str(), jitNumOptions, jitOptions, (void **)jitOptVals); printf("> PTX JIT log:\n%s\n", jitLogBuffer);
}
else
status = cuModuleLoad(&cuModule, module_path.c_str()); if (CUDA_SUCCESS != status)
goto Error; // 取出函数
if (totalGlobalMem > (unsigned long long) * * * 1024L)
status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_64bit");
else
status = cuModuleGetFunction(&cuFunction, cuModule, "matrixMul_bs32_32bit"); if (CUDA_SUCCESS != status)
goto Error;
*pMatrixMul = cuFunction;
return CUDA_SUCCESS; Error:
cuCtxDestroy(cuContext);
return status;
} void runTest(int argc, char **argv)
{
int block_size = ; // 获取计算函数
CUfunction matrixMul = NULL;// CUDA 函数指针
CUresult error_id = initCUDA(argc, argv, &matrixMul);// 获取函数 // 数据准备工作
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *) malloc(mem_size_B);
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
float *h_C = (float *)malloc(mem_size_C);
constantInit(h_A, size_A, 1.0f); // 全 1 阵
constantInit(h_B, size_B, 0.01f); // 全0.01 阵 // 如果是64位系统,则这里申请四块1G的显存占着,没啥用
CUdeviceptr d_Mem[];
if (use_64bit_memory_address)
{
unsigned int mem_size = **;
cuMemAlloc(&d_Mem[], mem_size);
cuMemAlloc(&d_Mem[], mem_size);
cuMemAlloc(&d_Mem[], mem_size);
cuMemAlloc(&d_Mem[], mem_size);
} CUdeviceptr d_A;
cuMemAlloc(&d_A, mem_size_A);
CUdeviceptr d_B;
cuMemAlloc(&d_B, mem_size_B);
CUdeviceptr d_C;
cuMemAlloc(&d_C, mem_size_C);
cuMemcpyHtoD(d_A, h_A, mem_size_A);
cuMemcpyHtoD(d_B, h_B, mem_size_B); // 计时相关
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer); dim3 block(block_size, block_size, );
dim3 grid(WC / block_size, HC / block_size, ); // 两种方式调用 Driver API
if ()
{
// 64位内存地址且显存足够大,使用 size_t 为尺寸格式,否则使用 int 为尺寸格式,其调用格式相同
if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)***1024L))
{
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数
cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
* block_size*block_size * sizeof(float), NULL, args, NULL);
}
else
{
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
* block_size*block_size * sizeof(float), NULL, args, NULL);
}
}
else
{
int offset = ;
char argBuffer[];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 *((CUdeviceptr *)&argBuffer[offset]) = d_C;
offset += sizeof(d_C);
*((CUdeviceptr *)&argBuffer[offset]) = d_A;
offset += sizeof(d_A);
*((CUdeviceptr *)&argBuffer[offset]) = d_B;
offset += sizeof(d_B); if (use_64bit_memory_address && (totalGlobalMem > (unsigned long long)***1024L))
{
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
*((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
}
else
{
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
*((int *)&argBuffer[offset]) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*((int *)&argBuffer[offset]) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
} // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏
void *kernel_launch_config[] =
{
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END
}; // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数
cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
* block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
} cuMemcpyDtoH((void *) h_C, d_C, mem_size_C); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); //检查结果
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = ; i < (int)(WC * HC); i++)
{
if (fabs(h_C[i] - (WA * 0.01f)) > 1e-)
{
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA*0.01f);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); if (use_64bit_memory_address)
{
cuMemFree(d_Mem[]);
cuMemFree(d_Mem[]);
cuMemFree(d_Mem[]);
cuMemFree(d_Mem[]);
}
free(h_A);
free(h_B);
free(h_C);
cuMemFree(d_A);
cuMemFree(d_B);
cuMemFree(d_C);
cuCtxDestroy(cuContext);
} int main(int argc, char **argv)
{
printf("[ matrixMulDrv(Driver API) ]\n");
runTest(argc, argv); getchar();
return ;
}

▶ 输出结果:

[ matrixMulDrv (Driver API) ]
> Using CUDA Device []: GeForce GTX
> GPU Device has SM 6.1 compute capability
Total amount of global memory: bytes
-bit Memory Address: YES
sdkFindFilePath <matrixMul_kernel64.ptx> in ./
sdkFindFilePath <matrixMul_kernel64.ptx> in ./../../bin/win64/Debug/matrixMulDrv_data_files/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./common/data/
sdkFindFilePath <matrixMul_kernel64.ptx> in ./data/
> findModulePath <./data/matrixMul_kernel64.ptx>
> initCUDA loading module: <./data/matrixMul_kernel64.ptx>
> PTX JIT log: Processing time: 0.568077 (ms)
Checking computed result for correctness: Result = PASS NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

▶ 涨姿势:

● 头文件 matrixMul.h 的内容:

 #ifndef _MATRIXMUL_H_
#define _MATRIXMUL_H_ // 规定了参与计算的矩阵的维数
#define WA (4 * block_size)
#define HA (6 * block_size)
#define WB (4 * block_size)
#define HB WA
#define WC WB
#define HC HA #endif // _MATRIXMUL_H_

● C++ 中 string 类的基本使用方法

 using namespace std;

 string buf, buf2;
int n;
char *buf = new char[n];// 动态创建字符数组大小,类似malloc
buf[n - ] = '\0'; // 手动结尾补零
buf2 = buf; // 直接赋值
delete[] buf; // 删除该数组,类似 free

●  class StopWatchInterface ,定义于 helper_timer.h 中用于计时的一个类,这里只说明其使用方法,其内容在头文件随笔中详细讨论。

 StopWatchInterface *timer = NULL;   // 创建计时类指针
sdkCreateTimer(&timer); // 创建计时类
sdkStartTimer(&timer); // 开始计时 ... // 核函数运行过程 sdkStopTimer(&timer); // 停止计时
sdkGetTimerValue(&timer); // 获取时间(返回浮点类型的毫秒数)
sdkDeleteTimer(&timer); // 删除计时类

● cuda.h 中各种定义

typedef int CUdevice;                      // CUDA int 类型,用于标志设备号
typedef struct CUfunc_st *CUfunction; // CUDA 函数指针
typedef struct CUmod_st *CUmodule; // CUDA 模块指针
typedef struct CUctx_st *CUcontext; // CUDA 上下文指针
typedef enum cudaError_enum {...}CUresult; // CUDA 各种错误信息标号
typedef unsigned long long CUdeviceptr; // 无符号长长整型 CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);// 获取设备名称 CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); // 获取设备计算能力 inline CUdevice findCudaDeviceDRV(int argc, const char **argv); // 依命令行指定设备,否则选择计算能力最高的设备。内含函数调用 cuInit(0) #define cuDeviceTotalMem cuDeviceTotalMem_v2 // 获取显存大小
CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); #define cuMemAlloc cuMemAlloc_v2 // 申请显存
CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); #define cuMemFree cuMemFree_v2 // 释放显存
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); CUresult CUDAAPI cuInit(unsigned int Flags); // 重要的初始化设备参数,在创建上下文之前要先调用它,参数可以设为 0 #define cuCtxCreate cuCtxCreate_v2 // 创建上下文
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); #define cuCtxDestroy cuCtxDestroy_v2 // 销毁上下文
CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) // cudaMemcpy(cudaMemcpyHostToDevice)的别名
#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) // cudaMemcpy(cudaMemcpyDeviceToHost)的别名
#define __CUDA_API_PTDS(api) api // 从 ptx 流 image 中编译模块 module,并且包括 numOptions 个参数,参数名列表为 options,参数值列表为 optionValues
CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); // 指定路径 fname 中获取模块 module
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); // 从指定模块 hmod 中获取名为 name 的函赋给函数指针 hfunc
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);

● 代码中使用了 goto 语句,基本使用过程如下。好处是函数整个函数 initCUDA 中只有一个 return,坏处是到处是跳转。

 int function()
{
CUresult status; status = cudaFunction();
if (!status == CUDA_SUCCESS)// 函数cudaFunction运行不正常
goto Error; ... // 函数运行正常 return ; // 正常结束,返回 0
Error:
return status; // 非正常结束,返回首个错误编号
}

● Driver API 的简略使用过程。本篇源代码很长,但是压缩后可以变成以下内容,方便看出该接口函数的使用过程。

 #include <stdio.h>
#include <cuda.h>
#include <builtin_types.h>
#include <helper_cuda_drvapi.h>
#include <helper_timer.h> int main()
{
// 常量
CUdevice cuDevice = ;
CUcontext cuContext;
CUmodule cuModule;
CUfunction matrixMul = NULL;
CUresult status;
char module_path[] = "./data/matrixMul_kernel64.ptx";
char ptx_source[]; // 创建上下文
cuInit();
status = cuCtxCreate(&cuContext, , cuDevice); // 获取函数
FILE *fp = fopen(module_path, "rb");
fseek(fp, , SEEK_END);
int file_size = ftell(fp);
fseek(fp, , SEEK_SET);
fread(ptx_source, sizeof(char), file_size, fp);
ptx_source[ - ] = '\0'; // 设置编译选项
const unsigned int jitNumOptions = ;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions]; // 编译日志大小
jitOptions[] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = ;
jitOptVals[] = (void *)(size_t)jitLogBufferSize; // 编译日志的指针
jitOptions[] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[] = jitLogBuffer; // 单核函数寄存器数量
jitOptions[] = CU_JIT_MAX_REGISTERS;
int jitRegCount = ;
jitOptVals[] = (void *)(size_t)jitRegCount; // 编译模块
status = cuModuleLoadDataEx(&cuModule, ptx_source, jitNumOptions, jitOptions, (void **)jitOptVals);
printf("\nPTX JIT log:\n%s\n", jitLogBuffer);
status = cuModuleGetFunction(&matrixMul, cuModule, "matrixMul_bs32_64bit"); // 数据准备工作
int block_size = ;
int wa = * block_size;
int ha = * block_size;
int wb = * block_size;
int hb = wa;
int wc = wb;
int hc = ha; unsigned int size_A = wa * ha;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = wb * hb;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
size_t size_C = wc * hc;
size_t mem_size_C = sizeof(float) * size_C;
float *h_C = (float *)malloc(mem_size_C); for (int i = ; i < size_A; ++i)
h_A[i] = 1.0f;
for (int i = ; i < size_B; ++i)
h_B[i] = 0.01f; CUdeviceptr d_A;
cuMemAlloc(&d_A, mem_size_A);
CUdeviceptr d_B;
cuMemAlloc(&d_B, mem_size_B);
CUdeviceptr d_C;
cuMemAlloc(&d_C, mem_size_C);
cuMemcpyHtoD(d_A, h_A, mem_size_A);
cuMemcpyHtoD(d_B, h_B, mem_size_B); dim3 block(block_size, block_size, );
dim3 grid(wc / block_size, hc / block_size, ); // 两种方式调用 Driver API
if ()
{
size_t Matrix_Width_A = (size_t)wa;
size_t Matrix_Width_B = (size_t)wb;
void *args[] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
// CUDA 4.0 Driver API 核函数调用,使用倒数第二个指针参数
cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
* block_size*block_size * sizeof(float), NULL, args, NULL);
}
else
{
int offset = ;
char argBuffer[];// 与上面 args 相同顺序依次填入所需的指针参数,用 offset 作偏移 *((CUdeviceptr *)&argBuffer[offset]) = d_C;
offset += sizeof(d_C);
*((CUdeviceptr *)&argBuffer[offset]) = d_A;
offset += sizeof(d_A);
*((CUdeviceptr *)&argBuffer[offset]) = d_B;
offset += sizeof(d_B);
size_t Matrix_Width_A = (size_t)wa;
size_t Matrix_Width_B = (size_t)wb;
*((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*((CUdeviceptr *)&argBuffer[offset]) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B); // 用一个 void * 来封装上面5个参数,并加上参数尺寸和一个指明参数结束的结束宏
void *kernel_launch_config[] =
{
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END
}; // CUDA 4.0 Driver API 核函数调用,使用最后一个指针参数
cuLaunchKernel(matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
* block_size*block_size * sizeof(float), NULL, NULL, (void **)&kernel_launch_config);
} cuMemcpyDtoH((void *)h_C, d_C, mem_size_C); //检查结果
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = ; i < (int)(wc * hc); i++)
{
if (fabs(h_C[i] - (wa * 0.01f)) > 1e-)
{
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], wa*0.01f);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); free(h_A);
free(h_B);
free(h_C);
cuMemFree(d_A);
cuMemFree(d_B);
cuMemFree(d_C);
cuCtxDestroy(cuContext); getchar();
return ;
}