0_Simple__cppOverload

▶ 使用 cuda 内置结构 cudaFuncAttributes 来观察核函数的共享内存、寄存器数量

▶ 源代码

 // cppOverload_kernel.cu

 __global__ void simple_kernel(const int *pIn, int *pOut, int a)

 {

     __shared__ int sData[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData[threadIdx.x] = pIn[tid];

     __syncthreads();

     pOut[tid] = sData[threadIdx.x] * a + tid;

 }

 __global__ void simple_kernel(const int2 *pIn, int *pOut, int a)

 {

     __shared__ int2 sData[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData[threadIdx.x] = pIn[tid];

     __syncthreads();

     pOut[tid] = (sData[threadIdx.x].x + sData[threadIdx.x].y) * a + tid;

 }

 __global__ void simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a)

 {

     __shared__ int sData1[THREAD_N], sData2[THREAD_N];

     int tid = threadIdx.x + blockDim.x * blockIdx.x;

     sData1[threadIdx.x] = pIn1[tid];

     sData2[threadIdx.x] = pIn2[tid];

     __syncthreads();

     pOut[tid] = (sData1[threadIdx.x] + sData2[threadIdx.x])*a + tid;

 }

 // cppOverload.cu

 #include <stdio.h>

 #include <helper_cuda.h>

 #include <helper_math.h>

 #include <helper_string.h>

 #define THREAD_N            256

 #include "cppOverload_kernel.cu"                                            // 源代码文件中使用了 THREAD_N，必须先定义

 #define N                   1024

 #define DIV_UP(a, b)        (((a) + (b) - 1) / (b))

 #define OUTPUT_ATTR(attr)                                               \

     printf("Shared Size:           %d\n", (int)attr.sharedSizeBytes);   \

     printf("Constant Size:         %d\n", (int)attr.constSizeBytes);    \

     printf("Local Size:            %d\n", (int)attr.localSizeBytes);    \

     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock);     \

     printf("Number of Registers:   %d\n", attr.numRegs);                \

     printf("PTX Version:           %d\n", attr.ptxVersion);             \

     printf("Binary Version:        %d\n", attr.binaryVersion);             

 bool check_func1(int *hInput, int *hOutput, int a)

 {

     for (int i = ; i < N; ++i)

     {

         int cpuRes = hInput[i] * a + i;

         if (hOutput[i] != cpuRes)

             return false;

     }

     return true;

 }

 bool check_func2(int2 *hInput, int *hOutput, int a)

 {

     for (int i = ; i < N; i++)

     {

         int cpuRes = (hInput[i].x + hInput[i].y)*a + i;

         if (hOutput[i] != cpuRes)

             return false;

     }

     return true;

 }

 bool check_func3(int *hInput1, int *hInput2, int *hOutput, int a)

 {

     for (int i = ; i < N; i++)

     {

         if (hOutput[i] != (hInput1[i] + hInput2[i])*a + i)

             return false;

     }

     return true;

 }

 int main(int argc, const char *argv[])

 {

     int deviceID = cudaSetDevice();

     int *hInput = NULL, *hOutput = NULL, *dInput = NULL, *dOutput = NULL;

     cudaMalloc(&dInput, sizeof(int)*N * );

     cudaMalloc(&dOutput, sizeof(int)*N);

     cudaMallocHost(&hInput, sizeof(int)*N * );

     cudaMallocHost(&hOutput, sizeof(int)*N);

     for (int i = ; i < N * ; i++)

         hInput[i] = i;

     cudaMemcpy(dInput, hInput, sizeof(int)*N * , cudaMemcpyHostToDevice);

     const int a = ;

     void(*func1)(const int *, int *, int) = simple_kernel;

     void(*func2)(const int2 *, int *, int) = simple_kernel;

     void(*func3)(const int *, const int *, int *, int) = simple_kernel;

     struct cudaFuncAttributes attr;

     // function 1

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func1, cudaFuncCachePreferShared);                      // 运行前分析资源占用

     cudaFuncGetAttributes(&attr, *func1);

     OUTPUT_ATTR(attr);

     (*func1) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dOutput, a);

     cudaDeviceSynchronize();

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int *pIn, int *pOut, int a) %s\n\n", check_func1(hInput, hOutput, a) ? "PASSED" : "FAILED");

     // function 2

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func2, cudaFuncCachePreferShared);

     cudaFuncGetAttributes(&attr, *func2);

     OUTPUT_ATTR(attr);

     (*func2) << <DIV_UP(N, THREAD_N), THREAD_N >> >((int2 *)dInput, dOutput, a);    // 强行转换成 int2*，反正也是对其的

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int2 *pIn, int *pOut, int a) %s\n\n", check_func2(reinterpret_cast<int2 *>(hInput), hOutput, a) ? "PASSED" : "FAILED");

     // function 3

     memset(&attr, , sizeof(attr));

     cudaFuncSetCacheConfig(*func3, cudaFuncCachePreferShared);

     cudaFuncGetAttributes(&attr, *func3);

     OUTPUT_ATTR(attr);

     (*func3) << <DIV_UP(N, THREAD_N), THREAD_N >> >(dInput, dInput + N, dOutput, a);

     cudaMemcpy(hOutput, dOutput, sizeof(int)*N, cudaMemcpyDeviceToHost);

     printf("simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) %s\n\n", check_func3(&hInput[], &hInput[N], hOutput, a) ? "PASSED" : "FAILED");

     cudaFree(dInput);

     cudaFree(dOutput);

     cudaFreeHost(hOutput);

     cudaFreeHost(hInput);

     getchar();

     return ;

 }

● 输出结果：

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int *pIn, int *pOut, int a) PASSED

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int2 *pIn, int *pOut, int a) PASSED

Shared Size:

Constant Size:

Local Size:

Max Threads Per Block:

Number of Registers:

PTX Version:

Binary Version:

simple_kernel(const int *pIn1, const int *pIn2, int *pOut, int a) PASSED

▶ 涨姿势：

● cuda 使用扩展名为 .cuh 的头文件

● cuda内置结构 cudaFuncAttributes 的定义：

 struct __device_builtin__ cudaFuncAttributes

 {

     size_t sharedSizeBytes; // 共享内存大小

     size_t constSizeBytees; // 常量内存大小

     size_t localSizeBytes;  // 局部内存大小

     int maxThreadsPerBlock; // 每线程块线最大程数量

     int numRegs;            // 寄存器数量

     int ptxVersion;         // PTX版本号

     int binaryVersion;      // 机器码版本号

     int cacheModeCA;        // 是否使用编译指令 -Xptxas --dlcm=ca

 };

● 通过使用cuda的内置结构和函数来查看核函数使用的共享内存与寄存器数量

 struct cudaFuncAttributes attr;

 memset(&attr, , sizeof(attr));

 cudaFuncSetCacheConfig(*function, cudaFuncCachePreferShared);

 cudaFuncGetAttributes(&attr, *function);

■ 涉及的函数

 extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);

 __device__ __attribute__((nv_weak)) cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c)

 {

     return cudaErrorUnknown;

 }

 #define OUTPUT_ATTR(attr)                                           \

     printf("Shared Size:   %d\n", (int)attr.sharedSizeBytes);       \

     printf("Constant Size: %d\n", (int)attr.constSizeBytes);        \

     printf("Local Size:    %d\n", (int)attr.localSizeBytes);        \

     printf("Max Threads Per Block: %d\n", attr.maxThreadsPerBlock); \

     printf("Number of Registers: %d\n", attr.numRegs);              \

     printf("PTX Version: %d\n", attr.ptxVersion);                   \

     printf("Binary Version: %d\n", attr.binaryVersion);

秒客网

0_Simple__cppOverload

相关文章