不同种类存储的读取速度
不同种类存储的读取速度1
将变量设置为局部变量, 编译器会将其放入寄存器中, 可以省去大量的内存读写操作.
GPU 寄存器实现位包装
__global__ void test_reg_kernel(Cuda32u* data, bool *packed_array, Cuda32u num_elements)
{
Cuda32u idx = blockIdx.x*blockDim.x + threadIdx.x;
Cuda32u idy = blockIdx.y*blockDim.y + threadIdx.y;
Cuda32u tid = idy*blockDim.x*gridDim.x + idx;
if (tid < num_elements)
{
// 局部变量, 放在寄存器中.
Cuda32u d_tmp = 0;
for (int i = 0; i < KERNEL_LOOP;i++)
{
d_tmp |= (packed_array[i] << i);
}
data[tid] = d_tmp;
}
}
void test_reg(Cuda32u* d_puData, bool *d_pbPackArray, Cuda32u uArrayLen)
{
dim3 thread_rect(8, 8);
dim3 block_rect(256, 256);
test_reg_kernel<<<block_rect, thread_rect>>>(d_puData, d_pbPackArray, uArrayLen);
}
GPU 全局内存实现位包装
// 放在全局内存中.
__device__ static Cuda32u d_tmp = 0;
__global__ void test_gmem_kernel(Cuda32u* data, bool *packed_array, Cuda32u num_elements)
{
Cuda32u idx = blockIdx.x*blockDim.x + threadIdx.x;
Cuda32u idy = blockIdx.y*blockDim.y + threadIdx.y;
Cuda32u tid = idy*blockDim.x*gridDim.x + idx;
//Cuda32u tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid < num_elements)
{
for (int i = 0; i < KERNEL_LOOP; i++)
{
d_tmp |= (packed_array[i] << i);
}
data[tid] = d_tmp;
}
}
void test_gmem(Cuda32u* d_puData, bool *d_pbPackArray, Cuda32u uArrayLen)
{
dim3 thread_rect(8, 8);
dim3 block_rect(256, 256);
test_gmem_kernel<<<block_rect, thread_rect>>>(d_puData, d_pbPackArray, uArrayLen);
}
测试
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_20_atomic_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "helper_cuda.h"
#include "timer.h"
#include <ctime>
#include "Global.h"
#include "RegisterTest.h"
#include "CalHist.h"
int main()
{
// CPU 数据初始化
Cuda32u* d_puData = NULL;
Cuda32u uArrayLen = 2048*2048;
checkCudaErrors(cudaMalloc((void**)&d_puData, uArrayLen*sizeof(Cuda32u)));
checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
bool *pbPackArray = (bool*)malloc(KERNEL_LOOP*sizeof(bool));
memset((void*)pbPackArray, 1, KERNEL_LOOP*sizeof(bool));
// CPU->GPU
bool *d_pbPackArray = NULL;
checkCudaErrors(cudaMalloc((void**)&d_pbPackArray, KERNEL_LOOP*sizeof(bool)));
checkCudaErrors(cudaMemcpy((void*)d_pbPackArray, (void*)pbPackArray,
KERNEL_LOOP*sizeof(bool), cudaMemcpyHostToDevice));
// 计时初始化
Cuda32u iIterNum = 10;
cudaEvent_t start, stop;
Cuda32f elapsedTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// 预热
cudaWarmUp();
// 开始计时
cudaEventRecord(start, 0);
// GPU 处理
for (Cuda32u i = 0; i < iIterNum; i++)
{
// 恢复.
//checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
// GPU 寄存器 位包装.
test_reg(d_puData, d_pbPackArray, uArrayLen);
}
// 结束计时
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
// 传输数据 GPU->CPU
Cuda32u* h_puData = (Cuda32u*)malloc(uArrayLen*sizeof(Cuda32u));
checkCudaErrors(cudaMemcpy((void*)h_puData, (void*)d_puData,
uArrayLen*sizeof(Cuda32u), cudaMemcpyDeviceToHost));
// 累加结果. 显示测试结果.
Cuda32u iSumData = 0;
for (Cuda32u i = 0; i < uArrayLen; i++)
{
iSumData += h_puData[i];
}
printf("\n%%%%%%%%%%%%%% GPU 寄存器 位包装:%%%%%%%%%%%%%%\n");
printf("序列长度 = %d\n", uArrayLen);
printf("重复次数 = %d\n", iIterNum);
printf("序列求和 = %d\n", iSumData);
printf("平均用时 = %f ms\n", elapsedTime / (Cuda32u)iIterNum);
printf("%%%%%%%%%%%%%% GPU 寄存器 位包装:%%%%%%%%%%%%%%\n\n");
// 开始计时
cudaEventRecord(start, 0);
// GPU 处理
for (Cuda32u i = 0; i < iIterNum; i++)
{
// 恢复.
//checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
// GPU 全局内存 位包装
test_gmem(d_puData, d_pbPackArray, uArrayLen);
}
// 结束计时
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
// 传输数据 GPU->CPU
//Cuda32u* h_puData = (Cuda32u*)malloc(uArrayLen*sizeof(Cuda32u));
checkCudaErrors(cudaMemcpy((void*)h_puData, (void*)d_puData,
uArrayLen*sizeof(Cuda32u), cudaMemcpyDeviceToHost));
// 累加结果. 显示测试结果.
Cuda32u iSumData2 = 0;
for (Cuda32u i = 0; i < uArrayLen; i++)
{
iSumData2 += h_puData[i];
}
printf("\n%%%%%%%%%%%%%% GPU 全局内存 位包装:%%%%%%%%%%%%%%\n");
printf("序列长度 = %d\n", uArrayLen);
printf("重复次数 = %d\n", iIterNum);
printf("序列求和 = %d\n", iSumData2);
printf("平均用时 = %f ms\n", elapsedTime / (Cuda32u)iIterNum);
printf("%%%%%%%%%%%%%% GPU 全局内存 位包装:%%%%%%%%%%%%%%\n\n");
// 释放资源
checkCudaErrors(cudaFree((void*)d_puData));
checkCudaErrors(cudaFree((void*)d_pbPackArray));
free(h_puData);
free(pbPackArray);
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
测试结果
使用寄存器比使用全局内存快很多. 节省了约75%的时间.
参考文献
-
Shane Cook. CUDA Programming: A developer’s guide to parallel computing with GPUs ↩︎