A previous question asked how to find to find the maximum value of an array in CUDA efficiently: Finding max value in CUDA, the top response provided a link to a NVIDIA presentation on optimizing reduction kernels.
之前的一个问题是如何在CUDA中找到一个数组的最大值:在CUDA中找到最大值,顶部响应提供了一个关于优化还原内核的NVIDIA演示的链接。
If you are using Visual Studio, simply remove the header reference, and everything between CPU EXECUTION.
如果您正在使用Visual Studio,只需删除头引用,以及CPU执行之间的所有内容。
I setup a variant which found the max, but it doesn't match what the CPU is finding:
我设置了一个找到最大值的变量,但它与CPU的发现不匹配:
// Returns the maximum value of
// an array of size n
float GetMax(float *maxes, int n)
{
int i = 0;
float max = -100000;
for(i = 0; i < n; i++)
{
if(maxes[i] > max)
max = maxes[i];
}
return max;
}
// Too obvious...
__device__ float MaxOf2(float a, float b)
{
if(a > b) return a;
else return b;
}
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE*2) + tid;
unsigned int gridSize = BLOCKSIZE*2*gridDim.x;
sdata[tid] = 0;
//MMX(index,i)
//MMX(index,i+blockSize)
// Final Optimized Kernel
while (i < n) {
sdata[tid] = MaxOf2(g_idata[i], g_idata[i+BLOCKSIZE]);
i += gridSize;
}
__syncthreads();
if (BLOCKSIZE >= 512) { if (tid < 256) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 256]); } __syncthreads(); }
if (BLOCKSIZE >= 256) { if (tid < 128) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 128]); } __syncthreads(); }
if (BLOCKSIZE >= 128) { if (tid < 64) { sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 64]); } __syncthreads(); }
if (tid < 32) {
if (BLOCKSIZE >= 64) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 32]);
if (BLOCKSIZE >= 32) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (BLOCKSIZE >= 16 ) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (BLOCKSIZE >= 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (BLOCKSIZE >= 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (BLOCKSIZE >= 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
I have a giant setup to test this algorithm:
我有一个巨大的设置来测试这个算法:
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "book.h"
#define ARRAYSIZE 16384
#define GRIDSIZE 60
#define BLOCKSIZE 32
#define SIZEFLOAT 4
using namespace std;
// Function definitions
float GetMax(float *maxes, int n);
__device__ float MaxOf2(float a, float b);
__global__ void MaxReduction(int n, float *g_idata, float *g_odata);
// Returns random floating point number
float RandomReal(float low, float high)
{
float d;
d = (float) rand() / ((float) RAND_MAX + 1);
return (low + d * (high - low));
}
int main()
{
/*****************VARIABLE SETUP*****************/
// Pointer to CPU numbers
float *numbers;
// Pointer to GPU numbers
float *dev_numbers;
// Counter
int i = 0;
// Randomize
srand(time(0));
// Timers
// Kernel timers
cudaEvent_t start_kernel, stop_kernel;
float elapsedTime_kernel;
HANDLE_ERROR(cudaEventCreate(&start_kernel));
HANDLE_ERROR(cudaEventCreate(&stop_kernel));
// cudaMalloc timers
cudaEvent_t start_malloc, stop_malloc;
float elapsedTime_malloc;
HANDLE_ERROR(cudaEventCreate(&start_malloc));
HANDLE_ERROR(cudaEventCreate(&stop_malloc));
// CPU timers
struct timeval start, stop;
float elapsedTime = 0;
/*****************VARIABLE SETUP*****************/
/*****************CPU ARRAY SETUP*****************/
// Setup CPU array
HANDLE_ERROR(cudaHostAlloc((void**)&numbers, ARRAYSIZE * sizeof(float), cudaHostAllocDefault));
for(i = 0; i < ARRAYSIZE; i++)
numbers[i] = RandomReal(0, 50000.0);
/*****************CPU ARRAY SETUP*****************/
/*****************GPU ARRAY SETUP*****************/
// Start recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(start_malloc,0));
// Allocate memory to GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_numbers, ARRAYSIZE * sizeof(float)));
// Transfer CPU array to GPU
HANDLE_ERROR(cudaMemcpy(dev_numbers, numbers, ARRAYSIZE*sizeof(float), cudaMemcpyHostToDevice));
// An array to temporarily store maximum values on the GPU
float *dev_max;
HANDLE_ERROR(cudaMalloc((void**)&dev_max, GRIDSIZE * sizeof(float)));
// An array to hold grab the GPU max
float maxes[GRIDSIZE];
/*****************GPU ARRAY SETUP*****************/
/*****************KERNEL EXECUTION*****************/
// Start recording kernel execution time
HANDLE_ERROR(cudaEventRecord(start_kernel,0));
// Run kernel
MaxReduction<<<GRIDSIZE, BLOCKSIZE, SIZEFLOAT*BLOCKSIZE>>> (ARRAYSIZE, dev_numbers, dev_max);
// Transfer maxes over
HANDLE_ERROR(cudaMemcpy(maxes, dev_max, GRIDSIZE * sizeof(float), cudaMemcpyDeviceToHost));
// Print out the max
cout << GetMax(maxes, GRIDSIZE) << endl;
// Stop recording kernel execution time
HANDLE_ERROR(cudaEventRecord(stop_kernel,0));
HANDLE_ERROR(cudaEventSynchronize(stop_kernel));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_kernel, start_kernel, stop_kernel));
// Stop recording cuda malloc time
HANDLE_ERROR(cudaEventRecord(stop_malloc,0));
HANDLE_ERROR(cudaEventSynchronize(stop_malloc));
// Retrieve recording data
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime_malloc, start_malloc, stop_malloc));
// Print results
printf("%5.3f\t%5.3f\n", elapsedTime_kernel, elapsedTime_malloc);
/*****************KERNEL EXECUTION*****************/
/*****************CPU EXECUTION*****************/
// Capture the start time
gettimeofday(&start, NULL);
// Call generic P7Viterbi function
cout << GetMax(numbers, ARRAYSIZE) << endl;
// Capture the stop time
gettimeofday(&stop, NULL);
// Retrieve time elapsed in milliseconds
long int elapsed_sec = stop.tv_sec - start.tv_sec;
long int elapsed_usec = stop.tv_usec - start.tv_usec;
elapsedTime = (float)(1000.0f * elapsed_sec) + (float)(0.001f * elapsed_usec);
// Print results
printf("%5.3f\n", elapsedTime);
/*****************CPU EXECUTION*****************/
// Free memory
cudaFreeHost(numbers);
cudaFree(dev_numbers);
cudaFree(dev_max);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_malloc);
cudaEventDestroy(stop_malloc);
// Exit program
return 0;
}
I ran cuda-memcheck on this test program, with -g & -G switches on, and it reports 0 problems. Can anyone spot the issue?
我在这个测试程序上运行cuda-memcheck,使用-g和-g开关,它报告0个问题。有人能发现这个问题吗?
NOTE: Be sure to have book.h from the CUDA by Example book in your current directory when you compile the program. Source link here: http://developer.nvidia.com/cuda-example-introduction-general-purpose-gpu-programming Download the source code, and book.h will be under the common directory/folder.
注意:一定要有书。当你编译程序时,从CUDA的CUDA by Example book到当前目录。这里的源代码链接:http://developer.nvidia.com/cuda-example-介绍性-通用-gpu-编程下载源代码和书籍。h将在公用目录/文件夹下。
1 个解决方案
#1
5
Your kernel looks broken to me. The thread local search (before the shared memory reduction), should look something like this:
你的内核好像坏了。线程本地搜索(在共享内存减少之前)应该是这样的:
sdata[tid] = g_idata[i];
i += gridSize;
while (i < n) {
sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
i += gridSize;
}
shouldn't it?
不应该吗?
Also note that if you run this on Fermi, the shared memory buffer should be declared volatile, and you will get a noticeable improvement in performance if the thread local search is done with a register variable, rather than in shared memory. There is about an 8 times difference in effective bandwidth between the two.
还要注意的是,如果在Fermi上运行这个,共享内存缓冲区应该被声明为volatile,如果线程本地搜索使用寄存器变量而不是共享内存,那么性能将得到显著提高。两者之间有效带宽的差异大约是8倍。
EDIT: Here is a simplified, working version of your reduction kernel. You should note a number of differences compared to your original:
编辑:这里有一个简化的,工作版本的还原内核。你应该注意到一些与你原来的不同之处:
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ volatile float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
unsigned int gridSize = BLOCKSIZE*gridDim.x;
float val = g_idata[i];
i += gridSize;
while (i < n) {
val = MaxOf2(g_idata[i],val);
i += gridSize;
}
sdata[tid] = val;
__syncthreads();
// This versions uses a single warp for the shared memory
// reduction
# pragma unroll
for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
sdata[tid] = MaxOf2(sdata[tid], sdata[i]);
if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (tid < 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (tid < 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (tid < 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
This code should also be safe on Fermi. You should also familiarise yourself with the CUDA math library, because there is a fmax(x,y)
intrinsic which you should use in place of your MaxOf2
function.
这段代码在费米上也应该是安全的。您还应该熟悉CUDA数学库,因为有一个fmax(x,y)的固有特性,您应该使用它来代替MaxOf2函数。
#1
5
Your kernel looks broken to me. The thread local search (before the shared memory reduction), should look something like this:
你的内核好像坏了。线程本地搜索(在共享内存减少之前)应该是这样的:
sdata[tid] = g_idata[i];
i += gridSize;
while (i < n) {
sdata[tid] = MaxOf2(sdata[tid], g_idata[i]);
i += gridSize;
}
shouldn't it?
不应该吗?
Also note that if you run this on Fermi, the shared memory buffer should be declared volatile, and you will get a noticeable improvement in performance if the thread local search is done with a register variable, rather than in shared memory. There is about an 8 times difference in effective bandwidth between the two.
还要注意的是,如果在Fermi上运行这个,共享内存缓冲区应该被声明为volatile,如果线程本地搜索使用寄存器变量而不是共享内存,那么性能将得到显著提高。两者之间有效带宽的差异大约是8倍。
EDIT: Here is a simplified, working version of your reduction kernel. You should note a number of differences compared to your original:
编辑:这里有一个简化的,工作版本的还原内核。你应该注意到一些与你原来的不同之处:
__global__ void MaxReduction(int n, float *g_idata, float *g_odata)
{
extern __shared__ volatile float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(BLOCKSIZE) + tid;
unsigned int gridSize = BLOCKSIZE*gridDim.x;
float val = g_idata[i];
i += gridSize;
while (i < n) {
val = MaxOf2(g_idata[i],val);
i += gridSize;
}
sdata[tid] = val;
__syncthreads();
// This versions uses a single warp for the shared memory
// reduction
# pragma unroll
for(int i=(tid+32); ((tid<32)&&(i<BLOCKSIZE)); i+=32)
sdata[tid] = MaxOf2(sdata[tid], sdata[i]);
if (tid < 16) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 16]);
if (tid < 8) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 8]);
if (tid < 4) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 4]);
if (tid < 2) sdata[tid] = MaxOf2(sdata[tid], sdata[tid + 2]);
if (tid == 0) g_odata[blockIdx.x] = MaxOf2(sdata[tid], sdata[tid + 1]);
}
This code should also be safe on Fermi. You should also familiarise yourself with the CUDA math library, because there is a fmax(x,y)
intrinsic which you should use in place of your MaxOf2
function.
这段代码在费米上也应该是安全的。您还应该熟悉CUDA数学库,因为有一个fmax(x,y)的固有特性,您应该使用它来代替MaxOf2函数。