Opencl 并行求和

时间:2023-03-08 16:46:44
Opencl 并行求和

上周尝试用opencl求极大值,在网上查到大多是求和,所谓的reduction算法。不过思路是一样的。

CPP:

   int err = ;
unsigned long int nNumCount = ;
int nLocalSize = ;
int nGroupSize = ;
int nGroup = nGroupSize / nLocalSize; int* pArray = new int[nNumCount];
unsigned long int nReal = ;
int nStart = GetTickCount();
for (int i=;i<nNumCount;++i)
{
pArray[i] = i*;
nReal += pArray[i];
}
cout<<GetTickCount() - nStart<<endl; cl_mem clmemArray = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nNumCount,NULL,NULL);
err = clEnqueueWriteBuffer(queue,clmemArray,CL_TRUE,,sizeof(int)*nNumCount,pArray,,,);
cl_mem clmemRes = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nGroup,NULL,NULL); nStart = GetTickCount(); err = clSetKernelArg(m_KerCalcRay,,sizeof(cl_mem),&clmemArray);
err = clSetKernelArg(m_KerCalcRay,,sizeof(cl_mem),&clmemRes);
err = clSetKernelArg(m_KerCalcRay,,sizeof(int)*nLocalSize,);
err = clSetKernelArg(m_KerCalcRay,,sizeof(int),&nNumCount); size_t localws[] = {nLocalSize};
size_t globalws[] = {nGroupSize}; err = clEnqueueNDRangeKernel(queue,m_KerCalcRay,,NULL,globalws,localws,,NULL,NULL);
clFinish(queue); int* pRes = new int[nGroup];
err = clEnqueueReadBuffer(queue,clmemRes,CL_TRUE,,sizeof(int)*nGroup,pRes,,,);
clFinish(queue); unsigned long int nRes = ;
for(int i=;i<nGroup;++i)
{
nRes += pRes[i];
}
  assert(nRes == nReal);

kernel:

__kernel void ReduceSum(__global int* num,__global int* res,__local int* pData,int nCount)
{
unsigned int tid = get_local_id();
unsigned int bid = get_group_id();
unsigned int gid = get_global_id();
unsigned int localSize = get_local_size();
unsigned int globalSize = get_global_size(); int nRes = ;
while(gid < nCount)
{
nRes += num[gid];
gid += globalSize;
}
pData[tid] = nRes;
barrier(CLK_LOCAL_MEM_FENCE); // do reduction in shared mem
for(unsigned int s = localSize >> ; s > ; s >>= )
{
if(tid < s)
{
pData[tid] += pData[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
} if(tid == )
res[bid] = pData[]; }

Reduction求和是这样一种方法,比如8个数0到7依次存放,求和的时候就是下标0和4、1和5、2和6、3和7,求和结果放到下标0、1、2、3中(同步一把barrier(CLK_LOCAL_MEM_FENCE))。然后继续就是0和2,、1和3求和结果放到0、1中。如此往复、最终结果就放到下标0中啦。

另:我试过循环展开减少同步次数、不过效率增长微乎其微。