Opencl 并行求和

上周尝试用opencl求极大值，在网上查到大多是求和，所谓的reduction算法。不过思路是一样的。

CPP：

　　 int err = ;

    unsigned long int nNumCount = ;

    int nLocalSize = ;

    int nGroupSize = ;

    int nGroup = nGroupSize / nLocalSize;

    int* pArray = new int[nNumCount];

    unsigned long int nReal = ;

    int nStart = GetTickCount();

    for (int i=;i<nNumCount;++i)

    {

        pArray[i] = i*;

        nReal += pArray[i];

    }

    cout<<GetTickCount() - nStart<<endl;

    cl_mem clmemArray = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nNumCount,NULL,NULL);

    err = clEnqueueWriteBuffer(queue,clmemArray,CL_TRUE,,sizeof(int)*nNumCount,pArray,,,);

    cl_mem clmemRes  = clCreateBuffer(context,CL_MEM_READ_WRITE,sizeof(int) * nGroup,NULL,NULL);

    nStart = GetTickCount();

    err = clSetKernelArg(m_KerCalcRay,,sizeof(cl_mem),&clmemArray);

    err = clSetKernelArg(m_KerCalcRay,,sizeof(cl_mem),&clmemRes);

    err = clSetKernelArg(m_KerCalcRay,,sizeof(int)*nLocalSize,);

    err = clSetKernelArg(m_KerCalcRay,,sizeof(int),&nNumCount);

    size_t localws[] = {nLocalSize};

    size_t globalws[] = {nGroupSize};

    err = clEnqueueNDRangeKernel(queue,m_KerCalcRay,,NULL,globalws,localws,,NULL,NULL);

    clFinish(queue);

    int* pRes = new int[nGroup];

    err = clEnqueueReadBuffer(queue,clmemRes,CL_TRUE,,sizeof(int)*nGroup,pRes,,,);

    clFinish(queue);

    unsigned long int nRes = ;

    for(int i=;i<nGroup;++i)

    {

        nRes += pRes[i];

    }
　　assert（nRes == nReal）;

kernel:

__kernel void ReduceSum(__global int* num,__global int* res,__local int* pData,int nCount)

{

    unsigned int tid = get_local_id();

    unsigned int bid = get_group_id();

    unsigned int gid = get_global_id();

    unsigned int localSize = get_local_size();

    unsigned int globalSize = get_global_size();

    int nRes = ;

    while(gid < nCount)

    {

        nRes += num[gid];

        gid += globalSize;

    }

    pData[tid] = nRes;

    barrier(CLK_LOCAL_MEM_FENCE);

     // do reduction in shared mem

    for(unsigned int s = localSize >> ; s > ; s >>= )

    {

         if(tid < s)

         {

             pData[tid] += pData[tid + s];

         }

         barrier(CLK_LOCAL_MEM_FENCE);

     }

    if(tid == )

        res[bid] =     pData[];

}

Reduction求和是这样一种方法，比如8个数0到7依次存放，求和的时候就是下标0和4、1和5、2和6、3和7，求和结果放到下标0、1、2、3中（同步一把barrier(CLK_LOCAL_MEM_FENCE)）。然后继续就是0和2,、1和3求和结果放到0、1中。如此往复、最终结果就放到下标0中啦。

另：我试过循环展开减少同步次数、不过效率增长微乎其微。

秒客网

Opencl 并行求和

相关文章