CUDA减少以找到阵列的最大值

时间:2021-03-06 13:50:42

I am doing the Udacity course on parallel programming (homework 3) and can not figure out why I can't get the maximum in the array using parallel reduction (Udacity forums yet to provide solution). I am pretty certain that I have set up the arrays properly and that the algorithm is correct. I suspect that I have a problem with memory management (accessing out of bounds, incorrect array sizes, copying to and from). Please help! I am running this in the Udacity environment, not locally. Below is the code that I am currently using. For some reason when I change the fmaxf's to fminf's it does find the minimum.

我正在进行关于并行编程(作业3)的Udacity课程,并且无法弄清楚为什么我无法使用并行缩减(Udacity论坛尚未提供解决方案)获得阵列中的最大值。我很确定我已正确设置数组并且算法正确。我怀疑我的内存管理有问题(访问越界,不正确的数组大小,复制到和来)。请帮忙!我在Udacity环境中运行它,而不是在本地运行。以下是我目前使用的代码。出于某种原因,当我将fmaxf更改为fminf时,它确实找到了最小值。

#include "reference_calc.cpp"
#include "utils.h"
#include "math.h"
#include <stdio.h>
#include <cmath>

__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {

    // Reduce log Lum with Max Operator
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    extern __shared__ float temp[];

    if (myId < size) {
        temp[tid] = d_logLum[myId];
    }
    else {
        temp[tid] = d_logLum[tid];
    }

    for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            if (myId < size) {
                temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]);
            } else {
                temp[tid] = d_logLum[tid];
            }
        }
        __syncthreads(); 
    }

    if (tid == 0) {
        d_out[blockIdx.x] = temp[0];
    }
}

__global__ void reduce_max_kernel2(float *d_out, float *d_in) {

    // Reduce log Lum with Max Operator
    int myId = threadIdx.x + blockDim.x * blockIdx.x;
    int tid  = threadIdx.x;

    for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
        if (tid < s) {
            d_in[myId] = fmaxf(d_in[myId + s], d_in[myId]);
        }
        __syncthreads();   
    }

    if (tid == 0) {
        d_out[0] = d_in[0];
    }

}


void your_histogram_and_prefixsum(const float* const d_logLuminance,
                                  unsigned int* const d_cdf,
                                  float &min_logLum,
                                  float &max_logLum,
                                  const size_t numRows,
                                  const size_t numCols,
                                  const size_t numBins)
{
  //TODO
  /*Here are the steps you need to implement
    1) find the minimum and maximum value in the input logLuminance channel
       store in min_logLum and max_logLum
    2) subtract them to find the range
    3) generate a histogram of all the values in the logLuminance channel using
       the formula: bin = (lum[i] - lumMin) / lumRange * numBins
    4) Perform an exclusive scan (prefix sum) on the histogram to get
       the cumulative distribution of luminance values (this should go in the
       incoming d_cdf pointer which already has been allocated for you)       */
    //int size = 1 << 18;
    int points = numRows * numCols;
    int logPoints = ceil(log(points)/log(2));
    int sizePow = logPoints;
    int size = pow(2, sizePow);
    int numThreads = 1024;
    int numBlocks = size / numThreads;

    float *d_out;
    float *d_max_out;

    checkCudaErrors(cudaMalloc((void **) &d_out, numBlocks * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **) &d_max_out, sizeof(float)));

    cudaDeviceSynchronize();
    reduce_max_kernel<<<numBlocks, numThreads, sizeof(float)*numThreads>>>(d_out, d_logLuminance, points);

    cudaDeviceSynchronize();
    reduce_max_kernel2<<<1, numBlocks>>>(d_max_out, d_out);

    float h_out_max;
    checkCudaErrors(cudaMemcpy(&h_out_max, d_max_out, sizeof(float), cudaMemcpyDeviceToHost));

    printf("%f\n", h_out_max);

    checkCudaErrors(cudaFree(d_max_out));
    checkCudaErrors(cudaFree(d_out));

}

1 个解决方案

#1


3  

You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.

您正在尝试重现CUDA SDK缩减示例的reduce2缩减内核。 Robert Crovella已经发现了你在代码中犯的两个错误。除了它们,我认为你也错误地初始化了共享内存。

Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.

下面,请找到围绕您的尝试构建的完整工作示例。我给你的方法留下了错误的指示。

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {

    int tid         = threadIdx.x;                              // Local thread index
    int myId        = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    extern __shared__ float temp[];

    // --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
    temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;

    // --- Your solution
    // if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Your solution
    //for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
    //    if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
    //    __syncthreads(); 
    //}

    if (tid == 0) {
        d_out[blockIdx.x] = temp[0];
    }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 10;

    thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- reduce2
    thrust::device_vector<float> d_vec_block(NumBlocks);
    reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<float> h_vec_block(d_vec_block);
    float result_reduce0 = -FLT_MAX;
    for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
    printf("Result = %f\n",result_reduce0);

}

#1


3  

You are trying to reproduce the reduce2 reduction kernel of the CUDA SDK reduction sample. Robert Crovella has already spot two mistakes that you have made in your code. Besides them, I think you are also mistakenly initializing the shared memory.

您正在尝试重现CUDA SDK缩减示例的reduce2缩减内核。 Robert Crovella已经发现了你在代码中犯的两个错误。除了它们,我认为你也错误地初始化了共享内存。

Below, please find a complete working example constructed around your attempt. I have left the wrong instructions of your approach.

下面,请找到围绕您的尝试构建的完整工作示例。我给你的方法留下了错误的指示。

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

__global__ void reduce_max_kernel(float *d_out, const float *d_logLum, int size) {

    int tid         = threadIdx.x;                              // Local thread index
    int myId        = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    extern __shared__ float temp[];

    // --- Loading data to shared memory. All the threads contribute to loading the data to shared memory.
    temp[tid] = (myId < size) ? d_logLum[myId] : -FLT_MAX;

    // --- Your solution
    // if (myId < size) { temp[tid] = d_logLum[myId]; } else { temp[tid] = d_logLum[tid]; }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory. Only half of the threads contribute to reduction.
    for (unsigned int s=blockDim.x/2; s>0; s>>=1)
    {
        if (tid < s) { temp[tid] = fmaxf(temp[tid], temp[tid + s]); }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Your solution
    //for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
    //    if (tid < s) { if (myId < size) { temp[tid] = fmaxf(d_logLum[myId + s], d_logLum[myId]); } else { temp[tid] = d_logLum[tid]; } }
    //    __syncthreads(); 
    //}

    if (tid == 0) {
        d_out[blockIdx.x] = temp[0];
    }
}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 10;

    thrust::device_vector<float> d_vec(N,3.f); d_vec[4] = 4.f;

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- reduce2
    thrust::device_vector<float> d_vec_block(NumBlocks);
    reduce_max_kernel<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec_block.data()), thrust::raw_pointer_cast(d_vec.data()), N);

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<float> h_vec_block(d_vec_block);
    float result_reduce0 = -FLT_MAX;
    for (int i=0; i<NumBlocks; i++) result_reduce0 = fmax(h_vec_block[i], result_reduce0);
    printf("Result = %f\n",result_reduce0);

}