CUDA __syncthreads();不工作;在断点命中顺序中反转

时间:2022-09-13 20:43:13

I have a problem with I think __syncthreads();. I have a code like this:


__device__ void prefixSumJoin(const bool *g_idata, int *g_odata, int n)
    __shared__ int temp[Config::bfr*Config::bfr];  // allocated on invocation  
    int thid = threadIdx.y*blockDim.x + threadIdx.x;  
        int offset = 1;
        temp[2*thid] = (g_idata[2*thid]?1:0); // load input into shared memory  
        temp[2*thid+1] = (g_idata[2*thid+1]?1:0); 
        for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree  
            if (thid < d)  
                int ai = offset*(2*thid+1)-1; // <-- breakpoint B 
                int bi = offset*(2*thid+2)-1;
                temp[bi] += temp[ai];  
            offset *= 2; 
        if (thid == 0) { temp[n - 1] = 0; } // clear the last element

        for (int d = 1; d < n; d *= 2) // traverse down tree & build scan  
            offset >>= 1;  
            if (thid < d)                       
                int ai = offset*(2*thid+1)-1;  
                int bi = offset*(2*thid+2)-1;
                int t = temp[ai];  
                temp[ai] = temp[bi];  
                temp[bi] += t;   
        g_odata[2*thid] = temp[2*thid]; // write results to device memory  
        g_odata[2*thid+1] = temp[2*thid+1]; 

__global__ void selectKernel3(...)
    int tidx = threadIdx.x;
    int tidy = threadIdx.y;
    int bidx = blockIdx.x;
    int bidy = blockIdx.y;
    int tid = tidy*blockDim.x + tidx;
    int bid = bidy*gridDim.x+bidx;
    int noOfRows1 = ...;
    int noOfRows2 = ...;

    __shared__ bool isRecordSelected[Config::bfr*Config::bfr];
    __shared__ int selectedRecordsOffset[Config::bfr*Config::bfr];

    isRecordSelected[tid] = false;
    selectedRecordsOffset[tid] = 0;

    if(tidx<noOfRows1 && tidy<noOfRows2)
        if(... == ...)
            isRecordSelected[tid] = true;

    prefixSumJoin(isRecordSelected,selectedRecordsOffset,Config::bfr*Config::bfr); // <-- breakpoint A

            some_instruction;// <-- breakpoint C
   dim3 dimGrid(13, 5);
   dim3 dimBlock(Config::bfr, Config::bfr);

   selectKernel3<<<dimGrid, dimBlock>>>(...)

//other file

class Config
    static const int bfr = 16; // blocking factor = number of rows per block

the prefixSum is from, with little change.


Ok, now I set 3 breakpoints: A,B,C. It should be hit in order A,B,C. Problem is that it is hit in order: A,B*x,C,B. So in point C, selectedRecordsOffset is not ready and it cause errors. After A the B is hit few times, but not all and then C is hit and it goes further in code and then again B for rest of the loop. x is different depending on input (for some inputs there isn't any inverse in breakpoints so C is last that was hit).

Moreover if I look on thread numbers that cause hit it is for A and C threadIdx.y = 0 and for B threadIdx.y = 10. How is this possible while it is the same block so why some threads ommit sync? There is no conditional sync. Does someone have any idea where to look for bug?

If you need some more clarification, just ask.
Thanks in advance for any advice how to work this out.


1 个解决方案



Thou shalt not use __syncthreads() in conditional code if the condition does not evaluate uniformly across all threads of each block.




