CUDA调试与NSight Visual Studio 2010

时间:2021-12-27 02:51:27

I am not able to debug for the "global" function lines for which I set breakpoints. I debug with "Start CUDA Debugging" option from NSight menu.

我无法调试用于设置断点的“全局”函数行。我在NSight菜单上使用“开始CUDA调试”选项进行调试。

My NSight plugin is successfully installed for VS 2010, I am able to debug my other projects (sample projects came within NSight debugger)

我的NSight插件已经为VS 2010成功安装,我可以调试我的其他项目(NSight调试器中有示例项目)

My code is here (it is a bit long but generally repeats same functions) :

我的代码在这里(有点长,但通常重复相同的功能):

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include "book.h"

#define N   (131072)

__global__ void conjugate( float2 *a ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        a[idx] = cuConjf(a[idx]);
    }
}

__global__ void multWithReference( float2 *signal, float2 *reference ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        signal[idx].x = signal[idx].x * reference[idx].x;
        signal[idx].y = signal[idx].y * reference[idx].y;
    }
}

__global__ void shift( float2 *signal, size_t shiftamount, float2* shifted ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    *(shifted+((idx+shiftamount)%131072)) = *(signal+idx);
}

__global__ void fftshift(float2 *u_d)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if(i < 131072)
    {
        double a = 1-2*(i&1);
        u_d[i].x *= a;
        u_d[i].y *= a;
    }
}


static inline cufftHandle createFFTPlan(cudaStream_t* stream)
{
    cufftHandle plan;

    if (cudaGetLastError() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to allocate\n");
    }
    if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){
        fprintf(stderr, "CUFFT error: Plan creation failed");
    }
    if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){
        fprintf(stderr, "CUFFT error: Plan stream association failed");
    }

    return plan;
}

int main( void ) {

    cudaDeviceProp  prop;
    int whichDevice;
    HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
    HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
    if (!prop.deviceOverlap) {
        printf( "Device will not handle overlaps, so no speed up from streams\n" );
        return 0;
    }

    cudaEvent_t     start, stop;
    float           elapsedTime;

    cudaStream_t    stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7;
    float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7;
    float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7;

    // start the timers
    HANDLE_ERROR( cudaEventCreate( &start ) );
    HANDLE_ERROR( cudaEventCreate( &stop ) );

    // initialize the streams
    HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream2 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream3 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream4 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream5 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream6 ) );
    HANDLE_ERROR( cudaStreamCreate( &stream7 ) );

    // allocate the memory on the GPU
    HANDLE_ERROR( cudaMalloc( (void**)&dev_ref,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_0,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_1,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_2,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_3,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_4,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_5,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_6,
                              N * sizeof(float2) ) );
    HANDLE_ERROR( cudaMalloc( (void**)&dev_7,
                              N * sizeof(float2) ) );

    // allocate host locked memory, used to stream
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_ref,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_0,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_1,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_2,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_3,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_4,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_5,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_6,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );
    HANDLE_ERROR( cudaHostAlloc( (void**)&host_7,
                              N * sizeof(float2),
                              cudaHostAllocDefault ) );

    // Open signal file
    FILE *fp;
    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_ref, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_0, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_1, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_2, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_3, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_4, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_5, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_6, sizeof(float2), 131072, fp);
    fclose(fp);

    if(NULL == (fp = fopen("testSignal4.bin","r"))){
        printf("can not open file...");
        exit(1);
    }
    fread(host_7, sizeof(float2), 131072, fp);
    fclose(fp);

    // create FFT plans
    cufftHandle plan0 = createFFTPlan(&stream0);
    cufftHandle plan1 = createFFTPlan(&stream1);
    cufftHandle plan2 = createFFTPlan(&stream2);
    cufftHandle plan3 = createFFTPlan(&stream3);
    cufftHandle plan4 = createFFTPlan(&stream4);
    cufftHandle plan5 = createFFTPlan(&stream5);
    cufftHandle plan6 = createFFTPlan(&stream6);
    cufftHandle plan7 = createFFTPlan(&stream7);

    float2* shifted0;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted0,
                            N * sizeof(float2) ) );
    float2* shifted1;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted1,
                            N * sizeof(float2) ) );
    float2* shifted2;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted2,
                            N * sizeof(float2) ) );
    float2* shifted3;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted3,
                            N * sizeof(float2) ) );
    float2* shifted4;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted4,
                            N * sizeof(float2) ) );
    float2* shifted5;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted5,
                            N * sizeof(float2) ) );
    float2* shifted6;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted6,
                            N * sizeof(float2) ) );
    float2* shifted7;
    HANDLE_ERROR( cudaMalloc( (void**)&shifted7,
                            N * sizeof(float2) ) );

    HANDLE_ERROR( cudaEventRecord( start, 0 ) );

    // enqueue copies of a in stream0 and stream1
    HANDLE_ERROR( cudaMemcpyAsync( dev_ref, host_ref,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream2 ) );

    HANDLE_ERROR( cudaMemcpyAsync( dev_0, host_0,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream0 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_1, host_1,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream1 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_2, host_2,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream2 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_3, host_3,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream3 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_4, host_4,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream4 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_5, host_5,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream5 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_6, host_6,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream6 ) );
    HANDLE_ERROR( cudaMemcpyAsync( dev_7, host_7,
                                    sizeof(float2),
                                    cudaMemcpyHostToDevice,
                                    stream7 ) );

    for(int i = 0; i < 100; i++){

        shift<<<131072,131072,0>>>(dev_0, i, shifted0);
        shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1);
        shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2);
        shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3);
        shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4);
        shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5);
        shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6);
        shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7);

        conjugate<<<131072/256,131072,0,stream0>>>(shifted0);
        conjugate<<<131072/256,131072,0,stream1>>>(shifted1);
        conjugate<<<131072/256,131072,0,stream2>>>(shifted2);
        conjugate<<<131072/256,131072,0,stream3>>>(shifted3);
        conjugate<<<131072/256,131072,0,stream4>>>(shifted4);
        conjugate<<<131072/256,131072,0,stream5>>>(shifted5);
        conjugate<<<131072/256,131072,0,stream6>>>(shifted6);
        conjugate<<<131072/256,131072,0,stream7>>>(shifted7);

        multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref);
        multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref);
        multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref);
        multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref);
        multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref);
        multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref);
        multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref);
        multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref);

        if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }
        if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){
            fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
        }

        fftshift<<<131072,131072,0,stream0>>>(shifted0);
        fftshift<<<131072,131072,0,stream1>>>(shifted1);
        fftshift<<<131072,131072,0,stream2>>>(shifted2);
        fftshift<<<131072,131072,0,stream3>>>(shifted3);
        fftshift<<<131072,131072,0,stream4>>>(shifted4);
        fftshift<<<131072,131072,0,stream5>>>(shifted5);
        fftshift<<<131072,131072,0,stream6>>>(shifted6);
        fftshift<<<131072,131072,0,stream7>>>(shifted7);

    }

    if (cudaThreadSynchronize() != cudaSuccess){
        fprintf(stderr, "Cuda error: Failed to synchronize\n");
    }

    float2 *host_last = (float2 *)malloc(8*131072);

    // enqueue copies of c from device to locked memory
    HANDLE_ERROR( cudaMemcpyAsync( host_last, shifted0,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream0 ) );

    // enqueue copies of c from device to locked memory
    HANDLE_ERROR( cudaMemcpyAsync( host_0, shifted0,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream0 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_1, shifted1,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream1 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_2, shifted2,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream2 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_3, shifted3,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream3 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_4, shifted4,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream4 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_5, shifted5,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream5 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_6, shifted6,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream6 ) );
    HANDLE_ERROR( cudaMemcpyAsync( host_7, shifted7,
                                    sizeof(float2),
                                    cudaMemcpyDeviceToHost,
                                    stream7 ) );
    // Streamleri senkronize et
    HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream2 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream3 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream4 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream5 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream6 ) );
    HANDLE_ERROR( cudaStreamSynchronize( stream7 ) );


    // Stop timer
    HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
    HANDLE_ERROR( cudaEventSynchronize( stop ) );
    HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
                                        start, stop ) );
    printf( "Time taken:  %3.1f ms\n", elapsedTime );

    FILE *fp2;
    if(NULL == (fp2 = fopen("result.bin","wb+"))){
        printf("can not open file...");
        exit(1);
    }

    fwrite(host_last, sizeof(float2), 131072, fp2);

    printf("signal written \n");
    fflush(stdout);
    fclose(fp2);

    // cleanup the streams and memory
    HANDLE_ERROR( cudaFreeHost( host_0 ) );
    HANDLE_ERROR( cudaFreeHost( host_1 ) );
    HANDLE_ERROR( cudaFreeHost( host_2 ) );
    HANDLE_ERROR( cudaFreeHost( host_3 ) );
    HANDLE_ERROR( cudaFreeHost( host_4 ) );
    HANDLE_ERROR( cudaFreeHost( host_5 ) );
    HANDLE_ERROR( cudaFreeHost( host_6 ) );
    HANDLE_ERROR( cudaFreeHost( host_7 ) );

    HANDLE_ERROR( cudaFree( dev_0 ) );
    HANDLE_ERROR( cudaFree( dev_1 ) );
    HANDLE_ERROR( cudaFree( dev_2 ) );
    HANDLE_ERROR( cudaFree( dev_3 ) );
    HANDLE_ERROR( cudaFree( dev_4 ) );
    HANDLE_ERROR( cudaFree( dev_5 ) );
    HANDLE_ERROR( cudaFree( dev_6 ) );
    HANDLE_ERROR( cudaFree( dev_7 ) );

    cufftDestroy(plan0);
    cufftDestroy(plan1);
    cufftDestroy(plan2);
    cufftDestroy(plan3);
    cufftDestroy(plan4);
    cufftDestroy(plan5);
    cufftDestroy(plan6);
    cufftDestroy(plan7);

    HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream2 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream3 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream4 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream5 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream6 ) );
    HANDLE_ERROR( cudaStreamDestroy( stream7 ) );

    printf("hit [enter] to exit...");
    fflush(stdout);
    getchar();

    return 0;
}

Binary file needed to reproduce the problem is within this link :

复制问题所需的二进制文件在此链接内:

Binary file

二进制文件

When I run "cuda-memcheck" on release exe file I get the following result:

当我在发行版exe文件上运行“cuda-memcheck”时,得到以下结果:

CUDA调试与NSight Visual Studio 2010

1 个解决方案

#1


1  

When debugging GPU code in Nsight VSE you need to start debugging through the Nsight menu ("Start CUDA Debugging"). See this walkthrough for more information.

在Nsight VSE中调试GPU代码时,需要通过Nsight菜单开始调试(“启动CUDA调试”)。更多信息请参见本演练。

EDIT

编辑

Based on the additional information you provided, in particular the cuda-memcheck output, it seems like your kernel is not actually being launched. Error 9 is cudaErrorInvalidConfiguration indicating that the launch configuration (blocks, threads/block, smem/block) is incompatible with the device.

基于您提供的附加信息,特别是cuda-memcheck输出,看起来您的内核实际上并没有被启动。错误9是cudaErrorInvalidConfiguration,指示启动配置(块、线程/块、smem/块)与设备不兼容。

cudaErrorInvalidConfiguration = 9

cudaErrorInvalidConfiguration = 9

This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.

这表明内核启动请求的资源永远不能满足当前设备。请求每个块比设备支持的共享内存更多将触发此错误,请求太多的线程或块也是如此。有关更多设备限制,请参见cudaDeviceProp。

In fact, you're trying to launch 131072 threads/block which is way above the limits (see the Programming Guide for details and for the specific limits). You should launch smaller blocks and increase the number of blocks accordingly.

实际上,您正在尝试启动131072线程/块,这远远超出了限制(详细信息和特定限制请参阅编程指南)。您应该启动较小的块并相应地增加块的数量。

As Robert Crovella said, you should always ensure you have proper error checking.

正如Robert Crovella所说,您应该始终确保您有正确的错误检查。

#1


1  

When debugging GPU code in Nsight VSE you need to start debugging through the Nsight menu ("Start CUDA Debugging"). See this walkthrough for more information.

在Nsight VSE中调试GPU代码时,需要通过Nsight菜单开始调试(“启动CUDA调试”)。更多信息请参见本演练。

EDIT

编辑

Based on the additional information you provided, in particular the cuda-memcheck output, it seems like your kernel is not actually being launched. Error 9 is cudaErrorInvalidConfiguration indicating that the launch configuration (blocks, threads/block, smem/block) is incompatible with the device.

基于您提供的附加信息,特别是cuda-memcheck输出,看起来您的内核实际上并没有被启动。错误9是cudaErrorInvalidConfiguration,指示启动配置(块、线程/块、smem/块)与设备不兼容。

cudaErrorInvalidConfiguration = 9

cudaErrorInvalidConfiguration = 9

This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.

这表明内核启动请求的资源永远不能满足当前设备。请求每个块比设备支持的共享内存更多将触发此错误,请求太多的线程或块也是如此。有关更多设备限制,请参见cudaDeviceProp。

In fact, you're trying to launch 131072 threads/block which is way above the limits (see the Programming Guide for details and for the specific limits). You should launch smaller blocks and increase the number of blocks accordingly.

实际上,您正在尝试启动131072线程/块,这远远超出了限制(详细信息和特定限制请参阅编程指南)。您应该启动较小的块并相应地增加块的数量。

As Robert Crovella said, you should always ensure you have proper error checking.

正如Robert Crovella所说,您应该始终确保您有正确的错误检查。