cuda cpu功能- gpu内核重叠。

时间:2022-02-20 17:07:25

I am having problems with concurrency in my CUDA application that I am trying to develop in order to practice CUDA. I want to share the work between GPU and CPU by using asynchronous behaviors of cudaMemecpyAsync and CUDA kernels but I cannot successfully overlap CPU execution and GPU execution.

我在我的CUDA应用程序中存在并发问题,我正在尝试开发,以便实践CUDA。我想用cudaMemecpyAsync和CUDA内核的异步行为来共享GPU和CPU之间的工作,但是我不能成功地重叠CPU执行和GPU执行。

It overlaps with Host to Device data transfer but kernel execution does not overlap. It basically waits CPU to finish and call the synchronization function then kernel starts to execute on device. I couldn't understand this behavior, aren't kernels always asynchronous to CPU thread?

它与主机与设备数据传输重叠,但内核执行不重叠。它基本上等待CPU完成并调用同步函数,然后内核开始在设备上执行。我无法理解这种行为,内核是否总是异步到CPU线程?

My GPU is Nvidia Geforce GT 550m (Fermi Architecture with 1 Copy Engine and 1 Compute Engine).

我的GPU是Nvidia Geforce GT 550m(费米架构,有1个拷贝引擎和1个计算引擎)。

I use CUDA 6.0 and Nsight 4.0.

我使用CUDA 6.0和Nsight 4.0。

Here is the code:

这是代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>

#include <iostream>
#include <thread>
#include <chrono>
using namespace std;

struct point4D 
{
    float x;
    float y;
    float z;
    float w;
};

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);

bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);

// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    point4D pA = d_ptrData[index];
    point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;

    out.x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
    out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
    out.z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
    out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);

   d_out[index] = out;
}

// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    for(unsigned int index = 0; index < h_dataSize; index++)
    {
        h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;

        point4D pA = h_ptrData[index];

        h_out[index].x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
        h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
        h_out[index].z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
        h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
    }   
}


int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    int device_count;
    cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    devID = 0;
    cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
    cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);

    cudaDeviceReset();

    const unsigned int DATA_SIZE = 30000000;
    bool bFinalResults = true;

    // Input Data Initialization
    point4D pointB;
    pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;

    point4D pointC;
    pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;

    point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    initialize_input(data, DATA_SIZE);
    //

    flush_buffer(out_points, DATA_SIZE);
    cout << endl << endl;

    // 1+way
    heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
    bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness

    free(out_points);
    free(data);

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
    return 0;
}

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    cout << "1-way_plus: STARTS!!!" << endl;

    // Run the %25 of the data from CPU, rest will be executed on GPU
    unsigned int ratioPercentCPUtoGPU = 25;
    unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
    h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
    size_t memorySize = d_dataSize * sizeof(point4D);

    cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
    cout << "CPU will process " << h_dataSize << " data." << endl;
    cout << "GPU will process " << d_dataSize << " data." << endl;

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
    cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");

    // allocate device memory
    point4D * d_in = 0; point4D * d_out = 0;
    cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
    cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");

    // set kernel launch configuration
    dim3 nThreads = dim3(1000,1);
    dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t stream;
    cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");

    // create cuda event handles
    cudaEvent_t start, stop;
    cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
    cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");

    // main thread waits for device
    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
    float gpu_time = 0.0f;
    cudaEventRecord(start, stream);

    cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);       
    gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
    cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);

    cudaEventRecord(stop, stream);

    // The memory layout of CPU processing starts after GPU's.
    cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);       

    cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");

    cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");

    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");

    // release resources
    cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
    cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
    cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
    cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
    cudaCheckError(cudaFree(d_in), "cudaFree failed!");
    cudaCheckError(cudaFree(d_out), "cudaFree failed!");
    cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");

    cudaDeviceReset();    

    cout << "Execution of GPU: " << gpu_time << "ms" << endl;
    cout << "1-way_plus: ENDS!!!" << endl;        
}

// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
{ 
    const static float x = 0, y = 0, z = 0, w = -1;

    for (unsigned int i = 0; i < size; i++)
    {
        if (data[i].x != x || data[i].y != y ||
            data[i].z != y || data[i].w != w)
        {
            printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
            i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);

            return 0;
        }
    }
    return 1;
}

// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)
{
    for(unsigned int i = 0; i < size; i++)
    {
        data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
    }
}

// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)
{
    for(unsigned int idx = 0; idx < size; idx++)
    {
        point4D* d = &data[idx];
        d->x = 1;
        d->y = 0;
        d->z = 0;
        d->w = 0;
    }
}

void cudaCheckError(cudaError_t cudaStatus, char* err)
{
    if(cudaStatus != cudaSuccess)
    {
        fprintf(stderr, err);
        cudaDeviceReset();
       exit(EXIT_FAILURE);
    }
}

And here is the Nsight screenshot cuda cpu功能- gpu内核重叠。:

这是Nsight截图:

1 个解决方案

#1


2  

You're getting proper overlap, from what I can see on your profiler image. I ran your code and see something similar.

你得到了正确的重叠,从你的轮廓图像上可以看到。我运行了你的代码,看到了类似的东西。

In general, the critical sequence in your code is like this:

一般来说,代码中的关键序列是这样的:

  1. cudaMemcpyAsyncH2D
  2. cudaMemcpyAsyncH2D
  3. kernel call
  4. 内核调用
  5. cudaMemcpyAsyncD2H
  6. cudaMemcpyAsyncD2H
  7. cpu function
  8. cpu功能
  9. cudaStreamSynchronize
  10. cudaStreamSynchronize

The CPU thread processes those steps in that order. Steps 1-3 are asynchronous, meaning control is returned to the CPU thread immediately, without waiting for the underlying CUDA operation to complete. And you desire that step 4 overlaps as much as possible with steps 1,2, and 3.

CPU线程按照该顺序处理这些步骤。步骤1-3是异步的,意味着控制将立即返回到CPU线程,而无需等待基础的CUDA操作完成。你希望第4步与第1、2和3步骤重叠。

What we see is that the cudaStreamSynchronize() call shows up in the timeline approximately coincident with the start of the kernel execution. This means that all CPU thread activity preceding the cudaStreamSynchronize() call has completed at that point (i.e. approximately at the point of the beginning of the actual kernel execution.) Therefore the cpu function (step 4) that we are desiring to overlap with steps 1-3 is actually completed by the start of step 2 (in terms of actual CUDA execution). Therefore you are getting full overlap of your cpu function with the first host->device memcpy operation.

我们所看到的是,cudaStreamSynchronize()调用在时间轴上显示与内核执行开始时的一致。这意味着在cudaStreamSynchronize()调用之前的所有CPU线程活动都已经完成(即大约在实际内核执行开始的时候)。因此,我们希望与步骤1-3重叠的cpu函数(步骤4)实际上是由步骤2的开始(在实际的CUDA执行方面)完成的。因此,在第一个主机->设备memcpy操作中,您的cpu函数将完全重叠。

So it's working as expected. Because the cudaStreamSynchronize() call blocks the CPU thread until all stream activity is complete, it occupies the timeline from when it is encountered until the point at which the stream activity is complete.

所以它是按照预期工作的。因为cudaStreamSynchronize()调用阻塞了CPU线程,直到所有流活动完成为止,它占用了时间线,从遇到它的时间到流活动完成的点为止。

The fact that the cudaStreamSynchronize() call is curiously coincident with the start of kernel execution, and that there is a gap in between the end of the H2D memcpy and the start of the kernel, is likely due to WDDM batching of commands. When I profile your code under linux, I don't see the gap and exact coincidence, but otherwise the general flow is the same. Here is what I see using the visual profiler under linux:

cudaStreamSynchronize()调用与内核执行的开始是奇怪的巧合,并且在H2D memcpy的结束和内核的开始之间有一个间隙,这可能是由于WDDM批处理命令造成的。当我在linux下剖析您的代码时,我并没有看到差距和确切的巧合,但在其他方面,一般的流程是相同的。下面是我在linux下看到的使用visual profiler的内容:

cuda cpu功能- gpu内核重叠。

Note that in the above image, the cudaStreamSynchronize() is actually encountered during the H2D memcpy operation before the kernel begins.

注意,在上面的图像中,cudaStreamSynchronize()实际上是在内核开始前的H2D memcpy操作过程中遇到的。

Responding to a question in the comments, I modified the app so the split percentage was 50 instead of 25:

我在评论中回答了一个问题,我修改了这个应用程序,所以拆分的百分比是50,而不是25:

unsigned int ratioPercentCPUtoGPU = 50;

here is what the new profiler output looks like:

这是新的分析器输出的样子:

cuda cpu功能- gpu内核重叠。

We see that the CPU is taking more time relative to the GPU kernel call, and so the cudaStreamSynchronize() call is not encountered by the CPU thread until during the D2H memcpy operation. We continue to see under linux that there is no fixed relationship between this point and the start of the kernel execution. Now the CPU execution is fully overlapping the H2D memcpy, the kernel execution, and a small portion of the D2H memcpy.

我们看到CPU占用的时间相对于GPU内核调用的时间更长,因此在D2H memcpy操作期间,CPU线程不会遇到cudastreamsync()调用。我们继续在linux下看到,这一点和内核执行的开始之间没有固定的关系。现在,CPU执行完全重叠了H2D memcpy、内核执行和一小部分D2H memcpy。

#1


2  

You're getting proper overlap, from what I can see on your profiler image. I ran your code and see something similar.

你得到了正确的重叠,从你的轮廓图像上可以看到。我运行了你的代码,看到了类似的东西。

In general, the critical sequence in your code is like this:

一般来说,代码中的关键序列是这样的:

  1. cudaMemcpyAsyncH2D
  2. cudaMemcpyAsyncH2D
  3. kernel call
  4. 内核调用
  5. cudaMemcpyAsyncD2H
  6. cudaMemcpyAsyncD2H
  7. cpu function
  8. cpu功能
  9. cudaStreamSynchronize
  10. cudaStreamSynchronize

The CPU thread processes those steps in that order. Steps 1-3 are asynchronous, meaning control is returned to the CPU thread immediately, without waiting for the underlying CUDA operation to complete. And you desire that step 4 overlaps as much as possible with steps 1,2, and 3.

CPU线程按照该顺序处理这些步骤。步骤1-3是异步的,意味着控制将立即返回到CPU线程,而无需等待基础的CUDA操作完成。你希望第4步与第1、2和3步骤重叠。

What we see is that the cudaStreamSynchronize() call shows up in the timeline approximately coincident with the start of the kernel execution. This means that all CPU thread activity preceding the cudaStreamSynchronize() call has completed at that point (i.e. approximately at the point of the beginning of the actual kernel execution.) Therefore the cpu function (step 4) that we are desiring to overlap with steps 1-3 is actually completed by the start of step 2 (in terms of actual CUDA execution). Therefore you are getting full overlap of your cpu function with the first host->device memcpy operation.

我们所看到的是,cudaStreamSynchronize()调用在时间轴上显示与内核执行开始时的一致。这意味着在cudaStreamSynchronize()调用之前的所有CPU线程活动都已经完成(即大约在实际内核执行开始的时候)。因此,我们希望与步骤1-3重叠的cpu函数(步骤4)实际上是由步骤2的开始(在实际的CUDA执行方面)完成的。因此,在第一个主机->设备memcpy操作中,您的cpu函数将完全重叠。

So it's working as expected. Because the cudaStreamSynchronize() call blocks the CPU thread until all stream activity is complete, it occupies the timeline from when it is encountered until the point at which the stream activity is complete.

所以它是按照预期工作的。因为cudaStreamSynchronize()调用阻塞了CPU线程,直到所有流活动完成为止,它占用了时间线,从遇到它的时间到流活动完成的点为止。

The fact that the cudaStreamSynchronize() call is curiously coincident with the start of kernel execution, and that there is a gap in between the end of the H2D memcpy and the start of the kernel, is likely due to WDDM batching of commands. When I profile your code under linux, I don't see the gap and exact coincidence, but otherwise the general flow is the same. Here is what I see using the visual profiler under linux:

cudaStreamSynchronize()调用与内核执行的开始是奇怪的巧合,并且在H2D memcpy的结束和内核的开始之间有一个间隙,这可能是由于WDDM批处理命令造成的。当我在linux下剖析您的代码时,我并没有看到差距和确切的巧合,但在其他方面,一般的流程是相同的。下面是我在linux下看到的使用visual profiler的内容:

cuda cpu功能- gpu内核重叠。

Note that in the above image, the cudaStreamSynchronize() is actually encountered during the H2D memcpy operation before the kernel begins.

注意,在上面的图像中,cudaStreamSynchronize()实际上是在内核开始前的H2D memcpy操作过程中遇到的。

Responding to a question in the comments, I modified the app so the split percentage was 50 instead of 25:

我在评论中回答了一个问题,我修改了这个应用程序,所以拆分的百分比是50,而不是25:

unsigned int ratioPercentCPUtoGPU = 50;

here is what the new profiler output looks like:

这是新的分析器输出的样子:

cuda cpu功能- gpu内核重叠。

We see that the CPU is taking more time relative to the GPU kernel call, and so the cudaStreamSynchronize() call is not encountered by the CPU thread until during the D2H memcpy operation. We continue to see under linux that there is no fixed relationship between this point and the start of the kernel execution. Now the CPU execution is fully overlapping the H2D memcpy, the kernel execution, and a small portion of the D2H memcpy.

我们看到CPU占用的时间相对于GPU内核调用的时间更长,因此在D2H memcpy操作期间,CPU线程不会遇到cudastreamsync()调用。我们继续在linux下看到,这一点和内核执行的开始之间没有固定的关系。现在,CPU执行完全重叠了H2D memcpy、内核执行和一小部分D2H memcpy。