利用两个流进行操作演示

项目下载链接
  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 
 17 #include "../common/book.h"
 18 #include "cuda.h"
 19 #include "cuda_runtime.h"
 20 #include "device_launch_parameters.h"
 21 #define N   (1024*1024)
 22 #define FULL_DATA_SIZE   (N*20)
 23 
 24 
 25 __global__ void kernel(int *a, int *b, int *c) {
 26     int idx = threadIdx.x + blockIdx.x * blockDim.x;
 27     if (idx < N) {
 28         int idx1 = (idx + 1) % 256;
 29         int idx2 = (idx + 2) % 256;
 30         float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
 31         float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
 32         c[idx] = (as + bs) / 2;
 33     }
 34 }
 35 
 36 
 37 int main(void) {
 38     cudaDeviceProp  prop;
 39     int whichDevice;
 40     HANDLE_ERROR(cudaGetDevice(&whichDevice));
 41     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
 42     if (!prop.deviceOverlap) {
 43         printf("Device will not handle overlaps, so no speed up from streams\n");
 44         return 0;
 45     }
 46 
 47     cudaEvent_t     start, stop;
 48     float           elapsedTime;
 49 
 50     cudaStream_t    stream0, stream1;
 51     int *host_a, *host_b, *host_c;
 52     int *dev_a0, *dev_b0, *dev_c0;
 53     int *dev_a1, *dev_b1, *dev_c1;
 54 
 55     // start the timers
 56     HANDLE_ERROR(cudaEventCreate(&start));
 57     HANDLE_ERROR(cudaEventCreate(&stop));
 58 
 59     //初始化两个流
 60     HANDLE_ERROR(cudaStreamCreate(&stream0));
 61     HANDLE_ERROR(cudaStreamCreate(&stream1));
 62 
 63     // allocate the memory on the GPU
 64     HANDLE_ERROR(cudaMalloc((void**)&dev_a0,
 65         N * sizeof(int)));
 66     HANDLE_ERROR(cudaMalloc((void**)&dev_b0,
 67         N * sizeof(int)));
 68     HANDLE_ERROR(cudaMalloc((void**)&dev_c0,
 69         N * sizeof(int)));
 70     HANDLE_ERROR(cudaMalloc((void**)&dev_a1,
 71         N * sizeof(int)));
 72     HANDLE_ERROR(cudaMalloc((void**)&dev_b1,
 73         N * sizeof(int)));
 74     HANDLE_ERROR(cudaMalloc((void**)&dev_c1,
 75         N * sizeof(int)));
 76 
 77     //在主机上分配锁定页内存
 78     HANDLE_ERROR(cudaHostAlloc((void**)&host_a,
 79         FULL_DATA_SIZE * sizeof(int),
 80         cudaHostAllocDefault));
 81     HANDLE_ERROR(cudaHostAlloc((void**)&host_b,
 82         FULL_DATA_SIZE * sizeof(int),
 83         cudaHostAllocDefault));
 84     HANDLE_ERROR(cudaHostAlloc((void**)&host_c,
 85         FULL_DATA_SIZE * sizeof(int),
 86         cudaHostAllocDefault));
 87 
 88     for (int i = 0; i<FULL_DATA_SIZE; i++) {
 89         host_a[i] = rand();
 90         host_b[i] = rand();
 91     }
 92 
 93     HANDLE_ERROR(cudaEventRecord(start, 0));
 94     // now loop over full data, in bite-sized chunks
 95     for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) {
 96         // enqueue copies of a in stream0 and stream1
 97         HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i,
 98             N * sizeof(int),
 99             cudaMemcpyHostToDevice,
100             stream0));
101         HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N,
102             N * sizeof(int),
103             cudaMemcpyHostToDevice,
104             stream1));
105         // enqueue copies of b in stream0 and stream1
106         HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i,
107             N * sizeof(int),
108             cudaMemcpyHostToDevice,
109             stream0));
110         HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N,
111             N * sizeof(int),
112             cudaMemcpyHostToDevice,
113             stream1));
114 
115         // enqueue kernels in stream0 and stream1   
116         kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0);
117         kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1);
118 
119         //从设备上将结果拷贝回主机上的锁定页内存
120         HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0,
121             N * sizeof(int),
122             cudaMemcpyDeviceToHost,
123             stream0));
124         HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1,
125             N * sizeof(int),
126             cudaMemcpyDeviceToHost,
127             stream1));
128     }
129     //将计算结果从锁定页内存复制会主机内存
130     HANDLE_ERROR(cudaStreamSynchronize(stream0));
131     HANDLE_ERROR(cudaStreamSynchronize(stream1));
132 
133     HANDLE_ERROR(cudaEventRecord(stop, 0));
134 
135     HANDLE_ERROR(cudaEventSynchronize(stop));
136     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
137         start, stop));
138     printf("Time taken:  %3.1f ms\n", elapsedTime);
139 
140     // cleanup the streams and memory
141     HANDLE_ERROR(cudaFreeHost(host_a));
142     HANDLE_ERROR(cudaFreeHost(host_b));
143     HANDLE_ERROR(cudaFreeHost(host_c));
144     HANDLE_ERROR(cudaFree(dev_a0));
145     HANDLE_ERROR(cudaFree(dev_b0));
146     HANDLE_ERROR(cudaFree(dev_c0));
147     HANDLE_ERROR(cudaFree(dev_a1));
148     HANDLE_ERROR(cudaFree(dev_b1));
149     HANDLE_ERROR(cudaFree(dev_c1));
150     HANDLE_ERROR(cudaStreamDestroy(stream0));
151     HANDLE_ERROR(cudaStreamDestroy(stream1));
152 
153     return 0;
154 }
秒客网

利用两个流进行操作演示

相关文章