项目下载链接
1 /* 2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 * 4 * NVIDIA Corporation and its licensors retain all intellectual property and 5 * proprietary rights in and to this software and related documentation. 6 * Any use, reproduction, disclosure, or distribution of this software 7 * and related documentation without an express license agreement from 8 * NVIDIA Corporation is strictly prohibited. 9 * 10 * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 * associated with this source code for terms and conditions that govern 12 * your use of this NVIDIA software. 13 * 14 */ 15 16 17 #include "../common/book.h" 18 #include "cuda.h" 19 #include "cuda_runtime.h" 20 #include "device_launch_parameters.h" 21 #define N (1024*1024) 22 #define FULL_DATA_SIZE (N*20) 23 24 25 __global__ void kernel(int *a, int *b, int *c) { 26 int idx = threadIdx.x + blockIdx.x * blockDim.x; 27 if (idx < N) { 28 int idx1 = (idx + 1) % 256; 29 int idx2 = (idx + 2) % 256; 30 float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; 31 float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; 32 c[idx] = (as + bs) / 2; 33 } 34 } 35 36 37 int main(void) { 38 cudaDeviceProp prop; 39 int whichDevice; 40 HANDLE_ERROR(cudaGetDevice(&whichDevice)); 41 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 42 if (!prop.deviceOverlap) { 43 printf("Device will not handle overlaps, so no speed up from streams\n"); 44 return 0; 45 } 46 47 cudaEvent_t start, stop; 48 float elapsedTime; 49 50 cudaStream_t stream0, stream1; 51 int *host_a, *host_b, *host_c; 52 int *dev_a0, *dev_b0, *dev_c0; 53 int *dev_a1, *dev_b1, *dev_c1; 54 55 // start the timers 56 HANDLE_ERROR(cudaEventCreate(&start)); 57 HANDLE_ERROR(cudaEventCreate(&stop)); 58 59 //初始化两个流 60 HANDLE_ERROR(cudaStreamCreate(&stream0)); 61 HANDLE_ERROR(cudaStreamCreate(&stream1)); 62 63 // allocate the memory on the GPU 64 HANDLE_ERROR(cudaMalloc((void**)&dev_a0, 65 N * sizeof(int))); 66 HANDLE_ERROR(cudaMalloc((void**)&dev_b0, 67 N * sizeof(int))); 68 HANDLE_ERROR(cudaMalloc((void**)&dev_c0, 69 N * sizeof(int))); 70 HANDLE_ERROR(cudaMalloc((void**)&dev_a1, 71 N * sizeof(int))); 72 HANDLE_ERROR(cudaMalloc((void**)&dev_b1, 73 N * sizeof(int))); 74 HANDLE_ERROR(cudaMalloc((void**)&dev_c1, 75 N * sizeof(int))); 76 77 //在主机上分配锁定页内存 78 HANDLE_ERROR(cudaHostAlloc((void**)&host_a, 79 FULL_DATA_SIZE * sizeof(int), 80 cudaHostAllocDefault)); 81 HANDLE_ERROR(cudaHostAlloc((void**)&host_b, 82 FULL_DATA_SIZE * sizeof(int), 83 cudaHostAllocDefault)); 84 HANDLE_ERROR(cudaHostAlloc((void**)&host_c, 85 FULL_DATA_SIZE * sizeof(int), 86 cudaHostAllocDefault)); 87 88 for (int i = 0; i<FULL_DATA_SIZE; i++) { 89 host_a[i] = rand(); 90 host_b[i] = rand(); 91 } 92 93 HANDLE_ERROR(cudaEventRecord(start, 0)); 94 // now loop over full data, in bite-sized chunks 95 for (int i = 0; i<FULL_DATA_SIZE; i += N * 2) { 96 // enqueue copies of a in stream0 and stream1 97 HANDLE_ERROR(cudaMemcpyAsync(dev_a0, host_a + i, 98 N * sizeof(int), 99 cudaMemcpyHostToDevice, 100 stream0)); 101 HANDLE_ERROR(cudaMemcpyAsync(dev_a1, host_a + i + N, 102 N * sizeof(int), 103 cudaMemcpyHostToDevice, 104 stream1)); 105 // enqueue copies of b in stream0 and stream1 106 HANDLE_ERROR(cudaMemcpyAsync(dev_b0, host_b + i, 107 N * sizeof(int), 108 cudaMemcpyHostToDevice, 109 stream0)); 110 HANDLE_ERROR(cudaMemcpyAsync(dev_b1, host_b + i + N, 111 N * sizeof(int), 112 cudaMemcpyHostToDevice, 113 stream1)); 114 115 // enqueue kernels in stream0 and stream1 116 kernel << <N / 256, 256, 0, stream0 >> >(dev_a0, dev_b0, dev_c0); 117 kernel << <N / 256, 256, 0, stream1 >> >(dev_a1, dev_b1, dev_c1); 118 119 //从设备上将结果拷贝回主机上的锁定页内存 120 HANDLE_ERROR(cudaMemcpyAsync(host_c + i, dev_c0, 121 N * sizeof(int), 122 cudaMemcpyDeviceToHost, 123 stream0)); 124 HANDLE_ERROR(cudaMemcpyAsync(host_c + i + N, dev_c1, 125 N * sizeof(int), 126 cudaMemcpyDeviceToHost, 127 stream1)); 128 } 129 //将计算结果从锁定页内存复制会主机内存 130 HANDLE_ERROR(cudaStreamSynchronize(stream0)); 131 HANDLE_ERROR(cudaStreamSynchronize(stream1)); 132 133 HANDLE_ERROR(cudaEventRecord(stop, 0)); 134 135 HANDLE_ERROR(cudaEventSynchronize(stop)); 136 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 137 start, stop)); 138 printf("Time taken: %3.1f ms\n", elapsedTime); 139 140 // cleanup the streams and memory 141 HANDLE_ERROR(cudaFreeHost(host_a)); 142 HANDLE_ERROR(cudaFreeHost(host_b)); 143 HANDLE_ERROR(cudaFreeHost(host_c)); 144 HANDLE_ERROR(cudaFree(dev_a0)); 145 HANDLE_ERROR(cudaFree(dev_b0)); 146 HANDLE_ERROR(cudaFree(dev_c0)); 147 HANDLE_ERROR(cudaFree(dev_a1)); 148 HANDLE_ERROR(cudaFree(dev_b1)); 149 HANDLE_ERROR(cudaFree(dev_c1)); 150 HANDLE_ERROR(cudaStreamDestroy(stream0)); 151 HANDLE_ERROR(cudaStreamDestroy(stream1)); 152 153 return 0; 154 }