1 #include <stdio.h> 2 #include <cuda_runtime.h> 3 #include <device_launch_parameters.h> 4 #include <stdlib.h> 5 #include <time.h> 6 7 #define THREAD_NUM 256 8 #define MATRIX_SIZE 1000 9 const int blocks_num = MATRIX_SIZE * (MATRIX_SIZE + THREAD_NUM - 1) / THREAD_NUM; 10 void matgen(float *a, int n); 11 12 ///生成随机矩阵 13 void matgen(float *a, int n){ 14 int i, j; 15 16 for (i = 0; i < n; i++){ 17 for (j = 0; j < n; j++){ 18 a[i * n + j] = (float)rand(); 19 //printf("%f ", a[i*n + j]); 20 } 21 } 22 } 23 24 //__global__函数 并行计算矩阵乘法 25 __global__ static void matMult(const float *a, const float *b, float * c,int n){ 26 //表示目前的thread是第几个thread(由0开始计算) 27 const int tid = threadIdx.x; 28 29 //表示目前的thread属于第几个block(由0开始计算) 30 const int bid = blockIdx.x; 31 32 //从bid和tid计算出这个thread应该计算的row和column 33 const int idx = bid * THREAD_NUM + tid; 34 const int row = idx / n; 35 const int column = idx % n; 36 37 //计算矩阵乘法 38 if (row < n && column < n){ 39 float t = 0; 40 for (int i = 0; i < n; i++){ 41 t += a[row * n + i] * b[i * n + column]; 42 } 43 c[row * n + column] = t; 44 } 45 } 46 47 int main(void){ 48 cudaEvent_t stop, start; 49 cudaEventCreate(&start); 50 cudaEventCreate(&stop); 51 52 //定义矩阵 53 float *a, *b, *c; 54 int n = MATRIX_SIZE; 55 float elapsedTime = 0; 56 57 //分配内存 58 a = (float *)malloc(sizeof(float) * n * n); 59 b = (float *)malloc(sizeof(float) * n * n); 60 c = (float *)malloc(sizeof(float) * n * n); 61 62 //设置随机数种子 63 srand(0); 64 65 //随机生成矩阵 66 matgen(a, n); 67 matgen(b, n); 68 69 //分配GPU内存 70 float *d_a, *d_b, *d_c; 71 cudaMalloc((void**)&d_a, sizeof(float) * n * n); 72 cudaMalloc((void**)&d_b, sizeof(float) * n * n); 73 cudaMalloc((void**)&d_c, sizeof(float) * n * n); 74 cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice); 75 cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice); 76 77 cudaEventRecord(start, 0); 78 matMult << <blocks_num, THREAD_NUM, 0 >> >(d_a, d_b, d_c,n); 79 cudaThreadSynchronize(); 80 cudaEventRecord(stop, 0); 81 cudaEventSynchronize(stop); 82 cudaEventElapsedTime(&elapsedTime, start, stop); 83 printf("%f\n", elapsedTime); 84 85 cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost); 86 /*for (int i = 0; i < 100; i++){ 87 printf("%f ", c[i]); 88 }*/ 89 90 cudaFree(d_a); 91 cudaFree(d_b); 92 cudaFree(d_c); 93 94 return 0; 95 }