opencl fft

时间:2024-02-18 12:37:15

       最近在做并行计算, 应用的是典型的计算快速傅立叶变换 FFT, 程序设计的环境是 Window7, GTX 660ti  使用的软件操作是  CUDA 6.0, OpenCL1.1 , VC2005

       现做规模为 2048 的 FFT 做512 次, 这样得到的结果用 matlab 来衡量程序计算结果的正确性。 在保证正确性的条件下, 测量程序中 512次规模为2048 FFT 计算时间,

先用CUDA cufft计算, 在用OpenCL采用两种方法计算, 一种是普通的并行蝶形计算方法, 另外一种择采用矩阵 FFT算法, 通过上述比较CUDA 与 OpenCL 的运行时间及计算精度

  1 #include "cuda_runtime.h"
  2 #include "device_launch_parameters.h"
  3 #include <stdio.h>
  4 #include "cufft.h"
  5 
  6 #define Nx 2048        // FFT
  7 #define Ny 512         // FFT총수
  8 #define pi 3.1415926   
  9 
 10 int main(void)
 11 {
 12    Mytime g_timer;
 13 
 14     float *h_fk;
 15     h_fk =(float*)malloc(sizeof(float)*Nx*Ny); 
 16     for (int j=0; j< Nx*Ny; j++)
 17     {
 18           h_fk[j] = j%Nx;
 19      }
 20 
 21 #if 0
 22     FILE *fid_h_fk;
 23     fid_h_fk = fopen("h_fk.txt","wt");
 24     if(fid_h_fk==NULL)
 25     {
 26         //cout<<"error"<<endl;
 27         printf( "Error : file open failed \n " );
 28         system("pause");
 29     }
 30 
 31    for (int j = 0; j < Nx; j++)
 32     {
 33        fprintf(fid_h_fk, "%f \n" , h_fk[j]);
 34     }
 35     fclose(fid_h_fk);
 36 #endif
 37 /////////////////////////////////////////////////////////////////////////
 38 /////////cufft
 39     int i,j;
 40     cufftComplex *h_hatCH;
 41     h_hatCH=(cufftComplex*)malloc(sizeof(cufftComplex)*(Nx/2+1)*Ny);
 42 
 43     cufftReal *d_fk;
 44     cufftComplex *d_cufft_CH;
 45  
 46     cudaMalloc((void**)&d_fk, Nx * Ny * sizeof(cufftReal));
 47     cudaMalloc((void**)&d_cufft_CH, (Nx/2+1) * Ny * sizeof(cufftComplex));
 48  
 49     cufftHandle plan;
 50 
 51     cufftPlan1d(&plan, Nx, CUFFT_R2C,Ny);             // Real float To Complex
 52 
 53     g_timer.Start( &g_timer );
 54 
 55     cufftExecR2C(plan, (cufftReal*)d_fk, (cufftComplex*)d_cufft_CH);
 56     
 57      cudaMemcpy(d_fk, h_fk, Nx * Ny * sizeof(cufftReal), cudaMemcpyHostToDevice);
 58 
 59 ////////////////////////////////////////////////////////////////////////////////////
 60 ///////////  d_cufft_CH output
 61 
 62    cudaMemcpy(h_hatCH, d_cufft_CH, sizeof(cufftComplex)*(Nx/2+1)*Ny, cudaMemcpyDeviceToHost);   
 63 
 64      gfFrametime = g_timer.End( &g_timer );
 65     printf("Time = %f Sec \n", gfFrametime);
 66 #if 0
 67     FILE *fid_h_hatCH;
 68     fid_h_hatCH = fopen("h_hatCH1.txt","wt");
 69     if(fid_h_hatCH==NULL)
 70     {
 71         printf( "Error : file open failed \n " );
 72         system("pause");
 73     }
 74 
 75     for (int j = 0; j < (Nx/2+1)*Ny; j++)
 76     {
 77        if (h_hatCH[j].y == 0)
 78        fprintf(fid_h_hatCH, "%f\n" , h_hatCH[j].x);
 79        else if (h_hatCH[j].y > 0)
 80         fprintf(fid_h_hatCH, "%f+%fi\n" , h_hatCH[j].x ,h_hatCH[j].y );
 81             else fprintf(fid_h_hatCH, "%f%fi\n" , h_hatCH[j].x ,h_hatCH[j].y );
 82     }
 83     fclose(fid_h_hatCH);
 84 #endif
 85 
 86     FILE *fid_h_hatCH;
 87     fid_h_hatCH = fopen("h_hatCH1.txt","wt");
 88     if(fid_h_hatCH==NULL)
 89     {
 90         printf( "Error : file open failed \n " );
 91         system("pause");
 92     }
 93 
 94     for (int j = 0; j < Nx/2; j++)
 95     {
 96        fprintf(fid_h_hatCH, "%f\n" , sqrt(pow(h_hatCH[j].x,2)+pow(h_hatCH[j].y,2)));
 97     }
 98     fclose(fid_h_hatCH);
 99 
100     cufftDestroy(plan);
101     cudaFree(d_fk);
102     cudaFree(d_cufft_CH);
103     free(h_hatCH);
104     free(h_fk);
105 
106     return 0;
107 
108 }

运行时间为:  0.003841 sec

如果用 OpenCL 蝶形运算方法,  Device kernel 源码为

 1 #define PI 3.14159265358979323846
 2 #define PI_2 1.57079632679489661923
 3 
 4 /* 회전인자 생성 */
 5 __kernel void spinFact(__global float2* w, int n)
 6 {
 7     unsigned int i = get_global_id(0);
 8 
 9     float2 angle = (float2)(2*i*PI/(float)n, (2*i*PI/(float)n)+PI_2);
10     w[i] = cos(angle);
11 }
12 
13 /* 비트 역순 */
14 __kernel void bitReverse(__global float2 *dst, __global float2 *src, int m, int n)
15 {
16     unsigned int gid = get_global_id(0);
17     unsigned int nid = get_global_id(1);
18 
19     unsigned int j = gid;
20     j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1;
21     j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2;
22     j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4;
23     j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8;
24     j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16;
25 
26     j >>= (32-m);
27 
28     dst[nid*n+j] = src[nid*n+gid];
29 }
30 
31 /* 버터플라이 연산 */
32 __kernel void butterfly(__global float2 *x, __global float2* w, int m, int n, int iter, uint flag)
33 {
34     unsigned int gid = get_global_id(0);
35     unsigned int nid = get_global_id(1);
36 
37     int butterflySize = 1 << (iter-1);
38     int butterflyGrpDist = 1 << iter;
39     int butterflyGrpNum = n >> iter;
40     int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist);
41     int butterflyGrpOffset = gid & (butterflySize-1);
42 
43     int a = nid * n + butterflyGrpBase + butterflyGrpOffset;
44     int b = a + butterflySize;
45 
46     int l = butterflyGrpNum * butterflyGrpOffset;
47 
48     float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb;
49 
50     xa = x[a];
51     xb = x[b];
52     xbxx = xb.xx;
53     xbyy = xb.yy;
54 
55     wab = as_float2(as_uint2(w[l]) ^ (uint2)(0x0, flag));
56     wayx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x80000000, 0x0));
57     wbyx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x0, 0x80000000));
58 
59     resa = xa + xbxx*wab + xbyy*wayx;
60     resb = xa - xbxx*wab + xbyy*wbyx;
61 
62     x[a] = resa;
63     x[b] = resb;
64 }

 计算时间为 0.060381 sec
 采用矩阵算法的源码为

 1 ///////////////////////////////////////////////////////////////////////////////
 2 //! Matrix multiplication on the device: C = A * B
 3 //! uiWA is A\'s width and uiWB is B\'s width
 4 ////////////////////////////////////////////////////////////////////////////////
 5 __kernel void
 6 matrixMul( __global float* C, __global float* A, __global float2* B, 
 7        __local float* As, __local float* Bsr,__local float* Bsi,  int uiWA, int uiWB)
 8 {
 9     // Block index
10     int bx = get_group_id(0);
11     int by = get_group_id(1);
12 
13     // Thread index
14     int tx = get_local_id(0);
15     int ty = get_local_id(1);
16 
17     // Index of the first sub-matrix of A processed by the block
18     int aBegin = uiWA * BLOCK_SIZE * by;
19 
20     // Index of the last sub-matrix of A processed by the block
21     int aEnd   = aBegin + uiWA - 1;
22 
23     // Step size used to iterate through the sub-matrices of A
24     int aStep  = BLOCK_SIZE;
25 
26     // Index of the first sub-matrix of B processed by the block
27     int bBegin = BLOCK_SIZE * bx;
28 
29     // Step size used to iterate through the sub-matrices of B
30     int bStep  = BLOCK_SIZE * uiWB;
31 
32     // Csub is used to store the element of the block sub-matrix
33     // that is computed by the thread
34     float2 Csub =(float2)(0.0f, 0.0f);
35 
36     // Loop over all the sub-matrices of A and B
37     // required to compute the block sub-matrix
38     for (int a = aBegin, b = bBegin;
39              a <= aEnd;
40              a += aStep, b += bStep) {
41 
42         // Load the matrices from device memory
43         // to shared memory; each thread loads
44         // one element of each matrix
45         AS(ty, tx) = A[a + uiWA * ty + tx];
46         BSr(ty, tx) = B[b + uiWB * ty + tx].x;
47         BSi(ty, tx) = B[b + uiWB * ty + tx].y;
48 
49         // Synchronize to make sure the matrices are loaded
50         barrier(CLK_LOCAL_MEM_FENCE);
51 
52         // Multiply the two matrices together;
53         // each thread computes one element
54         // of the block sub-matrix        
55         #pragma unroll
56         for (int k = 0; k < BLOCK_SIZE; ++k)
57             Csub +=(float2)(AS(ty, k)*BSr(k, tx),AS(ty, k)* BSi(k, tx));
58 
59         // Synchronize to make sure that the preceding
60         // computation is done before loading two new
61         // sub-matrices of A and B in the next iteration
62         barrier(CLK_LOCAL_MEM_FENCE);
63     }
64 
65     // Write the block sub-matrix to device memory;
66     // each thread writes one element
67     C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = 10*log10(Csub.x*Csub.x + Csub.y * Csub.y) ;
68 
69 }

 计算时间为 0.024020 sec

 精确度上 OpenCL明显优于 CUDA