C++调用CUDA(基于VS2015) matlab调用CUDA 以及matlab调用C++ 学习记录

时间:2022-03-12 12:11:40

版本说明:VS2015 matlab2016a  Nvcc9.0

实现功能:double型矩阵相乘(日后有机会更新模版类型)

attention:CUDA存储矩阵是列优先,不知道自己能不能改,还在摸索中

1、matlab调用C++ 基于mexFunction 其中c++扩展调用了Eigen库3.3version

//Matrixs_Multiply.cpp
/*function for matrixs mltply,based on Eigen 3.3 version
matlab use C function use eigen lib
there is a .m file named function_compiler to initialize the environment and function
change the path and lib according to different projects
*/

#include <Eigen/Dense>
#include "mex.h"
using namespace Eigen;
using namespace std;

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
int L_rows = mxGetM(prhs[0]);
int L_cols = mxGetN(prhs[0]);
Map< MatrixXd> L_mat(mxGetPr(prhs[0]), L_rows, L_cols);
int R_rows = mxGetM(prhs[1]);
int R_cols = mxGetN(prhs[1]);
Map< MatrixXd> R_mat(mxGetPr(prhs[1]), R_rows, R_cols);
plhs[0] = mxCreateDoubleMatrix(L_rows, R_cols, mxREAL);
MatrixXd result;
result = L_mat*R_mat;
double* outdata = mxGetPr(plhs[0]);
for (int i = 0; i<R_cols; i++)
{
for (int j = 0; j <L_rows; j++)
{
outdata[i*L_rows + j] = result(j, i);
}
}

}
matlab编译C++的.m文件代码如下 在代码对应处修改路径和库文件即可
clear all;
% Get the architecture of this computer
is_64bit = strcmp(computer,'MACI64') || strcmp(computer,'GLNXA64') || strcmp(computer,'PCWIN64');


%----------------------------------------------------------------------------------------------
%% The configuration of compiler
% You need to modify this configuration according to your own path of OpenCV
% Notice: if your system is 64bit, your OpenCV must be 64bit!
out_dir='./';
%CPPFLAGS = ' -O -DNDEBUG -I.\ -IF:\opencv\build\include -IF:\opencv\build\include\opencv -IF:\opencv\build\include\opencv2'; % your OpenCV "include" path
%LDFLAGS = ' -LF:\opencv\build\x86\vc10\lib'; % your OpenCV "lib" path
%LIBS = ' -lopencv_calib3d249d -lopencv_contrib249d -lopencv_core249d -lopencv_features2d249d -lopencv_flann249d -lopencv_gpu249d -lopencv_highgui249d -lopencv_imgproc249d -lopencv_legacy249d -lopencv_ml249d -lopencv_nonfree249d -lopencv_objdetect249d -lopencv_photo249d -lopencv_stitching249d -lopencv_ts249d -lopencv_video249d -lopencv_videostab249d';
%LIBS = ' -lopencv_calib3d249 -lopencv_contrib249 -lopencv_core249 -lopencv_features2d249 -lopencv_flann249 -lopencv_gpu249 -lopencv_highgui249 -lopencv_imgproc249 -lopencv_legacy249 -lopencv_ml249 -lopencv_nonfree249 -lopencv_objdetect249 -lopencv_photo249 -lopencv_stitching249 -lopencv_ts249 -lopencv_video249 -lopencv_videostab249';
CPPFLAGS = ' -O -DNDEBUG -I.\ -IF:\opencv\build\include -IE:\opencv3.3\opencv\build\include -IE:\eigen_higher_version' ; % your OpenCV "include" path
LDFLAGS = ' -LE:\opencv3.3\opencv\build\x64\vc14\lib'; % your OpenCV "lib" path
LIBS='opencv_world330d';
if is_64bit
CPPFLAGS = [CPPFLAGS ' -largeArrayDims'];
end

% add your files here!!
compile_files = {
%the list of your code files which need to be compiled
'Matrixs_Multiply.cpp'
};
%----------------------------------------------------------------------------------------------

%----------------------------------------------------------------------------------------------
%% compiling
for k = 1 : length(compile_files)
str = compile_files{k};
fprintf('compilation of: %s\n', str);
str = [str ' -outdir ' out_dir CPPFLAGS LDFLAGS LIBS];
args = regexp(str, '\s+', 'split');
mex(args{:});
end
fprintf('Congratulations, compilation successful!!!\n');
%----------------------------------------------------------------------------------------------

2、C++调用CUDA(基于VS2015)

/*this .cu file can be used to compute the multiply between two matrixs
C use CUDA to speed up
remember to add extern c as the line 29*/

#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <time.h>
#include <stdlib.h>



__global__ void MatrixMuiOnDevice(double *M, double *N, double* res,int L_rows,int L_cols) //矩阵列优先存储 //block thread 行优先
{
//int num = blockDim.x*threadIdx.y+threadIdx.x;
int num=(gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+ blockDim.x*threadIdx.y + threadIdx.x;

int col = num / L_rows;
int row = num % L_rows;
double sum = 0;
for (int i = 0; i < L_cols; i++)
{
sum += M[i*L_rows + row] * N[col*L_cols + i];
}
res[num] = sum;
}

extern "C"
int CUDA_Matrix_Mul(const double* a,const double* b, double* c,const int* size)
{
double *M, *N, *P;
//int width = 30;
//int NUM = 900;
//dim3 dimBlock(30, 30);
int width = size[0];
int NUM_a = size[0] * size[1];
int NUM_b = size[2] * size[3];
int NUM_c = size[0] * size[3];
dim3 dimBlock(size[0], size[3]);
cudaEvent_t start, stop;
float elapsedTime;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaMalloc((void**)&M, size[0]*size[1] * sizeof(double));
cudaMalloc((void**)&N, size[2]*size[3] * sizeof(double));
cudaMalloc((void**)&P, size[0]*size[3] * sizeof(double));
cudaMemcpy(M, a, NUM_a * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(N, b, NUM_b * sizeof(double), cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
int threadsPerBlock = 256;
int blocksPerGrid = (size[0] * size[3] + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
MatrixMuiOnDevice <<<blocksPerGrid, threadsPerBlock >>> (M, N, P, size[0],size[1]);
cudaMemcpy(c, P, NUM_c * sizeof(double), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("%f\n", elapsedTime);
cudaFree(M);
cudaFree(N);
cudaFree(P);
return 0;
}


3、matlab调用CUDA计算矩阵相乘

#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <time.h>
#include <stdlib.h>
#include "mex.h"
#include "gpu/mxGPUArray.h"

__global__ void matrix_mul(double *M, double *N, double* res,int L_rows,int L_cols) //矩阵列优先存储 //block thread 行优先
{
//int num = blockDim.x*threadIdx.y+threadIdx.x;
int num=(gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+ blockDim.x*threadIdx.y + threadIdx.x;

int col = num / L_rows;
int row = num % L_rows;
double sum = 0;
for (int i = 0; i < L_cols; i++)
{
sum += M[i*L_rows + row] * N[col*L_cols + i];
}
res[num] = sum;
}


void mexFunction(int nlhs, mxArray *plhs[],int nrhs, mxArray const *prhs[])
{
double* M;
double* L_mat;
double* N;
double* R_mat;
double* res_d;
double* res_h;

int L_rows = mxGetM(prhs[0]);
int L_cols = mxGetN(prhs[0]);
L_mat = mxGetPr(prhs[0]);

int R_rows = mxGetM(prhs[1]);
int R_cols = mxGetN(prhs[1]);
R_mat = mxGetPr(prhs[1]);

plhs[0] = mxCreateDoubleMatrix(L_rows, R_cols, mxREAL);

cudaMalloc((void**)&M, L_rows*L_cols*sizeof(double));
cudaMalloc((void**)&N, R_rows*R_cols * sizeof(double));
cudaMalloc((void**)&res_d, L_rows*R_cols * sizeof(double));
cudaMemcpy(M, L_mat, L_rows*L_cols * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(N, R_mat, R_rows*R_cols * sizeof(double), cudaMemcpyHostToDevice);
dim3 dimBlock(L_rows,R_cols);
int threadsPerBlock = 256;
int blocksPerGrid = (L_rows*R_cols + threadsPerBlock - 1) / threadsPerBlock;
matrix_mul << <blocksPerGrid, threadsPerBlock >> >(M,N, res_d,L_rows,L_cols);
cudaThreadSynchronize();

cudaMemcpy(mxGetPr(plhs[0]), res_d, L_rows*R_cols * sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(M);
cudaFree(N);
cudaFree(res_d);

}