MKL 教程
安装
在intel官网注册并下载mkl:https://software.intel.com/en-us/mkl
Linux下安装:
wget http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/14895/l_mkl_2019.1.144.tgz
tar -zxvf l_mkl_2019.1.144.tgz
cd l_mkl_2019.1.144/
./install.sh
sudo vim /etc/ld.so.conf.d/intel-mkl.conf
/path/intel/mkl/lib/intel64
/path/intel/lib/intel64
sudo ldconfig
cd /path/intel/mkl/bin
source mklvars.sh intel64 # https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables
vim dgemm_example.c # input your code
gcc -o run_dgemm_example dgemm_example.c -lmkl_rt
实例
第一次入门教程:https://software.intel.com/en-us/mkl-tutorial-c-overview
所有实例:https://software.intel.com/en-us/product-code-samples
wget https://software.intel.com/sites/default/files/ipsxe2019_samples_lin_20180731.tgz
mkdir ipsxe2019_samples_lin_20180731
tar -zxvf ipsxe2019_samples_lin_20180731.tgz -C ipsxe2019_samples_lin_20180731
(1)源码:dgemm_example.c
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
#define min(x,y) (((x) < (y)) ? (x) : (y))
int main()
{
double *A, *B, *C;
int m, n, k, i, j;
double alpha, beta;
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel(R) MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");
m = 2000, k = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*k*sizeof( double ), 64 );
B = (double *)mkl_malloc( k*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*k); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (k*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A, k, B, n, beta, C, n);
printf ("\n Computations completed.\n\n");
printf (" Top left corner of matrix A: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(k,6); j++) {
printf ("%12.0f", A[j+i*k]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix B: \n");
for (i=0; i<min(k,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.0f", B[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}
printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
printf (" Example completed. \n\n");
return 0;
}
运行结果如下:
$ ./run_dgemm_example
This example computes real matrix C=alpha*A*B+beta*C using
Intel(R) MKL function dgemm, where A, B, and C are matrices and
alpha and beta are double precision scalars
Initializing data for matrix multiplication C=A*B for matrix
A(2000x200) and matrix B(200x1000)
Allocating memory for matrices aligned on 64-byte boundary for better
performance
Intializing matrix data
Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface
Computations completed.
Top left corner of matrix A:
1 2 3 4 5 6
201 202 203 204 205 206
401 402 403 404 405 406
601 602 603 604 605 606
801 802 803 804 805 806
1001 1002 1003 1004 1005 1006
Top left corner of matrix B:
-1 -2 -3 -4 -5 -6
-1001 -1002 -1003 -1004 -1005 -1006
-2001 -2002 -2003 -2004 -2005 -2006
-3001 -3002 -3003 -3004 -3005 -3006
-4001 -4002 -4003 -4004 -4005 -4006
-5001 -5002 -5003 -5004 -5005 -5006
Top left corner of matrix C:
-2.6666E+09 -2.6666E+09 -2.6667E+09 -2.6667E+09 -2.6667E+09 -2.6667E+09
-6.6467E+09 -6.6467E+09 -6.6468E+09 -6.6468E+09 -6.6469E+09 -6.647E+09
-1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10
-1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10
-1.8587E+10 -1.8587E+10 -1.8587E+10 -1.8587E+10 -1.8588E+10 -1.8588E+10
-2.2567E+10 -2.2567E+10 -2.2567E+10 -2.2567E+10 -2.2568E+10 -2.2568E+10
Deallocating memory
Example completed.
(2)源码:dgemm_with_timing.c
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10
int main()
{
double *A, *B, *C;
int m, n, p, i, r;
double alpha, beta;
double s_initial, s_elapsed;
printf ("\n This example measures performance of Intel(R) MKL function dgemm \n"
" computing real matrix C=alpha*A*B+beta*C, where A, B, and C \n"
" are matrices and alpha and beta are double precision scalars\n\n");
m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"
" via CBLAS interface to get stable run time measurements \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"
" via CBLAS interface \n\n");
s_initial = dsecnd();
for (r = 0; r < LOOP_COUNT; r++) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
}
s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;
printf (" == Matrix multiplication using Intel(R) MKL dgemm completed == \n"
" == at %.5f milliseconds == \n\n", (s_elapsed * 1000));
printf (" Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
if (s_elapsed < 0.9/LOOP_COUNT) {
s_elapsed=1.0/LOOP_COUNT/s_elapsed;
i=(int)(s_elapsed*LOOP_COUNT)+1;
printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
" computer as %i to have total execution time about 1 second for reliability \n"
" of measurements\n\n", i);
}
printf (" Example completed. \n\n");
return 0;
}
运行结果如下:
$ ./run_dgemm_with_timing
This example measures performance of Intel(R) MKL function dgemm
computing real matrix C=alpha*A*B+beta*C, where A, B, and C
are matrices and alpha and beta are double precision scalars
Initializing data for matrix multiplication C=A*B for matrix
A(2000x200) and matrix B(200x1000)
Allocating memory for matrices aligned on 64-byte boundary for better
performance
Intializing matrix data
Making the first run of matrix product using Intel(R) MKL dgemm function
via CBLAS interface to get stable run time measurements
Measuring performance of matrix product using Intel(R) MKL dgemm function
via CBLAS interface
== Matrix multiplication using Intel(R) MKL dgemm completed ==
== at 4.53907 milliseconds ==
Deallocating memory
It is highly recommended to define LOOP_COUNT for this example on your
computer as 221 to have total execution time about 1 second for reliability
of measurements
Example completed.
(3)源码:dgemm_threading_effect_example.c
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10
int main()
{
double *A, *B, *C;
int m, n, p, i, j, r, max_threads;
double alpha, beta;
double s_initial, s_elapsed;
printf ("\n This example demonstrates threading impact on computing real matrix product \n"
" C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"
" matrices and alpha and beta are double precision scalars \n\n");
m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Finding max number of threads Intel(R) MKL can use for parallel runs \n\n");
max_threads = mkl_get_max_threads();
printf (" Running Intel(R) MKL from 1 to %i threads \n\n", max_threads);
for (i = 1; i <= max_threads; i++) {
for (j = 0; j < (m*n); j++)
C[j] = 0.0;
printf (" Requesting Intel(R) MKL to use %i thread(s) \n\n", i);
mkl_set_num_threads(i);
printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"
" via CBLAS interface to get stable run time measurements \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"
" via CBLAS interface on %i thread(s) \n\n", i);
s_initial = dsecnd();
for (r = 0; r < LOOP_COUNT; r++) {
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
}
s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;
printf (" == Matrix multiplication using Intel(R) MKL dgemm completed ==\n"
" == at %.5f milliseconds using %d thread(s) ==\n\n", (s_elapsed * 1000), i);
}
printf (" Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
if (s_elapsed < 0.9/LOOP_COUNT) {
s_elapsed=1.0/LOOP_COUNT/s_elapsed;
i=(int)(s_elapsed*LOOP_COUNT)+1;
printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
" computer as %i to have total execution time about 1 second for reliability \n"
" of measurements\n\n", i);
}
printf (" Example completed. \n\n");
return 0;
}
运行结果如下:
$ ./run_dgemm_threading_effect_example
This example demonstrates threading impact on computing real matrix product
C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are
matrices and alpha and beta are double precision scalars
Initializing data for matrix multiplication C=A*B for matrix
A(2000x200) and matrix B(200x1000)
Allocating memory for matrices aligned on 64-byte boundary for better
performance
Intializing matrix data
Finding max number of threads Intel(R) MKL can use for parallel runs
Running Intel(R) MKL from 1 to 4 threads
Requesting Intel(R) MKL to use 1 thread(s)
Making the first run of matrix product using Intel(R) MKL dgemm function
via CBLAS interface to get stable run time measurements
Measuring performance of matrix product using Intel(R) MKL dgemm function
via CBLAS interface on 1 thread(s)
== Matrix multiplication using Intel(R) MKL dgemm completed ==
== at 15.35492 milliseconds using 1 thread(s) ==
Requesting Intel(R) MKL to use 2 thread(s)
Making the first run of matrix product using Intel(R) MKL dgemm function
via CBLAS interface to get stable run time measurements
Measuring performance of matrix product using Intel(R) MKL dgemm function
via CBLAS interface on 2 thread(s)
== Matrix multiplication using Intel(R) MKL dgemm completed ==
== at 7.89382 milliseconds using 2 thread(s) ==
Requesting Intel(R) MKL to use 3 thread(s)
Making the first run of matrix product using Intel(R) MKL dgemm function
via CBLAS interface to get stable run time measurements
Measuring performance of matrix product using Intel(R) MKL dgemm function
via CBLAS interface on 3 thread(s)
== Matrix multiplication using Intel(R) MKL dgemm completed ==
== at 5.45093 milliseconds using 3 thread(s) ==
Requesting Intel(R) MKL to use 4 thread(s)
Making the first run of matrix product using Intel(R) MKL dgemm function
via CBLAS interface to get stable run time measurements
Measuring performance of matrix product using Intel(R) MKL dgemm function
via CBLAS interface on 4 thread(s)
== Matrix multiplication using Intel(R) MKL dgemm completed ==
== at 4.42070 milliseconds using 4 thread(s) ==
Deallocating memory
It is highly recommended to define LOOP_COUNT for this example on your
computer as 227 to have total execution time about 1 second for reliability
of measurements
Example completed.
(4)源码:matrix_multiplication.c
#define min(x,y) (((x) < (y)) ? (x) : (y))
#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"
/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10
int main()
{
double *A, *B, *C;
int m, n, p, i, j, k, r;
double alpha, beta;
double sum;
double s_initial, s_elapsed;
printf ("\n This example measures performance of rcomputing the real matrix product \n"
" C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are \n"
" matrices and alpha and beta are double precision scalars \n\n");
m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Making the first run of matrix product using triple nested loop\n"
" to get stable run time measurements \n\n");
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (k = 0; k < p; k++)
sum += A[p*i+k] * B[n*k+j];
C[n*i+j] = sum;
}
}
printf (" Measuring performance of matrix product using triple nested loop \n\n");
s_initial = dsecnd();
for (r = 0; r < LOOP_COUNT; r++) {
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
sum = 0.0;
for (k = 0; k < p; k++)
sum += A[p*i+k] * B[n*k+j];
C[n*i+j] = sum;
}
}
}
s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;
printf (" == Matrix multiplication using triple nested loop completed == \n"
" == at %.5f milliseconds == \n\n", (s_elapsed * 1000));
printf (" Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
if (s_elapsed < 0.9/LOOP_COUNT) {
s_elapsed=1.0/LOOP_COUNT/s_elapsed;
i=(int)(s_elapsed*LOOP_COUNT)+1;
printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
" computer as %i to have total execution time about 1 second for reliability \n"
" of measurements\n\n", i);
}
printf (" Example completed. \n\n");
return 0;
}
运行结果如下:
$ ./run_matrix_multiplication
This example measures performance of rcomputing the real matrix product
C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are
matrices and alpha and beta are double precision scalars
Initializing data for matrix multiplication C=A*B for matrix
A(2000x200) and matrix B(200x1000)
Allocating memory for matrices aligned on 64-byte boundary for better
performance
Intializing matrix data
Making the first run of matrix product using triple nested loop
to get stable run time measurements
Measuring performance of matrix product using triple nested loop
== Matrix multiplication using triple nested loop completed ==
== at 1408.21425 milliseconds ==
Deallocating memory
Example completed.