As described in table 2 in the cuda c programming guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions, the number of operations per clock cycle per multiprocessor for 32-bit floating-point add is 128, while it is 4 for 64-bit floating-point add, namely, 32 times slower for 64-bit floating-point add.
如cuda c编程指南http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions中的表2所述,每个多处理器的每个时钟周期的操作数为32 -bit浮点数添加为128,而64位浮点数添加为4,即64位浮点数添加速度慢32倍。
However, as I used the following code to test the speed difference, the double version is at most 2 times slower than float (It does not change much even with the compilation flag --device-debug), does any know the reason?
但是,由于我使用以下代码来测试速度差异,双版本最多比浮动慢2倍(即使使用编译标志--device-debug也没有太大变化),有没有人知道原因?
#define N 100000000
typedef double Real;
// Device code
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
C[i] = A[i] + B[i];
}
}
// Host code
int main()
{
size_t size = N * sizeof(Real);
// Allocate input vectors h_A and h_B in host memory
Real* h_A = (Real*)malloc(size);
Real* h_B = (Real*)malloc(size);
Real* h_C = (Real*)malloc(size);
// Initialize input vectors
for (int i = 0; i < N; i++)
{
h_A[i] = 1.0f + i * 0.1f;
h_B[i] = 100.0f + i * 0.1f;
}
// Allocate vectors in device memory
Real* d_A;
cudaMalloc(&d_A, size);
Real* d_B;
cudaMalloc(&d_B, size);
Real* d_C;
cudaMalloc(&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid =
(N + threadsPerBlock - 1) / threadsPerBlock;
// Time measurement starts
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventSynchronize(start);
for (int i = 0; i < 10000; i++)
{
VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C);
}
// Time measurement ends
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time to generate: %3.8f ms\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
}
I use Visual Studio 2013 with CUDA toolkit 8.0 and my system is 64bit windows 10 with GeForce 1080, driver version 372.90.
我使用带有CUDA工具包8.0的Visual Studio 2013,我的系统是带有GeForce 1080的64位Windows 10,驱动程序版本372.90。
Edit: After reading the answer from @talonmies, I changed N to 1000 and the kernel function as
编辑:从@talonmies读完答案后,我将N更改为1000,内核函数为
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
Real a = A[i];
Real b = B[i];
Real c = 0.0f;
for (int j = 0; j < 100000; j++)
{
c += (a + b);
}
C[i] = c;
}
}
Now float version (3700ms) is about 10 times faster than double version (38570ms), however, it is still far from the theoretical value 32, could anybody explain that? ps. it is without the flag --device-debug, since with it the float version is much slower and again, only at most two times faster than the double version.
现在浮动版本(3700ms)比双版本(38570ms)快10倍左右,然而,它仍远离理论值32,有人可以解释一下吗? PS。它没有标志--device-debug,因为有了它,浮动版本要慢得多,而且最多只比双版本快两倍。
Edit @einpoklum, here is the ptx file. I am not sure about the meaning of the ptx file, but I think the loop in the kernel is not optimized away by nvcc, since if I set the kernel loop number to 10000 for the float version, and the delay becomes 390ms, it's about one tenth of delay for the loop number 100000.
编辑@einpoklum,这是ptx文件。我不确定ptx文件的含义,但我认为内核中的循环没有被nvcc优化,因为如果我为浮点版本设置内核循环数为10000,并且延迟变为390ms,那么它是关于循环次数为100000的延迟的十分之一。
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_20
.address_size 32
// .globl _Z6VecAddPfS_S_
.visible .entry _Z6VecAddPfS_S_(
.param .u32 _Z6VecAddPfS_S__param_0,
.param .u32 _Z6VecAddPfS_S__param_1,
.param .u32 _Z6VecAddPfS_S__param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<57>;
.reg .b32 %r<20>;
ld.param.u32 %r5, [_Z6VecAddPfS_S__param_0];
ld.param.u32 %r6, [_Z6VecAddPfS_S__param_1];
ld.param.u32 %r7, [_Z6VecAddPfS_S__param_2];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r8, %r9, %r10;
setp.gt.s32 %p1, %r1, 999;
@%p1 bra BB0_4;
cvta.to.global.u32 %r2, %r7;
cvta.to.global.u32 %r12, %r5;
shl.b32 %r13, %r1, 2;
add.s32 %r14, %r12, %r13;
cvta.to.global.u32 %r15, %r6;
add.s32 %r16, %r15, %r13;
ld.global.f32 %f5, [%r16];
ld.global.f32 %f6, [%r14];
add.f32 %f1, %f6, %f5;
mov.f32 %f56, 0f00000000;
mov.u32 %r19, 100000;
BB0_2:
add.f32 %f7, %f1, %f56;
add.f32 %f8, %f1, %f7;
add.f32 %f9, %f1, %f8;
add.f32 %f10, %f1, %f9;
add.f32 %f11, %f1, %f10;
add.f32 %f12, %f1, %f11;
add.f32 %f13, %f1, %f12;
add.f32 %f14, %f1, %f13;
add.f32 %f15, %f1, %f14;
add.f32 %f16, %f1, %f15;
add.f32 %f17, %f1, %f16;
add.f32 %f18, %f1, %f17;
add.f32 %f19, %f1, %f18;
add.f32 %f20, %f1, %f19;
add.f32 %f21, %f1, %f20;
add.f32 %f22, %f1, %f21;
add.f32 %f23, %f1, %f22;
add.f32 %f24, %f1, %f23;
add.f32 %f25, %f1, %f24;
add.f32 %f26, %f1, %f25;
add.f32 %f27, %f1, %f26;
add.f32 %f28, %f1, %f27;
add.f32 %f29, %f1, %f28;
add.f32 %f30, %f1, %f29;
add.f32 %f31, %f1, %f30;
add.f32 %f32, %f1, %f31;
add.f32 %f33, %f1, %f32;
add.f32 %f34, %f1, %f33;
add.f32 %f35, %f1, %f34;
add.f32 %f36, %f1, %f35;
add.f32 %f37, %f1, %f36;
add.f32 %f38, %f1, %f37;
add.f32 %f39, %f1, %f38;
add.f32 %f40, %f1, %f39;
add.f32 %f41, %f1, %f40;
add.f32 %f42, %f1, %f41;
add.f32 %f43, %f1, %f42;
add.f32 %f44, %f1, %f43;
add.f32 %f45, %f1, %f44;
add.f32 %f46, %f1, %f45;
add.f32 %f47, %f1, %f46;
add.f32 %f48, %f1, %f47;
add.f32 %f49, %f1, %f48;
add.f32 %f50, %f1, %f49;
add.f32 %f51, %f1, %f50;
add.f32 %f52, %f1, %f51;
add.f32 %f53, %f1, %f52;
add.f32 %f54, %f1, %f53;
add.f32 %f55, %f1, %f54;
add.f32 %f56, %f1, %f55;
add.s32 %r19, %r19, -50;
setp.ne.s32 %p2, %r19, 0;
@%p2 bra BB0_2;
add.s32 %r18, %r2, %r13;
st.global.f32 [%r18], %f56;
BB0_4:
ret;
}
1 个解决方案
#1
5
The kernel code you have shown could never be limited by arithmetic instruction throughput, and therefore could never expose the arithmetic instruction throughput differences between single and double precision on your Pascal GPU.
您显示的内核代码永远不会受到算术指令吞吐量的限制,因此永远不会暴露Pascal GPU上单精度和双精度之间的算术指令吞吐量差异。
It is much more likely that memory bandwidth is the performance limit of that code (your code requires two loads and a store per FLOP). The reason you are seeing a ratio of around two between single and double precision is likely down to the ratio of size of the types and nothing more.
内存带宽更可能是该代码的性能限制(您的代码需要两个负载和每个FLOP存储)。您看到单精度和双精度之间的比率大约为2的原因可能是类型的大小比例,而不是更多。
#1
5
The kernel code you have shown could never be limited by arithmetic instruction throughput, and therefore could never expose the arithmetic instruction throughput differences between single and double precision on your Pascal GPU.
您显示的内核代码永远不会受到算术指令吞吐量的限制,因此永远不会暴露Pascal GPU上单精度和双精度之间的算术指令吞吐量差异。
It is much more likely that memory bandwidth is the performance limit of that code (your code requires two loads and a store per FLOP). The reason you are seeing a ratio of around two between single and double precision is likely down to the ratio of size of the types and nothing more.
内存带宽更可能是该代码的性能限制(您的代码需要两个负载和每个FLOP存储)。您看到单精度和双精度之间的比率大约为2的原因可能是类型的大小比例,而不是更多。