-
cudaMalloc((void**) &gpudata, sizeof(long)* TEST); -
cudaMemcpy(gpudata, data, sizeof(long) * TEST,cudaMemcpyHostToDevice);
patch的理解:
C语言申请2维内存时,一般是连续存放的。a[y][x]存放在第y*widthofx*sizeof(元素)+x*sizeof(元素)个字节。但在cuda的globalmemory访问中,从256字节对齐的地址(addr=0, 256, 512, ...)开始的连续访问是最有效率的。这样,为了提高内存访问的效率,有了cudaMallocPitch函数。cudaMallocPitch函数分配的内存中,数组的每一行的第一个元素的开始地址都保证是对齐的。因为每行有多少个数据是不确定的widthofx*sizeof(元素)不一定是256的倍数。故此,为保证数组的每一行的第一个元素的开始地址对齐,cudaMallocPitch在分配内存时,每行会多分配一些字节,以保证widthofx*sizeof(元素)+多分配的字节是256的倍数(对齐)。这样,y*widthofx*sizeof(元素)+x*sizeof(元素)来计算a[y][x]的地址就不正确了。而应该是y*[widthofx*sizeof(元素)+多分配的字节]+x*sizeof(元素)。而函数中返回的pitch的值就是widthofx*sizeof(元素)+多分配的字节。
cudaMallocPitch两个函数的用法,先看看cudalibrary中如何定义的这两个函数:
cudaError_t |
( | void ** |
devPtr, | |
size_t * |
pitch, | |||
size_t |
width, | |||
size_t |
height | |
||
) |
Allocates at least widthInBytes
height
of linear memory on the device and returns in *devPtr
pointer to the allocated memory. The function may pad the
allocation to ensure that corresponding pointers in any given row
will continue to meet the alignment requirements for coalescing as
the address is updated from row to row. The pitch returned
in *pitch
the width in bytes of the allocation. The intended usage
of pitch
as a separate parameter of the allocation, used to compute
addresses within the 2D array. Given the row and column of an array
element of type T
,
the address is computed as:
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
For allocations of 2D arrays, it is recommended that programmers
consider performing pitch allocations using
Due to pitch alignment restrictions in the hardware, this is
especially true if the application will be performing 2D memory
copies between different regions of device memory (whether linear
memory or CUDA arrays).
-
Parameters: -
devPtr
- Pointer to allocated pitched device memory
pitch
- Pitch for allocation
width
- Requested pitched allocation width
height
- Requested pitched allocation height
cudaError_t |
( |
void * |
dst, |
|
size_t |
dpitch, |
|||
const void * |
src, |
|||
size_t |
spitch, |
|||
size_t |
width, |
|||
size_t |
height, |
|||
enum |
kind |
|
||
) |
Copies a matrix (height
of width
each) from the memory area pointed to by src
the memory area pointed to by dst
,
where kind
one ofcudaMemcpyHostToHost,
or
and specifies the direction of the copy. dpitch
spitch
the widths in memory in bytes of the 2D arrays pointed to
by dst
src
,
including any padding added to the end of each row. The memory
areas may not overlap. Calling dst
src
that do not match the direction of the copy results in an undefined
behavior.
an error if dpitch
spitch
greater than the maximum allowed.
-
Parameters: -
dst
- Destination memory address
dpitch
- Pitch of destination memory
src
- Source memory address
spitch
- Pitch of source memory
width
- Width of matrix transfer (columns in bytes)
height
- Height of matrix transfer (rows)
kind
- Type of transfer
由此,可以对这两个函数有个充分的认识。此外,cudaMallocPitch和cudaMemcpy2D,一般用于二维数组各维度size不是2的幂次方的问题。使用cudaMallocPitch()那么该数组的对齐、大小、起始址等就自动做好了,其返回的pitch就是真正分配给数组的size(往往大于其真正申请的大小)。
&fea_pitch, sizeof(unsigned char) * sfeaturesw,
sfeaturesh);
cudaCreateChannelDesc<unsigned
char>();
sizeof(unsigned char) * sfeaturesw, sizeof(unsigned char) *
sfeaturesw, sfeaturesh, cudaMemcpyHostToDevice);
sfeaturesw, sfeaturesh, fea_pitch);
--------------------------------------------------------------------------------
sfeaturesh;
cudaCreateChannelDesc<unsigned
char>();
&chDesc2, sfeaturesw, sfeaturesh);
sfeatures_size, cudaMemcpyHostToDevice );
-------------------------------------------------------------------------------------
cudaMalloc((void**)&dev_grid,grid_data_size);
cudaMemcpy(dev_grid,sgrid,grid_data_size,cudaMemcpyHostToDevice);
----------------------------------------------------------------------------------
cudaError_t |
( |
void * |
dst, |
|
size_t |
dpitch, |
|||
const void * |
src, |
|||
size_t |
spitch, |
|||
size_t |
width, |
|||
size_t |
height, |
|||
enum |
kind |
|
||
) |