▶ CUDA 动态并行实现快排算法(单线程的递归调用)
▶ 源代码:动态并行递归调用线程块。要点:添加 -rdc=true 选项(生成 relocatable device code,相当于执行分离编译),以及链接库 cudadevrt.lib (用于动态并行,不同于运行时库 cudart.lib)
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_string.h> __device__ int g_blockId = ; // 线程块的全局编号,供所有线程读写 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId) // 打印当前线程块信息,包括深度,当前块号,
{
if (threadIdx.x == )
{
if (depth == )
printf("BLOCK %d launched by the host\n", blockId);
else
{
char buffer[];
for (int i = ; i < depth; ++i) // 对应更多层级,每层前面都有相应层数的 "| "
{
buffer[ * i + ] = '+';
buffer[ * i + ] = ' ';
buffer[ * i + ] = ' ';
}
buffer[ * depth] = '\0';
printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, blockId, parent_threadId, parent_blockId);
}
}
__syncthreads();
} __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归
{
__shared__ int s_blockId; // 当前线程块的编号 if (threadIdx.x == ) // 读取当前 g_blockId 到 s_blockId 中,并将 g_blockId 加一
s_blockId = atomicAdd(&g_blockId, );
__syncthreads(); print_info(depth, s_blockId, parent_threadId, parent_blockId); // 打印当前线程块信息, if (++depth >= max_depth) // 达到最大递归深度则退出,否则继续调用 cdp_kernel()
return;
cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId);
} int main(int argc, char **argv)
{
printf("CUDA Dynamic Parallelism\n");
int max_depth = ;
int device_count = , device = -; if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式
{
printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[]);
exit(EXIT_SUCCESS);
}
if (checkCmdLineFlag(argc, (const char **)argv, "depth")) // 手动设置递归深度
{
max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth");
if (max_depth < || max_depth > )
{
printf("depth parameter has to be between 1 and 8\n");
exit(EXIT_FAILURE);
}
}
if (checkCmdLineFlag(argc, (const char **)argv, "device")) // 命令行指定设备
{
device = getCmdLineArgumentInt(argc, (const char **)argv, "device");
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
if (properties.major > || (properties.major == && properties.minor >= ))
printf("Running on GPU %d (%s)\n", device, properties.name);
else
{
printf("ERROR: required GPU with compute SM 3.5 or higher.\nCurrent GPU compute SM %d.%d\n", properties.major, properties.minor);
exit(EXIT_FAILURE);
}
}
else
{
cudaGetDeviceCount(&device_count);
for (int i = ; i < device_count; ++i)
{
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);
if (properties.major > || (properties.major == && properties.minor >= ))
{
device = i;
printf("Running on GPU %d (%s)", i, properties.name);
break;
}
printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name);
}
}
if (device == -)
{
printf("required GPU with compute SM 3.5 or higher.");
exit(EXIT_WAIVED);
}
cudaSetDevice(device); cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth);
printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n");
cdp_kernel << <, >> >(max_depth, , , -);
cudaGetLastError();
cudaDeviceSynchronize(); getchar();
exit(EXIT_SUCCESS);
}
● 输出结果:主机调用 2 个线程块,每个线程块 2 个线程,每个线程按同样规模递归调用,共 2*4 个二级核函数,2*4*4 个三级核函数,一共 42 个线程块
CUDA Dynamic Parallelism
Running on GPU (GeForce GTX )Launching cdp_kernel() with CUDA Dynamic Parallelism: BLOCK launched by the host
BLOCK launched by the host
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
+ + BLOCK launched by thread of block
▶ 涨姿势:
● 在核函数中递归地调用核函数,注意函数调用的格式
▶ 源代码:动态并行实现快排算法,输出结果只有 Finish!
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_string.h> #define MAX_DEPTH 16
#define INSERTION_SORT 32 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序,单线程完成
{
for (int i = left; i <= right; ++i)
{
unsigned min_val = data[i];
int min_idx = i;
for (int j = i + ; j <= right; ++j) // 找最小元素及其下标
{
unsigned val_j = data[j];
if (val_j < min_val)
{
min_idx = j;
min_val = val_j;
}
}
if (i != min_idx) // 交换第 i 号元素到指定的位置上
{
data[min_idx] = data[i];
data[i] = min_val;
}
}
} __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) // 快排主体,内含递归调用
{
if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排
{
selection_sort(data, left, right);
return;
}
unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / ];
while (lptr <= rptr)
{
unsigned int lval = *lptr, rval = *rptr; // 指定左指针指向的值和右指针指向的值
while (lval < pivot) // 递增左指针,等价于 lptr++; lval = *lptr;
lval = *(++lptr);
while (rval > pivot) // 递减右指针
rval = *(--rptr);
if (lptr <= rptr) // 交换左右指针指向的值
{
*lptr++ = rval;
*rptr-- = lval;
}
}
if (left < rptr - data) // 将左右分区放到两个不同的流中
{
cudaStream_t s0;
cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking); // 指定该流不与 0 号流进行同步
cdp_simple_quicksort << < , , , s0 >> >(data, left, rptr - data, depth + );
cudaStreamDestroy(s0);
}
if (lptr - data < right)
{
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cdp_simple_quicksort << < , , , s1 >> >(data, lptr - data, right, depth + );
cudaStreamDestroy(s1);
}
} void run_qsort(unsigned int *data, unsigned int n) // 快排入口
{
cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH); // 设置最大递归深度
cdp_simple_quicksort << < , >> >(data, , n - , );
cudaDeviceSynchronize();
} int main(int argc, char **argv)
{
cudaSetDevice();
const int n = ; unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n);
srand();
for (unsigned i = ; i < n; i++)
h_data[i] = rand() % n; unsigned int *d_data;
cudaMalloc((void **)&d_data, n * sizeof(unsigned int));
cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice); run_qsort(d_data, n); cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost); for (int i = ; i < n; ++i)
{
if (h_data[i - ] > h_data[i])
{
printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d\n", h_data[i - ], h_data[i]);
break;
}
}
printf("Finish!\n"); free(h_data);
cudaFree(d_data);
getchar();
exit(EXIT_SUCCESS);
}
▶ 新姿势:
● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref(定义于 helper_string.h 中)
inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
{
bool bFound = false;
if (argc >= )
{
for (int i = ; i < argc; i++)
{
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start],
const char*equal_pos = strchr(string_argv, '=');
int argv_length = (int)(equal_pos == ? strlen(string_argv) : equal_pos - string_argv);
int length = (int)strlen(string_ref);
if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
{
bFound = true;
continue;
}
}
}
return bFound;
} inline int stringRemoveDelimiter(char delimiter, const char *string) // 去除特定的符号,上述函数的中用于去除参数前面的 - 或 --
{
int string_start = ;
while (string[string_start] == delimiter)
string_start++;
if (string_start >= (int)strlen(string) - )
return ;
return string_start;
} #define STRNCASECMP _strnicmp // 比较字符串(定义于string.h中) _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);
● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数(定义于 helper_string.h 中)
inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
{
bool bFound = false;
int value = -;
if (argc >= )
{
for (int i = ; i < argc; i++)
{
int string_start = stringRemoveDelimiter('-', argv[i]);
const char *string_argv = &argv[i][string_start];
int length = (int)strlen(string_ref);
if (!STRNCASECMP(string_argv, string_ref, length))
{
if (length + <= (int)strlen(string_argv))
{
int auto_inc = (string_argv[length] == '=') ? : ;
value = atoi(&string_argv[length + auto_inc]);
}
else
value = ;
bFound = true;
continue;
}
}
}
if (bFound)
return value;
return ;
}
● 指定最大递归深度
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
● 带有标识符的 cudaStreamCreateWithFlags ,设置流的优先级
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
■ 对比 cudaStreamCreate
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);