The question is that: is there a way to use the class "vector" in Cuda kernels? When I try I get the following error:
error : calling a host function("std::vector<int, std::allocator<int> > ::push_back") from a __device__/__global__ function not allowed
So there a way to use a vector in global section?I recently tried the following:
- create a new Cuda project
- go to properties of the project
- open Cuda C/C++
- go to Device
- change the value in "Code Generation" to be set to this value:compute_20,sm_20
打开Cuda C / C ++
........ after that I was able to use the printf standard library function in my Cuda kernel.
is there a way to use the standard library class vector
in the way printf is supported in kernel code? This is an example of using printf in kernel code:
// this code only to count the 3s in an array using Cuda//private_count is an array to hold every thread's result separately __global__ void countKernel(int *a, int length, int* private_count) { printf("%d\n",threadIdx.x); //it's print the thread id and it's working // vector<int> y; //y.push_back(0); is there a possibility to do this? unsigned int offset = threadIdx.x * length; int i = offset; for( ; i < offset + length; i++) { if(a[i] == 3) { private_count[threadIdx.x]++; printf("%d ",a[i]); } } }
4 个解决方案
You can't use the STL in CUDA, but you may be able to use the Thrust library to do what you want. Otherwise just copy the contents of the vector to the device and operate on it normally.
In the cuda library thrust, you can use thrust::device_vector<classT
> to define a vector on device, and the data transfer between host STL vector and device vector is very straightforward. you can refer to this useful link: to find some useful examples.
在cuda库推进中,您可以使用thrust :: device_vector
you can't use std::vector
in device code, you should use array instead.
你不能在设备代码中使用std :: vector,你应该使用数组。
I think you can implement a device vector by youself, because CUDA supports dynamic memory alloction in device codes. Operator new/delete are also supported. Here is an extremely simple prototype of device vector in CUDA, but it does work. It hasn't been tested sufficiently.
template<typename T>class LocalVector{private: T* m_begin; T* m_end; size_t capacity; size_t length; __device__ void expand() { capacity *= 2; size_t tempLength = (m_end - m_begin); T* tempBegin = new T[capacity]; memcpy(tempBegin, m_begin, tempLength * sizeof(T)); delete[] m_begin; m_begin = tempBegin; m_end = m_begin + tempLength; length = static_cast<size_t>(m_end - m_begin); }public: __device__ explicit LocalVector() : length(0), capacity(16) { m_begin = new T[capacity]; m_end = m_begin; } __device__ T& operator[] (unsigned int index) { return *(m_begin + index);//*(begin+index) } __device__ T* begin() { return m_begin; } __device__ T* end() { return m_end; } __device__ ~LocalVector() { delete[] m_begin; m_begin = nullptr; } __device__ void add(T t) { if ((m_end - m_begin) >= capacity) { expand(); } new (m_end) T(t); m_end++; length++; } __device__ T pop() { T endElement = (*m_end); delete m_end; m_end--; return endElement; } __device__ size_t getSize() { return length; }};
You can't use the STL in CUDA, but you may be able to use the Thrust library to do what you want. Otherwise just copy the contents of the vector to the device and operate on it normally.
In the cuda library thrust, you can use thrust::device_vector<classT
> to define a vector on device, and the data transfer between host STL vector and device vector is very straightforward. you can refer to this useful link: to find some useful examples.
在cuda库推进中,您可以使用thrust :: device_vector
you can't use std::vector
in device code, you should use array instead.
你不能在设备代码中使用std :: vector,你应该使用数组。
I think you can implement a device vector by youself, because CUDA supports dynamic memory alloction in device codes. Operator new/delete are also supported. Here is an extremely simple prototype of device vector in CUDA, but it does work. It hasn't been tested sufficiently.
template<typename T>class LocalVector{private: T* m_begin; T* m_end; size_t capacity; size_t length; __device__ void expand() { capacity *= 2; size_t tempLength = (m_end - m_begin); T* tempBegin = new T[capacity]; memcpy(tempBegin, m_begin, tempLength * sizeof(T)); delete[] m_begin; m_begin = tempBegin; m_end = m_begin + tempLength; length = static_cast<size_t>(m_end - m_begin); }public: __device__ explicit LocalVector() : length(0), capacity(16) { m_begin = new T[capacity]; m_end = m_begin; } __device__ T& operator[] (unsigned int index) { return *(m_begin + index);//*(begin+index) } __device__ T* begin() { return m_begin; } __device__ T* end() { return m_end; } __device__ ~LocalVector() { delete[] m_begin; m_begin = nullptr; } __device__ void add(T t) { if ((m_end - m_begin) >= capacity) { expand(); } new (m_end) T(t); m_end++; length++; } __device__ T pop() { T endElement = (*m_end); delete m_end; m_end--; return endElement; } __device__ size_t getSize() { return length; }};