I'm developing a genetic cellular automata using PyCuda. Each cell will have a lot of genome data, along with cell parameters. I'm wondering what could be a most efficient way to 1) pass cells data to a CUDA kernel, then 2) to process this data.
我正在开发一种利用PyCuda的基因细胞自动机。每个细胞都会有很多基因组数据,以及细胞参数。我在想什么是最有效的方法1)将单元数据传递给CUDA内核,然后2)处理这些数据。
I began with one particularly bad (imo), yet still working solution. It was passing each parameter in a separate array, then process them with a switch-case and a lot of duplicate code.
我一开始就有一个特别糟糕的问题(在我看来),但仍然是可行的解决方案。它在一个单独的数组中传递每个参数,然后用一个开关盒和许多重复的代码处理它们。
Then, realized that I could quickly end up with pretty large number of parameters per kernel function, and decide to rewrite it.
然后,我意识到我可能很快就会得到大量的每个内核函数的参数,并决定重写它。
Second solution was to store all bunch of cell's parameters in a single array with extra dimension. That was much more elegant in code, but surprisingly the code runs 10x slower!
第二种解决方案是将所有单元格参数存储在一个具有额外维度的数组中。这在代码中要优雅得多,但令人惊讶的是代码运行得慢了10x !
To make it more clear, the full list of data I need to be stored per cell:
为了更清楚地说明这一点,我需要在每个单元中存储完整的数据列表:
- (Fc, Mc, Tc): 3x (int) - the cell's current 'flavor', mass and temperature
- (Fc, Mc, Tc): 3x (int)——细胞当前的“味道”,质量和温度
- (Rfc, Rmc, Rtc): 3x (int) - the cell's current registers
- (Rfc, Rmc, Rtc): 3x (int)——单元的当前寄存器
- (Fi, Mi, Ti) for each neighbour: 8*3x (int) - incoming values
- (Fi, Mi, Ti)每个邻居:8*3x (int) -输入值。
- (Rfi, Rmi, Rti) for each neighbour: 8*3x (int) - incoming values
- (Rfi, Rmi, Rti)对于每个邻居:8*3x (int) -传入值
- gate orientation: 1x (uchar)
- 门方向:1 x(uchar)
- execution pointer: 1x (uchar)
- 执行指针:1 x(uchar)
- current micro-operations memory: 32x (uchar)
- 当前微操作内存:32x (uchar)
- last step's micro-operations memory: 32x (uchar)
- 最后一步的微操作内存:32x (uchar)
I'm splitting an automata step in 2 phases. First (emit phase) is calculating (Fi, Mi, Ti) for each cell neighbours. Second (absorb phase) is blending 8x(Fi, Mi, Ti) values with current cells' states. No genome or registers implemented yet, but I need its data to be passed for future.
我将一个自动机步骤分成两个阶段。首先(发射相位)是计算(Fi, Mi, Ti)每个单元的邻居。第二(吸收阶段)是将8x(Fi, Mi, Ti)值与当前单元的状态混合。还没有实现基因组或寄存器,但我需要它的数据为未来传递。
So, the code for my first solution was:
因此,我的第一个解决方案的代码是:
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
int x = i / h;
int y = i % h;
int3 cell = cells[i];
float M = (float) cell.y;
float T = (float) cell.z;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i].y -= Mi;
cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);
int Fi = cell.x;
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int Ti = cell.z;
int ii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fi) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo;
if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo;
if (xo >= w) yo = yo - h;
ii = xo * h + yo;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
switch(c) {
case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
default: break;
}
}
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)
absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
int3 cell = cells[i];
int3 dcell = dcells0[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
dcell = dcells1[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells2[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells3[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells4[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells5[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells6[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells7[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
cells[i] = cell;
img[i] = hsv2rgb(cell);
""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
__device__ int3 operator+(const int3 &a, const int3 &b) {
return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}
__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
return make_int3(dF, dM, dT);
}
__device__ uint hsv2rgb(int3 pixel) {
// skipped for brevity
}
""" % (Mk, Tk, RAM))
The second and current solution:
第二种也是目前的解决方案:
Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell
emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
int x = i / h;
int y = i % h;
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
float M = (float) Mc;
float T = (float) Tc;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[ii+1] = Mc - Mi;
cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int iii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fc) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
if (c > 4) c--;
iii = (xo * h + yo) * CN + 6 + c*3;
cells[iii] = Fc;
cells[iii+1] = Mbase + Madd;
cells[iii+2] = Tc;
}
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))
absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
for (int c=0; c < 8; c++){
int iii = ii + c * 3 + 6;
int Fi = cells[iii];
int Mi = cells[iii+1];
int Ti = cells[iii+2];
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
Fc += dF;
Mc += dM;
Tc += dT;
Fc = Fc % 360;
if (Fc < 0) Fc += 360;
if (Tc > Tk) Tc = Tk;
}
cells[ii] = Fc;
cells[ii+1] = Mc;
cells[ii+2] = Tc;
cells[ii+18] = (cells[ii+18] + 1) % 8;
img[i] = hsv2rgb(Fc, Tc, Mc);
""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
#define CN %s
__device__ uint hsv2rgb(int hue, int sat, int val) {
// skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))
Both variants produce exactly the same CA behaviour, but latter is running much slower.
这两种变体产生的CA行为完全相同,但后者的运行速度要慢得多。
GTX Titan:
泰坦GTX公司:
- Field size: 1900x1080 cells
- 字段大小:1900 x1080细胞
- Solution #1: ~200 steps/s
- 解决方案1:~ 200步/ s
- Solution #2: ~20 steps/s
- 解决方案2:~ 20个步骤/ s
GT 630M:
GT 630:
- Field size: 1600x900 cells
- 字段大小:1600 x900分辨率下玩细胞
- Solution #1: ~7.8 steps/s
- 解决方案1:~ 7.8步骤/ s
- Solution #2: ~1.5 steps/s
- 解决方案2:~ 1.5步骤/ s
Please feel free to play with both solutions' if you need:
如果需要,请随意使用这两种解决方案:
解决# 1完整的源代码
解决# 2完整的源代码
Any clues or advises are welcome:
任何线索或建议都是受欢迎的:
- Why the performance is slowed down?
- 为什么性能降低了?
- Is it possible to raise the performance of solution #2 at least to the level of #1?
- 是否有可能将解决方案2的性能至少提高到第1级?
- Or another solution would be better?
- 或者另一个解决方案更好?
1 个解决方案
#1
3
OK, I managed how to run second solution almost 15x faster. Following changes were made:
好的,我设法让第二种解决方案运行得快了15倍。以下变化:
- Convert main parameters array from
int
toint4
. This made it even faster than solution withint3
. Although, extra space left unused (.w dimension). [3x speedup] - 将主要参数数组从int转换为int4。这使得它比使用int3的解决方案更快。尽管,额外的空间未被使用(。w维度)。(3 x加速)
- Repack related parameters in WIDTHxHEIGHT groups. So, shape changed from (WIDTH, HEIGHT, N) to (N, WIDTH, HEIGHT). This made memory access more efficient, since elements inside groups tends to be processed together. [5x speedup]
- 重新打包WIDTHxHEIGHT组中的相关参数。因此,形状从(宽度,高度,N)变为(N,宽度,高度)这使得内存访问更有效,因为组内的元素通常是一起处理的。(5 x加速)
The optimized code looks like:
优化后的代码如下:
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int4 *cells, int w, int h, int cn", """
int x = i / h;
int y = i % h;
int4 cell = cells[i];
int Fc = cell.x;
int Mc = cell.y;
int Tc = cell.z;
float M = (float) Mc;
float T = (float) Tc;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i] = make_int4(Fc, Mc - Mi, Tc - (int) (T * fmin(1, T / Tk)), 0);
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int ii;
int xo, yo;
int cnn = 0;
for (int dx = -1; dx < 2; dx++) {
xo = x + dx;
if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
for (int dy = -1; dy < 2; dy++) {
if (dx == 0 && dy == 0) continue;
cnn += cn;
yo = y + dy;
if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
ii = (xo * h + yo) + cnn;
cells[ii] = make_int4(Fc, Mbase + Madd, Tc, 0);
}
}
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))
absorb_gpu = ElementwiseKernel("int4 *cells, int *img, int w, int h, int cn", """
int ii = i;
int4 cell = cells[i];
int Fc = cell.x;
int Mc = cell.y;
int Tc = cell.z;
for (int c=0; c < 8; c++){
ii += cn;
cell = cells[ii];
int Fi = cell.x;
int Mi = cell.y;
int Ti = cell.z;
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
Fc += dF;
Mc += dM;
Tc += dT;
Fc = Fc % 360;
if (Fc < 0) Fc += 360;
if (Tc > Tk) Tc = Tk;
}
cells[i] = make_int4(Fc, Mc, Tc, 0);
img[i] = hsv2rgb(Fc, Tc, Mc);
""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
__device__ uint hsv2rgb(int hue, int sat, int val) {
// skipped for brevity
}
""" % (Mk, Tk))
Thanks to Park Young-Bae for clues on repacking and also to Alexey Shchepin for some optimization issues.
感谢Park Young-Bae提供重新包装的线索,以及Alexey Shchepin提供一些优化问题。
#1
3
OK, I managed how to run second solution almost 15x faster. Following changes were made:
好的,我设法让第二种解决方案运行得快了15倍。以下变化:
- Convert main parameters array from
int
toint4
. This made it even faster than solution withint3
. Although, extra space left unused (.w dimension). [3x speedup] - 将主要参数数组从int转换为int4。这使得它比使用int3的解决方案更快。尽管,额外的空间未被使用(。w维度)。(3 x加速)
- Repack related parameters in WIDTHxHEIGHT groups. So, shape changed from (WIDTH, HEIGHT, N) to (N, WIDTH, HEIGHT). This made memory access more efficient, since elements inside groups tends to be processed together. [5x speedup]
- 重新打包WIDTHxHEIGHT组中的相关参数。因此,形状从(宽度,高度,N)变为(N,宽度,高度)这使得内存访问更有效,因为组内的元素通常是一起处理的。(5 x加速)
The optimized code looks like:
优化后的代码如下:
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int4 *cells, int w, int h, int cn", """
int x = i / h;
int y = i % h;
int4 cell = cells[i];
int Fc = cell.x;
int Mc = cell.y;
int Tc = cell.z;
float M = (float) Mc;
float T = (float) Tc;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i] = make_int4(Fc, Mc - Mi, Tc - (int) (T * fmin(1, T / Tk)), 0);
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int ii;
int xo, yo;
int cnn = 0;
for (int dx = -1; dx < 2; dx++) {
xo = x + dx;
if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
for (int dy = -1; dy < 2; dy++) {
if (dx == 0 && dy == 0) continue;
cnn += cn;
yo = y + dy;
if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
ii = (xo * h + yo) + cnn;
cells[ii] = make_int4(Fc, Mbase + Madd, Tc, 0);
}
}
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))
absorb_gpu = ElementwiseKernel("int4 *cells, int *img, int w, int h, int cn", """
int ii = i;
int4 cell = cells[i];
int Fc = cell.x;
int Mc = cell.y;
int Tc = cell.z;
for (int c=0; c < 8; c++){
ii += cn;
cell = cells[ii];
int Fi = cell.x;
int Mi = cell.y;
int Ti = cell.z;
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
Fc += dF;
Mc += dM;
Tc += dT;
Fc = Fc % 360;
if (Fc < 0) Fc += 360;
if (Tc > Tk) Tc = Tk;
}
cells[i] = make_int4(Fc, Mc, Tc, 0);
img[i] = hsv2rgb(Fc, Tc, Mc);
""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
__device__ uint hsv2rgb(int hue, int sat, int val) {
// skipped for brevity
}
""" % (Mk, Tk))
Thanks to Park Young-Bae for clues on repacking and also to Alexey Shchepin for some optimization issues.
感谢Park Young-Bae提供重新包装的线索,以及Alexey Shchepin提供一些优化问题。