Is it possible to use the scalar values of an input vector to index the output vector? I try to implement the following function in SIMD but I can not find any solution.
是否可以使用输入向量的标量值来索引输出向量?我尝试在SIMD中实现以下功能,但我找不到任何解决方案。
void shuffle(unsigned char * a, // input a
unsigned char * r){ // output r
for (i=0; i < 16; i++)
r[i] = 0;
for (i=0; i < 16; i++)
r[a[i] % 16] = 1;
}
An example input / output vector would look like this
输入/输出示例的示例如下所示
unsigned char * a = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
... do SIMD magic
// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
unsigned char * r = {1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 };
I was not able to find any suitable instruction that can dynamicly address the left side of an assignment. Maybe this function can be implemented by shifting operations? Did anybody implement something similar?
我无法找到任何可以动态地解决赋值左侧的指令。也许这个功能可以通过转移操作来实现?有没有人实现类似的东西?
1 个解决方案
#1
2
It seems that _mm_shuffle_epi8 is indeed the key to a solution. The idea is to set individual bits according to the values of the input vector a. These bits are distributed over (horizontal OR) the bytes of the 128 bits wide register.
似乎_mm_shuffle_epi8确实是解决方案的关键。想法是根据输入向量a的值设置各个位。这些位分布在128位宽寄存器的字节上(水平或)。
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -Wall -mavx test4.c */
/* gcc -O3 -Wall -msse2 -mssse3 -msse4.1 test4.c */
int print_char128(__m128i * x);
int print_char128(__m128i * x){
unsigned char v_x[16];
_mm_storeu_si128((__m128i *)v_x,*x);
printf("%4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u \n",
v_x[0], v_x[1], v_x[2], v_x[3], v_x[4], v_x[5], v_x[6], v_x[7],
v_x[8], v_x[9], v_x[10], v_x[11], v_x[12], v_x[13], v_x[14], v_x[15] );
return 0;
}
int main()
{
unsigned char a_v[] = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
/*unsigned char a_v[] = {13, 30, 0, 10, 0, 6, 0, 2, 0, 0, 7, 0, 3, 11, 0, 0 };*/
__m128i t0, t1, t2, t3;
__m128i a, r, msk0, msk1, msk0_1, zero, bin_ones, one_epi8;
/* set some constants */
unsigned char msk0_v[] ={1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0};
msk0=_mm_loadu_si128((__m128i *)msk0_v);
msk1=_mm_shuffle_epi32(msk0,0b01001110);
msk0_1=_mm_blend_epi16(msk0,msk1,0b11110000);
zero=_mm_setzero_si128();
bin_ones=_mm_cmpeq_epi32(zero,zero);
one_epi8=_mm_sub_epi8(zero,bin_ones);
/* load indices */
a=_mm_loadu_si128((__m128i *)a_v);
/* start of 'SIMD magic' */
/* index a_i sets the a_i -th bit within a byte of t0 if 0<=a_i<8 */
/* or set (a_i-8)-th bit within a byte of t1 if 8<=a_i<16 */
t0=_mm_shuffle_epi8(msk0,a);
t1=_mm_shuffle_epi8(msk1,a);
/* horizontal OR of the bytes in t0 and t1: */
t2=_mm_blend_epi16(t0,t1,0b11110000);
t3=_mm_alignr_epi8(t1,t0,8);
t0=_mm_or_si128(t2,t3);
t1=_mm_shuffle_epi32(t0,0b10110001);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,2);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,1);
t0=_mm_or_si128(t0,t1);
t0=_mm_shuffle_epi32(t0,0b11110101); /* end of horizontal OR */
/* filter out the relevant bits */
t0=_mm_and_si128(t0,msk0_1);
t0=_mm_cmpeq_epi8(t0,zero);
r=_mm_andnot_si128(t0,one_epi8); /* the result is in r */
print_char128(&r);
return 0;
}
This should work quite fast: Aside from the instructions for setting the constants and loading the data it is only 15 SSEx instructions. On today's processors these instructions all have a latency of only 1 cycle. The (reciprocal) througput is even smaller: 1/2 or 1/3 cycle. Intrinsic _mm_blend_epi16 is SSE4.1, some others are SSSE3.
这应该非常快:除了设置常量和加载数据的说明之外,它只有15条SSEx指令。在今天的处理器上,这些指令的延迟仅为1个周期。 (倒数)吞吐量甚至更小:1/2或1/3周期。内部_mm_blend_epi16是SSE4.1,其他一些是SSSE3。
#1
2
It seems that _mm_shuffle_epi8 is indeed the key to a solution. The idea is to set individual bits according to the values of the input vector a. These bits are distributed over (horizontal OR) the bytes of the 128 bits wide register.
似乎_mm_shuffle_epi8确实是解决方案的关键。想法是根据输入向量a的值设置各个位。这些位分布在128位宽寄存器的字节上(水平或)。
#include <stdio.h>
#include <immintrin.h>
/* gcc -O3 -Wall -mavx test4.c */
/* gcc -O3 -Wall -msse2 -mssse3 -msse4.1 test4.c */
int print_char128(__m128i * x);
int print_char128(__m128i * x){
unsigned char v_x[16];
_mm_storeu_si128((__m128i *)v_x,*x);
printf("%4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u | %4u %4u %4u %4u \n",
v_x[0], v_x[1], v_x[2], v_x[3], v_x[4], v_x[5], v_x[6], v_x[7],
v_x[8], v_x[9], v_x[10], v_x[11], v_x[12], v_x[13], v_x[14], v_x[15] );
return 0;
}
int main()
{
unsigned char a_v[] = {0, 0, 0, 10, 0, 0, 0, 2, 0, 0, 0, 0, 3, 1, 0, 0 };
/*unsigned char a_v[] = {13, 30, 0, 10, 0, 6, 0, 2, 0, 0, 7, 0, 3, 11, 0, 0 };*/
__m128i t0, t1, t2, t3;
__m128i a, r, msk0, msk1, msk0_1, zero, bin_ones, one_epi8;
/* set some constants */
unsigned char msk0_v[] ={1, 2, 4, 8, 16, 32, 64, 128, 0, 0, 0, 0, 0, 0, 0, 0};
msk0=_mm_loadu_si128((__m128i *)msk0_v);
msk1=_mm_shuffle_epi32(msk0,0b01001110);
msk0_1=_mm_blend_epi16(msk0,msk1,0b11110000);
zero=_mm_setzero_si128();
bin_ones=_mm_cmpeq_epi32(zero,zero);
one_epi8=_mm_sub_epi8(zero,bin_ones);
/* load indices */
a=_mm_loadu_si128((__m128i *)a_v);
/* start of 'SIMD magic' */
/* index a_i sets the a_i -th bit within a byte of t0 if 0<=a_i<8 */
/* or set (a_i-8)-th bit within a byte of t1 if 8<=a_i<16 */
t0=_mm_shuffle_epi8(msk0,a);
t1=_mm_shuffle_epi8(msk1,a);
/* horizontal OR of the bytes in t0 and t1: */
t2=_mm_blend_epi16(t0,t1,0b11110000);
t3=_mm_alignr_epi8(t1,t0,8);
t0=_mm_or_si128(t2,t3);
t1=_mm_shuffle_epi32(t0,0b10110001);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,2);
t0=_mm_or_si128(t0,t1);
t1=_mm_slli_si128(t0,1);
t0=_mm_or_si128(t0,t1);
t0=_mm_shuffle_epi32(t0,0b11110101); /* end of horizontal OR */
/* filter out the relevant bits */
t0=_mm_and_si128(t0,msk0_1);
t0=_mm_cmpeq_epi8(t0,zero);
r=_mm_andnot_si128(t0,one_epi8); /* the result is in r */
print_char128(&r);
return 0;
}
This should work quite fast: Aside from the instructions for setting the constants and loading the data it is only 15 SSEx instructions. On today's processors these instructions all have a latency of only 1 cycle. The (reciprocal) througput is even smaller: 1/2 or 1/3 cycle. Intrinsic _mm_blend_epi16 is SSE4.1, some others are SSSE3.
这应该非常快:除了设置常量和加载数据的说明之外,它只有15条SSEx指令。在今天的处理器上,这些指令的延迟仅为1个周期。 (倒数)吞吐量甚至更小:1/2或1/3周期。内部_mm_blend_epi16是SSE4.1,其他一些是SSSE3。