I'm developing an iOS app that needs to convert images from RGB -> BGRA fairly quickly. I would like to use NEON intrinsics if possible. Is there a faster way than simply assigning the components?
我正在开发一个iOS应用程序,需要快速转换RGB -> BGRA的图像。如果可能的话,我想使用氖气。是否有比简单分配组件更快的方法?
void neonPermuteRGBtoBGRA(unsigned char* src, unsigned char* dst, int numPix)
{
numPix /= 8; //process 8 pixels at a time
uint8x8_t alpha = vdup_n_u8 (0xff);
for (int i=0; i<numPix; i++)
{
uint8x8x3_t rgb = vld3_u8 (src);
uint8x8x4_t bgra;
bgra.val[0] = rgb.val[2]; //these lines are slow
bgra.val[1] = rgb.val[1]; //these lines are slow
bgra.val[2] = rgb.val[0]; //these lines are slow
bgra.val[3] = alpha;
vst4_u8(dst, bgra);
src += 8*3;
dst += 8*4;
}
}
2 个解决方案
#1
9
The ARMCC disassembly isn't that fast either :
ARMCC拆卸也不是那么快:
-
It isn't using the most appropriate instructions
它没有使用最恰当的指令。
-
It mixes VFP instructions with NEON ones which causes huge hiccups every time
它混合了VFP指令和霓虹灯,每次都产生巨大的打嗝。
Try this :
试试这个:
mov r2, r2, lsr #3
vmov.u8, d3, #0xff
loop:
vld3.8 {d0-d2}, [r0]!
subs r2, r2, #1
vswp d0, d2
vst4.8 {d0-d3}, [r1]!
bgt loop
bx lr
My suggested code isn't fully optimized either, but further "real" optimizations would harm the readability seriously. So I stop here.
我的建议代码也没有得到充分的优化,但是进一步的“真正的”优化会严重损害可读性。所以我停在这里。
#2
7
This depends on the compiler. For example when I compile the code above with armcc (5.01) and disassemble it, what I get looks like (I'm just putting the loop and I moved alpha assignment outside of the loop)
这取决于编译器。例如,当我用armcc(5.01)编译上面的代码并将其分解时,我得到的结果是(我只是在执行循环,并且在循环之外移动了alpha赋值)
18: f420440d vld3.8 {d4-d6}, [r0]!
1c: e2822001 add r2, r2, #1 ; 0x1
20: eeb01b45 fcpyd d1, d5
24: eeb00b46 fcpyd d0, d6
28: eeb02b44 fcpyd d2, d4
2c: f401000d vst4.8 {d0-d3}, [r1]!
30: e1520003 cmp r2, r3
34: bafffff7 blt 18 <neonPermuteRGBtoBGRA_armcc+0x18>
If I compile the code with gcc (4.4.3) and disassemble again I get,
如果我用gcc(4.4.3)编译代码,然后再次分解,
40: f967 040f vld3.8 {d16-d18}, [r7]
44: 46d6 mov lr, sl
46: ecca 0b06 vstmia sl, {d16-d18}
4a: 9d02 ldr r5, [sp, #8]
4c: ed8d 8b1a vstr d8, [sp, #104]
50: 3718 adds r7, #24
52: e8be 000f ldmia.w lr!, {r0, r1, r2, r3}
56: f108 0801 add.w r8, r8, #1 ; 0x1
5a: c50f stmia r5!, {r0, r1, r2, r3}
5c: eddd 0b24 vldr d16, [sp, #144]
60: e89e 0003 ldmia.w lr, {r0, r1}
64: edcd 0b16 vstr d16, [sp, #88]
68: eddd 0b22 vldr d16, [sp, #136]
6c: edcd 0b18 vstr d16, [sp, #96]
70: e885 0003 stmia.w r5, {r0, r1}
74: ed9d 0b26 vldr d0, [sp, #152]
78: 9d03 ldr r5, [sp, #12]
7a: ed8d 0b14 vstr d0, [sp, #80]
7e: cd0f ldmia r5!, {r0, r1, r2, r3}
80: 46ae mov lr, r5
82: 465d mov r5, fp
84: c50f stmia r5!, {r0, r1, r2, r3}
86: e89e 000f ldmia.w lr, {r0, r1, r2, r3}
8a: e885 000f stmia.w r5, {r0, r1, r2, r3}
8e: 9501 str r5, [sp, #4]
90: 465d mov r5, fp
92: 2100 movs r1, #0
94: 2220 movs r2, #32
96: 4620 mov r0, r4
98: f7ff fffe bl 0 <memset>
9c: cd0f ldmia r5!, {r0, r1, r2, r3}
9e: 4625 mov r5, r4
a0: c50f stmia r5!, {r0, r1, r2, r3}
a2: f8dd c004 ldr.w ip, [sp, #4]
a6: e89c 000f ldmia.w ip, {r0, r1, r2, r3}
aa: e885 000f stmia.w r5, {r0, r1, r2, r3}
ae: ecd4 0b08 vldmia r4, {d16-d19}
b2: f946 000f vst4.8 {d16-d19}, [r6]
b6: 3620 adds r6, #32
b8: 45c8 cmp r8, r9
ba: dbc1 blt.n 40 <memset+0x40>
And the execution time was 10 times faster with armcc.
而armcc的执行时间则快了10倍。
If I compile armcc produced assembly code for the function (it looks like now alpha is back in loop :)) with gcc (inline assembly)
如果我编译了armcc生成的函数的汇编代码(现在看起来好像alpha又回到了loop:)),使用gcc(内联汇编)
void neonPermuteRGBtoBGRA_gas(unsigned char* src, unsigned char* dst,
int numPix) {
asm(
" ASR r3,r2,#31\n"
" VMOV.I8 d1,#0xff\n"
" ADD r2,r2,r3,LSR #29\n"
" ASR r3,r2,#3\n"
" MOV r2,#0\n"
" CMP r3,#0\n"
" BLE end\n"
"loop:\n"
" VLD3.8 {d4,d5,d6},[r0]!\n"
" ADD r2,r2,#1\n"
" CMP r3,r2\n"
" VMOV.F64 d3,d5\n"
" VMOV.F64 d2,d6\n"
" VMOV.F64 d5,d1\n"
" VMOV.F64 d0,d4\n"
" VST4.8 {d2,d3,d4,d5},[r1]!\n"
" BGT loop\n"
"end:\n"
);
}
I get the same execution time with gcc as well.
我也得到了与gcc相同的执行时间。
At the end what I suggest you is either disassemble your binary and check if the compiler produces what you want or use assembly.
最后,我建议您将二进制代码分解,并检查编译器是否生成您想要的或使用程序集。
Btw if you want to improve the execution time of this function even further, I suggest you to look into
如果你想进一步提高这个功能的执行时间,我建议你去调查一下。
- arm's PLD (preload data) instruction
- arm的PLD(预载数据)指令。
- utilize all the possible neon instructions in the loop, like loop unrolling (you'll notice that actually bandwidth will be the data load time from memory)
- 在循环中使用所有可能的氖灯指令,比如循环展开(你会注意到实际的带宽是来自内存的数据加载时间)
#1
9
The ARMCC disassembly isn't that fast either :
ARMCC拆卸也不是那么快:
-
It isn't using the most appropriate instructions
它没有使用最恰当的指令。
-
It mixes VFP instructions with NEON ones which causes huge hiccups every time
它混合了VFP指令和霓虹灯,每次都产生巨大的打嗝。
Try this :
试试这个:
mov r2, r2, lsr #3
vmov.u8, d3, #0xff
loop:
vld3.8 {d0-d2}, [r0]!
subs r2, r2, #1
vswp d0, d2
vst4.8 {d0-d3}, [r1]!
bgt loop
bx lr
My suggested code isn't fully optimized either, but further "real" optimizations would harm the readability seriously. So I stop here.
我的建议代码也没有得到充分的优化,但是进一步的“真正的”优化会严重损害可读性。所以我停在这里。
#2
7
This depends on the compiler. For example when I compile the code above with armcc (5.01) and disassemble it, what I get looks like (I'm just putting the loop and I moved alpha assignment outside of the loop)
这取决于编译器。例如,当我用armcc(5.01)编译上面的代码并将其分解时,我得到的结果是(我只是在执行循环,并且在循环之外移动了alpha赋值)
18: f420440d vld3.8 {d4-d6}, [r0]!
1c: e2822001 add r2, r2, #1 ; 0x1
20: eeb01b45 fcpyd d1, d5
24: eeb00b46 fcpyd d0, d6
28: eeb02b44 fcpyd d2, d4
2c: f401000d vst4.8 {d0-d3}, [r1]!
30: e1520003 cmp r2, r3
34: bafffff7 blt 18 <neonPermuteRGBtoBGRA_armcc+0x18>
If I compile the code with gcc (4.4.3) and disassemble again I get,
如果我用gcc(4.4.3)编译代码,然后再次分解,
40: f967 040f vld3.8 {d16-d18}, [r7]
44: 46d6 mov lr, sl
46: ecca 0b06 vstmia sl, {d16-d18}
4a: 9d02 ldr r5, [sp, #8]
4c: ed8d 8b1a vstr d8, [sp, #104]
50: 3718 adds r7, #24
52: e8be 000f ldmia.w lr!, {r0, r1, r2, r3}
56: f108 0801 add.w r8, r8, #1 ; 0x1
5a: c50f stmia r5!, {r0, r1, r2, r3}
5c: eddd 0b24 vldr d16, [sp, #144]
60: e89e 0003 ldmia.w lr, {r0, r1}
64: edcd 0b16 vstr d16, [sp, #88]
68: eddd 0b22 vldr d16, [sp, #136]
6c: edcd 0b18 vstr d16, [sp, #96]
70: e885 0003 stmia.w r5, {r0, r1}
74: ed9d 0b26 vldr d0, [sp, #152]
78: 9d03 ldr r5, [sp, #12]
7a: ed8d 0b14 vstr d0, [sp, #80]
7e: cd0f ldmia r5!, {r0, r1, r2, r3}
80: 46ae mov lr, r5
82: 465d mov r5, fp
84: c50f stmia r5!, {r0, r1, r2, r3}
86: e89e 000f ldmia.w lr, {r0, r1, r2, r3}
8a: e885 000f stmia.w r5, {r0, r1, r2, r3}
8e: 9501 str r5, [sp, #4]
90: 465d mov r5, fp
92: 2100 movs r1, #0
94: 2220 movs r2, #32
96: 4620 mov r0, r4
98: f7ff fffe bl 0 <memset>
9c: cd0f ldmia r5!, {r0, r1, r2, r3}
9e: 4625 mov r5, r4
a0: c50f stmia r5!, {r0, r1, r2, r3}
a2: f8dd c004 ldr.w ip, [sp, #4]
a6: e89c 000f ldmia.w ip, {r0, r1, r2, r3}
aa: e885 000f stmia.w r5, {r0, r1, r2, r3}
ae: ecd4 0b08 vldmia r4, {d16-d19}
b2: f946 000f vst4.8 {d16-d19}, [r6]
b6: 3620 adds r6, #32
b8: 45c8 cmp r8, r9
ba: dbc1 blt.n 40 <memset+0x40>
And the execution time was 10 times faster with armcc.
而armcc的执行时间则快了10倍。
If I compile armcc produced assembly code for the function (it looks like now alpha is back in loop :)) with gcc (inline assembly)
如果我编译了armcc生成的函数的汇编代码(现在看起来好像alpha又回到了loop:)),使用gcc(内联汇编)
void neonPermuteRGBtoBGRA_gas(unsigned char* src, unsigned char* dst,
int numPix) {
asm(
" ASR r3,r2,#31\n"
" VMOV.I8 d1,#0xff\n"
" ADD r2,r2,r3,LSR #29\n"
" ASR r3,r2,#3\n"
" MOV r2,#0\n"
" CMP r3,#0\n"
" BLE end\n"
"loop:\n"
" VLD3.8 {d4,d5,d6},[r0]!\n"
" ADD r2,r2,#1\n"
" CMP r3,r2\n"
" VMOV.F64 d3,d5\n"
" VMOV.F64 d2,d6\n"
" VMOV.F64 d5,d1\n"
" VMOV.F64 d0,d4\n"
" VST4.8 {d2,d3,d4,d5},[r1]!\n"
" BGT loop\n"
"end:\n"
);
}
I get the same execution time with gcc as well.
我也得到了与gcc相同的执行时间。
At the end what I suggest you is either disassemble your binary and check if the compiler produces what you want or use assembly.
最后,我建议您将二进制代码分解,并检查编译器是否生成您想要的或使用程序集。
Btw if you want to improve the execution time of this function even further, I suggest you to look into
如果你想进一步提高这个功能的执行时间,我建议你去调查一下。
- arm's PLD (preload data) instruction
- arm的PLD(预载数据)指令。
- utilize all the possible neon instructions in the loop, like loop unrolling (you'll notice that actually bandwidth will be the data load time from memory)
- 在循环中使用所有可能的氖灯指令,比如循环展开(你会注意到实际的带宽是来自内存的数据加载时间)