There are many topics here whether a direct access or an indirect access (via pointer) are faster when accessing structure members, in C.
在C中访问结构成员时,直接访问或间接访问(通过指针)是否更快是有很多主题的。
One example: C pointers vs direct member access for structs
一个例子:C指针与结构的直接成员访问
The general opinion is direct access will be faster (at least theoretically) since pointer dereferencing is not used.
一般意见是直接访问将更快(至少在理论上),因为不使用指针解引用。
So I gave it a try with a chunk of code in my system: GNU Embedded Tools GCC 4.7.4, generating code for ARM (actually ARM-Cortex-A15).
所以我在我的系统中尝试了一大堆代码:GNU Embedded Tools GCC 4.7.4,为ARM生成代码(实际上是ARM-Cortex-A15)。
Surprisingly, direct access was much slower. Then I generated the assembly codes for the object file.
令人惊讶的是,直接访问速度要慢得多。然后我生成了目标文件的汇编代码。
Direct access code has 114 lines of assembly code, and indirect access code has 33 lines of assembly code. What is going on here?
直接访问代码有114行汇编代码,间接访问代码有33行汇编代码。这里发生了什么?
Below are the C code and generated assembly code of the functions. The structures are all map to external memory and the structure members are all one-byte word long (unsigned char type).
下面是C代码和生成的函数汇编代码。结构都映射到外部存储器,结构成员都是单字节字长(无符号字符类型)。
First function, with indirect access:
第一个功能,间接访问:
void sub_func_1(unsigned int num_of, struct file_s *__restrict__ first_file_ptr, struct file_s *__restrict__ second_file_ptr, struct output_s *__restrict__ output_ptr)
{
if(LIKELY(num_of == 0))
{
output_ptr->curr_id = UNUSED;
output_ptr->curr_cnt = output_ptr->cnt;
output_ptr->curr_mode = output_ptr->_mode;
output_ptr->curr_type = output_ptr->type;
output_ptr->curr_size = output_ptr->size;
output_ptr->curr_allocation_type = output_ptr->allocation_type;
output_ptr->curr_allocation_localized = output_ptr->allocation_localized;
output_ptr->curr_mode_enable = output_ptr->mode_enable;
if(output_ptr->curr_cnt == 1)
{
first_file_ptr->status = BLOCK_IDLE;
first_file_ptr->type = USER_DATA_TYPE;
first_file_ptr->index = FIRST__WORD;
first_file_ptr->layer_cnt = output_ptr->layer_cnt;
second_file_ptr->status = DISABLED;
second_file_ptr->index = 0;
second_file_ptr->redundancy_version = 1;
output_ptr->total_layer_cnt = first_file_ptr->layer_cnt;
}
}
}
00000000 <sub_func_1>:
0: e3500000 cmp r0, #0
4: e92d01f0 push {r4, r5, r6, r7, r8}
8: 1a00001b bne 7c <sub_func_1+0x7c>
c: e5d34007 ldrb r4, [r3, #7]
10: e3a05008 mov r5, #8
14: e5d3c003 ldrb ip, [r3, #3]
18: e5d38014 ldrb r8, [r3, #20]
1c: e5c35001 strb r5, [r3, #1]
20: e5d37015 ldrb r7, [r3, #21]
24: e5d36018 ldrb r6, [r3, #24]
28: e5c34008 strb r4, [r3, #8]
2c: e5d35019 ldrb r5, [r3, #25]
30: e35c0001 cmp ip, #1
34: e5c3c005 strb ip, [r3, #5]
38: e5d34012 ldrb r4, [r3, #18]
3c: e5c38010 strb r8, [r3, #16]
40: e5c37011 strb r7, [r3, #17]
44: e5c3601a strb r6, [r3, #26]
48: e5c3501b strb r5, [r3, #27]
4c: e5c34013 strb r4, [r3, #19]
50: 1a000009 bne 7c <sub_func_1+0x7c>
54: e5d3400b ldrb r4, [r3, #11]
58: e3a05005 mov r5, #5
5c: e5c1c000 strb ip, [r1]
60: e5c10002 strb r0, [r1, #2]
64: e5c15001 strb r5, [r1, #1]
68: e5c20000 strb r0, [r2]
6c: e5c14003 strb r4, [r1, #3]
70: e5c20005 strb r0, [r2, #5]
74: e5c2c014 strb ip, [r2, #20]
78: e5c3400f strb r4, [r3, #15]
7c: e8bd01f0 pop {r4, r5, r6, r7, r8}
80: e12fff1e bx lr
Second function, with direct access:
第二个功能,可直接访问:
void sub_func_2(unsigned int output_index, unsigned int cc_index, unsigned int num_of)
{
if(LIKELY(num_of == 0))
{
output_file[output_index].curr_id = UNUSED;
output_file[output_index].curr_cnt = output_file[output_index].cnt;
output_file[output_index].curr_mode = output_file[output_index]._mode;
output_file[output_index].curr_type = output_file[output_index].type;
output_file[output_index].curr_size = output_file[output_index].size;
output_file[output_index].curr_allocation_type = output_file[output_index].allocation_type;
output_file[output_index].curr_allocation_localized = output_file[output_index].allocation_localized;
output_file[output_index].curr_mode_enable = output_file[output_index].mode_enable;
if(output_file[output_index].curr_cnt == 1)
{
output_file[output_index].cc_file[cc_index].file[0].status = BLOCK_IDLE;
output_file[output_index].cc_file[cc_index].file[0].type = USER_DATA_TYPE;
output_file[output_index].cc_file[cc_index].file[0].index = FIRST__WORD;
output_file[output_index].cc_file[cc_index].file[0].layer_cnt = output_file[output_index].layer_cnt;
output_file[output_index].cc_file[cc_index].file[1].status = DISABLED;
output_file[output_index].cc_file[cc_index].file[1].index = 0;
output_file[output_index].cc_file[cc_index].file[1].redundancy_version = 1;
output_file[output_index].total_layer_cnt = output_file[output_index].cc_file[cc_index].file[0].layer_cnt;
}
}
}
00000084 <sub_func_2>:
84: e92d0ff0 push {r4, r5, r6, r7, r8, r9, sl, fp}
88: e3520000 cmp r2, #0
8c: e24dd018 sub sp, sp, #24
90: e58d2004 str r2, [sp, #4]
94: 1a000069 bne 240 <sub_func_2+0x1bc>
98: e3a03d61 mov r3, #6208 ; 0x1840
9c: e30dc0c0 movw ip, #53440 ; 0xd0c0
a0: e340c001 movt ip, #1
a4: e3002000 movw r2, #0
a8: e0010193 mul r1, r3, r1
ac: e3402000 movt r2, #0
b0: e3067490 movw r7, #25744 ; 0x6490
b4: e3068488 movw r8, #25736 ; 0x6488
b8: e3a0b008 mov fp, #8
bc: e3066498 movw r6, #25752 ; 0x6498
c0: e02c109c mla ip, ip, r0, r1
c4: e082c00c add ip, r2, ip
c8: e28c3b19 add r3, ip, #25600 ; 0x6400
cc: e08c4007 add r4, ip, r7
d0: e5d39083 ldrb r9, [r3, #131] ; 0x83
d4: e08c5006 add r5, ip, r6
d8: e5d3a087 ldrb sl, [r3, #135] ; 0x87
dc: e5c3b081 strb fp, [r3, #129] ; 0x81
e0: e5c39085 strb r9, [r3, #133] ; 0x85
e4: e2833080 add r3, r3, #128 ; 0x80
e8: e7cca008 strb sl, [ip, r8]
ec: e5d4a004 ldrb sl, [r4, #4]
f0: e7cca007 strb sl, [ip, r7]
f4: e5d47005 ldrb r7, [r4, #5]
f8: e5c47001 strb r7, [r4, #1]
fc: e7dc6006 ldrb r6, [ip, r6]
100: e5d5c001 ldrb ip, [r5, #1]
104: e5c56002 strb r6, [r5, #2]
108: e5c5c003 strb ip, [r5, #3]
10c: e5d4c002 ldrb ip, [r4, #2]
110: e5c4c003 strb ip, [r4, #3]
114: e5d33005 ldrb r3, [r3, #5]
118: e3530001 cmp r3, #1
11c: 1a000047 bne 240 <sub_func_2+0x1bc>
120: e30dc0c0 movw ip, #53440 ; 0xd0c0
124: e30db0c0 movw fp, #53440 ; 0xd0c0
128: e1a0700c mov r7, ip
12c: e7dfc813 bfi ip, r3, #16, #16
130: e1a05007 mov r5, r7
134: e1a0900b mov r9, fp
138: e02c109c mla ip, ip, r0, r1
13c: e1a04005 mov r4, r5
140: e1a0a00b mov sl, fp
144: e7df9813 bfi r9, r3, #16, #16
148: e7dfb813 bfi fp, r3, #16, #16
14c: e1a06007 mov r6, r7
150: e7dfa813 bfi sl, r3, #16, #16
154: e58dc008 str ip, [sp, #8]
158: e7df6813 bfi r6, r3, #16, #16
15c: e1a0c004 mov ip, r4
160: e7df4813 bfi r4, r3, #16, #16
164: e02b109b mla fp, fp, r0, r1
168: e7df5813 bfi r5, r3, #16, #16
16c: e0291099 mla r9, r9, r0, r1
170: e7df7813 bfi r7, r3, #16, #16
174: e7dfc813 bfi ip, r3, #16, #16
178: e0261096 mla r6, r6, r0, r1
17c: e0241094 mla r4, r4, r0, r1
180: e082b00b add fp, r2, fp
184: e0829009 add r9, r2, r9
188: e02a109a mla sl, sl, r0, r1
18c: e28bbc65 add fp, fp, #25856 ; 0x6500
190: e58d600c str r6, [sp, #12]
194: e2899c65 add r9, r9, #25856 ; 0x6500
198: e3a06005 mov r6, #5
19c: e58d4010 str r4, [sp, #16]
1a0: e59d4008 ldr r4, [sp, #8]
1a4: e0251095 mla r5, r5, r0, r1
1a8: e5cb3000 strb r3, [fp]
1ac: e082a00a add sl, r2, sl
1b0: e59db00c ldr fp, [sp, #12]
1b4: e5c96001 strb r6, [r9, #1]
1b8: e59d6004 ldr r6, [sp, #4]
1bc: e28aac65 add sl, sl, #25856 ; 0x6500
1c0: e58d5014 str r5, [sp, #20]
1c4: e0271097 mla r7, r7, r0, r1
1c8: e0825004 add r5, r2, r4
1cc: e30d40c0 movw r4, #53440 ; 0xd0c0
1d0: e02c109c mla ip, ip, r0, r1
1d4: e0855008 add r5, r5, r8
1d8: e7df4813 bfi r4, r3, #16, #16
1dc: e5ca6002 strb r6, [sl, #2]
1e0: e5d59003 ldrb r9, [r5, #3]
1e4: e082600b add r6, r2, fp
1e8: e59db014 ldr fp, [sp, #20]
1ec: e0201094 mla r0, r4, r0, r1
1f0: e2866c65 add r6, r6, #25856 ; 0x6500
1f4: e59d1010 ldr r1, [sp, #16]
1f8: e306a53c movw sl, #25916 ; 0x653c
1fc: e0827007 add r7, r2, r7
200: e2877c65 add r7, r7, #25856 ; 0x6500
204: e082c00c add ip, r2, ip
208: e5c69003 strb r9, [r6, #3]
20c: e59d6004 ldr r6, [sp, #4]
210: e28ccc65 add ip, ip, #25856 ; 0x6500
214: e082500b add r5, r2, fp
218: e0820000 add r0, r2, r0
21c: e0824001 add r4, r2, r1
220: e085500a add r5, r5, sl
224: e0808008 add r8, r0, r8
228: e7c4600a strb r6, [r4, sl]
22c: e5c56005 strb r6, [r5, #5]
230: e5c73050 strb r3, [r7, #80] ; 0x50
234: e5dc3003 ldrb r3, [ip, #3]
238: e287704c add r7, r7, #76 ; 0x4c
23c: e5c83007 strb r3, [r8, #7]
240: e28dd018 add sp, sp, #24
244: e8bd0ff0 pop {r4, r5, r6, r7, r8, r9, sl, fp}
248: e12fff1e bx lr
And last part, my compile options are:
最后一部分,我的编译选项是:
# Compile options.
C_OPTS = -Wall \
-std=gnu99 \
-fgnu89-inline \
-Wcast-align \
-Werror=uninitialized \
-Werror=maybe-uninitialized \
-Werror=overflow \
-mcpu=cortex-a15 \
-mtune=cortex-a15 \
-mabi=aapcs \
-mfpu=neon \
-ftree-vectorize \
-ftree-slp-vectorize \
-ftree-vectorizer-verbose=4 \
-mfloat-abi=hard \
-O3 \
-flto \
-marm \
-ffat-lto-objects \
-fno-gcse \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
-fno-strict-overflow \
-fuse-linker-plugin \
-falign-functions=4 \
-falign-loops=4 \
-falign-labels=4 \
-falign-jumps=4
Update:
Note: I deleted the structure definitions because there was differences with the version of my own program. It is actually a huge structure and it is not efficient to put here completely.
注意:我删除了结构定义,因为与我自己的程序版本存在差异。它实际上是一个巨大的结构,完全放在这里效率不高。
As suggested, I get rid of -fno-gcse, and the generated asm is not huge as before.
正如所建议的那样,我摆脱了-fno-gcse,并且生成的asm并不像以前那么大。
Without -fno-gcse, sub_func_1 generates the same code as above.
如果没有-fno-gcse,sub_func_1将生成与上面相同的代码。
For sub_func_2:
00000084 <sub_func_2>:
84: e3520000 cmp r2, #0
88: e92d0070 push {r4, r5, r6}
8c: 1a000030 bne 154 <sub_func_2+0xd0>
90: e30d30c0 movw r3, #53440 ; 0xd0c0
94: e3a06008 mov r6, #8
98: e3403001 movt r3, #1
9c: e0030093 mul r3, r3, r0
a0: e3a00d61 mov r0, #6208 ; 0x1840
a4: e0213190 mla r1, r0, r1, r3
a8: e59f30ac ldr r3, [pc, #172] ; 15c <sub_func_2+0xd8>
ac: e0831001 add r1, r3, r1
b0: e2813b19 add r3, r1, #25600 ; 0x6400
b4: e5d34083 ldrb r4, [r3, #131] ; 0x83
b8: e1a00003 mov r0, r3
bc: e5d35087 ldrb r5, [r3, #135] ; 0x87
c0: e5c36081 strb r6, [r3, #129] ; 0x81
c4: e5c34085 strb r4, [r3, #133] ; 0x85
c8: e3064488 movw r4, #25736 ; 0x6488
cc: e2833080 add r3, r3, #128 ; 0x80
d0: e7c15004 strb r5, [r1, r4]
d4: e5d05094 ldrb r5, [r0, #148] ; 0x94
d8: e0844006 add r4, r4, r6
dc: e7c15004 strb r5, [r1, r4]
e0: e5d04095 ldrb r4, [r0, #149] ; 0x95
e4: e5d0c092 ldrb ip, [r0, #146] ; 0x92
e8: e5c04091 strb r4, [r0, #145] ; 0x91
ec: e3064498 movw r4, #25752 ; 0x6498
f0: e7d15004 ldrb r5, [r1, r4]
f4: e5c0c093 strb ip, [r0, #147] ; 0x93
f8: e5d04099 ldrb r4, [r0, #153] ; 0x99
fc: e5c0509a strb r5, [r0, #154] ; 0x9a
100: e5c0409b strb r4, [r0, #155] ; 0x9b
104: e5d33005 ldrb r3, [r3, #5]
108: e3530001 cmp r3, #1
10c: 1a000010 bne 154 <sub_func_2+0xd0>
110: e281cc65 add ip, r1, #25856 ; 0x6500
114: e3a06005 mov r6, #5
118: e2810b19 add r0, r1, #25600 ; 0x6400
11c: e1a0500c mov r5, ip
120: e5cc3000 strb r3, [ip]
124: e1a0400c mov r4, ip
128: e5cc6001 strb r6, [ip, #1]
12c: e5cc2002 strb r2, [ip, #2]
130: e5d0608b ldrb r6, [r0, #139] ; 0x8b
134: e5cc6003 strb r6, [ip, #3]
138: e306c53c movw ip, #25916 ; 0x653c
13c: e7c1200c strb r2, [r1, ip]
140: e5c52041 strb r2, [r5, #65] ; 0x41
144: e285503c add r5, r5, #60 ; 0x3c
148: e5c43050 strb r3, [r4, #80] ; 0x50
14c: e284404c add r4, r4, #76 ; 0x4c
150: e5c0608f strb r6, [r0, #143] ; 0x8f
154: e8bd0070 pop {r4, r5, r6}
158: e12fff1e bx lr
15c: 00000000 .word 0x00000000
2 个解决方案
#1
7
TL:DR: can't reproduce that insane compiler output. Maybe the surrounding code + LTO did it?
TL:DR:无法重现那个疯狂的编译器输出。也许周围的代码+ LTO做到了吗?
I do have suggestions to improve the code: see the stuff below about copying whole structs instead of copying many individual members.
我确实有改进代码的建议:请参阅下面有关复制整个结构而不是复制许多单个成员的内容。
The question you linked is about accessing a value-type global directly vs. through a global pointer. On ARM, where it takes multiple instructions or a load from a nearby constant to get an arbitrary 32bit pointer into a register, passing around pointers is better than having each function reference a global directly.
您链接的问题是直接通过全局指针访问值类型全局。在ARM上,需要多个指令或来自附近常量的负载才能将任意32位指针放入寄存器,传递指针比使每个函数直接引用全局更好。
See this example on the Godbolt Compiler Explorer (ARM gcc 4.8.2 -O3)
在Godbolt Compiler Explorer上看到这个例子(ARM gcc 4.8.2 -O3)
struct example {
int a, b, c;
} global_example;
int load_global(void) { return global_example.c; }
movw r3, #:lower16:global_example @ tmp113,
movt r3, #:upper16:global_example @ tmp113,
ldr r0, [r3, #8] @, global_example.c
bx lr @
int load_pointer(struct example *p) { return p->c; }
ldr r0, [r0, #8] @, p_2(D)->c
bx lr @
(Apparently gcc is horrible at passing structs by val as function args, see the code for byval(struct example by_val)
on the godbolt link.)
(显然gcc在使用val作为函数args传递结构时非常糟糕,请参阅godbolt链接上的byval代码(struct example by_val)。)
Even worse is if you have a global pointer: first you have to load the value of the pointer, then another load to dereference it. This is the indirection overhead that was being discussed in the question you linked. If both loads miss in cache, you're paying the round-trip latency twice. The load address for the 2nd load isn't available until the first load completes, so no pipelining of those memory requests is possible even on an out-of-order CPU.
更糟糕的是,如果你有一个全局指针:首先你必须加载指针的值,然后另一个加载来取消引用它。这是您链接的问题中讨论的间接开销。如果两个加载都在缓存中丢失,那么您需要两次支付往返延迟。第一个加载完成之前,第二个加载的加载地址不可用,因此即使在无序CPU上也不能对这些内存请求进行流水线操作。
If you already have a pointer as an arg, it will be in a register. Dereferencing it is the same as loading from a global. (But better, because you don't need to get the global's address into a register yourself.)
如果您已经将指针作为arg,它将位于寄存器中。取消引用它与从全局加载相同。 (但更好的是,因为您不需要自己将全局地址放入注册表中。)
Your real code
I can't reproduce your massive asm output with ARM gcc 4.8.2 on Godbolt, or locally with ARM gcc 5.2.1. I'm not using LTO, though, since I don't have a complete test program.
我无法在Godbolt上使用ARM gcc 4.8.2重现您的大量asm输出,或在ARM gcc 5.2.1本地重现。但是,我没有使用LTO,因为我没有完整的测试程序。
All I can see is just slightly larger code to do some index math.
我只能看到稍微大一点的代码来做一些索引数学。
bfi
is Bitfield Insert. I think 144: e7df9813 bfi r9, r3, #16, #16
is setting the top half of r9
= low half of r3
. I don't see how that and mla
(integer mul-accumulate) make much sense. Other than perverse results from -ftree-vectorize
, all I can think of is maybe -fno-gcse
has a really bad impact for the version of gcc you tested.
bfi是Bitfield Insert。我认为144:e7df9813 bfi r9,r3,#16,#16设置r9的上半部分= r3的低半部分。我没有看到它和mla(整数mul-accumulate)有多大意义。除了-ftree-vectorize的反常结果之外,我所能想到的可能是-fno-gcse对你测试的gcc版本有很大影响。
Is it manipulating constants that are going to be stored? The code you actually posted #define
s everything to 0, which gcc takes advantage of. (It also takes advantage of the fact that it already has 1
in a register if curr_cnt == 1
, and stores that register for the second_file_ptr->redundancy_version = 1;
). ARM doesn't have a str [mem], immediate
or anything like x86's mov [mem], imm
.
它是否在操纵将要存储的常量?你实际发布的代码#defines一切都为0,gcc利用了。 (如果curr_cnt == 1,它还利用了寄存器中已有1的事实,并存储了second_file_ptr-> redundancy_version = 1的寄存器;)。 ARM没有str [mem],立即或类似x86的mov [mem],imm。
If your compiler output is from code with different values for those constants, the compiler would be doing more work to store different things.
如果您的编译器输出来自具有这些常量的不同值的代码,则编译器将执行更多工作来存储不同的内容。
Unfortunately gcc is bad at merging narrow stores into a single wider store (long-standing missed-optimization bug). For x86, clang does this in at least one case, storing 0x0100
(256) instead of a 0 and a 1. (check on godbolt by flipping the compiler to clang 3.7.1 or something, and removing the ARM-specific compiler args. There's a mov word ptr \[rsi\], 256
where gcc uses
不幸的是,gcc很难将狭窄的商店合并到一个更大的商店(长期错过优化错误)。对于x86,clang在至少一种情况下执行此操作,存储0x0100(256)而不是0和1.(通过将编译器翻转到clang 3.7.1或其他内容来检查godbolt,并删除特定于ARM的编译器args。有一个移动词ptr \ [rsi \],256这是gcc使用的
mov BYTE PTR [rsi], 0 # *first_file_ptr_23(D).status,
mov BYTE PTR [rsi+1], 1 # *first_file_ptr_23(D).type,
If you arranged your structs carefully, there would be more opportunities for copying 4B blocks in this function.
如果您仔细安排了结构,那么在此功能中复制4B块将有更多机会。
It might also help to have two identical sub-structs of curr
and not-curr
, instead of curr_size
and size
. You might have to declare it packed
to avoid padding after the sub-structs, though. Your two groups of members aren't in exactly the same order, which prevents compilers from doing much block-copying anyway when you do a bunch of assignments.
它也可能有两个相同的curr和not-curr子结构,而不是curr_size和size。但是,您可能必须声明它已打包以避免在子结构之后填充。您的两组成员的顺序不完全相同,这可以防止编译器在执行大量任务时进行大量的块复制。
It helps gcc and clang copy multiple bytes at once if you do:
struct output_s_optimized {
struct __attribute__((packed)) stuff {
unsigned char cnt,
mode,
type,
size,
allocation_type,
allocation_localized,
mode_enable;
} curr; // 7B
unsigned char curr_id; // no non-curr id?
struct stuff non_curr;
unsigned char layer_cnt;
// Another 8 byte boundary here
unsigned char total_layer_cnt;
struct cc_file_s cc_file[128];
};
void foo(struct output_s_optimized *p) {
p->curr_id = 0;
p->non_curr = p->curr;
}
void bar(struct output_s_optimized *output_ptr) {
output_ptr->curr_id = 0;
output_ptr->curr.cnt = output_ptr->non_curr.cnt;
output_ptr->curr.mode = output_ptr->non_curr.mode;
output_ptr->curr.type = output_ptr->non_curr.type;
output_ptr->curr.size = output_ptr->non_curr.size;
output_ptr->curr.allocation_type = output_ptr->non_curr.allocation_type;
output_ptr->curr.allocation_localized = output_ptr->non_curr.allocation_localized;
output_ptr->curr.mode_enable = output_ptr->non_curr.mode_enable;
}
gcc 4.8.2 compiles foo()
to three copies: byte, 2B, and 4B, even on ARM. It compiles bar()
to eight 1B copies, and so does clang-3.8 on x86. So copying whole structs can help your compiler a lot (as well as making sure the data to be copied is arranged in the same order in both locations).
gcc 4.8.2将foo()编译为三个副本:byte,2B和4B,即使在ARM上也是如此。它将bar()编译为8个1B副本,x86上的clang-3.8也是如此。因此,复制整个结构可以帮助您的编译器(以及确保要复制的数据在两个位置以相同的顺序排列)。
the same code on x86: nothing new
You can use -fverbose-asm
to put comments on each line. For x86, the compiler output from gcc 6.1 -O3
is very similar between versions, as you can see on the Godbolt Compiler Explorer. x86 addressing modes can index a global variable directly, so you see stuff like
您可以使用-fverbose-asm在每行上添加注释。对于x86,gcc 6.1 -O3的编译器输出在版本之间非常相似,正如你在Godbolt Compiler Explorer上看到的那样。 x86寻址模式可以直接索引全局变量,所以你可以看到类似的东西
movzx edi, BYTE PTR [rcx+10] # *output_ptr_7(D)._mode
# where rcx is the output_ptr arg, used directly
vs.
movzx ecx, BYTE PTR output_file[rdi+10] # output_file[output_index_7(D)]._mode
# where rdi = output_index * 1297 (sizeof(output_file[0])), calculated once at the start
(gcc apparently doesn't care that each instruction has a 4B displacement as part of the addressing mode, but this is an ARM question so I won't go tradeoffs between code-size and insn count with x86's variable-length insns.)
(gcc显然并不关心每个指令都有4B位移作为寻址模式的一部分,但这是一个ARM问题所以我不会在代码大小和insn计数之间用x86的可变长度insn进行权衡。)
#2
1
In broad (architecture-agnostic) terms, this is what your instructions do:
在广泛的(架构不可知的)术语中,这是您的指示:
global_structure_pointer->field = value;
global_structure_pointer-> field = value;
- loads the value of
global_structure_pointer
into an addressing register. - adds the offset represented by
field
to the addressing register. - stores
value
into the memory location addressed by the addressing register.
将global_structure_pointer的值加载到寻址寄存器中。
将字段表示的偏移量添加到寻址寄存器。
将值存储到寻址寄存器寻址的存储单元中。
global_structure[index].field = value;
global_structure [index] .field = value;
- loads the address of
global_structure
into an addressing register. - loads the value of
index
into an arithmetic register. - multiplies the arithmetic register by the size of
global_structure
. - adds the arithmetic register to the addressing register.
- stores
value
into the memory location addressed by the addressing register.
将global_structure的地址加载到寻址寄存器中。
将index的值加载到算术寄存器中。
将算术寄存器乘以global_structure的大小。
将运算寄存器添加到寻址寄存器。
将值存储到寻址寄存器寻址的存储单元中。
Your confusion seems to be due to a misunderstanding of what "direct access" actually is.
您的困惑似乎是由于对“直接访问”实际上是什么的误解。
THIS is direct access:
这是直接访问:
global_structure.field = value;
global_structure.field = value;
What you thought of as direct access is in fact indexed access.
您认为直接访问实际上是索引访问。
#1
7
TL:DR: can't reproduce that insane compiler output. Maybe the surrounding code + LTO did it?
TL:DR:无法重现那个疯狂的编译器输出。也许周围的代码+ LTO做到了吗?
I do have suggestions to improve the code: see the stuff below about copying whole structs instead of copying many individual members.
我确实有改进代码的建议:请参阅下面有关复制整个结构而不是复制许多单个成员的内容。
The question you linked is about accessing a value-type global directly vs. through a global pointer. On ARM, where it takes multiple instructions or a load from a nearby constant to get an arbitrary 32bit pointer into a register, passing around pointers is better than having each function reference a global directly.
您链接的问题是直接通过全局指针访问值类型全局。在ARM上,需要多个指令或来自附近常量的负载才能将任意32位指针放入寄存器,传递指针比使每个函数直接引用全局更好。
See this example on the Godbolt Compiler Explorer (ARM gcc 4.8.2 -O3)
在Godbolt Compiler Explorer上看到这个例子(ARM gcc 4.8.2 -O3)
struct example {
int a, b, c;
} global_example;
int load_global(void) { return global_example.c; }
movw r3, #:lower16:global_example @ tmp113,
movt r3, #:upper16:global_example @ tmp113,
ldr r0, [r3, #8] @, global_example.c
bx lr @
int load_pointer(struct example *p) { return p->c; }
ldr r0, [r0, #8] @, p_2(D)->c
bx lr @
(Apparently gcc is horrible at passing structs by val as function args, see the code for byval(struct example by_val)
on the godbolt link.)
(显然gcc在使用val作为函数args传递结构时非常糟糕,请参阅godbolt链接上的byval代码(struct example by_val)。)
Even worse is if you have a global pointer: first you have to load the value of the pointer, then another load to dereference it. This is the indirection overhead that was being discussed in the question you linked. If both loads miss in cache, you're paying the round-trip latency twice. The load address for the 2nd load isn't available until the first load completes, so no pipelining of those memory requests is possible even on an out-of-order CPU.
更糟糕的是,如果你有一个全局指针:首先你必须加载指针的值,然后另一个加载来取消引用它。这是您链接的问题中讨论的间接开销。如果两个加载都在缓存中丢失,那么您需要两次支付往返延迟。第一个加载完成之前,第二个加载的加载地址不可用,因此即使在无序CPU上也不能对这些内存请求进行流水线操作。
If you already have a pointer as an arg, it will be in a register. Dereferencing it is the same as loading from a global. (But better, because you don't need to get the global's address into a register yourself.)
如果您已经将指针作为arg,它将位于寄存器中。取消引用它与从全局加载相同。 (但更好的是,因为您不需要自己将全局地址放入注册表中。)
Your real code
I can't reproduce your massive asm output with ARM gcc 4.8.2 on Godbolt, or locally with ARM gcc 5.2.1. I'm not using LTO, though, since I don't have a complete test program.
我无法在Godbolt上使用ARM gcc 4.8.2重现您的大量asm输出,或在ARM gcc 5.2.1本地重现。但是,我没有使用LTO,因为我没有完整的测试程序。
All I can see is just slightly larger code to do some index math.
我只能看到稍微大一点的代码来做一些索引数学。
bfi
is Bitfield Insert. I think 144: e7df9813 bfi r9, r3, #16, #16
is setting the top half of r9
= low half of r3
. I don't see how that and mla
(integer mul-accumulate) make much sense. Other than perverse results from -ftree-vectorize
, all I can think of is maybe -fno-gcse
has a really bad impact for the version of gcc you tested.
bfi是Bitfield Insert。我认为144:e7df9813 bfi r9,r3,#16,#16设置r9的上半部分= r3的低半部分。我没有看到它和mla(整数mul-accumulate)有多大意义。除了-ftree-vectorize的反常结果之外,我所能想到的可能是-fno-gcse对你测试的gcc版本有很大影响。
Is it manipulating constants that are going to be stored? The code you actually posted #define
s everything to 0, which gcc takes advantage of. (It also takes advantage of the fact that it already has 1
in a register if curr_cnt == 1
, and stores that register for the second_file_ptr->redundancy_version = 1;
). ARM doesn't have a str [mem], immediate
or anything like x86's mov [mem], imm
.
它是否在操纵将要存储的常量?你实际发布的代码#defines一切都为0,gcc利用了。 (如果curr_cnt == 1,它还利用了寄存器中已有1的事实,并存储了second_file_ptr-> redundancy_version = 1的寄存器;)。 ARM没有str [mem],立即或类似x86的mov [mem],imm。
If your compiler output is from code with different values for those constants, the compiler would be doing more work to store different things.
如果您的编译器输出来自具有这些常量的不同值的代码,则编译器将执行更多工作来存储不同的内容。
Unfortunately gcc is bad at merging narrow stores into a single wider store (long-standing missed-optimization bug). For x86, clang does this in at least one case, storing 0x0100
(256) instead of a 0 and a 1. (check on godbolt by flipping the compiler to clang 3.7.1 or something, and removing the ARM-specific compiler args. There's a mov word ptr \[rsi\], 256
where gcc uses
不幸的是,gcc很难将狭窄的商店合并到一个更大的商店(长期错过优化错误)。对于x86,clang在至少一种情况下执行此操作,存储0x0100(256)而不是0和1.(通过将编译器翻转到clang 3.7.1或其他内容来检查godbolt,并删除特定于ARM的编译器args。有一个移动词ptr \ [rsi \],256这是gcc使用的
mov BYTE PTR [rsi], 0 # *first_file_ptr_23(D).status,
mov BYTE PTR [rsi+1], 1 # *first_file_ptr_23(D).type,
If you arranged your structs carefully, there would be more opportunities for copying 4B blocks in this function.
如果您仔细安排了结构,那么在此功能中复制4B块将有更多机会。
It might also help to have two identical sub-structs of curr
and not-curr
, instead of curr_size
and size
. You might have to declare it packed
to avoid padding after the sub-structs, though. Your two groups of members aren't in exactly the same order, which prevents compilers from doing much block-copying anyway when you do a bunch of assignments.
它也可能有两个相同的curr和not-curr子结构,而不是curr_size和size。但是,您可能必须声明它已打包以避免在子结构之后填充。您的两组成员的顺序不完全相同,这可以防止编译器在执行大量任务时进行大量的块复制。
It helps gcc and clang copy multiple bytes at once if you do:
struct output_s_optimized {
struct __attribute__((packed)) stuff {
unsigned char cnt,
mode,
type,
size,
allocation_type,
allocation_localized,
mode_enable;
} curr; // 7B
unsigned char curr_id; // no non-curr id?
struct stuff non_curr;
unsigned char layer_cnt;
// Another 8 byte boundary here
unsigned char total_layer_cnt;
struct cc_file_s cc_file[128];
};
void foo(struct output_s_optimized *p) {
p->curr_id = 0;
p->non_curr = p->curr;
}
void bar(struct output_s_optimized *output_ptr) {
output_ptr->curr_id = 0;
output_ptr->curr.cnt = output_ptr->non_curr.cnt;
output_ptr->curr.mode = output_ptr->non_curr.mode;
output_ptr->curr.type = output_ptr->non_curr.type;
output_ptr->curr.size = output_ptr->non_curr.size;
output_ptr->curr.allocation_type = output_ptr->non_curr.allocation_type;
output_ptr->curr.allocation_localized = output_ptr->non_curr.allocation_localized;
output_ptr->curr.mode_enable = output_ptr->non_curr.mode_enable;
}
gcc 4.8.2 compiles foo()
to three copies: byte, 2B, and 4B, even on ARM. It compiles bar()
to eight 1B copies, and so does clang-3.8 on x86. So copying whole structs can help your compiler a lot (as well as making sure the data to be copied is arranged in the same order in both locations).
gcc 4.8.2将foo()编译为三个副本:byte,2B和4B,即使在ARM上也是如此。它将bar()编译为8个1B副本,x86上的clang-3.8也是如此。因此,复制整个结构可以帮助您的编译器(以及确保要复制的数据在两个位置以相同的顺序排列)。
the same code on x86: nothing new
You can use -fverbose-asm
to put comments on each line. For x86, the compiler output from gcc 6.1 -O3
is very similar between versions, as you can see on the Godbolt Compiler Explorer. x86 addressing modes can index a global variable directly, so you see stuff like
您可以使用-fverbose-asm在每行上添加注释。对于x86,gcc 6.1 -O3的编译器输出在版本之间非常相似,正如你在Godbolt Compiler Explorer上看到的那样。 x86寻址模式可以直接索引全局变量,所以你可以看到类似的东西
movzx edi, BYTE PTR [rcx+10] # *output_ptr_7(D)._mode
# where rcx is the output_ptr arg, used directly
vs.
movzx ecx, BYTE PTR output_file[rdi+10] # output_file[output_index_7(D)]._mode
# where rdi = output_index * 1297 (sizeof(output_file[0])), calculated once at the start
(gcc apparently doesn't care that each instruction has a 4B displacement as part of the addressing mode, but this is an ARM question so I won't go tradeoffs between code-size and insn count with x86's variable-length insns.)
(gcc显然并不关心每个指令都有4B位移作为寻址模式的一部分,但这是一个ARM问题所以我不会在代码大小和insn计数之间用x86的可变长度insn进行权衡。)
#2
1
In broad (architecture-agnostic) terms, this is what your instructions do:
在广泛的(架构不可知的)术语中,这是您的指示:
global_structure_pointer->field = value;
global_structure_pointer-> field = value;
- loads the value of
global_structure_pointer
into an addressing register. - adds the offset represented by
field
to the addressing register. - stores
value
into the memory location addressed by the addressing register.
将global_structure_pointer的值加载到寻址寄存器中。
将字段表示的偏移量添加到寻址寄存器。
将值存储到寻址寄存器寻址的存储单元中。
global_structure[index].field = value;
global_structure [index] .field = value;
- loads the address of
global_structure
into an addressing register. - loads the value of
index
into an arithmetic register. - multiplies the arithmetic register by the size of
global_structure
. - adds the arithmetic register to the addressing register.
- stores
value
into the memory location addressed by the addressing register.
将global_structure的地址加载到寻址寄存器中。
将index的值加载到算术寄存器中。
将算术寄存器乘以global_structure的大小。
将运算寄存器添加到寻址寄存器。
将值存储到寻址寄存器寻址的存储单元中。
Your confusion seems to be due to a misunderstanding of what "direct access" actually is.
您的困惑似乎是由于对“直接访问”实际上是什么的误解。
THIS is direct access:
这是直接访问:
global_structure.field = value;
global_structure.field = value;
What you thought of as direct access is in fact indexed access.
您认为直接访问实际上是索引访问。