This is a small test I built. Here we have two scenarios:
Scenario 1: Two functions (scenario1a and scenario1b) which inputs and outputs are uint16_t* and load/store to/from Neon datatype (uint16x8x4_t).
Scenario 2: Same functions as Scenario 1 (in this case scenario2a and scenario2b) but the inputs and outputs are uint16x8x4_t*, and the load and store are done in the main function.
(Below the c code I include the disassembly generated after compiling with -O3).
#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario1a(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);
vst1q_u16_x4(resultArray, result);
}
void scenario1b(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
result.val[1] = vmulq_n_u16(temp.val[1], -2);
result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
result.val[3] = vmulq_n_u16(temp.val[3], -2);
vst1q_u16_x4(resultArray, result);
}
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
result->val[1] = vmulq_n_u16(temp->val[1], -2);
result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
result->val[3] = vmulq_n_u16(temp->val[3], -2);
}
int main(void) {
uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
// Scenario 01: Input and output are uint16_t*
uint16_t result01a[32];
uint16_t result01_final[32];
scenario1a(result01a, input);
scenario1b(result01_final, result01a);
// Scenario 02: Input and output are uint16x8x4_t
uint16_t result02_final[32];
uint16x8x4_t temp, result02a, result02b;
temp = vld1q_u16_x4(input);
scenario2a(&result02a, &temp);
scenario2b(&result02b, &result02a);
vst1q_u16_x4(result02_final, result02b);
return 0;
}
Disassembly:
test: file format elf64-littleaarch64
Disassembly of section .init:
0000000000000658 <_init>:
658: a9bf7bfd stp x29, x30, [sp, #-16]!
65c: 910003fd mov x29, sp
660: 94000065 bl 7f4 <call_weak_fn>
664: a8c17bfd ldp x29, x30, [sp], #16
668: d65f03c0 ret
Disassembly of section .plt:
0000000000000670 <.plt>:
670: a9bf7bf0 stp x16, x30, [sp, #-16]!
674: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
678: f947c611 ldr x17, [x16, #3976]
67c: 913e2210 add x16, x16, #0xf88
680: d61f0220 br x17
684: d503201f nop
688: d503201f nop
68c: d503201f nop
0000000000000690 <__cxa_finalize#plt>:
690: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
694: f947ca11 ldr x17, [x16, #3984]
698: 913e4210 add x16, x16, #0xf90
69c: d61f0220 br x17
00000000000006a0 <__libc_start_main#plt>:
6a0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6a4: f947ce11 ldr x17, [x16, #3992]
6a8: 913e6210 add x16, x16, #0xf98
6ac: d61f0220 br x17
00000000000006b0 <__stack_chk_fail#plt>:
6b0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6b4: f947d211 ldr x17, [x16, #4000]
6b8: 913e8210 add x16, x16, #0xfa0
6bc: d61f0220 br x17
00000000000006c0 <__gmon_start__#plt>:
6c0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6c4: f947d611 ldr x17, [x16, #4008]
6c8: 913ea210 add x16, x16, #0xfa8
6cc: d61f0220 br x17
00000000000006d0 <abort#plt>:
6d0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6d4: f947da11 ldr x17, [x16, #4016]
6d8: 913ec210 add x16, x16, #0xfb0
6dc: d61f0220 br x17
Disassembly of section .text:
00000000000006e0 <main>:
6e0: 90000085 adrp x5, 10000 <__FRAME_END__+0xf3d8>
6e4: a9a67bfd stp x29, x30, [sp, #-416]!
6e8: 910003fd mov x29, sp
6ec: 90000002 adrp x2, 0 <_init-0x658>
6f0: 91292042 add x2, x2, #0xa48
6f4: 910263e3 add x3, sp, #0x98
6f8: 910363e0 add x0, sp, #0xd8
6fc: 6f008434 mvni v20.8h, #0x1
700: f947f0a5 ldr x5, [x5, #4064]
704: aa0303e1 mov x1, x3
708: 910143e4 add x4, sp, #0x50
70c: a940344c ldp x12, x13, [x2]
710: a9412c4a ldp x10, x11, [x2, #16]
714: f94000a6 ldr x6, [x5]
718: f900cfe6 str x6, [sp, #408]
71c: d2800006 mov x6, #0x0 // #0
720: a9422448 ldp x8, x9, [x2, #32]
724: a9431c46 ldp x6, x7, [x2, #48]
728: 910463e2 add x2, sp, #0x118
72c: a909b7ec stp x12, x13, [sp, #152]
730: a90aafea stp x10, x11, [sp, #168]
734: a90ba7e8 stp x8, x9, [sp, #184]
738: a90c9fe6 stp x6, x7, [sp, #200]
73c: 94000069 bl 8e0 <scenario1a>
740: 4c402400 ld1 {v0.8h-v3.8h}, [x0]
744: 910043e1 add x1, sp, #0x10
748: aa0403e0 mov x0, x4
74c: 4c402470 ld1 {v16.8h-v19.8h}, [x3]
750: 4e619e85 mul v5.8h, v20.8h, v1.8h
754: 4e608424 add v4.8h, v1.8h, v0.8h
758: 4e628466 add v6.8h, v3.8h, v2.8h
75c: 4e639e87 mul v7.8h, v20.8h, v3.8h
760: 4c002030 st1 {v16.16b-v19.16b}, [x1]
764: 4c002444 st1 {v4.8h-v7.8h}, [x2]
768: 94000072 bl 930 <scenario2a>
76c: ad409885 ldp q5, q6, [x4, #16]
770: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
774: 910563e2 add x2, sp, #0x158
778: 3dc00c84 ldr q4, [x4, #48]
77c: 3dc017e7 ldr q7, [sp, #80]
780: f947f021 ldr x1, [x1, #4064]
784: 4e749c83 mul v3.8h, v4.8h, v20.8h
788: 4e668482 add v2.8h, v4.8h, v6.8h
78c: 4e749ca1 mul v1.8h, v5.8h, v20.8h
790: 4e6784a0 add v0.8h, v5.8h, v7.8h
794: 4c002440 st1 {v0.8h-v3.8h}, [x2]
798: f940cfe0 ldr x0, [sp, #408]
79c: f9400022 ldr x2, [x1]
7a0: eb020000 subs x0, x0, x2
7a4: d2800002 mov x2, #0x0 // #0
7a8: 54000081 b.ne 7b8 <main+0xd8> // b.any
7ac: 52800000 mov w0, #0x0 // #0
7b0: a8da7bfd ldp x29, x30, [sp], #416
7b4: d65f03c0 ret
7b8: 97ffffbe bl 6b0 <__stack_chk_fail#plt>
00000000000007bc <_start>:
7bc: d280001d mov x29, #0x0 // #0
7c0: d280001e mov x30, #0x0 // #0
7c4: aa0003e5 mov x5, x0
7c8: f94003e1 ldr x1, [sp]
7cc: 910023e2 add x2, sp, #0x8
7d0: 910003e6 mov x6, sp
7d4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7d8: f947f800 ldr x0, [x0, #4080]
7dc: 90000083 adrp x3, 10000 <__FRAME_END__+0xf3d8>
7e0: f947f463 ldr x3, [x3, #4072]
7e4: 90000084 adrp x4, 10000 <__FRAME_END__+0xf3d8>
7e8: f947e084 ldr x4, [x4, #4032]
7ec: 97ffffad bl 6a0 <__libc_start_main#plt>
7f0: 97ffffb8 bl 6d0 <abort#plt>
00000000000007f4 <call_weak_fn>:
7f4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7f8: f947ec00 ldr x0, [x0, #4056]
7fc: b4000040 cbz x0, 804 <call_weak_fn+0x10>
800: 17ffffb0 b 6c0 <__gmon_start__#plt>
804: d65f03c0 ret
808: d503201f nop
80c: d503201f nop
0000000000000810 <deregister_tm_clones>:
810: b0000080 adrp x0, 11000 <__data_start>
814: 91004000 add x0, x0, #0x10
818: b0000081 adrp x1, 11000 <__data_start>
81c: 91004021 add x1, x1, #0x10
820: eb00003f cmp x1, x0
824: 540000c0 b.eq 83c <deregister_tm_clones+0x2c> // b.none
828: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
82c: f947e421 ldr x1, [x1, #4040]
830: b4000061 cbz x1, 83c <deregister_tm_clones+0x2c>
834: aa0103f0 mov x16, x1
838: d61f0200 br x16
83c: d65f03c0 ret
0000000000000840 <register_tm_clones>:
840: b0000080 adrp x0, 11000 <__data_start>
844: 91004000 add x0, x0, #0x10
848: b0000081 adrp x1, 11000 <__data_start>
84c: 91004021 add x1, x1, #0x10
850: cb000021 sub x1, x1, x0
854: d37ffc22 lsr x2, x1, #63
858: 8b810c41 add x1, x2, x1, asr #3
85c: 9341fc21 asr x1, x1, #1
860: b40000c1 cbz x1, 878 <register_tm_clones+0x38>
864: 90000082 adrp x2, 10000 <__FRAME_END__+0xf3d8>
868: f947fc42 ldr x2, [x2, #4088]
86c: b4000062 cbz x2, 878 <register_tm_clones+0x38>
870: aa0203f0 mov x16, x2
874: d61f0200 br x16
878: d65f03c0 ret
87c: d503201f nop
0000000000000880 <__do_global_dtors_aux>:
880: a9be7bfd stp x29, x30, [sp, #-32]!
884: 910003fd mov x29, sp
888: f9000bf3 str x19, [sp, #16]
88c: b0000093 adrp x19, 11000 <__data_start>
890: 39404260 ldrb w0, [x19, #16]
894: 35000140 cbnz w0, 8bc <__do_global_dtors_aux+0x3c>
898: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
89c: f947e800 ldr x0, [x0, #4048]
8a0: b4000080 cbz x0, 8b0 <__do_global_dtors_aux+0x30>
8a4: b0000080 adrp x0, 11000 <__data_start>
8a8: f9400400 ldr x0, [x0, #8]
8ac: 97ffff79 bl 690 <__cxa_finalize#plt>
8b0: 97ffffd8 bl 810 <deregister_tm_clones>
8b4: 52800020 mov w0, #0x1 // #1
8b8: 39004260 strb w0, [x19, #16]
8bc: f9400bf3 ldr x19, [sp, #16]
8c0: a8c27bfd ldp x29, x30, [sp], #32
8c4: d65f03c0 ret
8c8: d503201f nop
8cc: d503201f nop
00000000000008d0 <frame_dummy>:
8d0: 17ffffdc b 840 <register_tm_clones>
8d4: d503201f nop
8d8: d503201f nop
8dc: d503201f nop
00000000000008e0 <scenario1a>:
8e0: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
8e4: 6e60b833 neg v19.8h, v1.8h
8e8: 6e60b852 neg v18.8h, v2.8h
8ec: 6e60b871 neg v17.8h, v3.8h
8f0: 6e60b810 neg v16.8h, v0.8h
8f4: 6e132004 ext v4.16b, v0.16b, v19.16b, #4
8f8: 6e122025 ext v5.16b, v1.16b, v18.16b, #4
8fc: 6e112046 ext v6.16b, v2.16b, v17.16b, #4
900: 6e102067 ext v7.16b, v3.16b, v16.16b, #4
904: 4c002404 st1 {v4.8h-v7.8h}, [x0]
908: d65f03c0 ret
90c: d503201f nop
0000000000000910 <scenario1b>:
910: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
914: 6f008430 mvni v16.8h, #0x1
918: 4e619e05 mul v5.8h, v16.8h, v1.8h
91c: 4e608424 add v4.8h, v1.8h, v0.8h
920: 4e628466 add v6.8h, v3.8h, v2.8h
924: 4e639e07 mul v7.8h, v16.8h, v3.8h
928: 4c002404 st1 {v4.8h-v7.8h}, [x0]
92c: d65f03c0 ret
0000000000000930 <scenario2a>:
930: ad400025 ldp q5, q0, [x1]
934: ad408423 ldp q3, q1, [x1, #16]
938: 3dc00c24 ldr q4, [x1, #48]
93c: 6e60b800 neg v0.8h, v0.8h
940: 4ea11c22 mov v2.16b, v1.16b
944: 6e60b821 neg v1.8h, v1.8h
948: 6e0020a5 ext v5.16b, v5.16b, v0.16b, #4
94c: 4ea41c80 mov v0.16b, v4.16b
950: 6e60b884 neg v4.8h, v4.8h
954: 6e012063 ext v3.16b, v3.16b, v1.16b, #4
958: 3d800005 str q5, [x0]
95c: 3dc00021 ldr q1, [x1]
960: 6e042042 ext v2.16b, v2.16b, v4.16b, #4
964: ad008803 stp q3, q2, [x0, #16]
968: 6e60b821 neg v1.8h, v1.8h
96c: 6e012000 ext v0.16b, v0.16b, v1.16b, #4
970: 3d800c00 str q0, [x0, #48]
974: d65f03c0 ret
978: d503201f nop
97c: d503201f nop
0000000000000980 <scenario2b>:
980: ad401022 ldp q2, q4, [x1]
984: 6f008420 mvni v0.8h, #0x1
988: ad410c21 ldp q1, q3, [x1, #32]
98c: 4e609c85 mul v5.8h, v4.8h, v0.8h
990: 4e648442 add v2.8h, v2.8h, v4.8h
994: 4e609c60 mul v0.8h, v3.8h, v0.8h
998: 4e638421 add v1.8h, v1.8h, v3.8h
99c: ad001402 stp q2, q5, [x0]
9a0: ad010001 stp q1, q0, [x0, #32]
9a4: d65f03c0 ret
00000000000009a8 <__libc_csu_init>:
9a8: a9bc7bfd stp x29, x30, [sp, #-64]!
9ac: 910003fd mov x29, sp
9b0: a90153f3 stp x19, x20, [sp, #16]
9b4: 90000094 adrp x20, 10000 <__FRAME_END__+0xf3d8>
9b8: 9135c294 add x20, x20, #0xd70
9bc: a9025bf5 stp x21, x22, [sp, #32]
9c0: 90000095 adrp x21, 10000 <__FRAME_END__+0xf3d8>
9c4: 9135a2b5 add x21, x21, #0xd68
9c8: cb150294 sub x20, x20, x21
9cc: 2a0003f6 mov w22, w0
9d0: a90363f7 stp x23, x24, [sp, #48]
9d4: aa0103f7 mov x23, x1
9d8: aa0203f8 mov x24, x2
9dc: 97ffff1f bl 658 <_init>
9e0: eb940fff cmp xzr, x20, asr #3
9e4: 54000160 b.eq a10 <__libc_csu_init+0x68> // b.none
9e8: 9343fe94 asr x20, x20, #3
9ec: d2800013 mov x19, #0x0 // #0
9f0: f8737aa3 ldr x3, [x21, x19, lsl #3]
9f4: aa1803e2 mov x2, x24
9f8: 91000673 add x19, x19, #0x1
9fc: aa1703e1 mov x1, x23
a00: 2a1603e0 mov w0, w22
a04: d63f0060 blr x3
a08: eb13029f cmp x20, x19
a0c: 54ffff21 b.ne 9f0 <__libc_csu_init+0x48> // b.any
a10: a94153f3 ldp x19, x20, [sp, #16]
a14: a9425bf5 ldp x21, x22, [sp, #32]
a18: a94363f7 ldp x23, x24, [sp, #48]
a1c: a8c47bfd ldp x29, x30, [sp], #64
a20: d65f03c0 ret
a24: d503201f nop
0000000000000a28 <__libc_csu_fini>:
a28: d65f03c0 ret
Disassembly of section .fini:
0000000000000a2c <_fini>:
a2c: a9bf7bfd stp x29, x30, [sp, #-16]!
a30: 910003fd mov x29, sp
a34: a8c17bfd ldp x29, x30, [sp], #16
a38: d65f03c0 ret
Questions
Normally people load the data from the pointer (using vld1q_u16_x4), operates using the Neon datatypes, and store back to another pointer (using vst1q_u16_x4), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?
I checked the disassembly of Scenario 1a (starts at line 8e0) vs. Scenario 2a (starts at line 930). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines 910 and 980, respectively).
In the main function, there are some add/mul after both Scenario1a and 2a (in lines 750,754,758,75c and 784,788,78c,790), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)
Thank you for all your help!
There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
Short: Keep it simple. Don't try to impress compilers or reviewers in any way.
I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (__restrict directive might help)
Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.
You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.
If a variable is not specified with the keyword volatile, the compiler likely does caching. The variable must be accessed from memory always otherwise until its transaction unit ends. The point I wonder lies in assembly part.
int main() {
/* volatile */ int lock = 999;
while (lock);
}
On x86-64-clang-3.0.0 compiler, its assembly code is following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
cmp DWORD PTR [RSP - 8], 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
When volatile keyword is commented in, it turns out the following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov EAX, DWORD PTR [RSP - 8]
cmp EAX, 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
The points I wonder and don't understand,
cmp DWORD PTR [RSP - 8], 0 . <---
Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
It looks like you forgot to enable optimization. -O0 treats all variables (except register variables) pretty similarly to volatile for consistent debugging.
With optimization enabled, compilers can hoist non-volatile loads out of loops. while(locked); will compile similarly to source like
if (locked) {
while(1){}
}
Or since locked has a compile-time-constant initializer, the whole function should compile to jmp main (an infinite loop).
See MCU programming - C++ O2 optimization breaks while loop for more details.
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
Some compilers are worse at folding loads into memory operands for other instructions when you use volatile. I think that's why you're getting a separate mov load here; it's just a missed optimization.
(Although cmp [mem], imm might be less efficient. I forget if it can macro-fuse with a JCC or something. With a RIP-relative addressing mode it couldn't micro-fuse the load, but a register base is ok.)
cmp EAX, 0 is weird, I guess clang with optimization disabled doesn't look for test eax,eax as a peephole optimization for comparing against zero.
As #user3386109 commented, locked in a boolean context is equivalent to locked != 0 in C / C++.
The compiler doesn't know about caching, it is not a caching thing, it tells the compiler that the value may change between accesses. So to functionally implement our code it needs to perform the accesses we ask for in the order we ask them. Can't optimize out.
void fun1 ( void )
{
/* volatile */ int lock = 999;
while (lock) continue;
}
void fun2 ( void )
{
volatile int lock = 999;
while (lock) continue;
}
volatile int vlock;
int ulock;
void fun3 ( void )
{
while(vlock) continue;
}
void fun4 ( void )
{
while(ulock) continue;
}
void fun5 ( void )
{
vlock=3;
vlock=4;
}
void fun6 ( void )
{
ulock=3;
ulock=4;
}
I find it easier to see in arm... doesn't really matter.
Disassembly of section .text:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
00001004 <fun2>:
1004: e59f3018 ldr r3, [pc, #24] ; 1024 <fun2+0x20>
1008: e24dd008 sub sp, sp, #8
100c: e58d3004 str r3, [sp, #4]
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
101c: e28dd008 add sp, sp, #8
1020: e12fff1e bx lr
1024: 000003e7 andeq r0, r0, r7, ror #7
00001028 <fun3>:
1028: e59f200c ldr r2, [pc, #12] ; 103c <fun3+0x14>
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
103c: 00002000
00001040 <fun4>:
1040: e59f3014 ldr r3, [pc, #20] ; 105c <fun4+0x1c>
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
105c: 00002004
00001060 <fun5>:
1060: e3a01003 mov r1, #3
1064: e3a02004 mov r2, #4
1068: e59f3008 ldr r3, [pc, #8] ; 1078 <fun5+0x18>
106c: e5831000 str r1, [r3]
1070: e5832000 str r2, [r3]
1074: e12fff1e bx lr
1078: 00002000
0000107c <fun6>:
107c: e3a02004 mov r2, #4
1080: e59f3004 ldr r3, [pc, #4] ; 108c <fun6+0x10>
1084: e5832000 str r2, [r3]
1088: e12fff1e bx lr
108c: 00002004
Disassembly of section .bss:
00002000 <vlock>:
2000: 00000000
00002004 <ulock>:
2004: 00000000
First one is the most telling:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
Being a local variable that is initialized, and non volatile then the compiler can assume it won't change value between accesses so it can never change in the while loop, so this is essentially a while 1 loop. If the initial value had been zero this would be a simple return as it can never be non-zero, being non-volatile.
fun2 being a local variable a stack frame needs to be built then.
It does what one assumes the code was trying to do, wait for this shared variable, one that can change during the loop
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
so it samples it and tests what it samples each time through the loop.
fun3 and fun4 same deal but more realistic, as external to the function code isnt going to change lock, being non-global doesn't make much sense for your while loop.
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
For the volatile fun3 case the variable has to be read and tested each loop
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
For the non-volatile being global it has to sample it once, very interesting what the compiler did here, have to think about why it would do that, but either way you can see that the "loop" retests the value read stored in a register (not cached) which will never change with a proper program. Functionally we asked it to only read the variable once by using non-volatile then it tests that value indefinitely.
fun5 and fun6 further demonstrate that volatile requires the compiler perform the accesses to the variable in its storage place before moving on to the next operation/access in the code. So when volatile we are asking the compiler to perform two assignments, two stores. When non-volatile the compiler can optimize out the first store and only do the last one as if you look at the code as a whole this function (fun6) leaves the variable set to 4, so the function leaves the variable set to 4.
The x86 solution is equally interesting repz retq is all over it (with the compiler on my computer), not hard to find out what that is all about.
Neither aarch64, x86, mips, riscv, msp430, pdp11 backends do the double check on fun3().
pdp11 is actually the easier code to read (no surprise there)
00000000 <_fun1>:
0: 01ff br 0 <_fun1>
00000002 <_fun2>:
2: 65c6 fffe add $-2, sp
6: 15ce 03e7 mov $1747, (sp)
a: 1380 mov (sp), r0
c: 02fe bne a <_fun2+0x8>
e: 65c6 0002 add $2, sp
12: 0087 rts pc
00000014 <_fun3>:
14: 1dc0 0026 mov $3e <_vlock>, r0
18: 02fd bne 14 <_fun3>
1a: 0087 rts pc
0000001c <_fun4>:
1c: 1dc0 001c mov $3c <_ulock>, r0
20: 0bc0 tst r0
22: 02fe bne 20 <_fun4+0x4>
24: 0087 rts pc
00000026 <_fun5>:
26: 15f7 0003 0012 mov $3, $3e <_vlock>
2c: 15f7 0004 000c mov $4, $3e <_vlock>
32: 0087 rts pc
00000034 <_fun6>:
34: 15f7 0004 0002 mov $4, $3c <_ulock>
3a: 0087 rts pc
(this is the not linked version)
cmp DWORD PTR [RSP - 8], 0 . <--- Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
while does a true false comparison meaning is it equal to zero or not equal to zero
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
mov -0x8(%rsp),%eax
cmp 0,%eax
cmp 0,-0x8(%rsp)
as so.s -o so.o
so.s: Assembler messages:
so.s:3: Error: too many memory references for `cmp'
compare wants a register. So it reads into a register so it can do the compare as it can't do the compare between the immediate and the memory access in one instruction. If they could have done it in one instruction they would have.