Why does gcc, with -O3, unnecessarily clear a local ARM NEON array? - c

Consider the following code (Compiler Explorer link), compiled under gcc and clang with -O3 optimization:
#include <arm_neon.h>
void bug(int8_t *out, const int8_t *in) {
for (int i = 0; i < 2; i++) {
int8x16x4_t x;
x.val[0] = vld1q_s8(&in[16 * i]);
x.val[1] = x.val[2] = x.val[3] = vshrq_n_s8(x.val[0], 7);
vst4q_s8(&out[64 * i], x);
}
}
NOTE: this is a minimally reproducible version of an issue that is popping up in many different functions of my actual, much more complex code, filled with arithmetic/logical/permutation instructions performing a totally different operation from above. Please refrain from criticizing and/or suggesting different ways of doing what the code above does, unless it has an effect on the code generation issue discussed below.
clang generates sane code:
bug(signed char*, signed char const*): // #bug(signed char*, signed char const*)
ldr q0, [x1]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
ldr q0, [x1, #16]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
ret
As for gcc, it inserts a lot of unnecessary operations, apparently zeroing out the registers that will be eventually input to the st4 instruction:
bug(signed char*, signed char const*):
sub sp, sp, #128
# mov x9, 0
# mov x8, 0
# mov x7, 0
# mov x6, 0
# mov x5, 0
# mov x4, 0
# mov x3, 0
# stp x9, x8, [sp]
# mov x2, 0
# stp x7, x6, [sp, 16]
# stp x5, x4, [sp, 32]
# str x3, [sp, 48]
ldr q0, [x1]
# stp x2, x9, [sp, 56]
# stp x8, x7, [sp, 72]
sshr v4.16b, v0.16b, 7
# str q0, [sp]
# ld1 {v0.16b - v3.16b}, [sp]
# stp x6, x5, [sp, 88]
mov v1.16b, v4.16b
# stp x4, x3, [sp, 104]
mov v2.16b, v4.16b
# str x2, [sp, 120]
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
### ldr q4, [x1, 16]
### add x1, sp, 64
### str q4, [sp, 64]
sshr v4.16b, v4.16b, 7
### ld1 {v0.16b - v3.16b}, [x1]
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
add sp, sp, 128
ret
I manually prefixed with # all instructions that could be safely taken out, without affecting the result of the function.
In addition, the instructions prefixed with ### perform an unnecessary trip to memory and back (and anyway, the mov instructions following ### ld1 ... overwrite 3 out of 4 registers loaded by that ld1 instruction), and could be replaced by a single load straight to v0.16b -- and the sshr instruction in the middle of the block would then use v0.16b as its source register.
As far as I know, x, being a local variable, can be used unitialized; and even if it weren't, all registers are properly initialized, so there's no point in zeroing them out just to immediately overwrite them with values.
I'm inclined to think this is a gcc bug, but before reporting it, I'm curious if I missed something. Maybe there's a compilation flag, an __attribute__ or something else that I could to make gcc generate sane code.
Thus, my question: is there anything I can do to generate sane code, or is this a bug I need to report to gcc?

Code generation on a fairly current development version of gcc appears to have improved immensely, at least for this case.
After installing the gcc-snapshot package (dated 20210918), gcc generates the following code:
bug:
ldr q5, [x1]
sshr v4.16b, v5.16b, 7
mov v0.16b, v5.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
ldr q4, [x1, 16]
mov v0.16b, v4.16b
sshr v4.16b, v4.16b, 7
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
ret
Not ideal yet -- at least two mov instruction could be removed per iteration by changing the destination registers of ldr and sshr, but considerably better than before.

Short answer: welcome to GCC. Do not bother optimizing anything while you are using it. And Clang isn't better either.
Secret tip: Add ARM and ARM64 components to Visual Studio, and you'd be surprised how well it works. The problem is however, it generates COFF binary, not ELF, and I haven't been able to find a converter.
You can use Ida Pro or dumpbin and generate a disassembly file and it look. like:
; void __fastcall bug(char *out, const char *in)
EXPORT bug
bug
MOV W10, #0
MOV W9, #0
$LL4 ; CODE XREF: bug+30↓j
ADD X8, X1, W9,SXTW
ADD W9, W9, #0x10
CMP W9, #0x20 ; ' '
LD1 {V0.16B}, [X8]
ADD X8, X0, W10,SXTW
ADD W10, W10, #0x40 ; '#'
SSHR V1.16B, V0.16B, #7
MOV V2.16B, V1.16B
MOV V3.16B, V1.16B
ST4 {V0.16B-V3.16B}, [X8]
B.LT $LL4
RET
; End of function bug
You can copy paste the disassembly to a GCC assembly file.
And don't bother with reporting the "bug" either. If they were listening, GCC wouldn't be this bad in first place.

Related

How arguments are passed to the printf() function?

I am trying to understand the assembly code for a simple program, shown below.
void f()
{
int i, x = 0;
for (i = 0; i < 10; i++)
x++;
printf("Value of x: %d\n", x);
}
and its corresponding assembly code on my machine is
00000000000007d4 <f>:
7d4: a9be7bfd stp x29, x30, [sp, #-32]!
7d8: 910003fd mov x29, sp
7dc: b9001fff str wzr, [sp, #28]
7e0: b9001bff str wzr, [sp, #24]
7e4: 14000007 b 800 <f+0x2c>
7e8: b9401fe0 ldr w0, [sp, #28]
7ec: 11000400 add w0, w0, #0x1
7f0: b9001fe0 str w0, [sp, #28]
7f4: b9401be0 ldr w0, [sp, #24]
7f8: 11000400 add w0, w0, #0x1
7fc: b9001be0 str w0, [sp, #24]
800: b9401be0 ldr w0, [sp, #24]
804: 7100241f cmp w0, #0x9
808: 54ffff0d b.le 7e8 <f+0x14>
80c: b9401fe1 ldr w1, [sp, #28]
810: 90000000 adrp x0, 0 <__abi_tag-0x278>
814: 9121c000 add x0, x0, #0x870
818: 97ffff9a bl 680 <printf#plt>
81c: d503201f nop
820: a8c27bfd ldp x29, x30, [sp], #32
824: d65f03c0 ret
I understand the loop, but line 814 - 818 is really confusion to me. What's the purpose of adding #0x870 to x0? What does line 818 mean? And how arguments are passed to the printf() function?
I expect words like "Value of x: " appears in the assembly code, but it seems like the compiler simply knows what to print.

Why (or why not) pass Neon intrinsics datatypes as inputs/outputs functions parameters?

This is a small test I built. Here we have two scenarios:
Scenario 1: Two functions (scenario1a and scenario1b) which inputs and outputs are uint16_t* and load/store to/from Neon datatype (uint16x8x4_t).
Scenario 2: Same functions as Scenario 1 (in this case scenario2a and scenario2b) but the inputs and outputs are uint16x8x4_t*, and the load and store are done in the main function.
(Below the c code I include the disassembly generated after compiling with -O3).
#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario1a(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);
vst1q_u16_x4(resultArray, result);
}
void scenario1b(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
result.val[1] = vmulq_n_u16(temp.val[1], -2);
result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
result.val[3] = vmulq_n_u16(temp.val[3], -2);
vst1q_u16_x4(resultArray, result);
}
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
result->val[1] = vmulq_n_u16(temp->val[1], -2);
result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
result->val[3] = vmulq_n_u16(temp->val[3], -2);
}
int main(void) {
uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
// Scenario 01: Input and output are uint16_t*
uint16_t result01a[32];
uint16_t result01_final[32];
scenario1a(result01a, input);
scenario1b(result01_final, result01a);
// Scenario 02: Input and output are uint16x8x4_t
uint16_t result02_final[32];
uint16x8x4_t temp, result02a, result02b;
temp = vld1q_u16_x4(input);
scenario2a(&result02a, &temp);
scenario2b(&result02b, &result02a);
vst1q_u16_x4(result02_final, result02b);
return 0;
}
Disassembly:
test: file format elf64-littleaarch64
Disassembly of section .init:
0000000000000658 <_init>:
658: a9bf7bfd stp x29, x30, [sp, #-16]!
65c: 910003fd mov x29, sp
660: 94000065 bl 7f4 <call_weak_fn>
664: a8c17bfd ldp x29, x30, [sp], #16
668: d65f03c0 ret
Disassembly of section .plt:
0000000000000670 <.plt>:
670: a9bf7bf0 stp x16, x30, [sp, #-16]!
674: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
678: f947c611 ldr x17, [x16, #3976]
67c: 913e2210 add x16, x16, #0xf88
680: d61f0220 br x17
684: d503201f nop
688: d503201f nop
68c: d503201f nop
0000000000000690 <__cxa_finalize#plt>:
690: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
694: f947ca11 ldr x17, [x16, #3984]
698: 913e4210 add x16, x16, #0xf90
69c: d61f0220 br x17
00000000000006a0 <__libc_start_main#plt>:
6a0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6a4: f947ce11 ldr x17, [x16, #3992]
6a8: 913e6210 add x16, x16, #0xf98
6ac: d61f0220 br x17
00000000000006b0 <__stack_chk_fail#plt>:
6b0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6b4: f947d211 ldr x17, [x16, #4000]
6b8: 913e8210 add x16, x16, #0xfa0
6bc: d61f0220 br x17
00000000000006c0 <__gmon_start__#plt>:
6c0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6c4: f947d611 ldr x17, [x16, #4008]
6c8: 913ea210 add x16, x16, #0xfa8
6cc: d61f0220 br x17
00000000000006d0 <abort#plt>:
6d0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6d4: f947da11 ldr x17, [x16, #4016]
6d8: 913ec210 add x16, x16, #0xfb0
6dc: d61f0220 br x17
Disassembly of section .text:
00000000000006e0 <main>:
6e0: 90000085 adrp x5, 10000 <__FRAME_END__+0xf3d8>
6e4: a9a67bfd stp x29, x30, [sp, #-416]!
6e8: 910003fd mov x29, sp
6ec: 90000002 adrp x2, 0 <_init-0x658>
6f0: 91292042 add x2, x2, #0xa48
6f4: 910263e3 add x3, sp, #0x98
6f8: 910363e0 add x0, sp, #0xd8
6fc: 6f008434 mvni v20.8h, #0x1
700: f947f0a5 ldr x5, [x5, #4064]
704: aa0303e1 mov x1, x3
708: 910143e4 add x4, sp, #0x50
70c: a940344c ldp x12, x13, [x2]
710: a9412c4a ldp x10, x11, [x2, #16]
714: f94000a6 ldr x6, [x5]
718: f900cfe6 str x6, [sp, #408]
71c: d2800006 mov x6, #0x0 // #0
720: a9422448 ldp x8, x9, [x2, #32]
724: a9431c46 ldp x6, x7, [x2, #48]
728: 910463e2 add x2, sp, #0x118
72c: a909b7ec stp x12, x13, [sp, #152]
730: a90aafea stp x10, x11, [sp, #168]
734: a90ba7e8 stp x8, x9, [sp, #184]
738: a90c9fe6 stp x6, x7, [sp, #200]
73c: 94000069 bl 8e0 <scenario1a>
740: 4c402400 ld1 {v0.8h-v3.8h}, [x0]
744: 910043e1 add x1, sp, #0x10
748: aa0403e0 mov x0, x4
74c: 4c402470 ld1 {v16.8h-v19.8h}, [x3]
750: 4e619e85 mul v5.8h, v20.8h, v1.8h
754: 4e608424 add v4.8h, v1.8h, v0.8h
758: 4e628466 add v6.8h, v3.8h, v2.8h
75c: 4e639e87 mul v7.8h, v20.8h, v3.8h
760: 4c002030 st1 {v16.16b-v19.16b}, [x1]
764: 4c002444 st1 {v4.8h-v7.8h}, [x2]
768: 94000072 bl 930 <scenario2a>
76c: ad409885 ldp q5, q6, [x4, #16]
770: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
774: 910563e2 add x2, sp, #0x158
778: 3dc00c84 ldr q4, [x4, #48]
77c: 3dc017e7 ldr q7, [sp, #80]
780: f947f021 ldr x1, [x1, #4064]
784: 4e749c83 mul v3.8h, v4.8h, v20.8h
788: 4e668482 add v2.8h, v4.8h, v6.8h
78c: 4e749ca1 mul v1.8h, v5.8h, v20.8h
790: 4e6784a0 add v0.8h, v5.8h, v7.8h
794: 4c002440 st1 {v0.8h-v3.8h}, [x2]
798: f940cfe0 ldr x0, [sp, #408]
79c: f9400022 ldr x2, [x1]
7a0: eb020000 subs x0, x0, x2
7a4: d2800002 mov x2, #0x0 // #0
7a8: 54000081 b.ne 7b8 <main+0xd8> // b.any
7ac: 52800000 mov w0, #0x0 // #0
7b0: a8da7bfd ldp x29, x30, [sp], #416
7b4: d65f03c0 ret
7b8: 97ffffbe bl 6b0 <__stack_chk_fail#plt>
00000000000007bc <_start>:
7bc: d280001d mov x29, #0x0 // #0
7c0: d280001e mov x30, #0x0 // #0
7c4: aa0003e5 mov x5, x0
7c8: f94003e1 ldr x1, [sp]
7cc: 910023e2 add x2, sp, #0x8
7d0: 910003e6 mov x6, sp
7d4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7d8: f947f800 ldr x0, [x0, #4080]
7dc: 90000083 adrp x3, 10000 <__FRAME_END__+0xf3d8>
7e0: f947f463 ldr x3, [x3, #4072]
7e4: 90000084 adrp x4, 10000 <__FRAME_END__+0xf3d8>
7e8: f947e084 ldr x4, [x4, #4032]
7ec: 97ffffad bl 6a0 <__libc_start_main#plt>
7f0: 97ffffb8 bl 6d0 <abort#plt>
00000000000007f4 <call_weak_fn>:
7f4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7f8: f947ec00 ldr x0, [x0, #4056]
7fc: b4000040 cbz x0, 804 <call_weak_fn+0x10>
800: 17ffffb0 b 6c0 <__gmon_start__#plt>
804: d65f03c0 ret
808: d503201f nop
80c: d503201f nop
0000000000000810 <deregister_tm_clones>:
810: b0000080 adrp x0, 11000 <__data_start>
814: 91004000 add x0, x0, #0x10
818: b0000081 adrp x1, 11000 <__data_start>
81c: 91004021 add x1, x1, #0x10
820: eb00003f cmp x1, x0
824: 540000c0 b.eq 83c <deregister_tm_clones+0x2c> // b.none
828: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
82c: f947e421 ldr x1, [x1, #4040]
830: b4000061 cbz x1, 83c <deregister_tm_clones+0x2c>
834: aa0103f0 mov x16, x1
838: d61f0200 br x16
83c: d65f03c0 ret
0000000000000840 <register_tm_clones>:
840: b0000080 adrp x0, 11000 <__data_start>
844: 91004000 add x0, x0, #0x10
848: b0000081 adrp x1, 11000 <__data_start>
84c: 91004021 add x1, x1, #0x10
850: cb000021 sub x1, x1, x0
854: d37ffc22 lsr x2, x1, #63
858: 8b810c41 add x1, x2, x1, asr #3
85c: 9341fc21 asr x1, x1, #1
860: b40000c1 cbz x1, 878 <register_tm_clones+0x38>
864: 90000082 adrp x2, 10000 <__FRAME_END__+0xf3d8>
868: f947fc42 ldr x2, [x2, #4088]
86c: b4000062 cbz x2, 878 <register_tm_clones+0x38>
870: aa0203f0 mov x16, x2
874: d61f0200 br x16
878: d65f03c0 ret
87c: d503201f nop
0000000000000880 <__do_global_dtors_aux>:
880: a9be7bfd stp x29, x30, [sp, #-32]!
884: 910003fd mov x29, sp
888: f9000bf3 str x19, [sp, #16]
88c: b0000093 adrp x19, 11000 <__data_start>
890: 39404260 ldrb w0, [x19, #16]
894: 35000140 cbnz w0, 8bc <__do_global_dtors_aux+0x3c>
898: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
89c: f947e800 ldr x0, [x0, #4048]
8a0: b4000080 cbz x0, 8b0 <__do_global_dtors_aux+0x30>
8a4: b0000080 adrp x0, 11000 <__data_start>
8a8: f9400400 ldr x0, [x0, #8]
8ac: 97ffff79 bl 690 <__cxa_finalize#plt>
8b0: 97ffffd8 bl 810 <deregister_tm_clones>
8b4: 52800020 mov w0, #0x1 // #1
8b8: 39004260 strb w0, [x19, #16]
8bc: f9400bf3 ldr x19, [sp, #16]
8c0: a8c27bfd ldp x29, x30, [sp], #32
8c4: d65f03c0 ret
8c8: d503201f nop
8cc: d503201f nop
00000000000008d0 <frame_dummy>:
8d0: 17ffffdc b 840 <register_tm_clones>
8d4: d503201f nop
8d8: d503201f nop
8dc: d503201f nop
00000000000008e0 <scenario1a>:
8e0: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
8e4: 6e60b833 neg v19.8h, v1.8h
8e8: 6e60b852 neg v18.8h, v2.8h
8ec: 6e60b871 neg v17.8h, v3.8h
8f0: 6e60b810 neg v16.8h, v0.8h
8f4: 6e132004 ext v4.16b, v0.16b, v19.16b, #4
8f8: 6e122025 ext v5.16b, v1.16b, v18.16b, #4
8fc: 6e112046 ext v6.16b, v2.16b, v17.16b, #4
900: 6e102067 ext v7.16b, v3.16b, v16.16b, #4
904: 4c002404 st1 {v4.8h-v7.8h}, [x0]
908: d65f03c0 ret
90c: d503201f nop
0000000000000910 <scenario1b>:
910: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
914: 6f008430 mvni v16.8h, #0x1
918: 4e619e05 mul v5.8h, v16.8h, v1.8h
91c: 4e608424 add v4.8h, v1.8h, v0.8h
920: 4e628466 add v6.8h, v3.8h, v2.8h
924: 4e639e07 mul v7.8h, v16.8h, v3.8h
928: 4c002404 st1 {v4.8h-v7.8h}, [x0]
92c: d65f03c0 ret
0000000000000930 <scenario2a>:
930: ad400025 ldp q5, q0, [x1]
934: ad408423 ldp q3, q1, [x1, #16]
938: 3dc00c24 ldr q4, [x1, #48]
93c: 6e60b800 neg v0.8h, v0.8h
940: 4ea11c22 mov v2.16b, v1.16b
944: 6e60b821 neg v1.8h, v1.8h
948: 6e0020a5 ext v5.16b, v5.16b, v0.16b, #4
94c: 4ea41c80 mov v0.16b, v4.16b
950: 6e60b884 neg v4.8h, v4.8h
954: 6e012063 ext v3.16b, v3.16b, v1.16b, #4
958: 3d800005 str q5, [x0]
95c: 3dc00021 ldr q1, [x1]
960: 6e042042 ext v2.16b, v2.16b, v4.16b, #4
964: ad008803 stp q3, q2, [x0, #16]
968: 6e60b821 neg v1.8h, v1.8h
96c: 6e012000 ext v0.16b, v0.16b, v1.16b, #4
970: 3d800c00 str q0, [x0, #48]
974: d65f03c0 ret
978: d503201f nop
97c: d503201f nop
0000000000000980 <scenario2b>:
980: ad401022 ldp q2, q4, [x1]
984: 6f008420 mvni v0.8h, #0x1
988: ad410c21 ldp q1, q3, [x1, #32]
98c: 4e609c85 mul v5.8h, v4.8h, v0.8h
990: 4e648442 add v2.8h, v2.8h, v4.8h
994: 4e609c60 mul v0.8h, v3.8h, v0.8h
998: 4e638421 add v1.8h, v1.8h, v3.8h
99c: ad001402 stp q2, q5, [x0]
9a0: ad010001 stp q1, q0, [x0, #32]
9a4: d65f03c0 ret
00000000000009a8 <__libc_csu_init>:
9a8: a9bc7bfd stp x29, x30, [sp, #-64]!
9ac: 910003fd mov x29, sp
9b0: a90153f3 stp x19, x20, [sp, #16]
9b4: 90000094 adrp x20, 10000 <__FRAME_END__+0xf3d8>
9b8: 9135c294 add x20, x20, #0xd70
9bc: a9025bf5 stp x21, x22, [sp, #32]
9c0: 90000095 adrp x21, 10000 <__FRAME_END__+0xf3d8>
9c4: 9135a2b5 add x21, x21, #0xd68
9c8: cb150294 sub x20, x20, x21
9cc: 2a0003f6 mov w22, w0
9d0: a90363f7 stp x23, x24, [sp, #48]
9d4: aa0103f7 mov x23, x1
9d8: aa0203f8 mov x24, x2
9dc: 97ffff1f bl 658 <_init>
9e0: eb940fff cmp xzr, x20, asr #3
9e4: 54000160 b.eq a10 <__libc_csu_init+0x68> // b.none
9e8: 9343fe94 asr x20, x20, #3
9ec: d2800013 mov x19, #0x0 // #0
9f0: f8737aa3 ldr x3, [x21, x19, lsl #3]
9f4: aa1803e2 mov x2, x24
9f8: 91000673 add x19, x19, #0x1
9fc: aa1703e1 mov x1, x23
a00: 2a1603e0 mov w0, w22
a04: d63f0060 blr x3
a08: eb13029f cmp x20, x19
a0c: 54ffff21 b.ne 9f0 <__libc_csu_init+0x48> // b.any
a10: a94153f3 ldp x19, x20, [sp, #16]
a14: a9425bf5 ldp x21, x22, [sp, #32]
a18: a94363f7 ldp x23, x24, [sp, #48]
a1c: a8c47bfd ldp x29, x30, [sp], #64
a20: d65f03c0 ret
a24: d503201f nop
0000000000000a28 <__libc_csu_fini>:
a28: d65f03c0 ret
Disassembly of section .fini:
0000000000000a2c <_fini>:
a2c: a9bf7bfd stp x29, x30, [sp, #-16]!
a30: 910003fd mov x29, sp
a34: a8c17bfd ldp x29, x30, [sp], #16
a38: d65f03c0 ret
Questions
Normally people load the data from the pointer (using vld1q_u16_x4), operates using the Neon datatypes, and store back to another pointer (using vst1q_u16_x4), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?
I checked the disassembly of Scenario 1a (starts at line 8e0) vs. Scenario 2a (starts at line 930). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines 910 and 980, respectively).
In the main function, there are some add/mul after both Scenario1a and 2a (in lines 750,754,758,75c and 784,788,78c,790), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)
Thank you for all your help!
There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
Short: Keep it simple. Don't try to impress compilers or reviewers in any way.
I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (__restrict directive might help)
Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.
You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.

How to record trace of assembly instructions and their corresponding timestamps of a C program on macOS?

I have the following C program:
int main() {
float number1, number2, sum=0.;
number1 = .5;
number2 = .3;
while(sum > -10000000.)
sum -= number1 + number2;
printf("%f",sum);
return 0;
}
Its corresponding assembly is as follows:
_main: ; #main
.cfi_startproc
; %bb.0:
sub sp, sp, #16 ; =16
.cfi_def_cfa_offset 16
str wzr, [sp, #12]
str wzr, [sp]
mov w8, #1056964608
str w8, [sp, #8]
mov w8, #39322
movk w8, #16025, lsl #16
str w8, [sp, #4]
LBB0_1: ; =>This Inner Loop Header: Depth=1
ldr s0, [sp]
fcvt d0, s0
adrp x8, lCPI0_0#PAGE
ldr d1, [x8, lCPI0_0#PAGEOFF]
fcmp d0, d1
b.le LBB0_3
; %bb.2: ; in Loop: Header=BB0_1 Depth=1
ldr s0, [sp, #8]
ldr s1, [sp, #4]
fadd s1, s0, s1
ldr s0, [sp]
fsub s0, s0, s1
str s0, [sp]
b LBB0_1
LBB0_3:
mov w0, #0
add sp, sp, #16 ; =16
ret
.cfi_endproc
; -- End function
.subsections_via_symbols
I want to analyse latency of each instructions so I'm looking for ways to obtain program counter trace.
Desired output is as follows:
0000000000 _main: ; #main
0000000001 .cfi_startproc
0000000002; %bb.0:
0000000003 sub sp, sp, #16 ; =16
0000000004 .cfi_def_cfa_offset 16
0000000005 str wzr, [sp, #12]
0000000006 str wzr, [sp]
0000000007 mov w8, #1056964608
0000000008 str w8, [sp, #8]
0000000009 mov w8, #39322
0000000010 movk w8, #16025, lsl #16
0000000011 str w8, [sp, #4]
...
where the first columns is the timestamp either in pico/nano/microseconds.
Target system is macOS, compiler is llvm, debugger is lldb.
There is no way to precisely measure the instruction time at the granularity of few cycles (at least not on this target architecture). Thus, you cannot measure the latency of one specific instruction unless it is a very slow one. The reason is that the best instructions used to measure the time are themselves pretty long and the processor can execute multiple instructions per cycles and in an out of order way (not to mention they are pipelined). This is especially true for the M1 processor you appear to run on. On ARM, the way to measure time seems to read the PMCCNTR based on this post. You certainly need to care about the superscalar out-of-order execution even with such instruction though. The delay taken by such instruction is dependent of the target architecture and AFAIK there is no official public information targetting the M1 on this topic (in fact, the documentation is pretty scarce on the way the M1 execute instructions so far).
An alternative solution is to simulate the execution of the code with LLVM-MCA which performs a static analysis of the program so to simulate the scheduling of the instructions on the target architecture. The static analysis has a big downside: the actual runtime behaviour of loops and conditional jumps is not considered.
Note that profiling a non-optimized code is generally a huge waste of time as it does not reflect the actual execution of the release version (which should be optimized). Once optimized, the code is likely bounded by the dependency chain on sum. This is especially true on the M1 processor which can execute a lot of instructions in parallel on a same (big/performance) core.

comprehending how "volatile" keyword and comparison work

If a variable is not specified with the keyword volatile, the compiler likely does caching. The variable must be accessed from memory always otherwise until its transaction unit ends. The point I wonder lies in assembly part.
int main() {
/* volatile */ int lock = 999;
while (lock);
}
On x86-64-clang-3.0.0 compiler, its assembly code is following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
cmp DWORD PTR [RSP - 8], 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
When volatile keyword is commented in, it turns out the following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov EAX, DWORD PTR [RSP - 8]
cmp EAX, 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
The points I wonder and don't understand,
cmp DWORD PTR [RSP - 8], 0 . <---
Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
It looks like you forgot to enable optimization. -O0 treats all variables (except register variables) pretty similarly to volatile for consistent debugging.
With optimization enabled, compilers can hoist non-volatile loads out of loops. while(locked); will compile similarly to source like
if (locked) {
while(1){}
}
Or since locked has a compile-time-constant initializer, the whole function should compile to jmp main (an infinite loop).
See MCU programming - C++ O2 optimization breaks while loop for more details.
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
Some compilers are worse at folding loads into memory operands for other instructions when you use volatile. I think that's why you're getting a separate mov load here; it's just a missed optimization.
(Although cmp [mem], imm might be less efficient. I forget if it can macro-fuse with a JCC or something. With a RIP-relative addressing mode it couldn't micro-fuse the load, but a register base is ok.)
cmp EAX, 0 is weird, I guess clang with optimization disabled doesn't look for test eax,eax as a peephole optimization for comparing against zero.
As #user3386109 commented, locked in a boolean context is equivalent to locked != 0 in C / C++.
The compiler doesn't know about caching, it is not a caching thing, it tells the compiler that the value may change between accesses. So to functionally implement our code it needs to perform the accesses we ask for in the order we ask them. Can't optimize out.
void fun1 ( void )
{
/* volatile */ int lock = 999;
while (lock) continue;
}
void fun2 ( void )
{
volatile int lock = 999;
while (lock) continue;
}
volatile int vlock;
int ulock;
void fun3 ( void )
{
while(vlock) continue;
}
void fun4 ( void )
{
while(ulock) continue;
}
void fun5 ( void )
{
vlock=3;
vlock=4;
}
void fun6 ( void )
{
ulock=3;
ulock=4;
}
I find it easier to see in arm... doesn't really matter.
Disassembly of section .text:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
00001004 <fun2>:
1004: e59f3018 ldr r3, [pc, #24] ; 1024 <fun2+0x20>
1008: e24dd008 sub sp, sp, #8
100c: e58d3004 str r3, [sp, #4]
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
101c: e28dd008 add sp, sp, #8
1020: e12fff1e bx lr
1024: 000003e7 andeq r0, r0, r7, ror #7
00001028 <fun3>:
1028: e59f200c ldr r2, [pc, #12] ; 103c <fun3+0x14>
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
103c: 00002000
00001040 <fun4>:
1040: e59f3014 ldr r3, [pc, #20] ; 105c <fun4+0x1c>
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
105c: 00002004
00001060 <fun5>:
1060: e3a01003 mov r1, #3
1064: e3a02004 mov r2, #4
1068: e59f3008 ldr r3, [pc, #8] ; 1078 <fun5+0x18>
106c: e5831000 str r1, [r3]
1070: e5832000 str r2, [r3]
1074: e12fff1e bx lr
1078: 00002000
0000107c <fun6>:
107c: e3a02004 mov r2, #4
1080: e59f3004 ldr r3, [pc, #4] ; 108c <fun6+0x10>
1084: e5832000 str r2, [r3]
1088: e12fff1e bx lr
108c: 00002004
Disassembly of section .bss:
00002000 <vlock>:
2000: 00000000
00002004 <ulock>:
2004: 00000000
First one is the most telling:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
Being a local variable that is initialized, and non volatile then the compiler can assume it won't change value between accesses so it can never change in the while loop, so this is essentially a while 1 loop. If the initial value had been zero this would be a simple return as it can never be non-zero, being non-volatile.
fun2 being a local variable a stack frame needs to be built then.
It does what one assumes the code was trying to do, wait for this shared variable, one that can change during the loop
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
so it samples it and tests what it samples each time through the loop.
fun3 and fun4 same deal but more realistic, as external to the function code isnt going to change lock, being non-global doesn't make much sense for your while loop.
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
For the volatile fun3 case the variable has to be read and tested each loop
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
For the non-volatile being global it has to sample it once, very interesting what the compiler did here, have to think about why it would do that, but either way you can see that the "loop" retests the value read stored in a register (not cached) which will never change with a proper program. Functionally we asked it to only read the variable once by using non-volatile then it tests that value indefinitely.
fun5 and fun6 further demonstrate that volatile requires the compiler perform the accesses to the variable in its storage place before moving on to the next operation/access in the code. So when volatile we are asking the compiler to perform two assignments, two stores. When non-volatile the compiler can optimize out the first store and only do the last one as if you look at the code as a whole this function (fun6) leaves the variable set to 4, so the function leaves the variable set to 4.
The x86 solution is equally interesting repz retq is all over it (with the compiler on my computer), not hard to find out what that is all about.
Neither aarch64, x86, mips, riscv, msp430, pdp11 backends do the double check on fun3().
pdp11 is actually the easier code to read (no surprise there)
00000000 <_fun1>:
0: 01ff br 0 <_fun1>
00000002 <_fun2>:
2: 65c6 fffe add $-2, sp
6: 15ce 03e7 mov $1747, (sp)
a: 1380 mov (sp), r0
c: 02fe bne a <_fun2+0x8>
e: 65c6 0002 add $2, sp
12: 0087 rts pc
00000014 <_fun3>:
14: 1dc0 0026 mov $3e <_vlock>, r0
18: 02fd bne 14 <_fun3>
1a: 0087 rts pc
0000001c <_fun4>:
1c: 1dc0 001c mov $3c <_ulock>, r0
20: 0bc0 tst r0
22: 02fe bne 20 <_fun4+0x4>
24: 0087 rts pc
00000026 <_fun5>:
26: 15f7 0003 0012 mov $3, $3e <_vlock>
2c: 15f7 0004 000c mov $4, $3e <_vlock>
32: 0087 rts pc
00000034 <_fun6>:
34: 15f7 0004 0002 mov $4, $3c <_ulock>
3a: 0087 rts pc
(this is the not linked version)
cmp DWORD PTR [RSP - 8], 0 . <--- Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
while does a true false comparison meaning is it equal to zero or not equal to zero
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
mov -0x8(%rsp),%eax
cmp 0,%eax
cmp 0,-0x8(%rsp)
as so.s -o so.o
so.s: Assembler messages:
so.s:3: Error: too many memory references for `cmp'
compare wants a register. So it reads into a register so it can do the compare as it can't do the compare between the immediate and the memory access in one instruction. If they could have done it in one instruction they would have.

Inline assembly instruction on ARM64 (iOS) fails

I'm trying out the new arm64 instructions on iOS and I'm having a peculiar issue. I hope someone can help me with this.
In particular this fails with 'Invalid operand for instruction'
void test()
{
register long long signed int r=0,c=0,d=0;
register signed int a=0,b=0,e=0,f=0;
// this fails
asm volatile("smaddl %0, %1, %2, %3" : "=r"(r) : "r"(a), "r"(b), "r"(c));
};
I'm not sure what I'm doing wrong, to the best that I can tell, I'm following the instruction and syntax correctly. Here's how it is defined in the docs:
"SMADDL Xd, Wn, Wm, Xa
Signed Multiply-Add Long: Xd = Xa + (Wn × Wm), treating source operands as signed."
where X denotes a 64bit register and W denotes a 32 bit one.
Any help will be appreciated.
Thx
I was able to fix it by using recommendation in this post:
asm volatile("smaddl %x0, %w1, %w2, %x3" : "=r"(r) : "r"(a), "r"(b), "r"(c));
This produces the following assembly:
_test: ; #test
; BB#0:
sub sp, sp, #48
movz w8, #0
movz x9, #0
stp x9, x9, [sp, #32]
str x9, [sp, #24]
stp w8, w8, [sp, #16]
stp w8, w8, [sp, #8]
ldp w10, w8, [sp, #16]
ldr x9, [sp, #32]
; InlineAsm Start
smaddl x9, w8, w10, x9
; InlineAsm End
str x9, [sp, #40]
add sp, sp, #48
ret lr
It seems you need to use 'w' to specifically mark 32-bit registers.
See also aarch64-inline-asm.c for a few more inline asm examples.

Resources