How arguments are passed to the printf() function? - c

I am trying to understand the assembly code for a simple program, shown below.
void f()
{
int i, x = 0;
for (i = 0; i < 10; i++)
x++;
printf("Value of x: %d\n", x);
}
and its corresponding assembly code on my machine is
00000000000007d4 <f>:
7d4: a9be7bfd stp x29, x30, [sp, #-32]!
7d8: 910003fd mov x29, sp
7dc: b9001fff str wzr, [sp, #28]
7e0: b9001bff str wzr, [sp, #24]
7e4: 14000007 b 800 <f+0x2c>
7e8: b9401fe0 ldr w0, [sp, #28]
7ec: 11000400 add w0, w0, #0x1
7f0: b9001fe0 str w0, [sp, #28]
7f4: b9401be0 ldr w0, [sp, #24]
7f8: 11000400 add w0, w0, #0x1
7fc: b9001be0 str w0, [sp, #24]
800: b9401be0 ldr w0, [sp, #24]
804: 7100241f cmp w0, #0x9
808: 54ffff0d b.le 7e8 <f+0x14>
80c: b9401fe1 ldr w1, [sp, #28]
810: 90000000 adrp x0, 0 <__abi_tag-0x278>
814: 9121c000 add x0, x0, #0x870
818: 97ffff9a bl 680 <printf#plt>
81c: d503201f nop
820: a8c27bfd ldp x29, x30, [sp], #32
824: d65f03c0 ret
I understand the loop, but line 814 - 818 is really confusion to me. What's the purpose of adding #0x870 to x0? What does line 818 mean? And how arguments are passed to the printf() function?
I expect words like "Value of x: " appears in the assembly code, but it seems like the compiler simply knows what to print.

Related

Why (or why not) pass Neon intrinsics datatypes as inputs/outputs functions parameters?

This is a small test I built. Here we have two scenarios:
Scenario 1: Two functions (scenario1a and scenario1b) which inputs and outputs are uint16_t* and load/store to/from Neon datatype (uint16x8x4_t).
Scenario 2: Same functions as Scenario 1 (in this case scenario2a and scenario2b) but the inputs and outputs are uint16x8x4_t*, and the load and store are done in the main function.
(Below the c code I include the disassembly generated after compiling with -O3).
#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario1a(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);
vst1q_u16_x4(resultArray, result);
}
void scenario1b(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
result.val[1] = vmulq_n_u16(temp.val[1], -2);
result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
result.val[3] = vmulq_n_u16(temp.val[3], -2);
vst1q_u16_x4(resultArray, result);
}
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
result->val[1] = vmulq_n_u16(temp->val[1], -2);
result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
result->val[3] = vmulq_n_u16(temp->val[3], -2);
}
int main(void) {
uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
// Scenario 01: Input and output are uint16_t*
uint16_t result01a[32];
uint16_t result01_final[32];
scenario1a(result01a, input);
scenario1b(result01_final, result01a);
// Scenario 02: Input and output are uint16x8x4_t
uint16_t result02_final[32];
uint16x8x4_t temp, result02a, result02b;
temp = vld1q_u16_x4(input);
scenario2a(&result02a, &temp);
scenario2b(&result02b, &result02a);
vst1q_u16_x4(result02_final, result02b);
return 0;
}
Disassembly:
test: file format elf64-littleaarch64
Disassembly of section .init:
0000000000000658 <_init>:
658: a9bf7bfd stp x29, x30, [sp, #-16]!
65c: 910003fd mov x29, sp
660: 94000065 bl 7f4 <call_weak_fn>
664: a8c17bfd ldp x29, x30, [sp], #16
668: d65f03c0 ret
Disassembly of section .plt:
0000000000000670 <.plt>:
670: a9bf7bf0 stp x16, x30, [sp, #-16]!
674: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
678: f947c611 ldr x17, [x16, #3976]
67c: 913e2210 add x16, x16, #0xf88
680: d61f0220 br x17
684: d503201f nop
688: d503201f nop
68c: d503201f nop
0000000000000690 <__cxa_finalize#plt>:
690: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
694: f947ca11 ldr x17, [x16, #3984]
698: 913e4210 add x16, x16, #0xf90
69c: d61f0220 br x17
00000000000006a0 <__libc_start_main#plt>:
6a0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6a4: f947ce11 ldr x17, [x16, #3992]
6a8: 913e6210 add x16, x16, #0xf98
6ac: d61f0220 br x17
00000000000006b0 <__stack_chk_fail#plt>:
6b0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6b4: f947d211 ldr x17, [x16, #4000]
6b8: 913e8210 add x16, x16, #0xfa0
6bc: d61f0220 br x17
00000000000006c0 <__gmon_start__#plt>:
6c0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6c4: f947d611 ldr x17, [x16, #4008]
6c8: 913ea210 add x16, x16, #0xfa8
6cc: d61f0220 br x17
00000000000006d0 <abort#plt>:
6d0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6d4: f947da11 ldr x17, [x16, #4016]
6d8: 913ec210 add x16, x16, #0xfb0
6dc: d61f0220 br x17
Disassembly of section .text:
00000000000006e0 <main>:
6e0: 90000085 adrp x5, 10000 <__FRAME_END__+0xf3d8>
6e4: a9a67bfd stp x29, x30, [sp, #-416]!
6e8: 910003fd mov x29, sp
6ec: 90000002 adrp x2, 0 <_init-0x658>
6f0: 91292042 add x2, x2, #0xa48
6f4: 910263e3 add x3, sp, #0x98
6f8: 910363e0 add x0, sp, #0xd8
6fc: 6f008434 mvni v20.8h, #0x1
700: f947f0a5 ldr x5, [x5, #4064]
704: aa0303e1 mov x1, x3
708: 910143e4 add x4, sp, #0x50
70c: a940344c ldp x12, x13, [x2]
710: a9412c4a ldp x10, x11, [x2, #16]
714: f94000a6 ldr x6, [x5]
718: f900cfe6 str x6, [sp, #408]
71c: d2800006 mov x6, #0x0 // #0
720: a9422448 ldp x8, x9, [x2, #32]
724: a9431c46 ldp x6, x7, [x2, #48]
728: 910463e2 add x2, sp, #0x118
72c: a909b7ec stp x12, x13, [sp, #152]
730: a90aafea stp x10, x11, [sp, #168]
734: a90ba7e8 stp x8, x9, [sp, #184]
738: a90c9fe6 stp x6, x7, [sp, #200]
73c: 94000069 bl 8e0 <scenario1a>
740: 4c402400 ld1 {v0.8h-v3.8h}, [x0]
744: 910043e1 add x1, sp, #0x10
748: aa0403e0 mov x0, x4
74c: 4c402470 ld1 {v16.8h-v19.8h}, [x3]
750: 4e619e85 mul v5.8h, v20.8h, v1.8h
754: 4e608424 add v4.8h, v1.8h, v0.8h
758: 4e628466 add v6.8h, v3.8h, v2.8h
75c: 4e639e87 mul v7.8h, v20.8h, v3.8h
760: 4c002030 st1 {v16.16b-v19.16b}, [x1]
764: 4c002444 st1 {v4.8h-v7.8h}, [x2]
768: 94000072 bl 930 <scenario2a>
76c: ad409885 ldp q5, q6, [x4, #16]
770: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
774: 910563e2 add x2, sp, #0x158
778: 3dc00c84 ldr q4, [x4, #48]
77c: 3dc017e7 ldr q7, [sp, #80]
780: f947f021 ldr x1, [x1, #4064]
784: 4e749c83 mul v3.8h, v4.8h, v20.8h
788: 4e668482 add v2.8h, v4.8h, v6.8h
78c: 4e749ca1 mul v1.8h, v5.8h, v20.8h
790: 4e6784a0 add v0.8h, v5.8h, v7.8h
794: 4c002440 st1 {v0.8h-v3.8h}, [x2]
798: f940cfe0 ldr x0, [sp, #408]
79c: f9400022 ldr x2, [x1]
7a0: eb020000 subs x0, x0, x2
7a4: d2800002 mov x2, #0x0 // #0
7a8: 54000081 b.ne 7b8 <main+0xd8> // b.any
7ac: 52800000 mov w0, #0x0 // #0
7b0: a8da7bfd ldp x29, x30, [sp], #416
7b4: d65f03c0 ret
7b8: 97ffffbe bl 6b0 <__stack_chk_fail#plt>
00000000000007bc <_start>:
7bc: d280001d mov x29, #0x0 // #0
7c0: d280001e mov x30, #0x0 // #0
7c4: aa0003e5 mov x5, x0
7c8: f94003e1 ldr x1, [sp]
7cc: 910023e2 add x2, sp, #0x8
7d0: 910003e6 mov x6, sp
7d4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7d8: f947f800 ldr x0, [x0, #4080]
7dc: 90000083 adrp x3, 10000 <__FRAME_END__+0xf3d8>
7e0: f947f463 ldr x3, [x3, #4072]
7e4: 90000084 adrp x4, 10000 <__FRAME_END__+0xf3d8>
7e8: f947e084 ldr x4, [x4, #4032]
7ec: 97ffffad bl 6a0 <__libc_start_main#plt>
7f0: 97ffffb8 bl 6d0 <abort#plt>
00000000000007f4 <call_weak_fn>:
7f4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7f8: f947ec00 ldr x0, [x0, #4056]
7fc: b4000040 cbz x0, 804 <call_weak_fn+0x10>
800: 17ffffb0 b 6c0 <__gmon_start__#plt>
804: d65f03c0 ret
808: d503201f nop
80c: d503201f nop
0000000000000810 <deregister_tm_clones>:
810: b0000080 adrp x0, 11000 <__data_start>
814: 91004000 add x0, x0, #0x10
818: b0000081 adrp x1, 11000 <__data_start>
81c: 91004021 add x1, x1, #0x10
820: eb00003f cmp x1, x0
824: 540000c0 b.eq 83c <deregister_tm_clones+0x2c> // b.none
828: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
82c: f947e421 ldr x1, [x1, #4040]
830: b4000061 cbz x1, 83c <deregister_tm_clones+0x2c>
834: aa0103f0 mov x16, x1
838: d61f0200 br x16
83c: d65f03c0 ret
0000000000000840 <register_tm_clones>:
840: b0000080 adrp x0, 11000 <__data_start>
844: 91004000 add x0, x0, #0x10
848: b0000081 adrp x1, 11000 <__data_start>
84c: 91004021 add x1, x1, #0x10
850: cb000021 sub x1, x1, x0
854: d37ffc22 lsr x2, x1, #63
858: 8b810c41 add x1, x2, x1, asr #3
85c: 9341fc21 asr x1, x1, #1
860: b40000c1 cbz x1, 878 <register_tm_clones+0x38>
864: 90000082 adrp x2, 10000 <__FRAME_END__+0xf3d8>
868: f947fc42 ldr x2, [x2, #4088]
86c: b4000062 cbz x2, 878 <register_tm_clones+0x38>
870: aa0203f0 mov x16, x2
874: d61f0200 br x16
878: d65f03c0 ret
87c: d503201f nop
0000000000000880 <__do_global_dtors_aux>:
880: a9be7bfd stp x29, x30, [sp, #-32]!
884: 910003fd mov x29, sp
888: f9000bf3 str x19, [sp, #16]
88c: b0000093 adrp x19, 11000 <__data_start>
890: 39404260 ldrb w0, [x19, #16]
894: 35000140 cbnz w0, 8bc <__do_global_dtors_aux+0x3c>
898: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
89c: f947e800 ldr x0, [x0, #4048]
8a0: b4000080 cbz x0, 8b0 <__do_global_dtors_aux+0x30>
8a4: b0000080 adrp x0, 11000 <__data_start>
8a8: f9400400 ldr x0, [x0, #8]
8ac: 97ffff79 bl 690 <__cxa_finalize#plt>
8b0: 97ffffd8 bl 810 <deregister_tm_clones>
8b4: 52800020 mov w0, #0x1 // #1
8b8: 39004260 strb w0, [x19, #16]
8bc: f9400bf3 ldr x19, [sp, #16]
8c0: a8c27bfd ldp x29, x30, [sp], #32
8c4: d65f03c0 ret
8c8: d503201f nop
8cc: d503201f nop
00000000000008d0 <frame_dummy>:
8d0: 17ffffdc b 840 <register_tm_clones>
8d4: d503201f nop
8d8: d503201f nop
8dc: d503201f nop
00000000000008e0 <scenario1a>:
8e0: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
8e4: 6e60b833 neg v19.8h, v1.8h
8e8: 6e60b852 neg v18.8h, v2.8h
8ec: 6e60b871 neg v17.8h, v3.8h
8f0: 6e60b810 neg v16.8h, v0.8h
8f4: 6e132004 ext v4.16b, v0.16b, v19.16b, #4
8f8: 6e122025 ext v5.16b, v1.16b, v18.16b, #4
8fc: 6e112046 ext v6.16b, v2.16b, v17.16b, #4
900: 6e102067 ext v7.16b, v3.16b, v16.16b, #4
904: 4c002404 st1 {v4.8h-v7.8h}, [x0]
908: d65f03c0 ret
90c: d503201f nop
0000000000000910 <scenario1b>:
910: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
914: 6f008430 mvni v16.8h, #0x1
918: 4e619e05 mul v5.8h, v16.8h, v1.8h
91c: 4e608424 add v4.8h, v1.8h, v0.8h
920: 4e628466 add v6.8h, v3.8h, v2.8h
924: 4e639e07 mul v7.8h, v16.8h, v3.8h
928: 4c002404 st1 {v4.8h-v7.8h}, [x0]
92c: d65f03c0 ret
0000000000000930 <scenario2a>:
930: ad400025 ldp q5, q0, [x1]
934: ad408423 ldp q3, q1, [x1, #16]
938: 3dc00c24 ldr q4, [x1, #48]
93c: 6e60b800 neg v0.8h, v0.8h
940: 4ea11c22 mov v2.16b, v1.16b
944: 6e60b821 neg v1.8h, v1.8h
948: 6e0020a5 ext v5.16b, v5.16b, v0.16b, #4
94c: 4ea41c80 mov v0.16b, v4.16b
950: 6e60b884 neg v4.8h, v4.8h
954: 6e012063 ext v3.16b, v3.16b, v1.16b, #4
958: 3d800005 str q5, [x0]
95c: 3dc00021 ldr q1, [x1]
960: 6e042042 ext v2.16b, v2.16b, v4.16b, #4
964: ad008803 stp q3, q2, [x0, #16]
968: 6e60b821 neg v1.8h, v1.8h
96c: 6e012000 ext v0.16b, v0.16b, v1.16b, #4
970: 3d800c00 str q0, [x0, #48]
974: d65f03c0 ret
978: d503201f nop
97c: d503201f nop
0000000000000980 <scenario2b>:
980: ad401022 ldp q2, q4, [x1]
984: 6f008420 mvni v0.8h, #0x1
988: ad410c21 ldp q1, q3, [x1, #32]
98c: 4e609c85 mul v5.8h, v4.8h, v0.8h
990: 4e648442 add v2.8h, v2.8h, v4.8h
994: 4e609c60 mul v0.8h, v3.8h, v0.8h
998: 4e638421 add v1.8h, v1.8h, v3.8h
99c: ad001402 stp q2, q5, [x0]
9a0: ad010001 stp q1, q0, [x0, #32]
9a4: d65f03c0 ret
00000000000009a8 <__libc_csu_init>:
9a8: a9bc7bfd stp x29, x30, [sp, #-64]!
9ac: 910003fd mov x29, sp
9b0: a90153f3 stp x19, x20, [sp, #16]
9b4: 90000094 adrp x20, 10000 <__FRAME_END__+0xf3d8>
9b8: 9135c294 add x20, x20, #0xd70
9bc: a9025bf5 stp x21, x22, [sp, #32]
9c0: 90000095 adrp x21, 10000 <__FRAME_END__+0xf3d8>
9c4: 9135a2b5 add x21, x21, #0xd68
9c8: cb150294 sub x20, x20, x21
9cc: 2a0003f6 mov w22, w0
9d0: a90363f7 stp x23, x24, [sp, #48]
9d4: aa0103f7 mov x23, x1
9d8: aa0203f8 mov x24, x2
9dc: 97ffff1f bl 658 <_init>
9e0: eb940fff cmp xzr, x20, asr #3
9e4: 54000160 b.eq a10 <__libc_csu_init+0x68> // b.none
9e8: 9343fe94 asr x20, x20, #3
9ec: d2800013 mov x19, #0x0 // #0
9f0: f8737aa3 ldr x3, [x21, x19, lsl #3]
9f4: aa1803e2 mov x2, x24
9f8: 91000673 add x19, x19, #0x1
9fc: aa1703e1 mov x1, x23
a00: 2a1603e0 mov w0, w22
a04: d63f0060 blr x3
a08: eb13029f cmp x20, x19
a0c: 54ffff21 b.ne 9f0 <__libc_csu_init+0x48> // b.any
a10: a94153f3 ldp x19, x20, [sp, #16]
a14: a9425bf5 ldp x21, x22, [sp, #32]
a18: a94363f7 ldp x23, x24, [sp, #48]
a1c: a8c47bfd ldp x29, x30, [sp], #64
a20: d65f03c0 ret
a24: d503201f nop
0000000000000a28 <__libc_csu_fini>:
a28: d65f03c0 ret
Disassembly of section .fini:
0000000000000a2c <_fini>:
a2c: a9bf7bfd stp x29, x30, [sp, #-16]!
a30: 910003fd mov x29, sp
a34: a8c17bfd ldp x29, x30, [sp], #16
a38: d65f03c0 ret
Questions
Normally people load the data from the pointer (using vld1q_u16_x4), operates using the Neon datatypes, and store back to another pointer (using vst1q_u16_x4), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?
I checked the disassembly of Scenario 1a (starts at line 8e0) vs. Scenario 2a (starts at line 930). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines 910 and 980, respectively).
In the main function, there are some add/mul after both Scenario1a and 2a (in lines 750,754,758,75c and 784,788,78c,790), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)
Thank you for all your help!
There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
Short: Keep it simple. Don't try to impress compilers or reviewers in any way.
I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (__restrict directive might help)
Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.
You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.

Why are w8,w9 used instead of w1,w2?

I am studying about armv8.
The following c language code
When converted to assembly with clang, w0 seems to be used for the return value, and w8 and w9 are used to save the variable values.
It is said that the arm has w series registers w0 to w30, but why are w8 and w9 used instead of w1 and w2?
int main() {
int a = 3;
int b = 5;
int c = a + b;
return c;
}
main: // #main
sub sp, sp, #16 // =16
str wzr, [sp, #12]
mov w8, #3
str w8, [sp, #8]
mov w8, #5
str w8, [sp, #4]
ldr w8, [sp, #8]
ldr w9, [sp, #4]
add w8, w8, w9
str w8, [sp]
ldr w0, [sp]
add sp, sp, #16 // =16
ret

Aarch64 Frame pointer

I saw this code on page 120 of ArmV8 Programmer's Guide:
foo//
SUB SP, SP, #0x30
STR W0, [SP, #0x2C]
STR W1, [SP, #0x28]
STR D0, [SP, #0x20]
STR D1, [SP, #0x18]
LDR W0, [SP, #0x2C]
STR W0, [SP, #0]
LDR W0, [SP, #0x28]
STR W0, [SP, #4]
LDR W0, [SP, #0x20]
STR W0, [SP, #8]
LDR W0, [SP, #0x18]
STR W0, [SP, #10]
LDR X9, [SP, #0x0]
STR X9, [X8, #0]
LDR X9, [SP, #8]
STR X9, [X8, #8]
LDR X9, [SP, #0x10]
STR X9, [X8, #0x10]
ADD SP, SP, #0x30
RET
bar//
STP X29, X30, [SP, #0x10]!
MOV X29, SP
SUB SP, SP, #0x20
ADD X8, SP, #8
MOV W0, WZR
ORR W1, WZR, #1
FMOV D0, #1.00000000
FMOV D1, #2.00000000
BL foo:
ADRP X8, {PC}, 0x78
ADD X8, X8, #0
LDR X9, [SP, #8]
STR X9, [X8, #0]
LDR X9, [SP, #0x10]
STR X9, [X8, #8]
LDR X9, [SP, #0x18]
STR X9, [X8, #0x10]
MOV SP, X29
LDP X20, X30, [SP], #0x10
RET
My question is regarding to following instruction at the end of bar routine:
LDP X20, X30, [SP], #0x10
LDP is for loading pair of registers from stack and restriction for this instruction will not allow loading to registers at that distance.
First, why this line can be valid ?
Second, why FP is loaded into x20 ? Isn't it supposed to load into x29 ?
From ARM Procedure call standard :
Each frame shall link to the frame of its caller by means of a frame record of two 64-bit values on the stack
Of course it also mentions cases which these requirements are not mandatory, but in this case FP seems to be not valid because it is pointing to sp - 0x10 which which has fp and lr .

Inline assembly instruction on ARM64 (iOS) fails

I'm trying out the new arm64 instructions on iOS and I'm having a peculiar issue. I hope someone can help me with this.
In particular this fails with 'Invalid operand for instruction'
void test()
{
register long long signed int r=0,c=0,d=0;
register signed int a=0,b=0,e=0,f=0;
// this fails
asm volatile("smaddl %0, %1, %2, %3" : "=r"(r) : "r"(a), "r"(b), "r"(c));
};
I'm not sure what I'm doing wrong, to the best that I can tell, I'm following the instruction and syntax correctly. Here's how it is defined in the docs:
"SMADDL Xd, Wn, Wm, Xa
Signed Multiply-Add Long: Xd = Xa + (Wn × Wm), treating source operands as signed."
where X denotes a 64bit register and W denotes a 32 bit one.
Any help will be appreciated.
Thx
I was able to fix it by using recommendation in this post:
asm volatile("smaddl %x0, %w1, %w2, %x3" : "=r"(r) : "r"(a), "r"(b), "r"(c));
This produces the following assembly:
_test: ; #test
; BB#0:
sub sp, sp, #48
movz w8, #0
movz x9, #0
stp x9, x9, [sp, #32]
str x9, [sp, #24]
stp w8, w8, [sp, #16]
stp w8, w8, [sp, #8]
ldp w10, w8, [sp, #16]
ldr x9, [sp, #32]
; InlineAsm Start
smaddl x9, w8, w10, x9
; InlineAsm End
str x9, [sp, #40]
add sp, sp, #48
ret lr
It seems you need to use 'w' to specifically mark 32-bit registers.
See also aarch64-inline-asm.c for a few more inline asm examples.

Using ARM NEON intrinsics to add alpha and permute

I'm developing an iOS app that needs to convert images from RGB -> BGRA fairly quickly. I would like to use NEON intrinsics if possible. Is there a faster way than simply assigning the components?
void neonPermuteRGBtoBGRA(unsigned char* src, unsigned char* dst, int numPix)
{
numPix /= 8; //process 8 pixels at a time
uint8x8_t alpha = vdup_n_u8 (0xff);
for (int i=0; i<numPix; i++)
{
uint8x8x3_t rgb = vld3_u8 (src);
uint8x8x4_t bgra;
bgra.val[0] = rgb.val[2]; //these lines are slow
bgra.val[1] = rgb.val[1]; //these lines are slow
bgra.val[2] = rgb.val[0]; //these lines are slow
bgra.val[3] = alpha;
vst4_u8(dst, bgra);
src += 8*3;
dst += 8*4;
}
}
The ARMCC disassembly isn't that fast either :
It isn't using the most appropriate instructions
It mixes VFP instructions with NEON ones which causes huge hiccups every time
Try this :
mov r2, r2, lsr #3
vmov.u8, d3, #0xff
loop:
vld3.8 {d0-d2}, [r0]!
subs r2, r2, #1
vswp d0, d2
vst4.8 {d0-d3}, [r1]!
bgt loop
bx lr
My suggested code isn't fully optimized either, but further "real" optimizations would harm the readability seriously. So I stop here.
This depends on the compiler. For example when I compile the code above with armcc (5.01) and disassemble it, what I get looks like (I'm just putting the loop and I moved alpha assignment outside of the loop)
18: f420440d vld3.8 {d4-d6}, [r0]!
1c: e2822001 add r2, r2, #1 ; 0x1
20: eeb01b45 fcpyd d1, d5
24: eeb00b46 fcpyd d0, d6
28: eeb02b44 fcpyd d2, d4
2c: f401000d vst4.8 {d0-d3}, [r1]!
30: e1520003 cmp r2, r3
34: bafffff7 blt 18 <neonPermuteRGBtoBGRA_armcc+0x18>
If I compile the code with gcc (4.4.3) and disassemble again I get,
40: f967 040f vld3.8 {d16-d18}, [r7]
44: 46d6 mov lr, sl
46: ecca 0b06 vstmia sl, {d16-d18}
4a: 9d02 ldr r5, [sp, #8]
4c: ed8d 8b1a vstr d8, [sp, #104]
50: 3718 adds r7, #24
52: e8be 000f ldmia.w lr!, {r0, r1, r2, r3}
56: f108 0801 add.w r8, r8, #1 ; 0x1
5a: c50f stmia r5!, {r0, r1, r2, r3}
5c: eddd 0b24 vldr d16, [sp, #144]
60: e89e 0003 ldmia.w lr, {r0, r1}
64: edcd 0b16 vstr d16, [sp, #88]
68: eddd 0b22 vldr d16, [sp, #136]
6c: edcd 0b18 vstr d16, [sp, #96]
70: e885 0003 stmia.w r5, {r0, r1}
74: ed9d 0b26 vldr d0, [sp, #152]
78: 9d03 ldr r5, [sp, #12]
7a: ed8d 0b14 vstr d0, [sp, #80]
7e: cd0f ldmia r5!, {r0, r1, r2, r3}
80: 46ae mov lr, r5
82: 465d mov r5, fp
84: c50f stmia r5!, {r0, r1, r2, r3}
86: e89e 000f ldmia.w lr, {r0, r1, r2, r3}
8a: e885 000f stmia.w r5, {r0, r1, r2, r3}
8e: 9501 str r5, [sp, #4]
90: 465d mov r5, fp
92: 2100 movs r1, #0
94: 2220 movs r2, #32
96: 4620 mov r0, r4
98: f7ff fffe bl 0 <memset>
9c: cd0f ldmia r5!, {r0, r1, r2, r3}
9e: 4625 mov r5, r4
a0: c50f stmia r5!, {r0, r1, r2, r3}
a2: f8dd c004 ldr.w ip, [sp, #4]
a6: e89c 000f ldmia.w ip, {r0, r1, r2, r3}
aa: e885 000f stmia.w r5, {r0, r1, r2, r3}
ae: ecd4 0b08 vldmia r4, {d16-d19}
b2: f946 000f vst4.8 {d16-d19}, [r6]
b6: 3620 adds r6, #32
b8: 45c8 cmp r8, r9
ba: dbc1 blt.n 40 <memset+0x40>
And the execution time was 10 times faster with armcc.
If I compile armcc produced assembly code for the function (it looks like now alpha is back in loop :)) with gcc (inline assembly)
void neonPermuteRGBtoBGRA_gas(unsigned char* src, unsigned char* dst,
int numPix) {
asm(
" ASR r3,r2,#31\n"
" VMOV.I8 d1,#0xff\n"
" ADD r2,r2,r3,LSR #29\n"
" ASR r3,r2,#3\n"
" MOV r2,#0\n"
" CMP r3,#0\n"
" BLE end\n"
"loop:\n"
" VLD3.8 {d4,d5,d6},[r0]!\n"
" ADD r2,r2,#1\n"
" CMP r3,r2\n"
" VMOV.F64 d3,d5\n"
" VMOV.F64 d2,d6\n"
" VMOV.F64 d5,d1\n"
" VMOV.F64 d0,d4\n"
" VST4.8 {d2,d3,d4,d5},[r1]!\n"
" BGT loop\n"
"end:\n"
);
}
I get the same execution time with gcc as well.
At the end what I suggest you is either disassemble your binary and check if the compiler produces what you want or use assembly.
Btw if you want to improve the execution time of this function even further, I suggest you to look into
arm's PLD (preload data) instruction
utilize all the possible neon instructions in the loop, like loop unrolling (you'll notice that actually bandwidth will be the data load time from memory)

Resources