Convert function to Arm Neon - c

I'm a beginner in Arm Neon, and I'm trying to vectorise this loop
float ans=0.0;
for (i=0; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
I'm trying to convert this function in Neon with prefetch instruction and loop-unrolling
int iter= numdims/4*4;
float result[3];
float ans=0.0;
asm volatile(
"mov x1, #0\n\t"
"mov x2, %[pt1]\n\t"
"mov x3, %[pt2]\n\t"
"movi v3.4s, #0\n\t"
".loop_neon%=:\n\t"
"prfm PLDL1STRM, [x2, #64]\n\t"
"prfm PLDL1STRM, [x3, #64]\n\t"
"ldr q1, [x2, #16]\n\t"
"ldr q2, [x3, #16]\n\t"
"fsub v4.4s, v1.4s, v2.4s\n\t"
"fmla v3.4s, v4.4s, v4.4s\n\t"
"add x1,x1, #16\n\t"
"cmp x1, %[iter]\n\t"
"b.lt .loop_neon%=\n\t"
"str q3, [%[result]]\n\t"
:
: [iter] "r" (iter),[pt1] "r" (pt1),[pt2] "r" (pt2), [result] "r" (result)
: "x1","x2","x3","memory","v0","v1","v2","v3","v4"
);
ans = result[0] + result[1] + result[2] + result[3];
//final iterations of the loop
for (int i=iter; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
This code works but the output is not correct

Short answer: add x1, x1, #4
Your code is far from optimal:
there are lots of pipeline hazards. unroll deeper
you should always count down the loop counter
you should avoid unnecessary memory access (result)
you should avoid unnecessary mov operations
Provided iter is a multiple of 16, the code below is suggested:
.func
// extern float sumDiffSquare(float *pA, float *pB, uint32_t length);
// assert(length >= 16);
// assert(length & 15 == 0);
pA .req x0
pB .req x1
length .req x2
sumDiffSqare:
movi v0.16b, #0
.balign 64
1:
ldp q16, q17, [pA], #32
ldp q20, q21, [pB], #32
ldp q18, q19, [pA], #32
ldp q22, q23, [pB], #32
subs length, length, #16
fsub v16.4s, v20.4s, v16.4s
fsub v17.4s, v21.4s, v17.4s
fsub v18.4s, v22.4s, v18.4s
fsub v19.4s, v23.4s, v19.4s
fmla v0.4s, v16.4s, v16.4s
fmla v0.4s, v17.4s, v17.4s
fmla v0.4s, v18.4s, v18.4s
fmla v0.4s, v19.4s, v19.4s
b.gt 1b
faddp v0.4s, v0.4s, v0.4s
faddp v0.2s, v0.2s, v0.2s
ret
.endfunc

Related

Why (or why not) pass Neon intrinsics datatypes as inputs/outputs functions parameters?

This is a small test I built. Here we have two scenarios:
Scenario 1: Two functions (scenario1a and scenario1b) which inputs and outputs are uint16_t* and load/store to/from Neon datatype (uint16x8x4_t).
Scenario 2: Same functions as Scenario 1 (in this case scenario2a and scenario2b) but the inputs and outputs are uint16x8x4_t*, and the load and store are done in the main function.
(Below the c code I include the disassembly generated after compiling with -O3).
#include <stdio.h>
#include <stdlib.h>
#include <arm_neon.h>
void scenario1a(uint16_t* resultArray, const uint16_t* X);
void scenario1b(uint16_t* resultArray, const uint16_t* X);
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp);
void scenario1a(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vextq_u16(temp.val[0], vmulq_n_u16(temp.val[1], -1), 2);
result.val[1] = vextq_u16(temp.val[1], vmulq_n_u16(temp.val[2], -1), 2);
result.val[2] = vextq_u16(temp.val[2], vmulq_n_u16(temp.val[3], -1), 2);
result.val[3] = vextq_u16(temp.val[3], vmulq_n_u16(temp.val[0], -1), 2);
vst1q_u16_x4(resultArray, result);
}
void scenario1b(uint16_t* resultArray, const uint16_t* X) {
uint16x8x4_t temp, result;
temp = vld1q_u16_x4(X);
result.val[0] = vaddq_u16(temp.val[0], temp.val[1]);
result.val[1] = vmulq_n_u16(temp.val[1], -2);
result.val[2] = vaddq_u16(temp.val[2], temp.val[3]);
result.val[3] = vmulq_n_u16(temp.val[3], -2);
vst1q_u16_x4(resultArray, result);
}
void scenario2a(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vextq_u16(temp->val[0], vmulq_n_u16(temp->val[1], -1), 2);
result->val[1] = vextq_u16(temp->val[1], vmulq_n_u16(temp->val[2], -1), 2);
result->val[2] = vextq_u16(temp->val[2], vmulq_n_u16(temp->val[3], -1), 2);
result->val[3] = vextq_u16(temp->val[3], vmulq_n_u16(temp->val[0], -1), 2);
}
void scenario2b(uint16x8x4_t* result, const uint16x8x4_t* temp) {
result->val[0] = vaddq_u16(temp->val[0], temp->val[1]);
result->val[1] = vmulq_n_u16(temp->val[1], -2);
result->val[2] = vaddq_u16(temp->val[2], temp->val[3]);
result->val[3] = vmulq_n_u16(temp->val[3], -2);
}
int main(void) {
uint16_t input[32] = {15,3,1,85,44,156,32,97,3,54,97,17,0,55,9,17,163,23,74,85,96,14,25,36,95,84,76,51,42,63,58,74};
// Scenario 01: Input and output are uint16_t*
uint16_t result01a[32];
uint16_t result01_final[32];
scenario1a(result01a, input);
scenario1b(result01_final, result01a);
// Scenario 02: Input and output are uint16x8x4_t
uint16_t result02_final[32];
uint16x8x4_t temp, result02a, result02b;
temp = vld1q_u16_x4(input);
scenario2a(&result02a, &temp);
scenario2b(&result02b, &result02a);
vst1q_u16_x4(result02_final, result02b);
return 0;
}
Disassembly:
test: file format elf64-littleaarch64
Disassembly of section .init:
0000000000000658 <_init>:
658: a9bf7bfd stp x29, x30, [sp, #-16]!
65c: 910003fd mov x29, sp
660: 94000065 bl 7f4 <call_weak_fn>
664: a8c17bfd ldp x29, x30, [sp], #16
668: d65f03c0 ret
Disassembly of section .plt:
0000000000000670 <.plt>:
670: a9bf7bf0 stp x16, x30, [sp, #-16]!
674: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
678: f947c611 ldr x17, [x16, #3976]
67c: 913e2210 add x16, x16, #0xf88
680: d61f0220 br x17
684: d503201f nop
688: d503201f nop
68c: d503201f nop
0000000000000690 <__cxa_finalize#plt>:
690: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
694: f947ca11 ldr x17, [x16, #3984]
698: 913e4210 add x16, x16, #0xf90
69c: d61f0220 br x17
00000000000006a0 <__libc_start_main#plt>:
6a0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6a4: f947ce11 ldr x17, [x16, #3992]
6a8: 913e6210 add x16, x16, #0xf98
6ac: d61f0220 br x17
00000000000006b0 <__stack_chk_fail#plt>:
6b0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6b4: f947d211 ldr x17, [x16, #4000]
6b8: 913e8210 add x16, x16, #0xfa0
6bc: d61f0220 br x17
00000000000006c0 <__gmon_start__#plt>:
6c0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6c4: f947d611 ldr x17, [x16, #4008]
6c8: 913ea210 add x16, x16, #0xfa8
6cc: d61f0220 br x17
00000000000006d0 <abort#plt>:
6d0: 90000090 adrp x16, 10000 <__FRAME_END__+0xf3d8>
6d4: f947da11 ldr x17, [x16, #4016]
6d8: 913ec210 add x16, x16, #0xfb0
6dc: d61f0220 br x17
Disassembly of section .text:
00000000000006e0 <main>:
6e0: 90000085 adrp x5, 10000 <__FRAME_END__+0xf3d8>
6e4: a9a67bfd stp x29, x30, [sp, #-416]!
6e8: 910003fd mov x29, sp
6ec: 90000002 adrp x2, 0 <_init-0x658>
6f0: 91292042 add x2, x2, #0xa48
6f4: 910263e3 add x3, sp, #0x98
6f8: 910363e0 add x0, sp, #0xd8
6fc: 6f008434 mvni v20.8h, #0x1
700: f947f0a5 ldr x5, [x5, #4064]
704: aa0303e1 mov x1, x3
708: 910143e4 add x4, sp, #0x50
70c: a940344c ldp x12, x13, [x2]
710: a9412c4a ldp x10, x11, [x2, #16]
714: f94000a6 ldr x6, [x5]
718: f900cfe6 str x6, [sp, #408]
71c: d2800006 mov x6, #0x0 // #0
720: a9422448 ldp x8, x9, [x2, #32]
724: a9431c46 ldp x6, x7, [x2, #48]
728: 910463e2 add x2, sp, #0x118
72c: a909b7ec stp x12, x13, [sp, #152]
730: a90aafea stp x10, x11, [sp, #168]
734: a90ba7e8 stp x8, x9, [sp, #184]
738: a90c9fe6 stp x6, x7, [sp, #200]
73c: 94000069 bl 8e0 <scenario1a>
740: 4c402400 ld1 {v0.8h-v3.8h}, [x0]
744: 910043e1 add x1, sp, #0x10
748: aa0403e0 mov x0, x4
74c: 4c402470 ld1 {v16.8h-v19.8h}, [x3]
750: 4e619e85 mul v5.8h, v20.8h, v1.8h
754: 4e608424 add v4.8h, v1.8h, v0.8h
758: 4e628466 add v6.8h, v3.8h, v2.8h
75c: 4e639e87 mul v7.8h, v20.8h, v3.8h
760: 4c002030 st1 {v16.16b-v19.16b}, [x1]
764: 4c002444 st1 {v4.8h-v7.8h}, [x2]
768: 94000072 bl 930 <scenario2a>
76c: ad409885 ldp q5, q6, [x4, #16]
770: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
774: 910563e2 add x2, sp, #0x158
778: 3dc00c84 ldr q4, [x4, #48]
77c: 3dc017e7 ldr q7, [sp, #80]
780: f947f021 ldr x1, [x1, #4064]
784: 4e749c83 mul v3.8h, v4.8h, v20.8h
788: 4e668482 add v2.8h, v4.8h, v6.8h
78c: 4e749ca1 mul v1.8h, v5.8h, v20.8h
790: 4e6784a0 add v0.8h, v5.8h, v7.8h
794: 4c002440 st1 {v0.8h-v3.8h}, [x2]
798: f940cfe0 ldr x0, [sp, #408]
79c: f9400022 ldr x2, [x1]
7a0: eb020000 subs x0, x0, x2
7a4: d2800002 mov x2, #0x0 // #0
7a8: 54000081 b.ne 7b8 <main+0xd8> // b.any
7ac: 52800000 mov w0, #0x0 // #0
7b0: a8da7bfd ldp x29, x30, [sp], #416
7b4: d65f03c0 ret
7b8: 97ffffbe bl 6b0 <__stack_chk_fail#plt>
00000000000007bc <_start>:
7bc: d280001d mov x29, #0x0 // #0
7c0: d280001e mov x30, #0x0 // #0
7c4: aa0003e5 mov x5, x0
7c8: f94003e1 ldr x1, [sp]
7cc: 910023e2 add x2, sp, #0x8
7d0: 910003e6 mov x6, sp
7d4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7d8: f947f800 ldr x0, [x0, #4080]
7dc: 90000083 adrp x3, 10000 <__FRAME_END__+0xf3d8>
7e0: f947f463 ldr x3, [x3, #4072]
7e4: 90000084 adrp x4, 10000 <__FRAME_END__+0xf3d8>
7e8: f947e084 ldr x4, [x4, #4032]
7ec: 97ffffad bl 6a0 <__libc_start_main#plt>
7f0: 97ffffb8 bl 6d0 <abort#plt>
00000000000007f4 <call_weak_fn>:
7f4: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
7f8: f947ec00 ldr x0, [x0, #4056]
7fc: b4000040 cbz x0, 804 <call_weak_fn+0x10>
800: 17ffffb0 b 6c0 <__gmon_start__#plt>
804: d65f03c0 ret
808: d503201f nop
80c: d503201f nop
0000000000000810 <deregister_tm_clones>:
810: b0000080 adrp x0, 11000 <__data_start>
814: 91004000 add x0, x0, #0x10
818: b0000081 adrp x1, 11000 <__data_start>
81c: 91004021 add x1, x1, #0x10
820: eb00003f cmp x1, x0
824: 540000c0 b.eq 83c <deregister_tm_clones+0x2c> // b.none
828: 90000081 adrp x1, 10000 <__FRAME_END__+0xf3d8>
82c: f947e421 ldr x1, [x1, #4040]
830: b4000061 cbz x1, 83c <deregister_tm_clones+0x2c>
834: aa0103f0 mov x16, x1
838: d61f0200 br x16
83c: d65f03c0 ret
0000000000000840 <register_tm_clones>:
840: b0000080 adrp x0, 11000 <__data_start>
844: 91004000 add x0, x0, #0x10
848: b0000081 adrp x1, 11000 <__data_start>
84c: 91004021 add x1, x1, #0x10
850: cb000021 sub x1, x1, x0
854: d37ffc22 lsr x2, x1, #63
858: 8b810c41 add x1, x2, x1, asr #3
85c: 9341fc21 asr x1, x1, #1
860: b40000c1 cbz x1, 878 <register_tm_clones+0x38>
864: 90000082 adrp x2, 10000 <__FRAME_END__+0xf3d8>
868: f947fc42 ldr x2, [x2, #4088]
86c: b4000062 cbz x2, 878 <register_tm_clones+0x38>
870: aa0203f0 mov x16, x2
874: d61f0200 br x16
878: d65f03c0 ret
87c: d503201f nop
0000000000000880 <__do_global_dtors_aux>:
880: a9be7bfd stp x29, x30, [sp, #-32]!
884: 910003fd mov x29, sp
888: f9000bf3 str x19, [sp, #16]
88c: b0000093 adrp x19, 11000 <__data_start>
890: 39404260 ldrb w0, [x19, #16]
894: 35000140 cbnz w0, 8bc <__do_global_dtors_aux+0x3c>
898: 90000080 adrp x0, 10000 <__FRAME_END__+0xf3d8>
89c: f947e800 ldr x0, [x0, #4048]
8a0: b4000080 cbz x0, 8b0 <__do_global_dtors_aux+0x30>
8a4: b0000080 adrp x0, 11000 <__data_start>
8a8: f9400400 ldr x0, [x0, #8]
8ac: 97ffff79 bl 690 <__cxa_finalize#plt>
8b0: 97ffffd8 bl 810 <deregister_tm_clones>
8b4: 52800020 mov w0, #0x1 // #1
8b8: 39004260 strb w0, [x19, #16]
8bc: f9400bf3 ldr x19, [sp, #16]
8c0: a8c27bfd ldp x29, x30, [sp], #32
8c4: d65f03c0 ret
8c8: d503201f nop
8cc: d503201f nop
00000000000008d0 <frame_dummy>:
8d0: 17ffffdc b 840 <register_tm_clones>
8d4: d503201f nop
8d8: d503201f nop
8dc: d503201f nop
00000000000008e0 <scenario1a>:
8e0: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
8e4: 6e60b833 neg v19.8h, v1.8h
8e8: 6e60b852 neg v18.8h, v2.8h
8ec: 6e60b871 neg v17.8h, v3.8h
8f0: 6e60b810 neg v16.8h, v0.8h
8f4: 6e132004 ext v4.16b, v0.16b, v19.16b, #4
8f8: 6e122025 ext v5.16b, v1.16b, v18.16b, #4
8fc: 6e112046 ext v6.16b, v2.16b, v17.16b, #4
900: 6e102067 ext v7.16b, v3.16b, v16.16b, #4
904: 4c002404 st1 {v4.8h-v7.8h}, [x0]
908: d65f03c0 ret
90c: d503201f nop
0000000000000910 <scenario1b>:
910: 4c402420 ld1 {v0.8h-v3.8h}, [x1]
914: 6f008430 mvni v16.8h, #0x1
918: 4e619e05 mul v5.8h, v16.8h, v1.8h
91c: 4e608424 add v4.8h, v1.8h, v0.8h
920: 4e628466 add v6.8h, v3.8h, v2.8h
924: 4e639e07 mul v7.8h, v16.8h, v3.8h
928: 4c002404 st1 {v4.8h-v7.8h}, [x0]
92c: d65f03c0 ret
0000000000000930 <scenario2a>:
930: ad400025 ldp q5, q0, [x1]
934: ad408423 ldp q3, q1, [x1, #16]
938: 3dc00c24 ldr q4, [x1, #48]
93c: 6e60b800 neg v0.8h, v0.8h
940: 4ea11c22 mov v2.16b, v1.16b
944: 6e60b821 neg v1.8h, v1.8h
948: 6e0020a5 ext v5.16b, v5.16b, v0.16b, #4
94c: 4ea41c80 mov v0.16b, v4.16b
950: 6e60b884 neg v4.8h, v4.8h
954: 6e012063 ext v3.16b, v3.16b, v1.16b, #4
958: 3d800005 str q5, [x0]
95c: 3dc00021 ldr q1, [x1]
960: 6e042042 ext v2.16b, v2.16b, v4.16b, #4
964: ad008803 stp q3, q2, [x0, #16]
968: 6e60b821 neg v1.8h, v1.8h
96c: 6e012000 ext v0.16b, v0.16b, v1.16b, #4
970: 3d800c00 str q0, [x0, #48]
974: d65f03c0 ret
978: d503201f nop
97c: d503201f nop
0000000000000980 <scenario2b>:
980: ad401022 ldp q2, q4, [x1]
984: 6f008420 mvni v0.8h, #0x1
988: ad410c21 ldp q1, q3, [x1, #32]
98c: 4e609c85 mul v5.8h, v4.8h, v0.8h
990: 4e648442 add v2.8h, v2.8h, v4.8h
994: 4e609c60 mul v0.8h, v3.8h, v0.8h
998: 4e638421 add v1.8h, v1.8h, v3.8h
99c: ad001402 stp q2, q5, [x0]
9a0: ad010001 stp q1, q0, [x0, #32]
9a4: d65f03c0 ret
00000000000009a8 <__libc_csu_init>:
9a8: a9bc7bfd stp x29, x30, [sp, #-64]!
9ac: 910003fd mov x29, sp
9b0: a90153f3 stp x19, x20, [sp, #16]
9b4: 90000094 adrp x20, 10000 <__FRAME_END__+0xf3d8>
9b8: 9135c294 add x20, x20, #0xd70
9bc: a9025bf5 stp x21, x22, [sp, #32]
9c0: 90000095 adrp x21, 10000 <__FRAME_END__+0xf3d8>
9c4: 9135a2b5 add x21, x21, #0xd68
9c8: cb150294 sub x20, x20, x21
9cc: 2a0003f6 mov w22, w0
9d0: a90363f7 stp x23, x24, [sp, #48]
9d4: aa0103f7 mov x23, x1
9d8: aa0203f8 mov x24, x2
9dc: 97ffff1f bl 658 <_init>
9e0: eb940fff cmp xzr, x20, asr #3
9e4: 54000160 b.eq a10 <__libc_csu_init+0x68> // b.none
9e8: 9343fe94 asr x20, x20, #3
9ec: d2800013 mov x19, #0x0 // #0
9f0: f8737aa3 ldr x3, [x21, x19, lsl #3]
9f4: aa1803e2 mov x2, x24
9f8: 91000673 add x19, x19, #0x1
9fc: aa1703e1 mov x1, x23
a00: 2a1603e0 mov w0, w22
a04: d63f0060 blr x3
a08: eb13029f cmp x20, x19
a0c: 54ffff21 b.ne 9f0 <__libc_csu_init+0x48> // b.any
a10: a94153f3 ldp x19, x20, [sp, #16]
a14: a9425bf5 ldp x21, x22, [sp, #32]
a18: a94363f7 ldp x23, x24, [sp, #48]
a1c: a8c47bfd ldp x29, x30, [sp], #64
a20: d65f03c0 ret
a24: d503201f nop
0000000000000a28 <__libc_csu_fini>:
a28: d65f03c0 ret
Disassembly of section .fini:
0000000000000a2c <_fini>:
a2c: a9bf7bfd stp x29, x30, [sp, #-16]!
a30: 910003fd mov x29, sp
a34: a8c17bfd ldp x29, x30, [sp], #16
a38: d65f03c0 ret
Questions
Normally people load the data from the pointer (using vld1q_u16_x4), operates using the Neon datatypes, and store back to another pointer (using vst1q_u16_x4), and don't use an approach like the one I used in Scenario 2 (sending the Neon datatypes as inputs/outputs). Is there a general reason why is this?
I checked the disassembly of Scenario 1a (starts at line 8e0) vs. Scenario 2a (starts at line 930). It seems scenario 2a has more data movement. Will this happen in all scenarios? So is it faster to do what I asked in question 1? If so, then why this doesn't happen in Scenario 1b vs 2b (lines 910 and 980, respectively).
In the main function, there are some add/mul after both Scenario1a and 2a (in lines 750,754,758,75c and 784,788,78c,790), but my main function has no multiplications nor additions. Why is this happening? (I'm just curious)
Thank you for all your help!
There is absolutely no reason for using pointer to neon datatypes for parameters. Memory doesn't care about datatypes. Compilers are very conservative and bureaucratic, they simply have to. It's like filing an application to authorities: One wrong check mark, your application will land in the wrong hand, causing tremendeous unnecessary trouble.
Short: Keep it simple. Don't try to impress compilers or reviewers in any way.
I told you last time to be explicit on memory load and store. You are computing directly from/to memory in scenario2. Never do this. Stick to load->compute->store. Local variables are your best friends. (__restrict directive might help)
Again, do not try to impress compilers or reviewers. Your scenario2 is just asking for trouble. A sheer disaster. The reviewer will raise a red flag immediately, and keep his eye on you and all your codes, if you are lucky and didn't get fired the instant.
You shouldn't put callees in the same file as the caller. More than often, the caller will inline short non-static callees which makes profiling harder.

Why does gcc, with -O3, unnecessarily clear a local ARM NEON array?

Consider the following code (Compiler Explorer link), compiled under gcc and clang with -O3 optimization:
#include <arm_neon.h>
void bug(int8_t *out, const int8_t *in) {
for (int i = 0; i < 2; i++) {
int8x16x4_t x;
x.val[0] = vld1q_s8(&in[16 * i]);
x.val[1] = x.val[2] = x.val[3] = vshrq_n_s8(x.val[0], 7);
vst4q_s8(&out[64 * i], x);
}
}
NOTE: this is a minimally reproducible version of an issue that is popping up in many different functions of my actual, much more complex code, filled with arithmetic/logical/permutation instructions performing a totally different operation from above. Please refrain from criticizing and/or suggesting different ways of doing what the code above does, unless it has an effect on the code generation issue discussed below.
clang generates sane code:
bug(signed char*, signed char const*): // #bug(signed char*, signed char const*)
ldr q0, [x1]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
ldr q0, [x1, #16]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
ret
As for gcc, it inserts a lot of unnecessary operations, apparently zeroing out the registers that will be eventually input to the st4 instruction:
bug(signed char*, signed char const*):
sub sp, sp, #128
# mov x9, 0
# mov x8, 0
# mov x7, 0
# mov x6, 0
# mov x5, 0
# mov x4, 0
# mov x3, 0
# stp x9, x8, [sp]
# mov x2, 0
# stp x7, x6, [sp, 16]
# stp x5, x4, [sp, 32]
# str x3, [sp, 48]
ldr q0, [x1]
# stp x2, x9, [sp, 56]
# stp x8, x7, [sp, 72]
sshr v4.16b, v0.16b, 7
# str q0, [sp]
# ld1 {v0.16b - v3.16b}, [sp]
# stp x6, x5, [sp, 88]
mov v1.16b, v4.16b
# stp x4, x3, [sp, 104]
mov v2.16b, v4.16b
# str x2, [sp, 120]
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
### ldr q4, [x1, 16]
### add x1, sp, 64
### str q4, [sp, 64]
sshr v4.16b, v4.16b, 7
### ld1 {v0.16b - v3.16b}, [x1]
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
add sp, sp, 128
ret
I manually prefixed with # all instructions that could be safely taken out, without affecting the result of the function.
In addition, the instructions prefixed with ### perform an unnecessary trip to memory and back (and anyway, the mov instructions following ### ld1 ... overwrite 3 out of 4 registers loaded by that ld1 instruction), and could be replaced by a single load straight to v0.16b -- and the sshr instruction in the middle of the block would then use v0.16b as its source register.
As far as I know, x, being a local variable, can be used unitialized; and even if it weren't, all registers are properly initialized, so there's no point in zeroing them out just to immediately overwrite them with values.
I'm inclined to think this is a gcc bug, but before reporting it, I'm curious if I missed something. Maybe there's a compilation flag, an __attribute__ or something else that I could to make gcc generate sane code.
Thus, my question: is there anything I can do to generate sane code, or is this a bug I need to report to gcc?
Code generation on a fairly current development version of gcc appears to have improved immensely, at least for this case.
After installing the gcc-snapshot package (dated 20210918), gcc generates the following code:
bug:
ldr q5, [x1]
sshr v4.16b, v5.16b, 7
mov v0.16b, v5.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
ldr q4, [x1, 16]
mov v0.16b, v4.16b
sshr v4.16b, v4.16b, 7
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
ret
Not ideal yet -- at least two mov instruction could be removed per iteration by changing the destination registers of ldr and sshr, but considerably better than before.
Short answer: welcome to GCC. Do not bother optimizing anything while you are using it. And Clang isn't better either.
Secret tip: Add ARM and ARM64 components to Visual Studio, and you'd be surprised how well it works. The problem is however, it generates COFF binary, not ELF, and I haven't been able to find a converter.
You can use Ida Pro or dumpbin and generate a disassembly file and it look. like:
; void __fastcall bug(char *out, const char *in)
EXPORT bug
bug
MOV W10, #0
MOV W9, #0
$LL4 ; CODE XREF: bug+30↓j
ADD X8, X1, W9,SXTW
ADD W9, W9, #0x10
CMP W9, #0x20 ; ' '
LD1 {V0.16B}, [X8]
ADD X8, X0, W10,SXTW
ADD W10, W10, #0x40 ; '#'
SSHR V1.16B, V0.16B, #7
MOV V2.16B, V1.16B
MOV V3.16B, V1.16B
ST4 {V0.16B-V3.16B}, [X8]
B.LT $LL4
RET
; End of function bug
You can copy paste the disassembly to a GCC assembly file.
And don't bother with reporting the "bug" either. If they were listening, GCC wouldn't be this bad in first place.

How to help different C(++) compilers to generate clean SIMD code?

I've been experimenting a bit with different compilers to see how much they can be used to write efficient SIMD code (aarch64 and x64) without ugly intrinsics or hand-written asm. And so far only Clang seems to do a correct job for aarch64. I think I might be missing some compiler hints or compilation flags.
I've tried to use __restrict keyword and I used the following compilation flags on GCC and Clang:
-force-vector-width=8 -ffast-math -ftree-vectorize
Here is the code:
typedef union
{
float coords[3];
struct { float x; float y; float z; };
} vec3; // simple 3D vector
vec3 AddVectors(vec3 a, vec3 b)
{
vec3 x = { { a.x + b.x, a.y + b.y, a.z + b.z } };
return x;
}
void Add4Vectors(vec3* __restrict a, vec3* __restrict b)
{
a[0] = AddVectors(a[0], b[0]);
a[1] = AddVectors(a[1], b[1]);
a[2] = AddVectors(a[2], b[2]);
a[3] = AddVectors(a[3], b[3]);
}
void Add16Vectors(vec3* __restrict a, vec3* __restrict b)
{
Add4Vectors(a, b);
Add4Vectors(a + 4, b + 4);
Add4Vectors(a + 8, b + 8);
Add4Vectors(a + 12, b + 12);
}
void Add64Vectors(vec3* __restrict a, vec3* __restrict b)
{
Add16Vectors(a, b);
Add16Vectors(a + 16, b + 16);
Add16Vectors(a + 32, b + 32);
Add16Vectors(a + 48, b + 48);
}
You can toy with on on GodBolt.org
https://godbolt.org/z/SW3aH-
Here is the result I get from clang 8 (flags: -O3 -target aarch64)
AddVectors(vec3, vec3): // #AddVectors(vec3, vec3)
fadd s0, s0, s3
fadd s1, s1, s4
fadd s2, s2, s5
ret
Add4Vectors(vec3*, vec3*): // #Add4Vectors(vec3*, vec3*)
ldp q0, q1, [x0]
ldp q2, q3, [x1]
ldr q4, [x0, #32]
ldr q5, [x1, #32]
fadd v0.4s, v0.4s, v2.4s
fadd v1.4s, v1.4s, v3.4s
fadd v2.4s, v4.4s, v5.4s
stp q0, q1, [x0]
str q2, [x0, #32]
ret
Add16Vectors(vec3*, vec3*): // #Add16Vectors(vec3*, vec3*)
ldp q0, q1, [x0]
ldp q2, q3, [x1]
ldp q4, q5, [x0, #32]
ldp q6, q7, [x1, #32]
ldp q16, q17, [x0, #64]
ldp q18, q19, [x1, #64]
ldp q20, q21, [x0, #96]
ldp q22, q23, [x1, #96]
fadd v0.4s, v0.4s, v2.4s
fadd v1.4s, v1.4s, v3.4s
ldp q2, q3, [x0, #128]
fadd v4.4s, v4.4s, v6.4s
fadd v5.4s, v5.4s, v7.4s
ldp q6, q7, [x1, #128]
fadd v16.4s, v16.4s, v18.4s
fadd v17.4s, v17.4s, v19.4s
ldp q18, q19, [x0, #160]
fadd v20.4s, v20.4s, v22.4s
fadd v21.4s, v21.4s, v23.4s
ldp q22, q23, [x1, #160]
fadd v2.4s, v2.4s, v6.4s
fadd v3.4s, v3.4s, v7.4s
stp q0, q1, [x0]
fadd v6.4s, v18.4s, v22.4s
fadd v7.4s, v19.4s, v23.4s
stp q4, q5, [x0, #32]
stp q16, q17, [x0, #64]
stp q20, q21, [x0, #96]
stp q2, q3, [x0, #128]
stp q6, q7, [x0, #160]
ret
Add64Vectors(vec3*, vec3*): // #Add64Vectors(vec3*, vec3*)
stp x20, x19, [sp, #-32]! // 8-byte Folded Spill
stp x29, x30, [sp, #16] // 8-byte Folded Spill
add x29, sp, #16 // =16
mov x19, x1
mov x20, x0
bl Add16Vectors(vec3*, vec3*)
add x0, x20, #192 // =192
add x1, x19, #192 // =192
bl Add16Vectors(vec3*, vec3*)
add x0, x20, #384 // =384
add x1, x19, #384 // =384
bl Add16Vectors(vec3*, vec3*)
ldp x29, x30, [sp, #16] // 8-byte Folded Reload
add x0, x20, #576 // =576
add x1, x19, #576 // =576
ldp x20, x19, [sp], #32 // 8-byte Folded Reload
b Add16Vectors(vec3*, vec3*)
This is pretty neat if you ask me, especially the Add16Vectors routine, I do not think I would write
a better version by hand.
But still, something is bugging me with the Add64Vectors routine.
The registers save on the stack does not seem necessary.
And it does not seem to be complicated at all for the compiler to avoid this, there is plenty of free registers.
Here is how I would write it:
Am I missing something?
Add64Vectors(vec3*, vec3*):
mov x3, x1
mov x2, x0
bl Add16Vectors(vec3*, vec3*)
add x0, x2, #192
add x1, x3, #192
bl Add16Vectors(vec3*, vec3*)
add x0, x2, #384
add x1, x3, #384
bl Add16Vectors(vec3*, vec3*)
add x0, x2, #576
add x1, x3, #576
b Add16Vectors(vec3*, vec3*)
Now, I am a bit nitpicking with this, but here is the ugly part.
I do not want to post a too long message here, so please try this on your side to see the results. (on https://godbolt.org)
But the aarch64 version produced by Clang is the only where I think the compiler is doing a good job.
GCC (8.2) for ARM64 is using about twice as many instructions for the same routines MSVC for ARM64 is even worse.
But maybe this is because of the target not being as mainstream as x64?
Nope.
Results for x64 are appalling, and even if Clang seems to also be the winner here, I do not understand why the x64 version is twice as long
as the aarch64 version...
I am thinking of alignment issues or missing compilation flags, maybe?

What is the most efficient way to reorder a contiguous strided pixel array?

I am working on a highly performance-critical image processing pipeline on a Jetson TX2 (with an ARM processor), which involves reading a set of images and then performing deep learning based object detection through Darknet. Darknet, written in C, has its own representation of how images are stored, which is different from how OpenCV's IplImage or a Python numpy array would store the images.
In my application, I am required to interface with Darknet through Python. So, as of now, I am reading a 'batch' of images (usually 16) into a numpy array and then passing it to Darknet as a contiguous array using ctypes. Within Darknet, I then have to rearrange the ordering of the pixels to go from the numpy format to Darknet's format.
While the input array is one contiguous block arranged column-wise, then row-wise, then channel-wise, and then by image, the Darknet format needs to be arranged by channel first, then by column, then by row: and contains one row per image in the batch instead of a contiguous block. The picture below tries to demonstrate the difference. In this example, I assume a single ixj image. (0,0), (0,1) etc. indicate (row, col), whereas in the top, C0, C1, C2.. etc indicate the column in the corresponding row. Note that in the case of multiple images as part of a batch, the input format arranges them sequentially one after the other, but Darknet needs them to be on separate rows: each row containing data from only one image.
As of now, my code in C that converts the input array to the Darknet format looks like this, where it iteratively hits every pixel in every channel and puts it in a different place, while also normalizing the pixels along the way.
matrix ndarray_to_matrix(unsigned char* src, long* shape, long* strides)
{
int nb = shape[0]; // Batch size
int h = shape[1]; // Height of each image
int w = shape[2]; // Width of each image
int c = shape[3]; // No. of channels in each image
matrix X = make_matrix(nb, h*w*c); // Output array format: 2D
int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];
int b, i, j, k;
int index1, index2 = 0;
for(b = 0; b < nb ; ++b) {
for(i = 0; i < h; ++i) {
for(k= 0; k < c; ++k) {
for(j = 0; j < w; ++j) {
index1 = k*w*h + i*w + j;
index2 = step_b*b + step_h*i + step_w*j + step_c*k;
X.vals[b][index1] = src[index2]/255.;
}
}
}
}
return X;
}
Is there a more efficient way of doing this rearranging and normalization in C?
I am using the Jetson TX2: which contains an ARM processor and an NVIDIA GPU, thus having access to NEON and CUDA as well as OpenMP.
The image dimensions are fixed and can be hardcoded: only the batch size can change.
The function below will be almost as fast as memcpy:
/*
* Created on: 2018. 5. 5.
* Author: Jake 'Alquimista' Lee
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix
// void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
pDst .req x0
pRed .req x1
pGrn .req x2
pBlu .req x3
count .req w4
.balign 64
.func
alquimista_ndarray_to_matrix:
mov x16, #(640*360) & 0xffff
str q8, [sp, #-16]!
movk x16, #((640*360)>>16), lsl #16
mov count, #(640*360)/128
add pGrn, pRed, x16
add pBlu, pRed, x16, lsl #1
b 1f
.balign 64
1:
ldp q0, q3, [pRed], #32
ldp q1, q4, [pGrn], #32
ldp q2, q5, [pBlu], #32
ldp q6, q16, [pRed], #32
ldp q7, q17, [pGrn], #32
ldp q8, q18, [pBlu], #32
ldp q19, q22, [pRed], #32
ldp q20, q23, [pGrn], #32
ldp q21, q24, [pBlu], #32
ldp q25, q28, [pRed], #32
ldp q26, q29, [pGrn], #32
ldp q27, q30, [pBlu], #32
subs count, count, #1
st3 {v0.16b, v1.16b, v2.16b}, [pDst], #48
st3 {v3.16b, v4.16b, v5.16b}, [pDst], #48
st3 {v6.16b, v7.16b, v8.16b}, [pDst], #48
st3 {v16.16b, v17.16b, v18.16b}, [pDst], #48
st3 {v19.16b, v20.16b, v21.16b}, [pDst], #48
st3 {v22.16b, v23.16b, v24.16b}, [pDst], #48
st3 {v25.16b, v26.16b, v27.16b}, [pDst], #48
st3 {v28.16b, v29.16b, v30.16b}, [pDst], #48
b.gt 1b
.balign 8
ldr q8, [sp], #16
ret
.endfunc
.end
For maximum performance and minimum power consumption, you might want to align the source pointer to 32 bytes and the destination to 16 bytes.
The function prototype is:
void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
Below is the function that does the conversion to float on the fly.
And I added the batch number as parameter so that you don't have to do a function call for every image.
/*
* Created on: 2018. 5. 5.
* Copyright: Jake 'Alquimista' Lee. All rights reserved
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix_float
// void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);
pDst .req x0
pRed .req x1
batch .req w2
pGrn .req x3
pBlu .req x4
stride .req x5
count .req w7
.balign 64
.func
alquimista_ndarray_to_matrix_float:
mov stride, #((640*360)<<1) & 0xffff
stp q8, q15, [sp, #-32]!
movk stride, #((640*360)>>15), lsl #16
mov count, #(640*360)/32
add pGrn, pRed, stride, lsr #1
add pBlu, pRed, stride
b 1f
.balign 64
1:
ldp q0, q1, [pRed], #32
ldp q2, q3, [pGrn], #32
ldp q4, q5, [pBlu], #32
subs count, count, #1
ushll v20.8h, v0.8b, #7
ushll2 v23.8h, v0.16b, #7
ushll v26.8h, v1.8b, #7
ushll2 v29.8h, v1.16b, #7
ushll v21.8h, v2.8b, #7
ushll2 v24.8h, v2.16b, #7
ushll v27.8h, v3.8b, #7
ushll2 v30.8h, v3.16b, #7
ushll v22.8h, v4.8b, #7
ushll2 v25.8h, v4.16b, #7
ushll v28.8h, v5.8b, #7
ushll2 v31.8h, v5.16b, #7
ursra v20.8h, v20.8h, #8
ursra v21.8h, v21.8h, #8
ursra v22.8h, v22.8h, #8
ursra v23.8h, v23.8h, #8
ursra v24.8h, v24.8h, #8
ursra v25.8h, v25.8h, #8
ursra v26.8h, v26.8h, #8
ursra v27.8h, v27.8h, #8
ursra v28.8h, v28.8h, #8
ursra v29.8h, v29.8h, #8
ursra v30.8h, v30.8h, #8
ursra v31.8h, v31.8h, #8
uxtl v0.4s, v20.4h
uxtl v1.4s, v21.4h
uxtl v2.4s, v22.4h
uxtl2 v3.4s, v20.8h
uxtl2 v4.4s, v21.8h
uxtl2 v5.4s, v22.8h
uxtl v6.4s, v23.4h
uxtl v7.4s, v24.4h
uxtl v8.4s, v25.4h
uxtl2 v15.4s, v23.8h
uxtl2 v16.4s, v24.8h
uxtl2 v17.4s, v25.8h
uxtl v18.4s, v26.4h
uxtl v19.4s, v27.4h
uxtl v20.4s, v28.4h
uxtl2 v21.4s, v26.8h
uxtl2 v22.4s, v27.8h
uxtl2 v23.4s, v28.8h
uxtl v24.4s, v29.4h
uxtl v25.4s, v30.4h
uxtl v26.4s, v31.4h
uxtl2 v27.4s, v29.8h
uxtl2 v28.4s, v30.8h
uxtl2 v29.4s, v31.8h
ucvtf v0.4s, v0.4s, #15
ucvtf v1.4s, v1.4s, #15
ucvtf v2.4s, v2.4s, #15
ucvtf v3.4s, v3.4s, #15
ucvtf v4.4s, v4.4s, #15
ucvtf v5.4s, v5.4s, #15
ucvtf v6.4s, v6.4s, #15
ucvtf v7.4s, v7.4s, #15
ucvtf v8.4s, v8.4s, #15
ucvtf v15.4s, v15.4s, #15
ucvtf v16.4s, v16.4s, #15
ucvtf v17.4s, v17.4s, #15
ucvtf v18.4s, v18.4s, #15
ucvtf v19.4s, v19.4s, #15
ucvtf v20.4s, v20.4s, #15
ucvtf v21.4s, v21.4s, #15
ucvtf v22.4s, v22.4s, #15
ucvtf v23.4s, v23.4s, #15
ucvtf v24.4s, v24.4s, #15
ucvtf v25.4s, v25.4s, #15
ucvtf v26.4s, v26.4s, #15
ucvtf v27.4s, v27.4s, #15
ucvtf v28.4s, v28.4s, #15
ucvtf v29.4s, v29.4s, #15
st3 {v0.4s - v2.4s}, [pDst], #48
st3 {v3.4s - v5.4s}, [pDst], #48
st3 {v6.4s - v8.4s}, [pDst], #48
st3 {v15.4s - v17.4s}, [pDst], #48
st3 {v18.4s - v20.4s}, [pDst], #48
st3 {v21.4s - v23.4s}, [pDst], #48
st3 {v24.4s - v26.4s}, [pDst], #48
st3 {v27.4s - v29.4s}, [pDst], #48
b.gt 1b
add pRed, pRed, stride
add pGrn, pGrn, stride
add pGrn, pGrn, stride
subs batch, batch, #1
mov count, #(640*360)/32
b.gt 1b
.balign 8
ldp q8, q15, [sp], #32
ret
.endfunc
.end
It's quite a long one, and it will take considerably longer than the uint8 one above.
Please note that it will scale extremely well to multi-core execution.
The function prototype is:
void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);

Compacting data in buffer from 16 bit per element to 12 bits

I'm wondering if there is any chance to improve performance
of such compacting. The idea is to saturate values higher than 4095
and place each value every 12 bits in new continuous buffer. Just like that:
Concept:
Convert:
Input buffer: [0.0][0.1][0.2] ... [0.15] | [1.0][1.1][1.2] ... [1.15]
| [2.0][2.1][2.2] ... [2.15] etc ...
to:
Output buffer: [0.0][0.1][0.2] ... [0.11] | [1.0][1.1][1.2] ... [1.11]
| [2.0][2.1][2.2] ... [2.11] etc ...
The input and output buffers are defines as:
uint16_t input[76800] (it's size in Bytes equal 153600 Bytes)
uint24_t output[38400] (it's size in Bytes equal 115200 Bytes)
So I have reduced the data size by 1/4. This computation cost ~1ms on Cortex-A9 with 792 MHz CPU speed and 2 Cores.
I have to perform such "compression" because I transfer about 18MB/s over Ethernet and that gives
me huge overhead. I've tested various compression algorithms such Snappy, LZ4 and none of that was
even close to achieved 1 ms with saturation and bits schifting.
I've written the following code:
#pragma pack(push, 1)
typedef struct {
union {
struct {
uint32_t value0_24x1:24;
};
struct {
uint32_t value0_12x1:12;
uint32_t value1_12x1:12;
};
struct {
uint32_t value0_8x1:8;
uint32_t value1_8x1:8;
uint32_t value3_8x1:8;
};
};
} uint24_t;
#pragma pack(pop)
static inline uint32_t __attribute__((always_inline)) saturate(uint32_t value)
{
register uint32_t result;
asm volatile("usat %0, %2, %1 \n\t" \
: [result] "=r" (result) \
: [value] "r" (value), [saturate] "I" (12) \
: \
);
return result;
}
void __attribute__((noinline, used)) compact(const uint16_t *input, uint24_t *output, uint32_t elements)
{
#if 0
/* More readable, but slower */
for (uint32_t i = 0; i < elements; ++i) {
output->value0_12x1 = saturate(*input++);
(output++)->value1_12x1 = saturate(*input++);
}
#else
/* Alternative - less readable but faster */
for (uint32_t i = 0; i < elements; ++i, input += 2)
(output++)->value0_24x1 = saturate(*input) | ((uint32_t)saturate(*(input+1))) << 12;
#endif
}
static uint16_t buffer_in[76800] = {0};
static uint24_t buffer_out[38400] = {0};
int main()
{
/* Dividing by 2 because we process two input values in a single loop inside compact() */
compact(buffer_in, buffer_out, sizeof(buffer_in) / sizeof(buffer_in[0]) / 2);
return 0;
}
And it's Assembly:
248 00008664 <compact>:
249 8664: e92d4010 push {r4, lr}
250 8668: e3a03000 mov r3, #0
251 866c: ea00000c b 86a4 <compact+0x40>
252 8670: e1d040b0 ldrh r4, [r0]
253 8674: e6ec4014 usat r4, #12, r4
254 8678: e1d0c0b2 ldrh ip, [r0, #2]
255 867c: e6ecc01c usat ip, #12, ip
256 8680: e184c60c orr ip, r4, ip, lsl #12
257 8684: e2833001 add r3, r3, #1
258 8688: e2800004 add r0, r0, #4
259 868c: e5c1c000 strb ip, [r1]
260 8690: e7e7445c ubfx r4, ip, #8, #8
261 8694: e7e7c85c ubfx ip, ip, #16, #8
262 8698: e5c14001 strb r4, [r1, #1]
263 869c: e5c1c002 strb ip, [r1, #2]
264 86a0: e2811003 add r1, r1, #3
265 86a4: e1530002 cmp r3, r2
266 86a8: 1afffff0 bne 8670 <compact+0xc>
267 86ac: e8bd8010 pop {r4, pc}
Compiled using GCC 4.6.3 with the following CFLAGS:
-Os (-O2 and -O3 do not give any noticable improvements)
-march=armv7-a -mcpu=cortex-a9 -mtune=cortex-a9
-marm -mfloat-abi=softfp -mfpu=neon funsafe-math-optimizations
Benchmark has shown that we're using ~10.3 cycles per 1 data convertion.
The questions are:
Can I use NEON to improve the performance?
Can someone give me some hints regardles NEON? What intrinsics shall I use?
Some code example would be very welcome, because I'm completly noob when it
comes to NEON.
Here are the answers :
Yes, it will be blazingly fast.
You should avoid intrinsics at all costs. It isn't worth the effort. Go for assembly
I'll give you a sample implementation once I arrive home.
////////////////////////////////////////////////////
Ok, here it goes :
You want to pack 16 bits to 12 bits. It's a ratio of 4:3.
Therefore, it's wise to load data 4 spread and store them 3 spread : vld4.16 -> vst3.16
/*
* void fanic_pack16to12(unsigned short * pDst, unsigned short * pSrc, unsigned int count);
* assert :
* count >= 64
* count % 4 == 0
*
* written by : Jake Lee
* part of FANIC project - Fastest ARM NEON Implementation Challenge
*/
pDst .req r0
pSrc .req r1
count .req r2
.text
.arm
.global fanic_pack16to12:
.func
.align 5
fanic_pack16to12:
pld [pSrc]
pld [pSrc, #64]
pld [pSrc, #128]
pld [pSrc, #192]
pld [pSrc, #256]
sub count, count, #64
.align 5
1:
vld4.16 {d16, d18, d20, d22}, [pSrc]!
vld4.16 {d17, d19, d21, d23}, [pSrc]!
vld4.16 {d24, d26, d28, d30}, [pSrc]!
vld4.16 {d25, d27, d29, d31}, [pSrc]!
pld [pSrc, #128]
pld [pSrc, #192]
subs count, count, #64
vqshl.u16 q0, q8, #4
vqshl.u16 q3, q9, #4
vqshl.u16 q8, q10, #4
vqshl.u16 q9, q11, #4
vqshl.u16 q10, q12, #4
vqshl.u16 q13, q13, #4
vqshl.u16 q14, q14, #4
vqshl.u16 q15, q15, #4
vshl.u16 q1, q3, #4
vshl.u16 q2, q8, #8
vshl.u16 q11, q13, #4
vshl.u16 q12, q14, #8
vsri.16 q0, q3, #12
vsri.16 q1, q8, #8
vsri.16 q2, q9, #4
vsri.16 q10, q13, #12
vsri.16 q11, q14, #8
vsri.16 q12, q15, #4
vst3.16 {d0, d2, d4}, [pDst]!
vst3.16 {d1, d3, d5}, [pDst]!
vst3.16 {d20, d22, d24}, [pDst]!
vst3.16 {d21, d23, d25}, [pDst]!
bpl 1b
cmp count, #-64
add pDst, pDst, count
bxle lr
add pSrc, pSrc, count, lsl #1
add pDst, pDst, count, asr #1
b 1b
.endfunc
.end
Please note how many cycles and bandwidth are saved thanks to smart register allocation and loop control - practices that are simply impossible with intrinsics.
This implementation will run so fast as if done by a dedicated hardware.
There is absolutely no pipeline hazard.
Roughly 50 cycles / iteration
= less than 1 cycle / data
Have fun!
//////////////////////////////////////////////////////
Ok, below is the unpacking function :
/*
* void fanic_unpack12to16(unsigned short *pDst, unsigned short *pSrc, unsigned int count);
* assert :
* count >=64
* count % 4 == 0
*
* written by : Jake Lee
* part of FANIC project - Fastest ARM NEON Implementation Challenge
*/
pDst .req r0
pSrc .req r1
count .req r2
.text
.arm
.global fanic_unpack12to16:
.func
.align 5
fanic_unpack12to16:
pld [pSrc]
pld [pSrc, #64*1]
pld [pSrc, #64*2]
vpush {q4}
pld [pSrc, #64*3]
vmov.i16 q4, #0x0fff
pld [pSrc, #64*4]
sub count, count, #64
.align 5
1:
vld3.16 {d20, d22, d24}, [pSrc]!
vld3.16 {d21, d23, d25}, [pSrc]!
vld3.16 {d26, d28, d30}, [pSrc]!
vld3.16 {d27, d29, d31}, [pSrc]!
pld [pSrc, #128]
pld [pSrc, #192]
subs count, count, #64
vshr.u16 q1, q11, #8
vshr.u16 q2, q12, #12
vshr.u16 q0, q10, #4
vand q3, q12, q4
vshr.u16 q9, q14, #8
vsli.16 q1, q10, #8
vsli.16 q2, q11, #4
vshr.u16 q10, q15, #12
vsli.16 q9, q13, #8
vbic.i16 q1, q1, #0xf000
vbic.i16 q2, q2, #0xf000
vsli.16 q10, q14, #4
vshr.u16 q8, q13, #4
vbic.i16 q9, q9, #0xf000
vand q11, q15, q4
vbic.i16 q10, q10, #0xf000
vst4.16 {d0, d2, d4, d6}, [pDst]!
vst4.16 {d1, d3, d5, d7}, [pDst]!
vst4.16 {d16, d18, d20, d22}, [pDst]!
vst4.16 {d17, d19, d21, d23}, [pDst]!
bpl 1b
cmp count, #-64
add pSrc, pSrc, count
vpople {q4}
bxle lr
add pSrc, pSrc, count, asr #1
add pDst, pDst, count, lsl #1
b 1b
.endfunc
.end
Tweak points :
force-align both src and dst to 64 bytes for maximum bandwidth
efficiency
then guarantee all the memory related instructions alignments. 256bit for 4 spread, 64bit for 3 spread like following :
vld4.16 {d16, d18, d20, d22}, [pSrc,:256]!
..
vst3.16 {d0, d2, d4}, [pDst,:64]!
..
make count a multiple of 64. otherwise, you'll have to write extra codes dealing with residual data (the current one would crash due to alignment fault)
you may increase/decrease the pld offsets by 64 for possibly increased cache hit rate
This will improve the performance by a good margin if not huge.
Recently I wrote code for packing 16bit data into 10bit using SSE. Here is the code. I don't have neon right now so I can't rewrite SSE code to NEON right now.
I used the following sources:
ARM NEON Basic Tutorials
ARM-NEON-Intrinsics
ARM Compiler toolchain Compiler Reference - Using NEON Support
Hints for rewriting code are follows:
First of all write a function for dump NEON variables and use it for debug
Use NEON way to load and store variables:
int16x8_t s;
s = vld1q_s16(ptr);
vst1q_s16(s, dst);
You can cast from int16x8_t to uint32x4_t.
Saturation:
const int16x8_t shft0 = { 4, 4, 4, 4, 4, 4, 4, 4 };
const int16x8_t shft1 = { -4, -4, -4, -4, -4, -4, -4, -4 };
s0 = vrshlq_s16(s, shft0);
s1 = vrshlq_s16(s, shft1);
Shifts:
uint32x4_t vrshlq_u32 (uint32x4_t, int32x4_t) // _mm_srli_epi32
uint64x1_t vrshl_u64 (uint64x1_t, int64x1_t) // _mm_srli_epi64
Assembly looks tight enough however you can see you are using 16-bit loads (ldrh) and store as bytes (strb). Your version of ARM's native word size is 32 bit, so real issue is probably input and output to memory.
You should refactor your code to do 32-bit loads and stores, and it would get much faster.

Resources