Compacting data in buffer from 16 bit per element to 12 bits - c

I'm wondering if there is any chance to improve performance
of such compacting. The idea is to saturate values higher than 4095
and place each value every 12 bits in new continuous buffer. Just like that:
Concept:
Convert:
Input buffer: [0.0][0.1][0.2] ... [0.15] | [1.0][1.1][1.2] ... [1.15]
| [2.0][2.1][2.2] ... [2.15] etc ...
to:
Output buffer: [0.0][0.1][0.2] ... [0.11] | [1.0][1.1][1.2] ... [1.11]
| [2.0][2.1][2.2] ... [2.11] etc ...
The input and output buffers are defines as:
uint16_t input[76800] (it's size in Bytes equal 153600 Bytes)
uint24_t output[38400] (it's size in Bytes equal 115200 Bytes)
So I have reduced the data size by 1/4. This computation cost ~1ms on Cortex-A9 with 792 MHz CPU speed and 2 Cores.
I have to perform such "compression" because I transfer about 18MB/s over Ethernet and that gives
me huge overhead. I've tested various compression algorithms such Snappy, LZ4 and none of that was
even close to achieved 1 ms with saturation and bits schifting.
I've written the following code:
#pragma pack(push, 1)
typedef struct {
union {
struct {
uint32_t value0_24x1:24;
};
struct {
uint32_t value0_12x1:12;
uint32_t value1_12x1:12;
};
struct {
uint32_t value0_8x1:8;
uint32_t value1_8x1:8;
uint32_t value3_8x1:8;
};
};
} uint24_t;
#pragma pack(pop)
static inline uint32_t __attribute__((always_inline)) saturate(uint32_t value)
{
register uint32_t result;
asm volatile("usat %0, %2, %1 \n\t" \
: [result] "=r" (result) \
: [value] "r" (value), [saturate] "I" (12) \
: \
);
return result;
}
void __attribute__((noinline, used)) compact(const uint16_t *input, uint24_t *output, uint32_t elements)
{
#if 0
/* More readable, but slower */
for (uint32_t i = 0; i < elements; ++i) {
output->value0_12x1 = saturate(*input++);
(output++)->value1_12x1 = saturate(*input++);
}
#else
/* Alternative - less readable but faster */
for (uint32_t i = 0; i < elements; ++i, input += 2)
(output++)->value0_24x1 = saturate(*input) | ((uint32_t)saturate(*(input+1))) << 12;
#endif
}
static uint16_t buffer_in[76800] = {0};
static uint24_t buffer_out[38400] = {0};
int main()
{
/* Dividing by 2 because we process two input values in a single loop inside compact() */
compact(buffer_in, buffer_out, sizeof(buffer_in) / sizeof(buffer_in[0]) / 2);
return 0;
}
And it's Assembly:
248 00008664 <compact>:
249 8664: e92d4010 push {r4, lr}
250 8668: e3a03000 mov r3, #0
251 866c: ea00000c b 86a4 <compact+0x40>
252 8670: e1d040b0 ldrh r4, [r0]
253 8674: e6ec4014 usat r4, #12, r4
254 8678: e1d0c0b2 ldrh ip, [r0, #2]
255 867c: e6ecc01c usat ip, #12, ip
256 8680: e184c60c orr ip, r4, ip, lsl #12
257 8684: e2833001 add r3, r3, #1
258 8688: e2800004 add r0, r0, #4
259 868c: e5c1c000 strb ip, [r1]
260 8690: e7e7445c ubfx r4, ip, #8, #8
261 8694: e7e7c85c ubfx ip, ip, #16, #8
262 8698: e5c14001 strb r4, [r1, #1]
263 869c: e5c1c002 strb ip, [r1, #2]
264 86a0: e2811003 add r1, r1, #3
265 86a4: e1530002 cmp r3, r2
266 86a8: 1afffff0 bne 8670 <compact+0xc>
267 86ac: e8bd8010 pop {r4, pc}
Compiled using GCC 4.6.3 with the following CFLAGS:
-Os (-O2 and -O3 do not give any noticable improvements)
-march=armv7-a -mcpu=cortex-a9 -mtune=cortex-a9
-marm -mfloat-abi=softfp -mfpu=neon funsafe-math-optimizations
Benchmark has shown that we're using ~10.3 cycles per 1 data convertion.
The questions are:
Can I use NEON to improve the performance?
Can someone give me some hints regardles NEON? What intrinsics shall I use?
Some code example would be very welcome, because I'm completly noob when it
comes to NEON.

Here are the answers :
Yes, it will be blazingly fast.
You should avoid intrinsics at all costs. It isn't worth the effort. Go for assembly
I'll give you a sample implementation once I arrive home.
////////////////////////////////////////////////////
Ok, here it goes :
You want to pack 16 bits to 12 bits. It's a ratio of 4:3.
Therefore, it's wise to load data 4 spread and store them 3 spread : vld4.16 -> vst3.16
/*
* void fanic_pack16to12(unsigned short * pDst, unsigned short * pSrc, unsigned int count);
* assert :
* count >= 64
* count % 4 == 0
*
* written by : Jake Lee
* part of FANIC project - Fastest ARM NEON Implementation Challenge
*/
pDst .req r0
pSrc .req r1
count .req r2
.text
.arm
.global fanic_pack16to12:
.func
.align 5
fanic_pack16to12:
pld [pSrc]
pld [pSrc, #64]
pld [pSrc, #128]
pld [pSrc, #192]
pld [pSrc, #256]
sub count, count, #64
.align 5
1:
vld4.16 {d16, d18, d20, d22}, [pSrc]!
vld4.16 {d17, d19, d21, d23}, [pSrc]!
vld4.16 {d24, d26, d28, d30}, [pSrc]!
vld4.16 {d25, d27, d29, d31}, [pSrc]!
pld [pSrc, #128]
pld [pSrc, #192]
subs count, count, #64
vqshl.u16 q0, q8, #4
vqshl.u16 q3, q9, #4
vqshl.u16 q8, q10, #4
vqshl.u16 q9, q11, #4
vqshl.u16 q10, q12, #4
vqshl.u16 q13, q13, #4
vqshl.u16 q14, q14, #4
vqshl.u16 q15, q15, #4
vshl.u16 q1, q3, #4
vshl.u16 q2, q8, #8
vshl.u16 q11, q13, #4
vshl.u16 q12, q14, #8
vsri.16 q0, q3, #12
vsri.16 q1, q8, #8
vsri.16 q2, q9, #4
vsri.16 q10, q13, #12
vsri.16 q11, q14, #8
vsri.16 q12, q15, #4
vst3.16 {d0, d2, d4}, [pDst]!
vst3.16 {d1, d3, d5}, [pDst]!
vst3.16 {d20, d22, d24}, [pDst]!
vst3.16 {d21, d23, d25}, [pDst]!
bpl 1b
cmp count, #-64
add pDst, pDst, count
bxle lr
add pSrc, pSrc, count, lsl #1
add pDst, pDst, count, asr #1
b 1b
.endfunc
.end
Please note how many cycles and bandwidth are saved thanks to smart register allocation and loop control - practices that are simply impossible with intrinsics.
This implementation will run so fast as if done by a dedicated hardware.
There is absolutely no pipeline hazard.
Roughly 50 cycles / iteration
= less than 1 cycle / data
Have fun!
//////////////////////////////////////////////////////
Ok, below is the unpacking function :
/*
* void fanic_unpack12to16(unsigned short *pDst, unsigned short *pSrc, unsigned int count);
* assert :
* count >=64
* count % 4 == 0
*
* written by : Jake Lee
* part of FANIC project - Fastest ARM NEON Implementation Challenge
*/
pDst .req r0
pSrc .req r1
count .req r2
.text
.arm
.global fanic_unpack12to16:
.func
.align 5
fanic_unpack12to16:
pld [pSrc]
pld [pSrc, #64*1]
pld [pSrc, #64*2]
vpush {q4}
pld [pSrc, #64*3]
vmov.i16 q4, #0x0fff
pld [pSrc, #64*4]
sub count, count, #64
.align 5
1:
vld3.16 {d20, d22, d24}, [pSrc]!
vld3.16 {d21, d23, d25}, [pSrc]!
vld3.16 {d26, d28, d30}, [pSrc]!
vld3.16 {d27, d29, d31}, [pSrc]!
pld [pSrc, #128]
pld [pSrc, #192]
subs count, count, #64
vshr.u16 q1, q11, #8
vshr.u16 q2, q12, #12
vshr.u16 q0, q10, #4
vand q3, q12, q4
vshr.u16 q9, q14, #8
vsli.16 q1, q10, #8
vsli.16 q2, q11, #4
vshr.u16 q10, q15, #12
vsli.16 q9, q13, #8
vbic.i16 q1, q1, #0xf000
vbic.i16 q2, q2, #0xf000
vsli.16 q10, q14, #4
vshr.u16 q8, q13, #4
vbic.i16 q9, q9, #0xf000
vand q11, q15, q4
vbic.i16 q10, q10, #0xf000
vst4.16 {d0, d2, d4, d6}, [pDst]!
vst4.16 {d1, d3, d5, d7}, [pDst]!
vst4.16 {d16, d18, d20, d22}, [pDst]!
vst4.16 {d17, d19, d21, d23}, [pDst]!
bpl 1b
cmp count, #-64
add pSrc, pSrc, count
vpople {q4}
bxle lr
add pSrc, pSrc, count, asr #1
add pDst, pDst, count, lsl #1
b 1b
.endfunc
.end
Tweak points :
force-align both src and dst to 64 bytes for maximum bandwidth
efficiency
then guarantee all the memory related instructions alignments. 256bit for 4 spread, 64bit for 3 spread like following :
vld4.16 {d16, d18, d20, d22}, [pSrc,:256]!
..
vst3.16 {d0, d2, d4}, [pDst,:64]!
..
make count a multiple of 64. otherwise, you'll have to write extra codes dealing with residual data (the current one would crash due to alignment fault)
you may increase/decrease the pld offsets by 64 for possibly increased cache hit rate
This will improve the performance by a good margin if not huge.

Recently I wrote code for packing 16bit data into 10bit using SSE. Here is the code. I don't have neon right now so I can't rewrite SSE code to NEON right now.
I used the following sources:
ARM NEON Basic Tutorials
ARM-NEON-Intrinsics
ARM Compiler toolchain Compiler Reference - Using NEON Support
Hints for rewriting code are follows:
First of all write a function for dump NEON variables and use it for debug
Use NEON way to load and store variables:
int16x8_t s;
s = vld1q_s16(ptr);
vst1q_s16(s, dst);
You can cast from int16x8_t to uint32x4_t.
Saturation:
const int16x8_t shft0 = { 4, 4, 4, 4, 4, 4, 4, 4 };
const int16x8_t shft1 = { -4, -4, -4, -4, -4, -4, -4, -4 };
s0 = vrshlq_s16(s, shft0);
s1 = vrshlq_s16(s, shft1);
Shifts:
uint32x4_t vrshlq_u32 (uint32x4_t, int32x4_t) // _mm_srli_epi32
uint64x1_t vrshl_u64 (uint64x1_t, int64x1_t) // _mm_srli_epi64

Assembly looks tight enough however you can see you are using 16-bit loads (ldrh) and store as bytes (strb). Your version of ARM's native word size is 32 bit, so real issue is probably input and output to memory.
You should refactor your code to do 32-bit loads and stores, and it would get much faster.

Related

ARM NEON: Convert a binary 8-bit-per-pixel image (only 0/1) to 1-bit-per-pixel?

I am working on a task to convert a large binary label image, which has 8 bits (uint8_t) per pixel and each pixel can only be 0 or 1 (or 255), to an array of uint64_t numbers and each bit in uint64_t number represent a label pixel.
For example,
input array: 0 1 1 0 ... (00000000 00000001 00000001 00000000 ...)
or input array: 0 255 255 0 ... (00000000 11111111 11111111 00000000 ...)
output array (number): 6 (because after convert each uint8_t to bit, it becomes 0110)
Currently the C code to achieve this is:
for (int j = 0; j < width >> 6; j++) {
uint8_t* in_ptr= in + (j << 6);
uint64_t out_bits = 0;
if (in_ptr[0]) out_bits |= 0x0000000000000001;
if (in_ptr[1]) out_bits |= 0x0000000000000002;
.
.
.
if (in_ptr[63]) out_bits |= 0x8000000000000000;
*output = obits; output ++;
}
Can ARM NEON optimize this functionality? Please help. Thank you!
Assuming the input value is either 0 or 255, below is the basic version which is rather straightforward, especially for people with Intel SSE/AVX experience.
void foo_basic(uint8_t *pDst, uint8_t *pSrc, intptr_t length)
{
//assert(length >= 64);
//assert(length & 7 == 0);
uint8x16_t in0, in1, in2, in3;
uint8x8_t out;
const uint8x16_t mask = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
length -= 64;
do {
do {
in0 = vld1q_u8(pSrc); pSrc += 16;
in1 = vld1q_u8(pSrc); pSrc += 16;
in2 = vld1q_u8(pSrc); pSrc += 16;
in3 = vld1q_u8(pSrc); pSrc += 16;
in0 &= mask;
in1 &= mask;
in2 &= mask;
in3 &= mask;
in0 = vpaddq_u8(in0, in1);
in2 = vpaddq_u8(in2, in3);
in0 = vpaddq_u8(in0, in2);
out = vpadd_u8(vget_low_u8(in0), vget_high_u8(in0));
vst1_u8(pDst, out); pDst += 8;
length -= 64;
} while (length >=0);
pSrc += length>>3;
pDst += length;
} while (length > -64);
}
Neon however has VERY user friendly and efficient permutation and bit operation instructions that allow to go "vertical"
void foo_advanced(uint8_t *pDst, uint8_t *pSrc, intptr_t length)
{
//assert(length >= 128);
//assert(length & 7 == 0);
uint8x16x4_t in0, in1;
uint8x16x2_t row04, row15, row26, row37;
length -= 128;
do {
do {
in0 = vld4q_u8(pSrc); pSrc += 64;
in1 = vld4q_u8(pSrc); pSrc += 64;
row04 = vuzpq_u8(in0.val[0], in1.val[0]);
row15 = vuzpq_u8(in0.val[1], in1.val[1]);
row26 = vuzpq_u8(in0.val[2], in1.val[2]);
row37 = vuzpq_u8(in0.val[3], in1.val[3]);
row04.val[0] = vsliq_n_u8(row04.val[0], row15.val[0], 1);
row26.val[0] = vsliq_n_u8(row26.val[0], row37.val[0], 1);
row04.val[1] = vsliq_n_u8(row04.val[1], row15.val[1], 1);
row26.val[1] = vsliq_n_u8(row26.val[1], row37.val[1], 1);
row04.val[0] = vsliq_n_u8(row04.val[0], row26.val[0], 2);
row04.val[1] = vsliq_n_u8(row04.val[1], row26.val[1], 2);
row04.val[0] = vsliq_n_u8(row04.val[0], row04.val[1], 4);
vst1q_u8(pDst, row04.val[0]); pDst += 16;
length -= 128;
} while (length >=0);
pSrc += length>>3;
pDst += length;
} while (length > -128);
}
The Neon-only advanced version is shorter and faster, but GCC is extremely bad at dealing with Neon specific permutation instructions such as vtrn, vzip, and vuzp.
https://godbolt.org/z/bGdbohqKe
Clang isn't any better: it spams unnecessary vorr where GCC does the same with vmov.
.syntax unified
.arm
.arch armv7-a
.fpu neon
.global foo_asm
.text
.func
.balign 64
foo_asm:
sub r2, r2, #128
.balign 16
1:
vld4.8 {d16, d18, d20, d22}, [r1]!
vld4.8 {d17, d19, d21, d23}, [r1]!
vld4.8 {d24, d26, d28, d30}, [r1]!
vld4.8 {d25, d27, d29, d31}, [r1]!
subs r2, r2, #128
vuzp.8 q8, q12
vuzp.8 q9, q13
vuzp.8 q10, q14
vuzp.8 q11, q15
vsli.8 q8, q9, #1
vsli.8 q10, q11, #1
vsli.8 q12, q13, #1
vsli.8 q14, q15, #1
vsli.8 q8, q10, #2
vsli.8 q12, q14, #2
vsli.8 q8, q12, #4
vst1.8 {q8}, [r0]!
bpl 1b
add r1, r1, r2
cmp r2, #-128
add r0, r0, r2, asr #3
bgt 1b
.balign 8
bx lr
.endfunc
.end
The inner most loop consists of :
GCC: 32 instructions
Clang: 30 instructions
Asm: 18 instructions
It doesn't take rocket science to figure out which one is the fastest and by how much: Never trust compilers if you are about to do permutations.
Standing on the shoulder of Jake 'Alquimista' LEE, we can improve the unzipping instruction and the algorithm as well by changing the order of the zip and vlsi operators:
#define interleave_nibbles(top) \
top.val[0] = vsliq_n_u8(top.val[0], top.val[1],1);\
top.val[2] = vsliq_n_u8(top.val[2], top.val[3],1);\
top.val[0] = vsliq_n_u8(top.val[0], top.val[2],2);
void transpose_bits(uint8_t const *src, uint8_t *dst) {
uint8x16x4_t top = vld4q_u8(src);
uint8x16x4_t bot = vld4q_u8(src + 64); src+=128;
interleave_nibbles(top);
interleave_nibbles(bot);
// now we have 4 bits correct in each of the 32 bytes left
// top = 0to3 4to7 8to11 12to15 ...
// bot = 64to67 68to71 ...
uint8x16x2_t top_bot = vuzpq_u8(top.val[0], bot.val[0]);
uint8x16_t result = vsliq_n_u8(top_bot.val[0], top_bot.val[1], 4);
vst1q_u8(dst, result); dst += 16;
}
The produced assembler by clang has now only two extraneous movs (by or) and gcc output has four movs.
vld4.8 {d16, d18, d20, d22}, [r0]!
vld4.8 {d17, d19, d21, d23}, [r0]!
vld4.8 {d24, d26, d28, d30}, [r0]!
vsli.8 q10, q11, #1
vorr q0, q8, q8
vld4.8 {d25, d27, d29, d31}, [r0]
vsli.8 q0, q9, #1
vorr q2, q14, q14
vsli.8 q12, q13, #1
vsli.8 q2, q15, #1
vsli.8 q0, q10, #2
vsli.8 q12, q2, #2
vuzp.8 q0, q12
vsli.8 q0, q12, #4
vst1.8 {d0, d1}, [r1]
And the arm64 version looks perfect with only 12 instructions.
ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0]
sli v0.16b, v1.16b, #1
sli v2.16b, v3.16b, #1
sli v0.16b, v2.16b, #2
sli v4.16b, v5.16b, #1
sli v6.16b, v7.16b, #1
sli v4.16b, v6.16b, #2
uzp1 v16.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v16.16b, v0.16b, #4
str q16, [x1]
You can do it more efficiently (especially for short arrays or single vectors) using something like this (in this example, turning one 128 bit register into one 16 bit mask):
// turn mask of bytes in v0 into mask of bits in w0
movmsk: adr x0, 0f // obtain address of literal
ld1r {v1.2d}, [x0] // load 80..01 mask twice into v1
and v0.16b, v0.16b, v1.16b // mask bytes from ff to single bits
mov d1, v0.d[1] // extract high 64 bit
zip1 v0.8b, v0.8b, v1.8b // interleave high and low bytes
addv h0, v0.8h // sum into bit mask
mov w0, v0.s[0] // move result to general register
ret
0: .quad 0x8040201008040201
The idea is to turn the contents of each byte into just one bit at the bit position it's going to end up at and to then sum up the bits using addv (8 bytes at a time, resulting in one byte of output).
Putting a loop around this code to have it traverse the entire array is left as an exercise to the reader.

Why does gcc, with -O3, unnecessarily clear a local ARM NEON array?

Consider the following code (Compiler Explorer link), compiled under gcc and clang with -O3 optimization:
#include <arm_neon.h>
void bug(int8_t *out, const int8_t *in) {
for (int i = 0; i < 2; i++) {
int8x16x4_t x;
x.val[0] = vld1q_s8(&in[16 * i]);
x.val[1] = x.val[2] = x.val[3] = vshrq_n_s8(x.val[0], 7);
vst4q_s8(&out[64 * i], x);
}
}
NOTE: this is a minimally reproducible version of an issue that is popping up in many different functions of my actual, much more complex code, filled with arithmetic/logical/permutation instructions performing a totally different operation from above. Please refrain from criticizing and/or suggesting different ways of doing what the code above does, unless it has an effect on the code generation issue discussed below.
clang generates sane code:
bug(signed char*, signed char const*): // #bug(signed char*, signed char const*)
ldr q0, [x1]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
ldr q0, [x1, #16]
sshr v1.16b, v0.16b, #7
mov v2.16b, v1.16b
mov v3.16b, v1.16b
st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
ret
As for gcc, it inserts a lot of unnecessary operations, apparently zeroing out the registers that will be eventually input to the st4 instruction:
bug(signed char*, signed char const*):
sub sp, sp, #128
# mov x9, 0
# mov x8, 0
# mov x7, 0
# mov x6, 0
# mov x5, 0
# mov x4, 0
# mov x3, 0
# stp x9, x8, [sp]
# mov x2, 0
# stp x7, x6, [sp, 16]
# stp x5, x4, [sp, 32]
# str x3, [sp, 48]
ldr q0, [x1]
# stp x2, x9, [sp, 56]
# stp x8, x7, [sp, 72]
sshr v4.16b, v0.16b, 7
# str q0, [sp]
# ld1 {v0.16b - v3.16b}, [sp]
# stp x6, x5, [sp, 88]
mov v1.16b, v4.16b
# stp x4, x3, [sp, 104]
mov v2.16b, v4.16b
# str x2, [sp, 120]
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
### ldr q4, [x1, 16]
### add x1, sp, 64
### str q4, [sp, 64]
sshr v4.16b, v4.16b, 7
### ld1 {v0.16b - v3.16b}, [x1]
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
add sp, sp, 128
ret
I manually prefixed with # all instructions that could be safely taken out, without affecting the result of the function.
In addition, the instructions prefixed with ### perform an unnecessary trip to memory and back (and anyway, the mov instructions following ### ld1 ... overwrite 3 out of 4 registers loaded by that ld1 instruction), and could be replaced by a single load straight to v0.16b -- and the sshr instruction in the middle of the block would then use v0.16b as its source register.
As far as I know, x, being a local variable, can be used unitialized; and even if it weren't, all registers are properly initialized, so there's no point in zeroing them out just to immediately overwrite them with values.
I'm inclined to think this is a gcc bug, but before reporting it, I'm curious if I missed something. Maybe there's a compilation flag, an __attribute__ or something else that I could to make gcc generate sane code.
Thus, my question: is there anything I can do to generate sane code, or is this a bug I need to report to gcc?
Code generation on a fairly current development version of gcc appears to have improved immensely, at least for this case.
After installing the gcc-snapshot package (dated 20210918), gcc generates the following code:
bug:
ldr q5, [x1]
sshr v4.16b, v5.16b, 7
mov v0.16b, v5.16b
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0], 64
ldr q4, [x1, 16]
mov v0.16b, v4.16b
sshr v4.16b, v4.16b, 7
mov v1.16b, v4.16b
mov v2.16b, v4.16b
mov v3.16b, v4.16b
st4 {v0.16b - v3.16b}, [x0]
ret
Not ideal yet -- at least two mov instruction could be removed per iteration by changing the destination registers of ldr and sshr, but considerably better than before.
Short answer: welcome to GCC. Do not bother optimizing anything while you are using it. And Clang isn't better either.
Secret tip: Add ARM and ARM64 components to Visual Studio, and you'd be surprised how well it works. The problem is however, it generates COFF binary, not ELF, and I haven't been able to find a converter.
You can use Ida Pro or dumpbin and generate a disassembly file and it look. like:
; void __fastcall bug(char *out, const char *in)
EXPORT bug
bug
MOV W10, #0
MOV W9, #0
$LL4 ; CODE XREF: bug+30↓j
ADD X8, X1, W9,SXTW
ADD W9, W9, #0x10
CMP W9, #0x20 ; ' '
LD1 {V0.16B}, [X8]
ADD X8, X0, W10,SXTW
ADD W10, W10, #0x40 ; '#'
SSHR V1.16B, V0.16B, #7
MOV V2.16B, V1.16B
MOV V3.16B, V1.16B
ST4 {V0.16B-V3.16B}, [X8]
B.LT $LL4
RET
; End of function bug
You can copy paste the disassembly to a GCC assembly file.
And don't bother with reporting the "bug" either. If they were listening, GCC wouldn't be this bad in first place.

Convert function to Arm Neon

I'm a beginner in Arm Neon, and I'm trying to vectorise this loop
float ans=0.0;
for (i=0; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
I'm trying to convert this function in Neon with prefetch instruction and loop-unrolling
int iter= numdims/4*4;
float result[3];
float ans=0.0;
asm volatile(
"mov x1, #0\n\t"
"mov x2, %[pt1]\n\t"
"mov x3, %[pt2]\n\t"
"movi v3.4s, #0\n\t"
".loop_neon%=:\n\t"
"prfm PLDL1STRM, [x2, #64]\n\t"
"prfm PLDL1STRM, [x3, #64]\n\t"
"ldr q1, [x2, #16]\n\t"
"ldr q2, [x3, #16]\n\t"
"fsub v4.4s, v1.4s, v2.4s\n\t"
"fmla v3.4s, v4.4s, v4.4s\n\t"
"add x1,x1, #16\n\t"
"cmp x1, %[iter]\n\t"
"b.lt .loop_neon%=\n\t"
"str q3, [%[result]]\n\t"
:
: [iter] "r" (iter),[pt1] "r" (pt1),[pt2] "r" (pt2), [result] "r" (result)
: "x1","x2","x3","memory","v0","v1","v2","v3","v4"
);
ans = result[0] + result[1] + result[2] + result[3];
//final iterations of the loop
for (int i=iter; i<numdims; i++)
ans += (pt1[i]-pt2[i]) * (pt1[i]-pt2[i]);
This code works but the output is not correct
Short answer: add x1, x1, #4
Your code is far from optimal:
there are lots of pipeline hazards. unroll deeper
you should always count down the loop counter
you should avoid unnecessary memory access (result)
you should avoid unnecessary mov operations
Provided iter is a multiple of 16, the code below is suggested:
.func
// extern float sumDiffSquare(float *pA, float *pB, uint32_t length);
// assert(length >= 16);
// assert(length & 15 == 0);
pA .req x0
pB .req x1
length .req x2
sumDiffSqare:
movi v0.16b, #0
.balign 64
1:
ldp q16, q17, [pA], #32
ldp q20, q21, [pB], #32
ldp q18, q19, [pA], #32
ldp q22, q23, [pB], #32
subs length, length, #16
fsub v16.4s, v20.4s, v16.4s
fsub v17.4s, v21.4s, v17.4s
fsub v18.4s, v22.4s, v18.4s
fsub v19.4s, v23.4s, v19.4s
fmla v0.4s, v16.4s, v16.4s
fmla v0.4s, v17.4s, v17.4s
fmla v0.4s, v18.4s, v18.4s
fmla v0.4s, v19.4s, v19.4s
b.gt 1b
faddp v0.4s, v0.4s, v0.4s
faddp v0.2s, v0.2s, v0.2s
ret
.endfunc

Armv8a NEON inline asm code: How to convert 16x8bit vector to four 4x32bit (integer) vectors?

I need to load 8 bit array and then convert every element to 32-bit integer using armv8a neon inline asm code. I have done it with armv7 but no idea how to do it in v8a...
The code I used in v7 is
"pld [%1, #128] \n"
"vld1.u8 {d0,d1}, [%1]! \n"
"vmovl.u8 q8, d0 \n"
"vmovl.u8 q9, d1 \n"
"vmovl.u16 q0, d16 \n"
"vmovl.u16 q1, d17 \n"
"vmovl.u16 q2, d18 \n"
"vmovl.u16 q3, d19 \n"
How can I finish this by using armv8a neon code? Or how can I convert the code above to armv8a? PS: In my case, I only need inline asm but not intrinsics...
Thanks for the help.
For unsigned elements, USHLL, USHLL2 with the shift number 0 will do the job.
ld1 {v0.16b}, [%1], #16
USHLL v16.8h, v0.8b, #0
USHLL2 v17.8h, v0.16b, #0
USHLL v0.4s, v16.4h, #0
USHLL2 v1.4s, v16.8h, #0
USHLL v2.4s, v17.4h, #0
USHLL2 v3.4s, v17.8h, #0
For signed elements - guess guess - use SSHLL and SSHLL2 instead.
Similarly, there is no direct equivalent to MOVN on aarch64 as well.
--EDIT
There are XTN/XTN2 instructions that wore exactly like VMOVN on the other hand.

What is the most efficient way to reorder a contiguous strided pixel array?

I am working on a highly performance-critical image processing pipeline on a Jetson TX2 (with an ARM processor), which involves reading a set of images and then performing deep learning based object detection through Darknet. Darknet, written in C, has its own representation of how images are stored, which is different from how OpenCV's IplImage or a Python numpy array would store the images.
In my application, I am required to interface with Darknet through Python. So, as of now, I am reading a 'batch' of images (usually 16) into a numpy array and then passing it to Darknet as a contiguous array using ctypes. Within Darknet, I then have to rearrange the ordering of the pixels to go from the numpy format to Darknet's format.
While the input array is one contiguous block arranged column-wise, then row-wise, then channel-wise, and then by image, the Darknet format needs to be arranged by channel first, then by column, then by row: and contains one row per image in the batch instead of a contiguous block. The picture below tries to demonstrate the difference. In this example, I assume a single ixj image. (0,0), (0,1) etc. indicate (row, col), whereas in the top, C0, C1, C2.. etc indicate the column in the corresponding row. Note that in the case of multiple images as part of a batch, the input format arranges them sequentially one after the other, but Darknet needs them to be on separate rows: each row containing data from only one image.
As of now, my code in C that converts the input array to the Darknet format looks like this, where it iteratively hits every pixel in every channel and puts it in a different place, while also normalizing the pixels along the way.
matrix ndarray_to_matrix(unsigned char* src, long* shape, long* strides)
{
int nb = shape[0]; // Batch size
int h = shape[1]; // Height of each image
int w = shape[2]; // Width of each image
int c = shape[3]; // No. of channels in each image
matrix X = make_matrix(nb, h*w*c); // Output array format: 2D
int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];
int b, i, j, k;
int index1, index2 = 0;
for(b = 0; b < nb ; ++b) {
for(i = 0; i < h; ++i) {
for(k= 0; k < c; ++k) {
for(j = 0; j < w; ++j) {
index1 = k*w*h + i*w + j;
index2 = step_b*b + step_h*i + step_w*j + step_c*k;
X.vals[b][index1] = src[index2]/255.;
}
}
}
}
return X;
}
Is there a more efficient way of doing this rearranging and normalization in C?
I am using the Jetson TX2: which contains an ARM processor and an NVIDIA GPU, thus having access to NEON and CUDA as well as OpenMP.
The image dimensions are fixed and can be hardcoded: only the batch size can change.
The function below will be almost as fast as memcpy:
/*
* Created on: 2018. 5. 5.
* Author: Jake 'Alquimista' Lee
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix
// void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
pDst .req x0
pRed .req x1
pGrn .req x2
pBlu .req x3
count .req w4
.balign 64
.func
alquimista_ndarray_to_matrix:
mov x16, #(640*360) & 0xffff
str q8, [sp, #-16]!
movk x16, #((640*360)>>16), lsl #16
mov count, #(640*360)/128
add pGrn, pRed, x16
add pBlu, pRed, x16, lsl #1
b 1f
.balign 64
1:
ldp q0, q3, [pRed], #32
ldp q1, q4, [pGrn], #32
ldp q2, q5, [pBlu], #32
ldp q6, q16, [pRed], #32
ldp q7, q17, [pGrn], #32
ldp q8, q18, [pBlu], #32
ldp q19, q22, [pRed], #32
ldp q20, q23, [pGrn], #32
ldp q21, q24, [pBlu], #32
ldp q25, q28, [pRed], #32
ldp q26, q29, [pGrn], #32
ldp q27, q30, [pBlu], #32
subs count, count, #1
st3 {v0.16b, v1.16b, v2.16b}, [pDst], #48
st3 {v3.16b, v4.16b, v5.16b}, [pDst], #48
st3 {v6.16b, v7.16b, v8.16b}, [pDst], #48
st3 {v16.16b, v17.16b, v18.16b}, [pDst], #48
st3 {v19.16b, v20.16b, v21.16b}, [pDst], #48
st3 {v22.16b, v23.16b, v24.16b}, [pDst], #48
st3 {v25.16b, v26.16b, v27.16b}, [pDst], #48
st3 {v28.16b, v29.16b, v30.16b}, [pDst], #48
b.gt 1b
.balign 8
ldr q8, [sp], #16
ret
.endfunc
.end
For maximum performance and minimum power consumption, you might want to align the source pointer to 32 bytes and the destination to 16 bytes.
The function prototype is:
void alquimista_ndarray_to_matrix(uint8_t * pDst, uint8_t *pSrc);
Below is the function that does the conversion to float on the fly.
And I added the batch number as parameter so that you don't have to do a function call for every image.
/*
* Created on: 2018. 5. 5.
* Copyright: Jake 'Alquimista' Lee. All rights reserved
*/
.arch armv8-a
.text
.global alquimista_ndarray_to_matrix_float
// void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);
pDst .req x0
pRed .req x1
batch .req w2
pGrn .req x3
pBlu .req x4
stride .req x5
count .req w7
.balign 64
.func
alquimista_ndarray_to_matrix_float:
mov stride, #((640*360)<<1) & 0xffff
stp q8, q15, [sp, #-32]!
movk stride, #((640*360)>>15), lsl #16
mov count, #(640*360)/32
add pGrn, pRed, stride, lsr #1
add pBlu, pRed, stride
b 1f
.balign 64
1:
ldp q0, q1, [pRed], #32
ldp q2, q3, [pGrn], #32
ldp q4, q5, [pBlu], #32
subs count, count, #1
ushll v20.8h, v0.8b, #7
ushll2 v23.8h, v0.16b, #7
ushll v26.8h, v1.8b, #7
ushll2 v29.8h, v1.16b, #7
ushll v21.8h, v2.8b, #7
ushll2 v24.8h, v2.16b, #7
ushll v27.8h, v3.8b, #7
ushll2 v30.8h, v3.16b, #7
ushll v22.8h, v4.8b, #7
ushll2 v25.8h, v4.16b, #7
ushll v28.8h, v5.8b, #7
ushll2 v31.8h, v5.16b, #7
ursra v20.8h, v20.8h, #8
ursra v21.8h, v21.8h, #8
ursra v22.8h, v22.8h, #8
ursra v23.8h, v23.8h, #8
ursra v24.8h, v24.8h, #8
ursra v25.8h, v25.8h, #8
ursra v26.8h, v26.8h, #8
ursra v27.8h, v27.8h, #8
ursra v28.8h, v28.8h, #8
ursra v29.8h, v29.8h, #8
ursra v30.8h, v30.8h, #8
ursra v31.8h, v31.8h, #8
uxtl v0.4s, v20.4h
uxtl v1.4s, v21.4h
uxtl v2.4s, v22.4h
uxtl2 v3.4s, v20.8h
uxtl2 v4.4s, v21.8h
uxtl2 v5.4s, v22.8h
uxtl v6.4s, v23.4h
uxtl v7.4s, v24.4h
uxtl v8.4s, v25.4h
uxtl2 v15.4s, v23.8h
uxtl2 v16.4s, v24.8h
uxtl2 v17.4s, v25.8h
uxtl v18.4s, v26.4h
uxtl v19.4s, v27.4h
uxtl v20.4s, v28.4h
uxtl2 v21.4s, v26.8h
uxtl2 v22.4s, v27.8h
uxtl2 v23.4s, v28.8h
uxtl v24.4s, v29.4h
uxtl v25.4s, v30.4h
uxtl v26.4s, v31.4h
uxtl2 v27.4s, v29.8h
uxtl2 v28.4s, v30.8h
uxtl2 v29.4s, v31.8h
ucvtf v0.4s, v0.4s, #15
ucvtf v1.4s, v1.4s, #15
ucvtf v2.4s, v2.4s, #15
ucvtf v3.4s, v3.4s, #15
ucvtf v4.4s, v4.4s, #15
ucvtf v5.4s, v5.4s, #15
ucvtf v6.4s, v6.4s, #15
ucvtf v7.4s, v7.4s, #15
ucvtf v8.4s, v8.4s, #15
ucvtf v15.4s, v15.4s, #15
ucvtf v16.4s, v16.4s, #15
ucvtf v17.4s, v17.4s, #15
ucvtf v18.4s, v18.4s, #15
ucvtf v19.4s, v19.4s, #15
ucvtf v20.4s, v20.4s, #15
ucvtf v21.4s, v21.4s, #15
ucvtf v22.4s, v22.4s, #15
ucvtf v23.4s, v23.4s, #15
ucvtf v24.4s, v24.4s, #15
ucvtf v25.4s, v25.4s, #15
ucvtf v26.4s, v26.4s, #15
ucvtf v27.4s, v27.4s, #15
ucvtf v28.4s, v28.4s, #15
ucvtf v29.4s, v29.4s, #15
st3 {v0.4s - v2.4s}, [pDst], #48
st3 {v3.4s - v5.4s}, [pDst], #48
st3 {v6.4s - v8.4s}, [pDst], #48
st3 {v15.4s - v17.4s}, [pDst], #48
st3 {v18.4s - v20.4s}, [pDst], #48
st3 {v21.4s - v23.4s}, [pDst], #48
st3 {v24.4s - v26.4s}, [pDst], #48
st3 {v27.4s - v29.4s}, [pDst], #48
b.gt 1b
add pRed, pRed, stride
add pGrn, pGrn, stride
add pGrn, pGrn, stride
subs batch, batch, #1
mov count, #(640*360)/32
b.gt 1b
.balign 8
ldp q8, q15, [sp], #32
ret
.endfunc
.end
It's quite a long one, and it will take considerably longer than the uint8 one above.
Please note that it will scale extremely well to multi-core execution.
The function prototype is:
void alquimista_ndarray_to_matrix_float(float *pDst, uint8_t *pSrc, uint32_t batch);

Resources