Related
I have a program that needs to run a transpose operation on 8x8 float32 matrices many times. I want to transpose these using NEON SIMD intrinsics. I know that the array will always contain 8x8 float elements. I have a baseline non-intrinsic solution below:
void transpose(float *matrix, float *matrixT) {
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
matrixT[i*8+j] = matrix[j*8+i];
}
}
}
I also created an intrinsic solution that transposes each 4x4 quadrant of the 8x8 matrix, and swaps the positions of the second and third quadrants. This solution looks like this:
void transpose_4x4(float *matrix, float *matrixT, int store_index) {
float32x4_t r0, r1, r2, r3, c0, c1, c2, c3;
r0 = vld1q_f32(matrix);
r1 = vld1q_f32(matrix + 8);
r2 = vld1q_f32(matrix + 16);
r3 = vld1q_f32(matrix + 24);
c0 = vzip1q_f32(r0, r1);
c1 = vzip2q_f32(r0, r1);
c2 = vzip1q_f32(r2, r3);
c3 = vzip2q_f32(r2, r3);
r0 = vcombine_f32(vget_low_f32(c0), vget_low_f32(c2));
r1 = vcombine_f32(vget_high_f32(c0), vget_high_f32(c2));
r2 = vcombine_f32(vget_low_f32(c1), vget_low_f32(c3));
r3 = vcombine_f32(vget_high_f32(c1), vget_high_f32(c3));
vst1q_f32(matrixT + store_index, r0);
vst1q_f32(matrixT + store_index + 8, r1);
vst1q_f32(matrixT + store_index + 16, r2);
vst1q_f32(matrixT + store_index + 24, r3);
}
void transpose(float *matrix, float *matrixT) {
// Transpose top-left 4x4 quadrant and store the result in the top-left 4x4 quadrant
transpose_4x4(matrix, matrixT, 0);
// Transpose top-right 4x4 quadrant and store the result in the bottom-left 4x4 quadrant
transpose_4x4(matrix + 4, matrixT, 32);
// Transpose bottom-left 4x4 quadrant and store the result in the top-right 4x4 quadrant
transpose_4x4(matrix + 32, matrixT, 4);
// Transpose bottom-right 4x4 quadrant and store the result in the bottom-right 4x4 quadrant
transpose_4x4(matrix + 36, matrixT, 36);
}
This solution however, results in a slower performance than the baseline non-intrinsic solution. I am struggling to see, if there is one, a faster solution that can transpose my 8x8 matrix. Any help would be greatly appreciated!
Edit: both solutions are compiled using the -O1 flag.
First off, you shouldn't expect a huge performance boost to start with:
there is actually no computation
you are dealing with 32bit data, and thus, not much of bandwidth constraint.
to sum it up, just a little bit saving in bandwidth by vectorizing - that's all
As for the 4x4 transpose, you don't even need a separate function, but just a macro:
#define TRANSPOSE4x4(pSrc,pDst) vst1q_f32_x4(pDst,vld4q_f32(pSrc))
will do the job since NEON does the 4x4 transpose on the fly when you load the data with vld4.
But you should ask yourself at this point if your approach - transposing all the matrice prior to actual computation - is the right one if 4x4 transpose costs virtually nothing. This step could end up being a pure waste of computation and bandwidth. Optimization shouldn't be limited to the final step, but should be considered from the designing phase.
8x8 transpose is a different animal though:
void transpose8x8(float *pDst, float *pSrc)
{
float32x4_t row0a, row0b, row1a, row1b, row2a, row2b, row3a, row3b, row4a, row4b, row5a, row5b, row6a, row6b, row7a, row7b;
float32x4_t r0a, r0b, r1a, r1b, r2a, r2b, r3a, r3b, r4a, r4b, r5a, r5b, r6a, r6b, r7a, r7b;
row0a = vld1q_f32(pSrc);
pSrc += 4;
row0b = vld1q_f32(pSrc);
pSrc += 4;
row1a = vld1q_f32(pSrc);
pSrc += 4;
row1b = vld1q_f32(pSrc);
pSrc += 4;
row2a = vld1q_f32(pSrc);
pSrc += 4;
row2b = vld1q_f32(pSrc);
pSrc += 4;
row3a = vld1q_f32(pSrc);
pSrc += 4;
row3b = vld1q_f32(pSrc);
pSrc += 4;
row4a = vld1q_f32(pSrc);
pSrc += 4;
row4b = vld1q_f32(pSrc);
pSrc += 4;
row5a = vld1q_f32(pSrc);
pSrc += 4;
row5b = vld1q_f32(pSrc);
pSrc += 4;
row6a = vld1q_f32(pSrc);
pSrc += 4;
row6b = vld1q_f32(pSrc);
pSrc += 4;
row7a = vld1q_f32(pSrc);
pSrc += 4;
row7b = vld1q_f32(pSrc);
r0a = vtrn1q_f32(row0a, row1a);
r0b = vtrn1q_f32(row0b, row1b);
r1a = vtrn2q_f32(row0a, row1a);
r1b = vtrn2q_f32(row0b, row1b);
r2a = vtrn1q_f32(row2a, row3a);
r2b = vtrn1q_f32(row2b, row3b);
r3a = vtrn2q_f32(row2a, row3a);
r3b = vtrn2q_f32(row2b, row3b);
r4a = vtrn1q_f32(row4a, row5a);
r4b = vtrn1q_f32(row4b, row5b);
r5a = vtrn2q_f32(row4a, row5a);
r5b = vtrn2q_f32(row4b, row5b);
r6a = vtrn1q_f32(row6a, row7a);
r6b = vtrn1q_f32(row6b, row7b);
r7a = vtrn2q_f32(row6a, row7a);
r7b = vtrn2q_f32(row6b, row7b);
row0a = vtrn1q_f64(row0a, row2a);
row0b = vtrn1q_f64(row0b, row2b);
row1a = vtrn1q_f64(row1a, row3a);
row1b = vtrn1q_f64(row1b, row3b);
row2a = vtrn2q_f64(row0a, row2a);
row2b = vtrn2q_f64(row0b, row2b);
row3a = vtrn2q_f64(row1a, row3a);
row3b = vtrn2q_f64(row1b, row3b);
row4a = vtrn1q_f64(row4a, row6a);
row4b = vtrn1q_f64(row4b, row6b);
row5a = vtrn1q_f64(row5a, row7a);
row5b = vtrn1q_f64(row5b, row7b);
row6a = vtrn2q_f64(row4a, row6a);
row6b = vtrn2q_f64(row4b, row6b);
row7a = vtrn2q_f64(row5a, row7a);
row7b = vtrn2q_f64(row5b, row7b);
vst1q_f32(pDst, row0a);
pDst += 4;
vst1q_f32(pDst, row4a);
pDst += 4;
vst1q_f32(pDst, row1a);
pDst += 4;
vst1q_f32(pDst, row5a);
pDst += 4;
vst1q_f32(pDst, row2a);
pDst += 4;
vst1q_f32(pDst, row6a);
pDst += 4;
vst1q_f32(pDst, row3a);
pDst += 4;
vst1q_f32(pDst, row7a);
pDst += 4;
vst1q_f32(pDst, row0b);
pDst += 4;
vst1q_f32(pDst, row4b);
pDst += 4;
vst1q_f32(pDst, row1b);
pDst += 4;
vst1q_f32(pDst, row5b);
pDst += 4;
vst1q_f32(pDst, row2b);
pDst += 4;
vst1q_f32(pDst, row6b);
pDst += 4;
vst1q_f32(pDst, row3b);
pDst += 4;
vst1q_f32(pDst, row7b);
}
It boils down to : 16 load + 32 trn + 16 store vs 64 load + 64 store
Now we can clearly see it really isn't worth it. The neon routine above might be a little faster, but I doubt it will make a difference in the end.
No, you can't optimize it any further. Nobody can. Just make sure the pointers are 64byte aligned, test it, and decide for yourself.
ld1 {v0.4s-v3.4s}, [x1], #64
ld1 {v4.4s-v7.4s}, [x1], #64
ld1 {v16.4s-v19.4s}, [x1], #64
ld1 {v20.4s-v23.4s}, [x1]
trn1 v24.4s, v0.4s, v2.4s // row0
trn1 v25.4s, v1.4s, v3.4s
trn2 v26.4s, v0.4s, v2.4s // row1
trn2 v27.4s, v1.4s, v3.4s
trn1 v28.4s, v4.4s, v6.4s // row2
trn1 v29.4s, v5.4s, v7.4s
trn2 v30.4s, v4.4s, v6.4s // row3
trn2 v31.4s, v5.4s, v7.4s
trn1 v0.4s, v16.4s, v18.4s // row4
trn1 v1.4s, v17.4s, v19.4s
trn2 v2.4s, v16.4s, v18.4s // row5
trn2 v3.4s, v17.4s, v19.4s
trn1 v4.4s, v20.4s, v22.4s // row6
trn1 v5.4s, v21.4s, v23.4s
trn2 v6.4s, v20.4s, v22.4s // row7
trn2 v7.4s, v21.4s, v23.4s
trn1 v16.2d, v24.2d, v28.2d // row0a
trn1 v17.2d, v0.2d, v4.2d // row0b
trn1 v18.2d, v26.2d, v30.2d // row1a
trn1 v19.2d, v2.2d, v6.2d // row1b
trn2 v20.2d, v24.2d, v28.2d // row2a
trn2 v21.2d, v0.2d, v4.2d // row2b
trn2 v22.2d, v26.2d, v30.2d // row3a
trn2 v23.2d, v2.2d, v6.2d // row3b
st1 {v16.4s-v19.4s}, [x0], #64
st1 {v20.4s-v23.4s}, [x0], #64
trn1 v16.2d, v25.2d, v29.2d // row4a
trn1 v17.2d, v1.2d, v5.2d // row4b
trn1 v18.2d, v27.2d, v31.2d // row5a
trn1 v19.2d, v3.2d, v7.2d // row5b
trn2 v20.2d, v25.2d, v29.2d // row4a
trn2 v21.2d, v1.2d, v5.2d // row4b
trn2 v22.2d, v27.2d, v31.2d // row5a
trn2 v23.2d, v3.2d, v7.2d // row5b
st1 {v16.4s-v19.4s}, [x0], #64
st1 {v20.4s-v23.4s}, [x0]
ret
above is the hand optimized assembly version that's most probably shorter (as short as it can get), but not exactly meaningfully faster than:
Below is the pure C version that I'd settle with:
void transpose8x8(float *pDst, float *pSrc)
{
uint32_t i = 8;
do {
pDst[0] = *pSrc++;
pDst[8] = *pSrc++;
pDst[16] = *pSrc++;
pDst[24] = *pSrc++;
pDst[32] = *pSrc++;
pDst[40] = *pSrc++;
pDst[48] = *pSrc++;
pDst[56] = *pSrc++;
pDst++;
} while (--i);
}
or
void transpose8x8(float *pDst, float *pSrc)
{
uint32_t i = 8;
do {
*pDst++ = pSrc[0];
*pDst++ = pSrc[8];
*pDst++ = pSrc[16];
*pDst++ = pSrc[24];
*pDst++ = pSrc[32];
*pDst++ = pSrc[40];
*pDst++ = pSrc[48];
*pDst++ = pSrc[56];
pSrc++;
} while (--i);
}
PS: It could bring some gain in performance/power consumption if you declared pDst and pSrc uint32_t *, because the compiler would definitely generate pure integer machine code which has most various addressing modes, and only use w registers instead of s ones. Just typecase float * to uint32_t *
PS2: Clang already utilizes w registers instead of s ones while GCC is being GCC.... When will GNU-shills finally admit the fact that GCC is an extremely bad choice for ARM?
godbolt
PS3: Below is the non-neon version in assembly (zero latency) since I was very disappointed (even shocked) in both Clang and GCC above:
.arch armv8-a
.global transpose8x8
.text
.balign 64
.func
transpose8x8:
mov w10, #8
sub x0, x0, #8
.balign 16
1:
ldr w2, [x1, #0]
ldr w3, [x1, #32]
ldr w4, [x1, #64]
ldr w5, [x1, #96]
ldr w6, [x1, #128]
ldr w7, [x1, #160]
ldr w8, [x1, #192]
ldr w9, [x1, #224]
subs w10, w10, #1
stp w2, w3, [x0, #8]
add x1, x1, #4
stp w4, w5, [x0, #16]
stp w6, w7, [x0, #24]
stp w8, w9, [x0, #32]!
b.ne 1b
.balign 16
ret
.endfunc
.end
It's arguably the best version you will ever get if you still insist on doing pure 8x8 transpose. It might be a little slower than the neon assembly version, but consume considerably less power.
It's possible to optimise the 8x8 neon code presented in the other answer; 8x8 transpose can be not only thought of as recursive version of [A B;C D]' == [A' C'; B' D'] but also as repeated application of zip or unzip.
a b c d
e f g h
i j k l
m n o p == a b c d e f g h i j k l m n o p
zip(first_half, last_half) ==
zip(...) == a i b j c k d l e m f n g o h p
zip(...) == a e i m b f j n c g k o d h l p == transpose
For 8x8 matrix we need to apply this algorithm 3 times and reading the data by vld4 two of those passes have been already done.
float32x4x4_t d0 = vld4q_f32(input);
float32x4x4_t d1 = vld4q_f32(input + 16);
float32x4x4_t d2 = vld4q_f32(input + 32);
float32x4x4_t d3 = vld4q_f32(input + 48);
float32x4x4_t e0 = {
vzipq_f32(d0.val[0], d2.val[0]).val[0],
vzipq_f32(d0.val[1], d2.val[1]).val[0],
vzipq_f32(d0.val[2], d2.val[2]).val[0],
vzipq_f32(d0.val[3], d2.val[3]).val[0]
};
float32x4x4_t e1 = {
vzipq_f32(d1.val[0], d3.val[0]).val[0],
vzipq_f32(d1.val[1], d3.val[1]).val[0],
vzipq_f32(d1.val[2], d3.val[2]).val[0],
vzipq_f32(d1.val[3], d3.val[3]).val[0]
};
float32x4x4_t e2 = {
vzipq_f32(d0.val[0], d2.val[0]).val[1],
vzipq_f32(d0.val[1], d2.val[1]).val[1],
vzipq_f32(d0.val[2], d2.val[2]).val[1],
vzipq_f32(d0.val[3], d2.val[3]).val[1]
};
float32x4x4_t e3 = {
vzipq_f32(d1.val[0], d3.val[0]).val[1],
vzipq_f32(d1.val[1], d3.val[1]).val[1],
vzipq_f32(d1.val[2], d3.val[2]).val[1],
vzipq_f32(d1.val[3], d3.val[3]).val[1]
};
vst1q_f32_x4(output, e0);
vst1q_f32_x4(output + 16, e1);
vst1q_f32_x4(output + 32, e2);
vst1q_f32_x4(output + 48, e3);
One should be able to perform the transpose also by starting with vld1q_f32_x4, then uzpq and finish with vst4q_f32.
This question already has answers here:
ARM Assembler Regular Bubble Sort
(1 answer)
RAM store binary numbers and bubble sort in assembly language
(1 answer)
Closed 1 year ago.
I am attempting to write a program to search for the median of an array after first sorting the array from smallest integer to largest integer. I am referencing a C program for finding the median of an array to translate it into the ARM assembly equivalent. I am having trouble figuring out how to swap a[j] with a[j+1] if a[j] > a[j+1].
Here is the C program:
void swap(int *p,int *q) {
int t;
t=*p;
*p=*q;
*q=t;
}
void sort(int a[],int n) {
int i,j,temp;
for(i = 0;i < n-1;i++) {
for(j = 0;j < n-i-1;j++) {
if(a[j] > a[j+1])
swap(&a[j],&a[j+1]);
}
}
}
int main() {
int a[] = {6,3,8,5,1};
int n = 5;
int sum,i;
sort(a,n);
n = (n+1) / 2 - 1; // -1 as array indexing in C starts from 0
and here is the program I have written so far:
.syntax unified
.cpu cortex-m4
.fpu softvfp
.thumb
.section .data
.balign
array: .word 81,75,90,94,79,86,89,54,75,98, -1
size: .word 9
.section .text
.balign
.global main
Median:
ldr r8, =array
ldr r9, =size //hard coded size of the array -1 is not meant to be included
mov r1, #0 // i = 0
mov r3, #0 // j = 0
ldr r2, [r8, r1, lsl #2] // getting a[j]
ldr r4, [r8, r1, lsl #3] //getting a[j+1]
cmp r2, r4
bgt swap
swap: // swap a[j] with a[j+1]
str r2, [r4]
forever:
b forever
I'm trying to pass a matrix multiplication written in C to ARM assembly manually. The matrices are always square matrices defined by TAM and I'm using unidimensional arrays like you can see here. The original C code is:
void multiply(int *a, int *b, int *c){
int i = 0;
while(i < TAM){
int j = 0;
while(j < TAM){
int k = 0;
while(k < TAM){
c[i*TAM + j] = c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j];
k++;
}
j++;
}
i++;
}
}
And the ARM code is getting segfault in store instruction but I cannot find the cause:
ldr x0, =matriz1
ldr x1, =matriz2
ldr x2, =matriz3
mov x3, #10 //TAM = 10
mov x4, #0 //i = 0
loop_i: //while(i<TAM){
mov x5, #0 //j = 0
loop_j: //while(j<TAM){
mov x6, #0 //k = 0
loop_k: //while(k<TAM){
madd x7, x3, x4, x6 //x7 <- i*TAM + k
ldr x7, [x0, x7] //a[x7]
madd x8, x6, x3, x5 //x8 <- k*TAM + j
ldr x8, [x1, x8] //b[x8]
madd x9, x3, x4, x5 //x9 <- i*TAM + j
ldr x10, [x2, x9] //c[x9]
madd x10, x7, x8, x10 //c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j]
str x10, [x2, x9] //c[i*TAM + j] = c[i*TAM + j] + a[i*TAM + k]*b[k*TAM + j]
end_loop_k:
add x6, x6, #1 //k++
cmp x6, #10
blt loop_k // }
end_loop_j:
add x5, x5, #1 //j++
cmp x5, #10
blt loop_j // }
end_loop_i:
add x4, x4, #1 //i++
cmp x4, #10
blt loop_i //}
The comments are related to the original C code.
Do you have any idea why the segfault?
A CUDA kernel with some local, fixed-size array may get compiled so that the array resides in the thread's "local memory", or - if NVCC can determine the position of each array access at compile time, and there are enough registers available - the array might be broken up with its elements residing in registers.
Is it possible to check or to ensure, either via the code or as part of the build process, that a specific array, or all local arrays in a kernel, have been fit into registers? Is doing so supported by any tool?
At runtime
You may use the CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES as a hint on whether your array has been registrified; using CUDA driver API function cuFuncGetAttribute. But for some use cases, runtime may be too late.
At compile time
You want to have a look at the generated ptx file (using --keep option in nvcc).
The local data declation is identified as .local in the ptx. Here is a small example, with a kernel.
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
buffer[j % ww] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
When compiled there is a declaration of a local variable:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.local .align 4 .b8 __local_depot0[260];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<5>;
.reg .b32 %r<219>;
.reg .b64 %rd<81>;
However, when rolling the buffer, buffer is always accessed with known indices and registers may be obtained - no local storage:
#define ww 65
__global__ void kernel(int W, int H, const int *a, int *b)
{
int buffer[ww];
for (int i = threadIdx.x; i < H; i += blockDim.x)
{
#pragma unroll
for (int w = 0; w < ww; ++w)
buffer[w] = a[i + w * W];
for (int j = 5; j < H - 5; ++j)
{
#pragma unroll
for (int w = 0; w < ww-1; ++w)
buffer[w] = buffer[w + 1];
buffer[ww - 1] = a[i + (j + 6) * W];
int s = 0;
#pragma unroll
for (int w = 0; w < ww; ++w)
s += buffer[w];
b[i + (j + 6) * W] = s;
}
}
}
Yields the following ptx:
.visible .entry _Z6kerneliiPKiPi(
.param .u32 _Z6kerneliiPKiPi_param_0,
.param .u32 _Z6kerneliiPKiPi_param_1,
.param .u64 _Z6kerneliiPKiPi_param_2,
.param .u64 _Z6kerneliiPKiPi_param_3
)
{
.reg .pred %p<5>;
.reg .b32 %r<393>;
.reg .b64 %rd<240>;
Note that depending on the number of registers available, the number of required registers may not fit. These are virtual registers (which has somehow changed in recent versions of CUDA). Meaning that the absence of .local .align 4 .b8 __local_depot is a prerequisite, but not sufficient.
You need to look at the SASS then. Using nvdisasm on your generated .cubin, you want to search for STL instruction which stands for STore Local, as described briefly here. Here are parts of the two disassembled cubins compiled with two different values of --maxrregcount compiler switch - first for 32 (see the many occurrences of STL):
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=32"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ { IADD32I R1, R1, -0x180;
/*0018*/ S2R R0, SR_TID.X; }
/*0028*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0030*/ NOP;
/*0038*/ NOP;
/*0048*/ #P0 EXIT;
.L_3:
/*0050*/ IADD R2, R0, c[0x0][0x140];
/*0058*/ MOV R30, c[0x0][0x140];
/*0068*/ ISCADD R5.CC, R2.reuse, c[0x0][0x148], 0x2;
/*0070*/ { SHR R3, R2, 0x1e;
/*0078*/ STL [R1+0x14], R5; }
/*0088*/ ISCADD R2, R30.reuse, R0.reuse, 0x1;
/*0090*/ ISCADD R4, R30.reuse, R0.reuse, 0x2;
/*0098*/ ISCADD R20, R30, R0, 0x3;
/*00a8*/ IADD.X R5, R3, c[0x0][0x14c];
/*00b0*/ { SHR R3, R2.reuse, 0x1e;
/*00b8*/ STL [R1+0x10], R5; }
/*00c8*/ ISCADD R2.CC, R2, c[0x0][0x148], 0x2;
/*00d0*/ STL [R1+0x8], R2;
/*00d8*/ SHR R5, R4, 0x1e;
/*00e8*/ IADD.X R2, R3, c[0x0][0x14c];
/*00f0*/ { ISCADD R4.CC, R4, c[0x0][0x148], 0x2;
/*00f8*/ STL [R1+0x4], R2; }
Then for 255 - no occurence of STL:
//--------------------- .text._Z6kerneliiPKiPi --------------------------
.section .text._Z6kerneliiPKiPi,"ax",#progbits
.sectioninfo #"SHI_REGISTERS=124"
.align 32
.global _Z6kerneliiPKiPi
.type _Z6kerneliiPKiPi,#function
.size _Z6kerneliiPKiPi,(.L_25 - _Z6kerneliiPKiPi)
.other _Z6kerneliiPKiPi,#"STO_CUDA_ENTRY STV_DEFAULT"
_Z6kerneliiPKiPi:
.text._Z6kerneliiPKiPi:
/*0008*/ MOV R1, c[0x0][0x20];
/*0010*/ S2R R0, SR_TID.X;
/*0018*/ ISETP.GE.AND P0, PT, R0, c[0x0][0x144], PT;
/*0028*/ NOP;
/*0030*/ NOP;
/*0038*/ #P0 EXIT;
/*0048*/ MOV R46, c[0x0][0x144];
/*0050*/ IADD R47, RZ, -c[0x0][0x140];
/*0058*/ IADD32I R46, R46, -0x5;
/*0068*/ SHL R47, R47, 0x2;
.L_3:
/*0070*/ ISETP.LT.AND P0, PT, R46, 0x6, PT;
/*0078*/ #P0 BRA `(.L_1);
/*0088*/ MOV R2, c[0x0][0x140];
/*0090*/ ISCADD R2, R2, R0, 0x6;
/*0098*/ SHR R27, R2.reuse, 0x1e;
/*00a8*/ ISCADD R26.CC, R2, c[0x0][0x148], 0x2;
/*00b0*/ SHR R48, R47, 0x1f;
/*00b8*/ IADD.X R27, R27, c[0x0][0x14c];
/*00c8*/ { IADD R44.CC, R47.reuse, R26;
/*00d0*/ LDG.E R49, [R26]; }
/*00d8*/ IADD.X R45, R48.reuse, R27;
/*00e8*/ { IADD R42.CC, R47.reuse, R44 SLOT 0;
/*00f0*/ LDG.E R44, [R44] SLOT 1; }
/*00f8*/ IADD.X R43, R48.reuse, R45;
/*0108*/ { IADD R38.CC, R47, R42 SLOT 0;
/*0110*/ LDG.E R42, [R42] SLOT 1; }
Very much like you I assume, I wish that all of this was better documented.
I have the following code which I would like to optimise using ARM NEON instructions. How can I implement it?
Thanks for the answers
unsigned char someVector[] = {1, 2, 4, 1, 2, 0, 8, 100};
unsigned char maxVal = 0, minVal = 255;
for (int i = 0; i < sizeof(someVector); i++)
{
if (someVector[i] < minVal)
{
minVal = someVector[i];
}
else if (someVector[i] > maxVal)
{
maxVal = someVector[i];
}
}
Below is an highly optimized example how to find min and max in a large array. The function simply returns if size is smaller than 128 :
/*
* minmax.S
*
* Created on: 2014. 10. 29.
* Author: Jake Lee
*/
// unsigned int minmax(unsigned char *pSrc, unsigned int size);
.text
.arm
.global minmax
pSrc .req r0
size .req r1
qmin1 .req q0
dmina .req d0
dminb .req d1
qmax1 .req q1
dmaxa .req d2
dmaxb .req d3
qmin2 .req q2
qmax2 .req q3
.align 5
.func
minmax:
subs size, size, #128
bxmi lr
vmov.i8 qmin1, #0xff
vmov.i8 qmax1, #0
vmov.i8 qmin2, #0xff
vmov.i8 qmax2, #0
.align 5
1:
vld1.8 {q8, q9}, [pSrc]!
vld1.8 {q10, q11}, [pSrc]!
vld1.8 {q12, q13}, [pSrc]!
vld1.8 {q14, q15}, [pSrc]!
subs size, size, #128
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vmin.u8 qmin1, q8
vmax.u8 qmax1, q8
vmin.u8 qmin2, q9
vmax.u8 qmax2, q9
vmin.u8 qmin1, q10
vmax.u8 qmax1, q10
vmin.u8 qmin2, q11
vmax.u8 qmax2, q11
vmin.u8 qmin1, q12
vmax.u8 qmax1, q12
vmin.u8 qmin2, q13
vmax.u8 qmax2, q13
vmin.u8 qmin1, q14
vmax.u8 qmax1, q14
vmin.u8 qmin2, q15
vmax.u8 qmax2, q15
bpl 1b
// deal width residuals (size % 128)
cmp size, #-128
addgt pSrc, pSrc, size
bgt 1b
// shrink to sixteen
vmin.u8 qmin1, qmin2
vmax.u8 qmax1, qmax2
// shrink to eight
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to four
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to two
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
// shrink to one
vpmin.u8 dmina, dmina, dminb
vpmax.u8 dmaxa, dmaxa, dmaxb
vmov r0, dmina[0]
vmov r1, dmaxa[0]
and r0, r0, #0xff
and r1, r1, #0xff
orr r0, r0, r1, lsl #16
bx lr
.endfunc
.end
The return value is an unsigned int. The lower 16 bits contain min and higher ones max :
result = minmax(pSrc, size);
min = result & 0xff;
max = result >> 16;
GCC will auto-vectorize this, with only small modifications.
unsigned char someVector[256] = { 1, 2, 4, 1, 2, 0, 8, 100 };
unsigned char maxVal = 0, minVal = 255;
void f(void)
{
unsigned char mn = 255, mx = 0;
for (int i = 0; i < sizeof(someVector); i++) {
if (someVector[i] < mn) {
mn = someVector[i];
}
if (someVector[i] > mx) {
mx = someVector[i];
}
}
maxVal = mx;
minVal = mn;
}
compile with
$ arm-unknown-linux-gnueabihf-gcc -O3 -std=c11 -mfpu=neon -c test.c
or
$ arm-unknown-linux-gnueabihf-gcc -O2 -ftree-vectorize -std=c11 -mfpu=neon -c test.c
You can do better than GCC if you write NEON intrinsics or assembler.