Neon optimization of interlaced YUYV to gray - c

I have the following C code that converts an interlaced webcam YUYV to gray:
void convert_yuyv_to_y(const void *src, char *dest) {
int x, y;
char *Y, *gray;
//get only Y component for grayscale from (Y1)(U1,2)(Y2)(V1,2)
for (y = 0; y < CAM_HEIGHT; y++) {
Y = src + (CAM_WIDTH * 2 * y);
gray = dest + (CAM_WIDTH * y);
for (x=0; x < CAM_WIDTH; x += 2) {
gray[x] = *Y;
Y += 2;
gray[x + 1] = *Y;
Y += 2;
}
}
}
Is there a way to optimize such function with some neon instructions?

Here is a starting point. From here you can do cache preloads, loop unrolling, etc. The best performance will happen when more NEON registers are involved to prevent data stalls.
.equ CAM_HEIGHT, 480 # fill in the correct values
.equ CAM_WIDTH, 640
#
# Call from C as convert_yuyv_to_y(const void *src, char *dest);
#
convert_yuyv_to_y:
mov r2,#CAM_HEIGHT
cvtyuyv_top_y:
mov r3,#CAM_WIDTH
cvtyuyv_top_x:
vld2.8 {d0,d1},[r0]! # assumes source width is a multiple of 8
vst1.8 {d0},[r1]! # work with 8 pixels at a time
subs r3,r3,#8 # x+=8
bgt cvtyuyv_top_x
subs r2,r2,#1 # y++
bgt cvtyuyv_top_y
bx lr

(Promoting my comment to answer)
The least amount of instructions to de-interleave data in NEON architecture is achievable with the sequence:
vld2.8 { d0, d1 }, [r0]!
vst1.8 { d0 }, [r1]!
Here r0 is the source pointer, which advances by 16 each time and r1 is the destination pointer, which advances by 8.
Loop unrolling, ability to retrieve up to 4 registers and offset the registers by 2 can give slightly larger maximum throughput. Coupled with alignment by 16 bytes:
start:
vld4.8 { d0, d1, d2, d3 }, [r0:256]
subs r3, r3, #1
vld4.8 { d4, d5, d6, d7 }, [r1:256]
add r0, r0, #64
add r1, r0, #64
vst2.8 { d0, d2 }, [r2:256]!
vst2.8 { d4, d6 }, [r2:128]!
bgt start
(I can't remember if the format vstx.y {regs}, [rx, ro] exists -- here ro is offset register, that post-increments rx)
While memory transfer optimizations can be useful, it's still better to think, if it can be skipped all together, or merged with some calculation. Also this could be the place to consider planar pixel format, which could completely avoid the copying task.

Related

ARM NEON: Convert a binary 8-bit-per-pixel image (only 0/1) to 1-bit-per-pixel?

I am working on a task to convert a large binary label image, which has 8 bits (uint8_t) per pixel and each pixel can only be 0 or 1 (or 255), to an array of uint64_t numbers and each bit in uint64_t number represent a label pixel.
For example,
input array: 0 1 1 0 ... (00000000 00000001 00000001 00000000 ...)
or input array: 0 255 255 0 ... (00000000 11111111 11111111 00000000 ...)
output array (number): 6 (because after convert each uint8_t to bit, it becomes 0110)
Currently the C code to achieve this is:
for (int j = 0; j < width >> 6; j++) {
uint8_t* in_ptr= in + (j << 6);
uint64_t out_bits = 0;
if (in_ptr[0]) out_bits |= 0x0000000000000001;
if (in_ptr[1]) out_bits |= 0x0000000000000002;
.
.
.
if (in_ptr[63]) out_bits |= 0x8000000000000000;
*output = obits; output ++;
}
Can ARM NEON optimize this functionality? Please help. Thank you!
Assuming the input value is either 0 or 255, below is the basic version which is rather straightforward, especially for people with Intel SSE/AVX experience.
void foo_basic(uint8_t *pDst, uint8_t *pSrc, intptr_t length)
{
//assert(length >= 64);
//assert(length & 7 == 0);
uint8x16_t in0, in1, in2, in3;
uint8x8_t out;
const uint8x16_t mask = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
length -= 64;
do {
do {
in0 = vld1q_u8(pSrc); pSrc += 16;
in1 = vld1q_u8(pSrc); pSrc += 16;
in2 = vld1q_u8(pSrc); pSrc += 16;
in3 = vld1q_u8(pSrc); pSrc += 16;
in0 &= mask;
in1 &= mask;
in2 &= mask;
in3 &= mask;
in0 = vpaddq_u8(in0, in1);
in2 = vpaddq_u8(in2, in3);
in0 = vpaddq_u8(in0, in2);
out = vpadd_u8(vget_low_u8(in0), vget_high_u8(in0));
vst1_u8(pDst, out); pDst += 8;
length -= 64;
} while (length >=0);
pSrc += length>>3;
pDst += length;
} while (length > -64);
}
Neon however has VERY user friendly and efficient permutation and bit operation instructions that allow to go "vertical"
void foo_advanced(uint8_t *pDst, uint8_t *pSrc, intptr_t length)
{
//assert(length >= 128);
//assert(length & 7 == 0);
uint8x16x4_t in0, in1;
uint8x16x2_t row04, row15, row26, row37;
length -= 128;
do {
do {
in0 = vld4q_u8(pSrc); pSrc += 64;
in1 = vld4q_u8(pSrc); pSrc += 64;
row04 = vuzpq_u8(in0.val[0], in1.val[0]);
row15 = vuzpq_u8(in0.val[1], in1.val[1]);
row26 = vuzpq_u8(in0.val[2], in1.val[2]);
row37 = vuzpq_u8(in0.val[3], in1.val[3]);
row04.val[0] = vsliq_n_u8(row04.val[0], row15.val[0], 1);
row26.val[0] = vsliq_n_u8(row26.val[0], row37.val[0], 1);
row04.val[1] = vsliq_n_u8(row04.val[1], row15.val[1], 1);
row26.val[1] = vsliq_n_u8(row26.val[1], row37.val[1], 1);
row04.val[0] = vsliq_n_u8(row04.val[0], row26.val[0], 2);
row04.val[1] = vsliq_n_u8(row04.val[1], row26.val[1], 2);
row04.val[0] = vsliq_n_u8(row04.val[0], row04.val[1], 4);
vst1q_u8(pDst, row04.val[0]); pDst += 16;
length -= 128;
} while (length >=0);
pSrc += length>>3;
pDst += length;
} while (length > -128);
}
The Neon-only advanced version is shorter and faster, but GCC is extremely bad at dealing with Neon specific permutation instructions such as vtrn, vzip, and vuzp.
https://godbolt.org/z/bGdbohqKe
Clang isn't any better: it spams unnecessary vorr where GCC does the same with vmov.
.syntax unified
.arm
.arch armv7-a
.fpu neon
.global foo_asm
.text
.func
.balign 64
foo_asm:
sub r2, r2, #128
.balign 16
1:
vld4.8 {d16, d18, d20, d22}, [r1]!
vld4.8 {d17, d19, d21, d23}, [r1]!
vld4.8 {d24, d26, d28, d30}, [r1]!
vld4.8 {d25, d27, d29, d31}, [r1]!
subs r2, r2, #128
vuzp.8 q8, q12
vuzp.8 q9, q13
vuzp.8 q10, q14
vuzp.8 q11, q15
vsli.8 q8, q9, #1
vsli.8 q10, q11, #1
vsli.8 q12, q13, #1
vsli.8 q14, q15, #1
vsli.8 q8, q10, #2
vsli.8 q12, q14, #2
vsli.8 q8, q12, #4
vst1.8 {q8}, [r0]!
bpl 1b
add r1, r1, r2
cmp r2, #-128
add r0, r0, r2, asr #3
bgt 1b
.balign 8
bx lr
.endfunc
.end
The inner most loop consists of :
GCC: 32 instructions
Clang: 30 instructions
Asm: 18 instructions
It doesn't take rocket science to figure out which one is the fastest and by how much: Never trust compilers if you are about to do permutations.
Standing on the shoulder of Jake 'Alquimista' LEE, we can improve the unzipping instruction and the algorithm as well by changing the order of the zip and vlsi operators:
#define interleave_nibbles(top) \
top.val[0] = vsliq_n_u8(top.val[0], top.val[1],1);\
top.val[2] = vsliq_n_u8(top.val[2], top.val[3],1);\
top.val[0] = vsliq_n_u8(top.val[0], top.val[2],2);
void transpose_bits(uint8_t const *src, uint8_t *dst) {
uint8x16x4_t top = vld4q_u8(src);
uint8x16x4_t bot = vld4q_u8(src + 64); src+=128;
interleave_nibbles(top);
interleave_nibbles(bot);
// now we have 4 bits correct in each of the 32 bytes left
// top = 0to3 4to7 8to11 12to15 ...
// bot = 64to67 68to71 ...
uint8x16x2_t top_bot = vuzpq_u8(top.val[0], bot.val[0]);
uint8x16_t result = vsliq_n_u8(top_bot.val[0], top_bot.val[1], 4);
vst1q_u8(dst, result); dst += 16;
}
The produced assembler by clang has now only two extraneous movs (by or) and gcc output has four movs.
vld4.8 {d16, d18, d20, d22}, [r0]!
vld4.8 {d17, d19, d21, d23}, [r0]!
vld4.8 {d24, d26, d28, d30}, [r0]!
vsli.8 q10, q11, #1
vorr q0, q8, q8
vld4.8 {d25, d27, d29, d31}, [r0]
vsli.8 q0, q9, #1
vorr q2, q14, q14
vsli.8 q12, q13, #1
vsli.8 q2, q15, #1
vsli.8 q0, q10, #2
vsli.8 q12, q2, #2
vuzp.8 q0, q12
vsli.8 q0, q12, #4
vst1.8 {d0, d1}, [r1]
And the arm64 version looks perfect with only 12 instructions.
ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0], #64
ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x0]
sli v0.16b, v1.16b, #1
sli v2.16b, v3.16b, #1
sli v0.16b, v2.16b, #2
sli v4.16b, v5.16b, #1
sli v6.16b, v7.16b, #1
sli v4.16b, v6.16b, #2
uzp1 v16.16b, v0.16b, v4.16b
uzp2 v0.16b, v0.16b, v4.16b
sli v16.16b, v0.16b, #4
str q16, [x1]
You can do it more efficiently (especially for short arrays or single vectors) using something like this (in this example, turning one 128 bit register into one 16 bit mask):
// turn mask of bytes in v0 into mask of bits in w0
movmsk: adr x0, 0f // obtain address of literal
ld1r {v1.2d}, [x0] // load 80..01 mask twice into v1
and v0.16b, v0.16b, v1.16b // mask bytes from ff to single bits
mov d1, v0.d[1] // extract high 64 bit
zip1 v0.8b, v0.8b, v1.8b // interleave high and low bytes
addv h0, v0.8h // sum into bit mask
mov w0, v0.s[0] // move result to general register
ret
0: .quad 0x8040201008040201
The idea is to turn the contents of each byte into just one bit at the bit position it's going to end up at and to then sum up the bits using addv (8 bytes at a time, resulting in one byte of output).
Putting a loop around this code to have it traverse the entire array is left as an exercise to the reader.

VLA prototype and multidimensional array argument

I created a C99 VLA function as such :
void create_polygon(int n, int faces[][n]);
I want to call this function in another function where I would allocate my two-dimensional array :
void parse_faces()
{
int faces[3][6];
create_polygon(6, faces);
}
When I pass a two-dimensional array as an argument, it passes a pointer to a 6 integer array, referencing the stack memory in the calling function.
The VLA argument here only acts as a type declaration (not allocating any actual memory), telling the compiler to access the data in row-major order with ((int*)faces)[i * 6 + j] instead of faces[i][j].
What is the difference between declaring functions with a VLA argument or with a fixed size ?
faces[i][j] always is equivalent to *(*(faces + i) + j), no matter if VLA or not.
Now let's compare two variants (not considering that you actually need the outer dimension as well to prevent exceeding array bounds on iterating):
void create_polygon1(int faces[][6]);
void create_polygon2(int n, int faces[][n]);
It doesn't matter if array passed to originally were created as classic array or as VLA, first function accepts arrays of length of exactly 6, second can accept arbitrary length array (assuming this being clear so far...).
faces[i][j] will now be translated to:
*((int*)faces + (i * 6 + j)) // (1)
*((int*)faces + (i * n + j)) // (2)
Difference yet looks marginal, but might get more obvious on assembler level (assuming all variables are yet stored on stack; assuming sizeof(int) == 4):
LD R1, i;
LD R2, j;
MUL R1, R1, 24; // using a constant! 24: 6 * sizeof(int)!
MUL R2, R2, 4; // sizeof(int)
ADD R1, R2, R2; // index stored in R1 register
LD R1, i;
LD R2, j;
LD R3, m; // need to load from stack
MUL R3, R3, 4; // need to multiply with sizeof(int) yet
MUL R1, R1, R3; // can now use m from register R3
MUL R2, R2, 4; // ...
ADD R1, R2, R2; // ...
True assembler code might vary, of course, especially if you use a calling convention that allows passing some parameters in registers (then loading n into into R3 might be unnecessary).
For completeness (added due to comments, unrelated to original question):There's yet the int* array[] case: Representation by array of pointers to arrays.
*((int*)faces + (i * ??? + j))
doesn't work any more, as faces in this case is no contiguous memory (well, the pointers themselves are in contiguous memory, of course, but not all the faces[i][j]). We are forced to do:
*(*(faces + i) + j)
as we need to dereference the true pointer in the array before we can apply the next index. Assembler code for (for comparison, need a more complete variant of the pointer to 2D-array case first):
LD R1, faces;
LD R2, i;
LD R3, j;
LD R4, m; // or skip, if no VLA
MUL R4, R4, 4; // or skip, if no VLA
MUL R2, R2, R3; // constant instead of R3, if no VLA
MUL R3, R3, 4;
ADD R2, R2, R3; // index stored in R1 register
ADD R1, R1, R2; // offset from base pointer
LD R1, [R1]; // loading value of faces[i][j] into register
LD R1, faces;
LD R2, i;
LD R3, j;
MUL R2, R2, 8; // sizeof(void*) (any pointer)
MUL R3, R3, 4; // sizeof(int)
ADD R1, R1, R2; // address of faces[i]
LD R1, [R1]; // now need to load address - i. e. de-referencing faces[i]
ADD R1, R1, R3; // offset within array
LD R1, [R1]; // loading value of faces[i][j] into register
I disassembled this code :
void create_polygon(int n, int faces[][6])
{
int a = sizeof(faces[0]);
(void)a;
}
With VLA argument :
movl %edi, -4(%rbp) # 6
movq %rsi, -16(%rbp) # faces
movl %edi, %esi
shlq $2, %rsi # 6 << 2 = 24
movl %esi, %edi
With fixed size :
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movl $24, %edi # 24
As Aconcagua pointed out, in the first example using a VLA, the size is computed at run time by multiplying the size of an int by the size of the second dimension, which is the argument stored in rsi, then moved into edi.
In the second example, the size is directly computed at compile time and placed into edi. The main advantage being the ability to check an incorrect pointer type argument if passing a different size, thus avoiding a crash.

Compare two 64 bit variables on 32 bit microcontroller

I have the following issue: I have two 64 bit variables and they have to be compared as quick as possible, my Microcontroller is only 32bit.
My thoughts are that it is necessary to divide 64 bit variable into two 32 bit variables, like this
uint64_t var = 0xAAFFFFFFABCDELL;
hiPart = (uint32_t)((var & 0xFFFFFFFF00000000LL) >> 32);
loPart = (uint32_t)(var & 0xFFFFFFFFLL);
and then to compare hiParts and loParts, but I am sure that this approach is slow and there is much better solution
The first rule should be: Write your program, so that is readable to a human.
When in doubt, don't assume anything, but measure it. Let's see, what godbolt gives us.
#include <stdint.h>
#include <stdbool.h>
bool foo(uint64_t a, uint64_t b) {
return a == b;
}
bool foo2(uint64_t a, uint64_t b) {
uint32_t ahiPart = (uint32_t)((a & 0xFFFFFFFF00000000ULL) >> 32);
uint32_t aloPart = (uint32_t)(a & 0xFFFFFFFFULL);
uint32_t bhiPart = (uint32_t)((b & 0xFFFFFFFF00000000ULL) >> 32);
uint32_t bloPart = (uint32_t)(b & 0xFFFFFFFFULL);
return ahiPart == bhiPart && aloPart == bloPart;
}
foo:
eor r1, r1, r3
eor r0, r0, r2
orr r0, r0, r1
rsbs r1, r0, #0
adc r0, r0, r1
bx lr
foo2:
eor r1, r1, r3
eor r0, r0, r2
orr r0, r0, r1
rsbs r1, r0, #0
adc r0, r0, r1
bx lr
As you can see, they result in the exact same assembly code, but you decide, which one is less error prone and easiert to read?
There was a time some years ago where you need to do tricks to be more smart than a compiler. But in 99.999% the compiler will be more smart than you.
And your variables are unsigned. So use ULL instead of LL.
The fastest way is to let the compiler do it. Most compilers are much better than humans at micro-optimization.
uint64_t var = …, other_var = …;
if (var == other_var) …
There aren't many ways to go about it. Under the hood, the compiler will arrange to load the upper 32 bits and the lower 32 bits of each variables into registers, and compare the two registers that contain upper 32 bits and the two registers that contain lower 32 bits. The assembly code might look something like this:
load 32 bits from &var into r0
load 32 bits from &other_var into r1
if r0 != r1: goto different
load 32 bits from &var + 4 into r2
load 32 bits from &other_var + 4 into r3
if r2 != r3: goto different
// code for if-equal
different:
// code for if-not-equal
Here are some things the compiler knows better than you:
Which registers to use, based on the needs of the surrounding code.
Whether to reuse the same registers to compare the upper and lower parts, or to use different registers.
Whether to process one part and then the other (as above), or to load one variable then the other. The best order depends on the pressure on registers and on the memory access times and pipelining of the particular processor model.
If you work with a union you could compare Hi and Lo Part without any extra calculations:
typedef union
{
struct
{
uint32_t loPart;
uint32_t hiPart;
};
uint64_t complete;
}uint64T;
uint64T var.complete = 0xAAFFFFFFABCDEULL;

Convert ARM 32-bit neon to ARM 64-bit neon

I have the following 32-bit neon code that simply extracts an image:
extractY8ImageARM(unsigned char *from, unsigned char *to, int left, int top, int width, int height, int stride)
from: pointer to the original image
to: pointer to the destination extracted image
left, top: position where to extract in the original image
width, height: size of the extracted image
stride: width of the original image
and here is the assembly code:
.text
.arch armv7-a
.fpu neon
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req r0
to .req r1
left .req r2
top .req r3
width .req r4
height .req r5
stride .req r6
tmp .req r7
push {r0-r7, lr}
//Let's get back the arguments
ldr width, [sp, #(9 * 4)]
ldr height, [sp, #(10 * 4)]
ldr stride, [sp, #(11 * 4)]
//Update the from pointer. Advance left + stride * top
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
//We will copy width
mov tmp, width
.loopH:
//Read and store data
pld [from]
vld1.u8 { d0, d1, d2, d3 }, [from]!
pld [to]
vst1.u8 { d0, d1, d2, d3 }, [to]!
subs tmp, tmp, #32
bgt .loopH
//We advance the from pointer for the next line
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
pop {r0-r7, pc}
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp
I need to port it to 64-bit neon. can anyone help me to do the translation? I have read this white paper http://malideveloper.arm.com/downloads/Porting%20to%20ARM%2064-bit.pdf so I understand more or less the differences.
My code is simple and it would be a good example how to pass arguments and load/store data in a 64-bit neon assembly file. I prefer to avoid intrinsic.
The whole code looks like this:
.text
.arch armv8-a
.type extractY8ImageARM, STT_FUNC
.global extractY8ImageARM
extractY8ImageARM:
from .req x0
to .req x1
left .req x2
top .req x3
width .req x4
height .req x5
stride .req x6
tmp .req x9
add from, from, left
mul tmp, top, stride
add from, from, tmp
.loopV:
mov tmp, width
.loopH:
ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [from], #64
st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [to], #64
subs tmp, tmp, #64
bgt .loopH
add from, from, stride
sub from, from, width
subs height, height, #1
bgt .loopV
ret
.unreq from
.unreq to
.unreq left
.unreq top
.unreq width
.unreq height
.unreq stride
.unreq tmp

Creating Nested If-Statements in ARM Assembly

I am interested in converting a Fibonacci sequence code in C++ into ARM assembly language. The code in C++ is as follows:
#include <iostream>
using namespace std;
int main()
{
int range, first = 0 , second = 1, fibonacci;
cout << "Enter range for the Fibonacci Sequence" << endl;
cin >> range;
for (int i = 0; i < range; i++)
{
if (i <=1)
{
fibonacci = i;
}
else
{
fibonacci = first and second;
first = second;
second = fibonacci;
}
}
cout << fibonacci << endl;
return 0;
}
My attempt at converting this to assembly is as follows:
ldr r0, =0x00000000 ;loads 0 in r0
ldr r1, =0x00000001 ;loads 1 into r1
ldr r2, =0x00000002 ;loads 2 into r2, this will be the equivalent of 'n' in C++ code,
but I will force the value of 'n' when writing this code
ldr r3, =0x00000000 ;r3 will be used as a counter in the loop
;r4 will be used as 'fibonacci'
loop:
cmp r3, #2 ;Compares r3 with a value of 0
it lt
movlt r4, r3 ;If r3 is less than #0, r4 will equal r3. This means r4 will only ever be
0 or 1.
it eq ;If r3 is equal to 2, run through these instructions
addeq r4, r0, r1
moveq r0,r1
mov r1, r4
adds r3, r3, #1 ;Increases the counter by one
it gt ;Similarly, if r3 is greater than 2, run though these instructions
addgt r4, r0, r1
movgt r0, r1
mov r1, r4
adds r3, r3, #1
I'm not entirely sure if that is how you do if statements in Assembly, but that will be a secondary concern for me at this point. What I am more interested in, is how I can incorporate an if statement in order to test for the initial condition where the 'counter' is compared to the 'range'. If counter < range, then it should go into the main body of the code where the fibonacci statement will be iterated. It will then continue to loop until counter = range.
I am not sure how to do the following:
cmp r3, r2
;If r3 < r2
{
<code>
}
;else, stop
Also, in order for this to loop correctly, am I able to add:
cmp r3, r2
bne loop
So that the loop iterates until r3 = r2?
Thanks in advance :)
It's not wise to put if-statements inside a loop. Get rid of it.
An optimized(kinda) standalone Fibonacci function should be like this:
unsigned int fib(unsigned int n)
{
unsigned int first = 0;
unsigned int second = 1;
unsigned int temp;
if (n > 47) return 0xffffffff; // overflow check
if (n < 2) return n;
n -= 1;
while (1)
{
n -= 1;
if (n == 0) return second;
temp = first + second;
first = second;
second = temp
}
}
Much like factorial, optimizing Fibonacci sequence is somewhat nonsense in real world computing, because they exceed the 32-bit barrier really soon: It's 12 with factorial and 47 with Fibonacci.
If you really need them, you are served the best with very short lookup tables.
If you need this function fully implemented for larger values:
https://www.nayuki.io/page/fast-fibonacci-algorithms
Last but not least, here is the function above in assembly:
cmp r0, #47 // r0 is n
movhi r0, #-1 // overflow check
bxhi lr
cmp r0, #2
bxlo lr
sub r2, r0, #1 // r2 is the counter now
mov r1, #0 // r1 is first
mov r0, #1 // r0 is second
loop:
subs r2, r2, #1 // n -= 1
add r12, r0, r1 // temp = first + second
mov r1, r0 // first = second
bxeq lr // return second when condition is met
mov r0, r12 // second = temp
b loop
Please note that the last bxeq lr can be placed immediately after subs which might seem more logical, but with the multiple issuing capability of the Cortex series in mind, it's better in this order.
It might be not exactly the answer you were looking for, but keep this in mind: A single if statement inside a loop can seriously cripple the performance - a nested one even more.
And there are almost always ways avoiding these. You just have to look for them.
Conditionals compile to conditional jumps in almost all assembly language:
if (condition)
..iftrue..
else
..iffalse..
becomes
eval condition
conditional_jump_if_true truelabel
..iffalse..
unconditional_jump endlabel
truelabel:
..iftrue..
endlabel:
or the other way around (exchange false and true).
ARM supports conditional execution to eliminate these jumps when compiling the innermost conditionals: http://www.davespace.co.uk/arm/introduction-to-arm/conditional.html
IT... is a Thumb-2 instruction: http://en.wikipedia.org/wiki/ARM_architecture#Thumb-2 to support unified assemblies. See http://www.keil.com/support/man/docs/armasm/armasm_BABJGFDD.htm for more details.
Your code for looping (cmp and bne) is fine.
In general, try to rewrite your code using gotos instead of cycles, and else parts.
else can remain only at the deepest nesting level.
Then you can convert this semi-assembly code to assembly much more easily.
HTH

Resources