I wonder if my code can be more optimized - c

I'm student and taking Microprocessor class.
Because of COVID-19, I had gotten a simple mid-term assignment writing inline assembly code of QuickSort.
Today, I got a score of this assignment and the score was almost the lowest because of "operation speed". (scored by rank, that mean my code works much slower than anyone else in class)
I tried to optimize my code before submission. So I have no idea why does my code work slowly.
TA said it is common for 85 ms to come out, but my code has an execution speed of 400ms.
I guess my code load a lot of unnecessary memory.
So my questions are:
Do I load unnecessary memory? (ie. Is my code use ldr, str instruction in unnecessary way?)
Does the number of memory loads affect the actual operation time?
Adding comment on "optimization", We only learned that the access speed of memory is slower then the register in the class, So I only tried to minimize the memory load and reduce unnecessary construction.
That mean someone can use "awesome tricks" to solve this problem but it is not general in my class.
Development environment:
GNU C Compiller(GCC) in Arm cortex M0 processor and nano Editor
Here is my code:
void QuickSort(int* arr, int size, int pivot, int end) {
int Lsize, Lstart, Lend, Rsize, Rstart, Rend;
//Note that all variables are mean index of Array, excep for temp
//so in inline assembly, We have to change them into byte
asm
(
/*check Condition*/
"MOV r10, #2\n\t"
"CMP r1, r10\n\t"
"BLT FINISH\n\t"
/*Initialize*/
"MOV r10, #4\n\t" //r10 is temp value
//"LDR r1, %[size]\n\t" //r1 is size of arr
//"LDR r2, %[pivot]\n\t" //r2 is pivot's address(start)
"MUL r2, r2, r10\n\t"
//"LDR r3, %[end]\n\t" //r3 is end's address
"MUL r3, r3, r10\n\t"
"MOV r4, r2\n\t" //r4 is low's address
"ADD r4, r4, #4\n\t" //low = pivot +1
"MOV r5, r3\n\t" //r5 is high's address
/*Initialize for loop*/
"LDR r6, [r0, r4]\n\t" //let r6 as value of low
"LDR r7, [r0, r2]\n\t" //let r7 as value of pivot
"LDR r8, [r0, r5]\n\t" //let r8 as value of high
/*Start Loop*/
"B L6\n\t" //Check Condition First
"LOOP2:\n\t" //while(low <= high)
"B L7\n\t" //check condition First
"LOOP3:\n\t" //while(arr[low]<=arr[pivot])
"ADDS r4, r4, #4\n\t" //low++
"LDR r6, [r0,r4]\n\t" //and update low's value
"L7: CMP r6, r7\n\t"
"BLE LOOP3\n\t"
"B L8\n\t" //check condition First
"LOOP4:\n\t" //while(arr[high]>=arr[pivot])
"SUBS r5, r5, #4\n\t" //high--
"LDR r8, [r0,r5]\n\t" //and update high's value
"L8: CMP r8, r7\n\t"
"BGE LOOP4\n\t"
"CMP r5, r2\n\t" //if(high<pivot)
"BGE L9\n\t"
"MOVS r5, r2\n\t" //high = pivot
"L9:\n\t"
"CMP r4, r5\n\t" //if(low >= high)
"BGE END\n\t" //break LOOP2
"LDR r9, [r0, r4]\n\t" //store r9 value of low
"LDR r10, [r0, r5]\n\t" //store r10 value of high
"STR r9, [r0, r5]\n\t"
"STR r10, [r0, r4]\n\t"
"MOV r6, r10\n\t"
"MOV r8, r9\n\t"
"L6: CMP r4, r5\n\t" //Compare whether low <= high
"BLE LOOP2\n\t" //If so, back to loop2
"END: \n\t" //This is end of loop2
/*swap pivot and high*/
"LDR r9, [r0, r5]\n\t" //store r9 vlaue of high
"LDR r10, [r0, r2]\n\t" //store r10 value of pivot
"STR r9, [r0, r2]\n\t"
"STR r10, [r0,r5]\n\t"
/*Set variables again*/
//Note that we have to divide them in 4
//Use r9 as quotient and r10 as zero
"MOV r9, #0\n\t"
"MOV r10, #0\n\t"
"B D1\n\t"
"sLoop1:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r2, r2, #4\n\t"
"D1: CMP r2, r10\n\t"
"BGT sLoop1\n\t"
"MOV r2, r9\n\t" //pivot
"MOV r9, #0\n\t"
"B D2\n\t"
"sLoop2:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r3, r3, #4\n\t"
"D2: CMP r3, r10\n\t"
"BGT sLoop2\n\t"
"MOV r3, r9\n\t" //end
"MOV r9, #0\n\t"
"B D4\n\t"
"sLoop4:\n\t"
"ADD r9, r9, #1\n\t"
"SUB r5, r5, #4\n\t"
"D4: CMP r5, r10\n\t"
"BGT sLoop4\n\t"
"MOV r5, r9\n\t" //high
/*Let r10 a temp of argument*/
"SUB r10, r5, r2\n\t" //Lsize = high-pivot
"STR r10, %[Lsize]\n\t"
"STR r2, %[Lstart]\n\t" //Lstart = pivot
"SUB r10, r5, #1\n\t" //Lend=high-1
"STR r10, %[Lend]\n\t"
"SUB r10, r3, r5\n\t" //Rsize = end-high
"STR r10, %[Rsize]\n\t"
"ADD r10, r5, #1\n\t" //Rstart=high+1
"STR r10, %[Rstart]\n\t"
"STR r3, %[Rend]\n\t" //Rend = end
://There is no output operands
:[arr] "r"(arr), [size] "m"(size) ,[pivot] "m"(pivot),[end] "m"(end), [Lsize] "m"(Lsize), [Lstart] "m"(Lstart), [Lend] "m"(Lend), [Rsize] "m"(Rsize), [Rstart] "m"(Rstart), [Rend] "m"(Rend)
:"r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10"
);

Related

ARM64 Backtrace from link register

I am currently trying to get backtrace based on stack pointer and link register on ARM64 device using C program.
Below is example of objdump
bar() calls foo() with 240444: ebfffd68 bl 23f9ec <foo##Base>
I can get link register (lr) and from that getting 23f9ec, save it to backtrace list as last routine.
My question: From below assembly code with current lr 0023f9ec <foo##Base>:, how to calculate to get previous routine with lr is 0023fe14 <bar##Base> using C language?
here is my implementation, but getting wrong previous lr
int bt(void** backtrace, int max_size) {
unsigned long* sp = __get_SP();
unsigned long* ra = __get_LR();
int* funcbase = (int*)(int)&bt;
int spofft = (short)((*funcbase));
sp = (char*)sp-spofft;
unsigned long* wra = (unsigned long*)ra;
int spofft;
int depth = 0;
while(ra) {
wra = ra;
while((*wra >> 16) != 0xe92d) {
wra--;
}
if(wra == 0)
return 0;
spofft = (short)(*wra & 0xffff);
if(depth < max_size)
backtrace[depth] = ra;
else
break;
ra =(unsigned long *)((unsigned long)ra + spofft);
sp =(unsigned long *)((unsigned long)sp + spofft);
depth++;
}
return 1;
}
0023f9ec <foo##Base>:
23f9ec: e92d42f3 push {r0, r1, r4, r5, r6, r7, r9, lr}
23f9f0: e1a09001 mov r9, r1
23f9f4: e1a07000 mov r7, r0
23f9f8: ebfffff9 bl 23f9e4 <__get_SP##Base>
23f9fc: e59f4060 ldr r4, [pc, #96] ; 23fa64 <foo##Base+0x78>
23fa00: e08f4004 add r4, pc, r4
23fa04: e1a05000 mov r5, r0
23fa08: ebfffff3 bl 23f9dc <__get_LR##Base>
23fa0c: e59f3054 ldr r3, [pc, #84] ; 23fa68 <foo##Base+0x7c>
23fa10: e3002256 movw r2, #598 ; 0x256
23fa14: e59f1050 ldr r1, [pc, #80] ; 23fa6c <foo##Base+0x80>
23fa18: e7943003 ldr r3, [r4, r3]
23fa1c: e08f1001 add r1, pc, r1
23fa20: e5934000 ldr r4, [r3]
23fa24: e1a03005 mov r3, r5
23fa28: e6bf4074 sxth r4, r4
23fa2c: e58d4004 str r4, [sp, #4]
23fa30: e1a06000 mov r6, r0
23fa34: e58d0000 str r0, [sp]
23fa38: e59f0030 ldr r0, [pc, #48] ; 23fa70 <foo##Base+0x84>
23fa3c: e08f0000 add r0, pc, r0
23fa40: ebfd456d bl 190ffc <printf#plt>
23fa44: e1a03009 mov r3, r9
23fa48: e1a02007 mov r2, r7
23fa4c: e1a01006 mov r1, r6
23fa50: e0640005 rsb r0, r4, r5
23fa54: ebffff70 bl 23f81c <get_prev_sp_ra2##Base>
23fa58: e3a00000 mov r0, #0
23fa5c: e28dd008 add sp, sp, #8
23fa60: e8bd82f0 pop {r4, r5, r6, r7, r9, pc}
23fa64: 003d5be0 eorseq r5, sp, r0, ror #23
23fa68: 000026c8 andeq r2, r0, r8, asr #13
23fa6c: 002b7ba6 eoreq r7, fp, r6, lsr #23
23fa70: 002b73e5 eoreq r7, fp, r5, ror #7
0023fe14 <bar##Base>:
23fe14: e92d4ef0 push {r4, r5, r6, r7, r9, sl, fp, lr}
23fe18: e24dde16 sub sp, sp, #352 ; 0x160
23fe1c: e59f76a8 ldr r7, [pc, #1704] ; 2404cc <bar##Base+0x6b8>
23fe20: e1a04000 mov r4, r0
23fe24: e59f66a4 ldr r6, [pc, #1700] ; 2404d0 <bar##Base+0x6bc>
23fe28: e1a03000 mov r3, r0
23fe2c: e59f26a0 ldr r2, [pc, #1696] ; 2404d4 <bar##Base+0x6c0>
23fe30: e08f7007 add r7, pc, r7
23fe34: e08f6006 add r6, pc, r6
23fe38: e3a00000 mov r0, #0
23fe3c: e08f2002 add r2, pc, r2
23fe40: e1a05001 mov r5, r1
23fe44: e3a01003 mov r1, #3
23fe48: e59f9688 ldr r9, [pc, #1672] ; 2404d8 <bar##Base+0x6c4>
.....................................................................
24043c: e3a0100f mov r1, #15
240440: e1a0000a mov r0, sl
240444: ebfffd68 bl 23f9ec <foo##Base>
240448: e59f2108 ldr r2, [pc, #264] ; 240558 <bar##Base+0x744>
24044c: e3a01003 mov r1, #3
240450: e08f2002 add r2, pc, r2
240454: e1a05000 mov r5, r0
240458: e1a03000 mov r3, r0
24045c: e3a00000 mov r0, #0
I don't think there's an easy way to do this.
Normally the register ABI of any operating system contains a "frame pointer" register. For example, on Apple's armv7 ABI, this is r7:
0x10006fc0 b0b5 push {r4, r5, r7, lr}
0x10006fc2 02af add r7, sp, 8
0x10006fc4 0448 ldr r0, [0x10006fd8]
0x10006fc6 d0e90c45 ldrd r4, r5, [r0, 0x30]
0x10006fca 0020 movs r0, 0
0x10006fcc fff7a6ff bl 0x10006f1c
0x10006fd0 0019 adds r0, r0, r4
0x10006fd2 6941 adcs r1, r5
0x10006fd4 b0bd pop {r4, r5, r7, pc}
If you dereference r7 there, you get to a pair of pointers, the second of which is lr, and the first of which is the r7 of the calling function, allowing you to repeat this process until you reach the bottom of the stack.
Judging by the assembly you posted, the codebase you're looking at doesn't have that. This means that the only way to obtain the return address is the same way that the code itself does: step forward through each instruction and parse/interpret them until you reach something that loads into pc. This is of course imperfect, since there may be functions in your call stack that do not ever return, but there's not much you can do about that.
It may be tempting to search backwards instead, and while you can do a heuristic approach and probably reach quite reasonable results with it, that is even less reliable than searching forward, since you have absolutely no way of telling whether you arrived at address X by stepping forward from the previous instruction or by explicitly jumping there from somewhere else.

Quick sort using ARM assembly - segmentation error

I'm trying to make a Quick Sort function using ARM assembly (Raspberry pi),
but it shows me segmentation error.
I think recursion process makes that error, while storing or loading with stacks.
Can you tell me how can I fix it?
I used ARM assembly code in https://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Quicksort#ARM_Assembly
here,
I just typed it same. Just changing registers like 'r3'->'r2', 'r2'->'r1', 'r1'->'r0' ...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE 32
int main()
{
int arr[SIZE];
int max, min;
int i;
for (i = 0; i < SIZE; i++) {
arr[i] = rand() % 100;
}
asm(
"mov r0, #0\n\t"
"mov r1, #128\n\t"
"Loop3:\n\t"
"stmfd sp!, {r3, r5, lr}\n\t"
"mov r5, r1\n\t"
"Loop4:\n\t"
"sub r6, r5, r0\n\t"
"cmp r6, #4\n\t"
"ldmlefd sp!, {r3, r5, pc}\n\t"
"ldr r6, [%[arr],r0]\n\t"
"add r1, r0, #4\n\t"
"mov r3, r5\n\t"
"Loop5:\n\t"
"ldr r2, [%[arr],r1]\n\t"
"cmp r2, r6\n\t"
"addle r1, r1, #4\n\t"
"ble Loop6\n\t"
"sub r3, r3, #4\n\t"
"ldr r4, [%[arr],r3]\n\t"
"str r4, [%[arr],r1]\n\t"
"str r2, [%[arr],r3]\n\t"
"Loop6:\n\t"
"cmp r1, r3\n\t"
"blt Loop5\n\t"
"Loop7:\n\t"
"sub r1, r1, #4\n\t"
"ldr r2, [%[arr],r1]\n\t"
"str r2, [%[arr],r0]\n\t"
"str r6, [%[arr],r1]\n\t"
"bl Loop3\n\t"
"mov r0, r3\n\t"
"b Loop4\n\t"
:
:
[arr] "r"(arr)
:
"r0", "r1", "r2", "r3", "r4", "r5", "r6"
);
return 0;
}
You inline asm can never reach the end of the asm template. Presumably you're trying to return out of the C function, not just the internal recursive calls. That's obviously unsafe because there's zero guarantee about stack layout or the contents of LR, and that will change with/without optimization.
Don't write a whole recursive in the middle of a C function.
Use a debugger to single-step the resulting program and see where your code breaks the compiler-generated asm that surrounds it.
Also your inline asm is broken: you dereference arr without specifying it as a memory read/write input or a "memory" clobber. A pointer input does not imply that the pointed-to memory is also an operand.

Debugging ARM Assembly Context Switch

I'm working on a context switch in ARM v6 assembly. I posted about writing the switch in C, but assembly seems to be safer and more reliable. I've spent a while checking all the offsets and being careful not to delete data from registers, but the context switch just doesn't seem to work properly. I have set up and tested timer interrupts without switching context.
Here's my code:
interrupt_asm:
//store basic interrupt stuff
sub lr, lr, #4
//call the interrupt vector
push { r0-r12 }
mov r0, lr # Pass old pc
bl interrupt_vector # C function
pop { r0-r12 }
# save_current_thread:
//remember r1 so you can use it for r0
push {r1}
mov r1, r0 //store r0 so it can be restored
push {r2, r3}
bl get_current_thread //r0 now has the address of CURRENT_THREAD
pop {r2, r3}
add r0, r0, #4 // r0 = &CURRENT_THREAD.r0
str r1, [r0] // save what the r0 was
pop {r1} // restore r1
add r0, r0, #4 // r0 = &CURRENT_THREAD.r1
str r1, [r0] // save r1
//r2
add r0, r0, #4
str r2, [r0]
//r3
add r0, r0, #4
str r3, [r0]
//r4
add r0, r0, #4
str r4, [r0]
//r5
add r0, r0, #4
str r5, [r0]
//r6
add r0, r0, #4
str r6, [r0]
//r7
add r0, r0, #4
str r7, [r0]
//r8
add r0, r0, #4
str r8, [r0]
//r9
add r0, r0, #4
str r9, [r0]
//r10
add r0, r0, #4
str r10, [r0]
//r11
add r0, r0, #4
str r11, [r0]
//r12
add r0, r0, #4
str r12, [r0]
//store SVC sp, lr, and pc
mrs r1, cpsr
bic r1, r1, #0x1F
orr r1, r1, #0x13
msr cpsr_c, r1
//sp
add r0, r0, #4
str sp, [r0]
//lr
add r0, r0, #4
str lr, [r0]
//back to IRQ land
mrs r1, cpsr
bic r1, r1, #0x1F
orr r1, r1, #0x12
msr cpsr_c, r1
//pc THIS NEEDS TO BE LR
add r0, r0, #4
str lr, [r0]
//cpsr
add r0, r0, #4
mrs r1, cpsr
str r1, [r0]
//spsr
add r0, r0, #4
mrs r1, spsr
str r1, [r0]
push {r2, r3}
bl increment_thread //r0 now has the address of our next thread
pop {r2, r3}
# restore_thread:
//were pushing in order, so from the bottom up our stack is: r0, ... r12, sp, lr, pc, spsr
add r0, r0, #4 //r0 = &CURRENT_THREAD.r0
ldr r1, [r0] //r1 = thread.r0
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r1
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r2
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r3
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r4
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r5
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r6
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r7
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r8
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r9
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r10
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r11
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r12
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.sp
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.lr
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.pc
push {r1}
add r0, r0, #8 //skip cpsr - agreed
ldr r1, [r0] //r1 = thread.spsr
push {r1}
//Our stack now looks like spsr, pc, lr, sp, r12, ... r0 (in order of popping)
pop {r1} //this was the spsr - SPSR
pop {lr} //this was the pc - PC (from thread)
pop {r2} //this was the lr - LR (from thread)
pop {r3} //this was the sp - SP (from thread)
//switch to SVC
mrs r0, cpsr
bic r0, r0, #0x1F
orr r0, r0, #0x13
msr cpsr_c, r0
msr spsr, r1 //restore spsr
mov lr, r2 //restore lr to be old lr
mov sp, r3 //restore sp
//switch to IRQ
// mrs r0, cpsr
// bic r0, r0, #0x1F
// orr r0, r0, #0x12
// msr cpsr_c, r0
cps #0x12
//our stack now just has the normal registers in it. Restore them
pop {r12}
pop {r11}
pop {r10}
pop {r9}
pop {r8}
pop {r7}
pop {r6}
pop {r5}
pop {r4}
pop {r3}
pop {r2}
pop {r2}
pop {r1}
pop {r0}
push {lr}
ldm sp!, {pc}^
A thread looks like this:
typedef struct __attribute__((packed, aligned(8))) {
void (*run)();
unsigned r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, sp, lr, pc;
unsigned cpsr, spsr;
unsigned id;
unsigned priority;
thread_status status;
wait_event wait_status;
} thread_t;
Do you have any advice about what's going on? An interrupt occurs and it never goes back to a new thread. I've debugged with GDB simulator but can't seem to nail down the issue.
The C function interrupt_vector just does this:
void interrupt_vector(unsigned pc) {
CURRENT_THREAD.pc = pc;
printf(" interrupt vector (pc = 0x%08x | thread.r0 = 0x%08x)\n", pc, CURRENT_THREAD.r0);
armtimer_clear_interrupt();
}
My other C functions are literally one line, and I've looked at their disassembly:
void increment_thread(){
// Not trying to actually increment yet
__asm__ volatile("mov r0, %0" : : "r" ((unsigned) &CURRENT_THREAD));
}
void get_current_thread(){
__asm__ volatile("mov r0, %0" : : "r" ((unsigned) &CURRENT_THREAD));
}

Symmetric difference in ARM

I am trying to make a program on ARM that finds the symmetric difference between two sets and stores it in a separate set and I am not sure what I am doing wrong, can somebody help?
Here is what I have:
start
LDR R0, = ASize ;load number of elements in A
LDR R0, [R0]
LDR R1, = BSize ;load number of elements in B
LDR R1, [R1]
LDR R2, = CSize ;load number of elements in C
LDR R2, [R2]
LDR R3, = AElems ;load elements in A
LDR R4, = BElems ;load elements in B
LDR R5, = CElems ;load elements in C
LDR R8, = '?'
while
CMP R0,#0
BEQ endwh
while2
CMP R1,#0
BEQ endwh
LDR R6, [R3]
LDR R7, [R4]
CMP R6,R7
BEQ endwh
STR R7, [R5]
ADD R2,R2,#1
ADD R4,R4,#4
STR R8, [R4]
SUB R1,R1,#1
B while
STR R6, [R5]
ADD R2,R2,#1
STR R8, [R3]
SUB R0,R0,#1
ADD R3,R3,#4
B while2
endwh
stop B stop
I managed to figure it out thanks for the help!
here is the solution I came up with
start
LDR R0, = ASize
LDR R0, [R0]
LDR R1, = BSize
LDR R1, [R1]
LDR R2, = CSize
LDR R2, [R2]
LDR R3, = AElems
LDR R4, = BElems
LDR R5, = CElems
LDR R8, = '?'
while
CMP R0,#0
BEQ endwh1
LDR R6, [R3]
LDR R7, [R4]
CMP R6,R7
BNE endwh2
STR R8, [R3]
STR R8, [R4]
ADD R3,R3,#4
SUB R0,R0,#1
B while
endwh2
CMP R1,#0
BEQ endwh3
ADD R4,R4,#4
SUB R1,R1,#1
B while
endwh3
STR R6, [R5]
ADD R5,R5,#4
ADD R2,R2,#1
ADD R3,R3,#4
SUB R0,R0,#1
LDR R4, = BElems
LDR R1, = BSize
LDR R1, [R1]
B while
endwh1
while2
CMP R1,#0
BEQ endwh
LDR R7, [R4]
CMP R7,#'?'
BEQ endwh4
STR R7, [R5]
ADD R2,R2,#1
ADD R5,R5,#4
ADD R4,R4,#4
SUB R1,R1,#1
B while2
endwh4
ADD R4,R4,#4
SUB R1,R1,#1
B while2
endwh
stop B stop

simple ADD/ADC ARM assemlby fails

I have the following C and ASM version of the (supposedly) same code. What it does is load 2 128bit ints represented by 2 64bit ints each to registers (first 4*lower 32bit, then 4*higher 32bit) and ADD/ADC to them. It is simple enough code and the ARM/ST manuals actually give the same example with 96bit (3 ADD/ADCs).
For simple calls both versions work (repeatedly adding (1 << x++) or 1..x). But for the longer testsuite the ARM assembly fails (board hangs). ATM I have no ability to trap/debug that and cannot use any printf() or the likes to find the test failing, which is irrelevant anyways, because there must be some basic fault in the ASM version, as the C version works as expected.
I don't get it, it's simple enough and very close to the C assembly output (sans branching). I tried the "memory" constraint (shouldn't be needed), I tried saving the carry between lower and upper 64bit in a register and adding that later, using ADD(C).W, alignment, using two LDR/STR instead of LDRD/STRD, etc.. I assume the board faults because some addition goes wrong and results in a divide by 0 or something like that.
The GCC ASM is below and uses similar basic technique, so I don't see the problem.
I'm really just looking for the fastest way to do the add, not to fix that code specifically. It's a shame you have to use constant register names because there is no constraint for specifying rX and rX+1. Also it's impossible to use as many registers as GCC as it will run out of them during compilation.
typedef struct I128 {
int64_t high;
uint64_t low;
} I128;
I128 I128add(I128 a, const I128 b) {
#if defined(USEASM) && defined(ARMx)
__asm(
"LDRD %%r2, %%r3, %[alo]\n"
"LDRD %%r4, %%r5, %[blo]\n"
"ADDS %%r2, %%r2, %%r4\n"
"ADCS %%r3, %%r3, %%r5\n"
"STRD %%r2, %%r3, %[alo]\n"
"LDRD %%r2, %%r3, %[ahi]\n"
"LDRD %%r4, %%r5, %[bhi]\n"
"ADCS %%r2, %%r2, %%r4\n"
"ADC %%r3, %%r3, %%r5\n"
"STRD %%r2, %%r3, %[ahi]\n"
: [alo] "+m" (a.low), [ahi] "+m" (a.high)
: [blo] "m" (b.low), [bhi] "m" (b.high)
: "r2", "r3", "r4", "r5", "cc"
);
return a;
#else
// faster to use temp than saving low and adding to a directly
I128 r = {a.high + b.high, a.low + b.low};
// check for overflow of low 64 bits, add carry to high
// avoid conditionals
//r.high += r.low < a.low || r.low < b.low;
// actually gcc produces faster code with conditionals
if(r.low < a.low || r.low < b.low) ++r.high;
return r;
}
GCC C version using " armv7m-none-eabi-gcc-4.7.2 -O3 -ggdb -fomit-frame-pointer -falign-functions=16 -std=gnu99 -march=armv7e-m":
b082 sub sp, #8
e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
a908 add r1, sp, #32
e881 000c stmia.w r1, {r2, r3}
e9dd 890e ldrd r8, r9, [sp, #56] ; 0x38
e9dd 670a ldrd r6, r7, [sp, #40] ; 0x28
e9dd 2308 ldrd r2, r3, [sp, #32]
e9dd 450c ldrd r4, r5, [sp, #48] ; 0x30
eb16 0a08 adds.w sl, r6, r8
eb47 0b09 adc.w fp, r7, r9
1912 adds r2, r2, r4
eb43 0305 adc.w r3, r3, r5
45bb cmp fp, r7
bf08 it eq
45b2 cmpeq sl, r6
d303 bcc.n 8012c9a <I128add+0x3a>
45cb cmp fp, r9
bf08 it eq
45c2 cmpeq sl, r8
d204 bcs.n 8012ca4 <I128add+0x44>
2401 movs r4, #1
2500 movs r5, #0
1912 adds r2, r2, r4
eb43 0305 adc.w r3, r3, r5
e9c0 2300 strd r2, r3, [r0]
e9c0 ab02 strd sl, fp, [r0, #8]
e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
b002 add sp, #8
4770 bx lr
My ASM version that fails:
b082 sub sp, #8
b430 push {r4, r5}
a902 add r1, sp, #8
e881 000c stmia.w r1, {r2, r3}
e9dd 2304 ldrd r2, r3, [sp, #16]
e9dd 4508 ldrd r4, r5, [sp, #32]
1912 adds r2, r2, r4
416b adcs r3, r5
e9cd 2304 strd r2, r3, [sp, #16]
e9dd 2302 ldrd r2, r3, [sp, #8]
e9dd 4506 ldrd r4, r5, [sp, #24]
4162 adcs r2, r4
eb43 0305 adc.w r3, r3, r5
e9cd 2302 strd r2, r3, [sp, #8]
4604 mov r4, r0
c90f ldmia r1, {r0, r1, r2, r3}
e884 000f stmia.w r4, {r0, r1, r2, r3}
4620 mov r0, r4
bc30 pop {r4, r5}
b002 add sp, #8
4770 bx lr
I am not getting a hang from your code, but it isnt working either, not sure why. But it was very easy to patch the compiler generated code to handle the carry:
I128 I128add(I128 a, const I128 b) {
I128 r = {a.high + b.high, a.low + b.low};
return r;
}
becomes
000001e4 <I128add>:
1e4: b082 sub sp, #8
1e6: b4f0 push {r4, r5, r6, r7}
1e8: e9dd 4506 ldrd r4, r5, [sp, #24]
1ec: a904 add r1, sp, #16
1ee: e881 000c stmia.w r1, {r2, r3}
1f2: e9dd 230a ldrd r2, r3, [sp, #40] ; 0x28
1f6: 1912 adds r2, r2, r4
1f8: eb43 0305 adc.w r3, r3, r5
1fc: e9dd 6704 ldrd r6, r7, [sp, #16]
200: e9dd 4508 ldrd r4, r5, [sp, #32]
204: 1936 adds r6, r6, r4
206: eb47 0705 adc.w r7, r7, r5
20a: e9c0 6700 strd r6, r7, [r0]
20e: e9c0 2302 strd r2, r3, [r0, #8]
212: bcf0 pop {r4, r5, r6, r7}
214: b002 add sp, #8
216: 4770 bx lr
fixed up the adds
.thumb_func
.globl test2
test2:
sub sp, #8
push {r4, r5, r6, r7}
ldrd r4, r5, [sp, #24]
add r1, sp, #16
stmia r1, {r2, r3}
ldrd r2, r3, [sp, #40]
add r2, r4
adc r3, r5
ldrd r6, r7, [sp, #16]
ldrd r4, r5, [sp, #32]
adc r6, r4
adc r7, r5
strd r6, r7, [r0]
strd r2, r3, [r0, #8]
pop {r4, r5, r6, r7}
add sp, #8
bx lr
final result
00000024 <test2>:
24: b082 sub sp, #8
26: b4f0 push {r4, r5, r6, r7}
28: e9dd 4506 ldrd r4, r5, [sp, #24]
2c: a904 add r1, sp, #16
2e: c10c stmia r1!, {r2, r3}
30: e9dd 230a ldrd r2, r3, [sp, #40] ; 0x28
34: 1912 adds r2, r2, r4
36: 416b adcs r3, r5
38: e9dd 6704 ldrd r6, r7, [sp, #16]
3c: e9dd 4508 ldrd r4, r5, [sp, #32]
40: 4166 adcs r6, r4
42: 416f adcs r7, r5
44: e9c0 6700 strd r6, r7, [r0]
48: e9c0 2302 strd r2, r3, [r0, #8]
4c: bcf0 pop {r4, r5, r6, r7}
4e: b002 add sp, #8
50: 4770 bx lr
Notice the fewer number of thumb2 instructions, unless you are on a cortex-A that has thumb2 support, those fetches from flash (cortex-m) are (can be) slow. I see you are trying to save the push and pop of two more registers but you cost yourself more fetches. You could take the above and still re-arrange the loads and stores and save those two registers.
minimal testing so far. printfs show the upper words adding, where I didnt see that with your code. I am stilll trying to unwind the calling convention (please document your code more for us), looks like r0 is prepped by the caller to place the result, rest is on the stack. I am using a stellaris launchpad (cortex-m4).

Resources