Related
I'm currently having issues translating a C program to ARM assembly. The C program is as follows:
int i = 1;
int j = 0;
int x = 0;
int main(){
for( ; i < 10; i += 2){
for( j = i; j < 10; j++){
x += i + j;
}
}
return x;
}
This code will output 240.
What I have currently is as follows:
.data
i: .word 1
j: .word 0
x: .word 0
.text
.global main
main:
LDR r6, addrJ
LDR r5, addrI
LDR r4, addrX
LDR r3, [r6]
LDR r2, [r5]
LDR r1, [r4]
b loop_outer
loop_outer:
CMP r2, #10
BGE done
MOV r3, r2 # j = i
loop_inner:
CMP r3, #10 # j < 10
BGE inner_done
ADD r1, r1, r2 # x+=i
ADD r1, r1, r3 # x+=j
ADD r3, r3, #1 # j++
inner_done:
ADD r2, r2, #2 # i+=2
b loop_outer
b done
done:
MOV r0, r1
bx lr
addrI: .word i
addrX: .word x
addrJ: .word j
This code currently outputs 50. I have tried debugging myself but I have been having a hard time with GDB.
You're missing the b loop_inner to repeat the inner loop.
And b done is not needed, since it's after the unconditional b loop_outer, so it will never be executed.
loop_outer:
CMP r2, #10
BGE done
MOV r3, r2 # j = i
loop_inner:
CMP r3, #10 # j < 10
BGE inner_done
ADD r1, r1, r2 # x+=i
ADD r1, r1, r3 # x+=j
ADD r3, r3, #1 # j++
b loop_inner
inner_done:
ADD r2, r2, #2 # i+=2
b loop_outer
done:
MOV r0, r1
bx lr
I am trying to implement insertion sort in Assembly ARMV-8A. More specifically i tried to translate the following code in C:
void insertionSortRecursive(int arr[], int n)
{
if (n <= 1)
return;
insertionSortRecursive( arr, n-1 );
int last = arr[n-1];
int j = n-2;
while (j >= 0 && arr[j] > last)
{
arr[j+1] = arr[j];
j--;
}
arr[j+1] = last;
}
I have tried to translate it as it is but my implementation gets into infinite loop in function loop_insertion as debugger shows:
.data
my_Table: .space 16
size: .word 4
FileOpenMode: .string "r"
FileName: .string "test1.txt"
fscanf_str: .string "%d"
printf_str: .string "%d "
out_message_str: .string "%d "
.text
.global main
main:
stp x29,x30,[sp,-32]!
add x29,sp,0
adr x1,FileOpenMode
adr x0,FileName
bl fopen
mov x20,x0
adr x0,my_Table
mov x22,x0 //x22 adr of table
mov x21,4
mov x23,0
//**************** READ FROM FILE ******************
loop:
add x2,x29,28
adr x1,fscanf_str
mov x0,x20
bl fscanf
ldr w1,[x29,28]
mov x0,x22
str w1,[x0,x23]
add x23,x23,4
add w21,w21,-1
cbnz w21,loop
//********************************************
mov x0,x22 //adr of table
mov x1,4
bl insertion_sort
//**************** PRINT TO SCREEN FROM TABLE *****************
mov x21,0
mov x23,4
loop_print:
adr x0, out_message_str
ldr w1,[x22,x21]
bl printf
add x21,x21,4
sub x23,x23,1
cbnz x23,loop_print
//***********************************************************
ldp x29, x30, [sp], 32
ret
insertion_sort:
stp x29,x30,[sp,-64]!
add x29,sp,0
str x19,[x29,32] //str the save register
str x0,[x29,16] //str the address of table
str w1,[x29,24] //str the n
mov x19,4
cmp w1,1
b.le exit_ins
sub w1,w1,1
bl insertion_sort
ldr w9,[x29,24] //load the n for the suitable function
sub w9,w9,1 //n-1
mul w9,w9,w19
ldr x10,[x29,16] //adr table
ldr w11,[x10,x9] //last
udiv w9,w9,w19
sub w12,w9,1 //j=n-2
loop_insertion:
ldr w12,[x29,24]
cmp w12,0
b.lt label1
mul w12,w12,w19
ldr w13,[x10,x12] // w13=arr[j]
cmp w13,w11
b.le label1
add w12,w12,w19
str w13,[x10,x12] //arr[j+1]=w13
udiv w12,w12,w19
sub w12,w12,1
str w12,[x29,24]
b loop_insertion
label1:
add w12,w12,1
mul w12,w12,w19
str w11,[x10,x12]
exit_ins:
ldr x19,[x29,32] //ldr the value of x19 back to the x19
ldp x29, x30, [sp], 64
ret
I did some modifications such as loading and storing the value of n-2 inside the insertion_loop function but that does not make any change.
You need to comment your code better, especially if you want others to help.
I am guessing that instead of preserving j in w12 you are doing calculations with it and then try to get the original value back but fail. Since you have done add w12,w12,w19 to get arr[j+1], the value of w12 after udiv w12,w12,w19 will be j+1 and when you subtract one from that you end up with j again, hence the infinite loop. You have tons of registers, just use a different one for the j+1.
You should have been able to see this in your debugger.
I'm trying to translate a simple insertion sort algorithm to assembly, but something about this particular configuration is causing the program to get an invalid pointer error.
Here's the C version that I'm using:
int n, array[100], c, d, t;
for (c = 1; c < n - 1; c++) {
d = c;
while (d > 0 && array[d] < array[d - 1]) {
t = array[d];
array[d] = array[d - 1];
array[d - 1] = t;
d--;
}
}
This is a C struct that is being used:
typedef struct {
int *list;
int size;
int maxSize;
} list;
Here is my assembly file:
.syntax unified
.text
.align 8
.global insert_ARM
.func insert_ARM, insert_ARM
.type insert_ARM, %function
insert_ARM:
push {r4-r11, ip, lr}
# setup
ldr r4, [r0, #4]
sub r4, r4, 1 # r4 = n-1
mov r5, #1 # c=1
mov r6, #16 # d=0, which starts at #16
mov r7, #0 # t=0
for:
# d = c ; needs these lines to do the assembly equivalent, which is * 4.
mov r6, r5 # d = c
LSL r6, #2 # uses logical shift left: multiplies r6 by 4 to get the correct index
add r6, r6, 16 # add 16 because that's where the array starts
while:
# condition 1: d > 0
cmp r6, #0 # if d <= 0, get out of there
ble forLoopStatements
# condition 2: array[d] < array[d-1]
# first, I need to define array[d] and array[d-1]
# r8 = array[d] and r9 = array[d-1]
sub r10, r6, #4 # r10 = d-1
ldr r9, [r0, r10] # r9 = array[d-1]
ldr r8, [r0, r6] # r8 = array[d]
cmp r9, r8 # comparing array[d-1] with array[d]
bge forLoopStatements # if array[d] >= array[d-1], get out of there
# while effects
# note that r8 should still be array[d] here.
str r9, [r0, r6] # array[d] = array[d-1]
str r8, [r0, r10] # array[d-1] = t # BUG HERE.
sub r6, r6, #4 # d--; // does -4 for ARM
bal while # repeat loop
forLoopStatements:
# (c<n-1; c++)
add r5, r5, #1 # c++
cmp r5, r4 # compares c with n-1
blt for # if c < n-1, loop again
end:
mov r0, r10
pop {r4-r11, ip, lr}
BX lr
.endfunc
.end
It seems to be
str r8, [r0, r10] # array[d-1] = t
that causes a trip at some point.
Edit: I found out that r8's numbers during this instruction are somehow incorrect, since immediately using something like
mov r8, #4
before the store prevents the error (but of course makes the results incorrect).
Upon examining the contents of r0, it happens that the update is going off range because other members of the struct are being modified in the process. Array index 0 is at +16.
You found the problem in the translation to assembly. Note however the following problems:
The outer loop should run all the way to c < n instead of c < n - 1. As coded, the last element of the array is never moved.
it would be more readable to use 2 nested for loops:
int n, array[100], c, d, t;
for (c = 1; c < n; c++) {
for (d = c; d > 0 && array[d] < array[d - 1]; d--) {
t = array[d];
array[d] = array[d - 1];
array[d - 1] = t;
}
}
Every one has a different approach to writting code. Mine is different from your, but I would like to share my ideas. I would start with as simple as possible to get somthing working and build from there. Here is a sample code for a forloop.
/* forloop.s */
/* int n, array[100], c, d, t;
for (c=1; c<n-1; c++)
address of array = r0 = .word ( Raspbian Jessie = 32 bits )
n = r4 = array size
c = r5 = 1word = 4memory_bytes = index into array
d = r6 = c = address in array
array[d] = r10 = data
*/
.data
.balign 4
array:
.word 6, 3, 7, 8, 5, 2, 1, 9, 4
size:
.word (size - array)
.text
.global main
main:
push {r4-r12, lr} # save registers for OS
ldr r0, =array # load address of array in r0
ldr r4, =size # load address of size in r4
ldr r4, [r4] # load size in r4
sub r4, #4 # substract 1 word from r4 (n=n-1)
mov r5, #4 # move 4 in r5 (c=1word=4memory_bytes)
for: # (c=1; c<n-1; c++)
add r6, r0, r5 # d (r6) = array address (r0) + (c=4)
# while: # while loop would go here
ldr r10, [r6], #-4 # r10 = array[d], d=d-4
ldr r11, [r6] # r11 = array[d-1]
#... # while code
cmp r0, r6 # is d > 0 ...
#... #continue while loop code
# back to forloop code
cmp r5, r4 # compare (subtract) r5 (c) from r4 (n)
add r5, #4 # add 1 word to r5 (c++)
blt for # end of for loop (c<n-1)
end:
mov r0, #0 # set exit code
pop {r4-r12, lr} # restore enviroment for return to OS
bx lr # return to OS
Assemble and link the code and the run it and check ending status.
as -o forloop.o forloop.s
gcc -o forloop forloop.o
./forloop; echo $?
It works for me on the Raspberry Pi. I don't know much about gdb, but this may help as suggested by Jester. (See middle section "Commands" at http://cs107e.github.io/guides/gdb/ for more information.)
pi#RPi0:~/pgm/Asm $ gdb -tui forloop # Text User Interface
---Type <return> to continue, or q <return> to quit--- [Enter]
(gdb) layout asm
(gdb) start # start is required
(gdb) layout reg
(gdb) Ctl-x o # Selects registers as Up & Down arrow to see all
(gdb) si # single step
(gdb) [Enter] # repeat single step
(gdb) run # run program to end
(gdb) q # quit gdb
Move the down arrow to see the cpsr register. The left most number is the flags 8=Negative, 6=Zero&Carry, 4=Zero, 2=Carry, 1=oVerflow.
Another approach to debugging assembly program on arm is to use the linux printf command. Here is myprint.s.
/* myprint.s */
.data
.balign 4
format:
.asciz " %2d %2d %2d %2d %2d %2d %2d %2d %2d\n"
.balign 4
array:
.word 6, 3, 7, 8, 5, 2, 1, 9, 4
size:
.word (size - array)
.text
.global main
print: # --- a printf function to print the value in the array ---
push {r0-r12, lr} # save registers for OS
mrs r10, cpsr # save flag settings
ldr r11, =array # To print the array[0-8], the array
ldm r11, {r1-r9} # address is loaded in r11 and stored
push {r4-r10} # in reg r1-r9, printf gets args# from
ldr r0, =format # format, 3 print from r1-r3, rest from
bl printf # stack.
pop {r4-r10} # adjust stack, restore r10 (flags)
msr cpsr_f, r10 # restore saved flags
pop {r0-r12, pc} # restore reg and return
main:
push {r4-r12, lr} # save registers for OS
bl print # --- can be placed anywhere in code ---
ldr r0, =array # load address of array in r0
ldr r4, =size # load address of size in r4
ldr r4, [r4] # load size in r4
sub r4, #4 # substract 1word from r4 (n=n-1)
mov r5, #4 # move 4 in r5 (c=1word=4memory_bytes)
for: # (c=1; c<n-1; c++)
add r6, r0, r5 # d=r6 = array address (r0) + (c=4)
while: # while loop would go here
ldr r10, [r6], #-4 # r10 = array[d], d=d-4
ldr r11, [r6] # r11 = array[d-1]
cmp r10, r11 # is array[d] < array[d-1]
bge forloop_code # if not, continue forloop code
mov r7, r11 # move array[d-1] into t (r7)
str r10, [r6], #4 # store array[d] into array[d-1], (d-1)+4=d
str r7, [r6], #-4 # store t-array[d-1] into array[d], d-4=(d-1)
cmp r6, r0 # is d>0 (addr(array[d-1]) > addr(array[0]))?
bgt while # yes, check if array[d-1] < array[d-2]
forloop_code: # back to forloop code
bl print # --- can be placed anywhere in code ---
cmp r5, r4 # compare (subtract) r5 (c) from r4 (n)
add r5, #4 # add 1 word to r5 (c++)
blt for # end of for loop (c<n-1)
end:
pop {r4-r12, lr} # restore registers for OS
mov r0, #0 # set exit code
bx lr # return to OS
as -o myprint.o myprint.s
gcc -o myprint myprint.o
./myprint; echo $?
6 3 7 8 5 2 1 9 4
3 6 7 8 5 2 1 9 4
3 6 7 8 5 2 1 9 4
3 6 7 8 5 2 1 9 4
3 5 6 7 8 2 1 9 4
2 3 5 6 7 8 1 9 4
1 2 3 5 6 7 8 9 4
1 2 3 5 6 7 8 9 4
1 2 3 4 5 6 7 8 9
0
Another thought would be to assemble your C code and use gdb to see how C code in assembly. This was an interesting projects, I did not know about insertion sort.
I figured it out. Aside from cleaning up my code, I just needed to translate
while ( d > 0 )
as
cmp r6, #16 # if d <= 0, get out of there
ble forLoopStatements
instead of
cmp r6, #0 # if d <= 0, get out of there
ble forLoopStatements
to keep the minimum index at 0.
I made an bubble sort program in C, and checked its assembly listing file. But I cannot get where the for loops are. Could you let me know where are the for loops and if statement?
#include<stdio.h>
int main()
{
int arr[5]={2,4,5,6,1};
int i,j,tmp;
for(i=0;i<5;i++)
{
for(j=0;j<4;j++)
{
if(arr[j]>arr[j+1])
{
tmp=arr[j];
arr[j]=arr[j+1];
arr[j+1]=tmp;
}
}
}
for(i=0;i<5;i++){
printf(" %d",arr[i]);
}
printf("\n");
return 0;
}
And here is assembly listing file.
; generated by ARM C Compiler, ADS1.2 [Build 848]
; commandline [-O1 -browseinfo "0xff
" -S -g+ -fk -J:cw:]
CODE32
AREA ||.text||, CODE, READONLY
main PROC
|L1.0|
STMFD sp!,{r4,lr}
SUB sp,sp,#0x18
MOV r2,#0x14
LDR r1,|L1.148|
ADD r0,sp,#4
BL __rt_memcpy_w
MOV r12,#0
|L1.28|
MOV r0,#0
|L1.32|
ADD r1,sp,#4
ADD r2,sp,#4
ADD r3,r2,r0,LSL #2
LDR r1,[r1,r0,LSL #2]
LDR r2,[r3,#4]
CMP r1,r2
BLE |L1.72|
ADD lr,sp,#4
STR r2,[lr,r0,LSL #2]
STR r1,[r3,#4]
|L1.72|
ADD r0,r0,#1
CMP r0,#4
BLT |L1.32|
ADD r12,r12,#1
CMP r12,#5
BLT |L1.28|
MOV r4,#0
|L1.100|
ADD r0,sp,#4
LDR r1,[r0,r4,LSL #2]
ADR r0,|L1.152|
BL _printf
ADD r4,r4,#1
CMP r4,#5
BLT |L1.100|
ADR r0,|L1.160|
BL _printf
MOV r0,#0
ADD sp,sp,#0x18
LDMFD sp!,{r4,pc}
|L1.148|
DCD ||.constdata$1||
|L1.152|
DCB " %d"
DCB "\0\0\0\0"
|L1.160|
DCB "\n\0\0\0"
ENDP
AREA ||.constdata||, DATA, READONLY, ALIGN=2
||.constdata$1||
DCD 0x00000002
DCD 0x00000004
DCD 0x00000005
DCD 0x00000006
DCD 0x00000001
EXPORT main
IMPORT _main
IMPORT __main
IMPORT _printf
IMPORT __rt_memcpy_w
IMPORT ||Lib$$Request$$armlib||, WEAK
KEEP
Here you go. If you just looked for CMP and the constants, you should have been able to find them:
LDR r1,[r1,r0,LSL #2] ; r1 = arr[j]
LDR r2,[r3,#4] ; r2 = arr[j+1]
CMP r1,r2 ; arr[j] <= arr[j+1]
BLE |L1.72| ; jump if yes
ADD lr,sp,#4 ; arr
STR r2,[lr,r0,LSL #2] ; arr[j] = arr[j+1]
STR r1,[r3,#4] ; arr[j+1] = arr[j]
|L1.72|
ADD r0,r0,#1 ; j++
CMP r0,#4 ; j < 4
BLT |L1.32| ; jump to inner loop if yes
ADD r12,r12,#1 ; i++
CMP r12,#5 ; i < 5
BLT |L1.28| ; jump to outer loop if yes
I am interested in converting a Fibonacci sequence code in C++ into ARM assembly language. The code in C++ is as follows:
#include <iostream>
using namespace std;
int main()
{
int range, first = 0 , second = 1, fibonacci;
cout << "Enter range for the Fibonacci Sequence" << endl;
cin >> range;
for (int i = 0; i < range; i++)
{
if (i <=1)
{
fibonacci = i;
}
else
{
fibonacci = first and second;
first = second;
second = fibonacci;
}
}
cout << fibonacci << endl;
return 0;
}
My attempt at converting this to assembly is as follows:
ldr r0, =0x00000000 ;loads 0 in r0
ldr r1, =0x00000001 ;loads 1 into r1
ldr r2, =0x00000002 ;loads 2 into r2, this will be the equivalent of 'n' in C++ code,
but I will force the value of 'n' when writing this code
ldr r3, =0x00000000 ;r3 will be used as a counter in the loop
;r4 will be used as 'fibonacci'
loop:
cmp r3, #2 ;Compares r3 with a value of 0
it lt
movlt r4, r3 ;If r3 is less than #0, r4 will equal r3. This means r4 will only ever be
0 or 1.
it eq ;If r3 is equal to 2, run through these instructions
addeq r4, r0, r1
moveq r0,r1
mov r1, r4
adds r3, r3, #1 ;Increases the counter by one
it gt ;Similarly, if r3 is greater than 2, run though these instructions
addgt r4, r0, r1
movgt r0, r1
mov r1, r4
adds r3, r3, #1
I'm not entirely sure if that is how you do if statements in Assembly, but that will be a secondary concern for me at this point. What I am more interested in, is how I can incorporate an if statement in order to test for the initial condition where the 'counter' is compared to the 'range'. If counter < range, then it should go into the main body of the code where the fibonacci statement will be iterated. It will then continue to loop until counter = range.
I am not sure how to do the following:
cmp r3, r2
;If r3 < r2
{
<code>
}
;else, stop
Also, in order for this to loop correctly, am I able to add:
cmp r3, r2
bne loop
So that the loop iterates until r3 = r2?
Thanks in advance :)
It's not wise to put if-statements inside a loop. Get rid of it.
An optimized(kinda) standalone Fibonacci function should be like this:
unsigned int fib(unsigned int n)
{
unsigned int first = 0;
unsigned int second = 1;
unsigned int temp;
if (n > 47) return 0xffffffff; // overflow check
if (n < 2) return n;
n -= 1;
while (1)
{
n -= 1;
if (n == 0) return second;
temp = first + second;
first = second;
second = temp
}
}
Much like factorial, optimizing Fibonacci sequence is somewhat nonsense in real world computing, because they exceed the 32-bit barrier really soon: It's 12 with factorial and 47 with Fibonacci.
If you really need them, you are served the best with very short lookup tables.
If you need this function fully implemented for larger values:
https://www.nayuki.io/page/fast-fibonacci-algorithms
Last but not least, here is the function above in assembly:
cmp r0, #47 // r0 is n
movhi r0, #-1 // overflow check
bxhi lr
cmp r0, #2
bxlo lr
sub r2, r0, #1 // r2 is the counter now
mov r1, #0 // r1 is first
mov r0, #1 // r0 is second
loop:
subs r2, r2, #1 // n -= 1
add r12, r0, r1 // temp = first + second
mov r1, r0 // first = second
bxeq lr // return second when condition is met
mov r0, r12 // second = temp
b loop
Please note that the last bxeq lr can be placed immediately after subs which might seem more logical, but with the multiple issuing capability of the Cortex series in mind, it's better in this order.
It might be not exactly the answer you were looking for, but keep this in mind: A single if statement inside a loop can seriously cripple the performance - a nested one even more.
And there are almost always ways avoiding these. You just have to look for them.
Conditionals compile to conditional jumps in almost all assembly language:
if (condition)
..iftrue..
else
..iffalse..
becomes
eval condition
conditional_jump_if_true truelabel
..iffalse..
unconditional_jump endlabel
truelabel:
..iftrue..
endlabel:
or the other way around (exchange false and true).
ARM supports conditional execution to eliminate these jumps when compiling the innermost conditionals: http://www.davespace.co.uk/arm/introduction-to-arm/conditional.html
IT... is a Thumb-2 instruction: http://en.wikipedia.org/wiki/ARM_architecture#Thumb-2 to support unified assemblies. See http://www.keil.com/support/man/docs/armasm/armasm_BABJGFDD.htm for more details.
Your code for looping (cmp and bne) is fine.
In general, try to rewrite your code using gotos instead of cycles, and else parts.
else can remain only at the deepest nesting level.
Then you can convert this semi-assembly code to assembly much more easily.
HTH