ARM ASSEMBLY loop through argv arguments - loops

I am trying to pass arguments to my program at the command line using argv, i figured out how to point to the first argv address, but i cannot seem to loop to the next one.
here is my code, but I think that what is relevant is in the first subroutine:
.text
.global _start
.equ exit, 1
.equ write, 4
.equ stdout, 1
_start:
ldr r5, [sp] #argc value
ldr r6, =1
mov r8, #8 #argv address
0: ldr r4, [sp, r8]
add r8, r8, #4
mov r1, r4
adr r10, isbn10
adr r11, valid
adr r12, invalid
adr r13, isbn13
bl strlen
cmp r0, #13
beq 1f
cmp r0, #10
beq 2f
1: bl check_13
cmp r2,#0
bleq print13v
blne print13i
add r6, r6, #1
cmp r5,r6
bne 0b
mov r0, #0 # success exit code
mov r7, #exit
svc 0
2: bl check_10
cmp r2,#0
bleq print10v
blne print10i
add r6, r6, #1
cmp r5,r6
bne 0b
mov r0, #0 # success exit code
mov r7, #exit
svc 0 # return to os
val:.asciz "9780306406157"
isbn10:.asciz "\nisbn-10 : "
isbn13:.asciz "\nisbn-13 : "
valid:.asciz ": valid"
invalid:.asciz ": invalid"
.align 2
strlen:
mov r0, #0
# length to return
0:
ldrb r2, [r1], #1 # get current char and advance
cmp r2, #0 # are we at the end of the string?
addne r0, #1
bne 0b
mov pc, lr
#######################
check_13: #sum at r2
mov r1, r4
mov r3,#1 #toggle
mov r2,#0 #sum
0:
ldrb r0,[r1], #1
cmp r0, #0
beq 9f
cmp r0, #'0
blo 1f
cmp r0, #'9
bhi 1f
sub r0,r0,#'0
add r2,r2,r0
cmp r2, #10
subge r2,r2,#10
eors r3,r3,#1 #toggled?
addne r2,r2,r0,lsl#1
cmp r2, #10
subge r2,r2,#10
cmp r2, #10
subge r2,r2,#10
bal 0b
1: mov r2, #22 #returns r2=22 if invalid
mov pc,lr
9: mov pc,lr
##################
check_10: #sum at r2
mov r1, r4
mov r3,#0 #t
mov r2,#0 #sum
0:
ldrb r0,[r1], #1
cmp r0, #0 #end?
beq 9f
cmp r0, #'0
blo 1f
cmp r0, #'9
bhi 2f
sub r0,r0,#'0
bal 3f
3: add r3,r3,r0
cmp r3, #11
subge r3, r3, #11
add r2,r2,r3
cmp r2, #11
subge r2, r2, #11
bal 0b
2: and r0,r0, #0xdf # x becomes x
cmp r0, #0x58 # x?
bne 1f
mov r0,#10
bal 3b
1: mov r2, #22 #returns r2=22 if invalid
mov pc,lr
9: mov pc,lr
######################
invalid:
mov r2, #22 #returns r2=22 if invalid
mov pc,lr
#######################
print10v:
mov r9,lr
mov r1,r10
bl strlen
mov r2,r0
mov r1,r10
mov r0,#stdout
mov r7, #write #herehere
svc 0
mov r1,r4
bl strlen
mov r1,r4
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov r1,r11
bl strlen
mov r1,r11
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov pc,r9
#######################
print10i:
mov r9,lr
mov r1,r10
bl strlen
mov r2,r0
mov r1,r10
mov r0,#stdout
mov r7, #write #herehere
svc 0
mov r1,r4
bl strlen
mov r1,r4
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov r1,r12
bl strlen
mov r1,r12
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov pc,r9
#######################
print13v:
mov r9,lr
mov r1,r13
bl strlen
mov r2,r0
mov r1,r13
mov r0,#stdout
mov r7, #write #herehere
svc 0
mov r1,r4
bl strlen
mov r1,r4
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov r1,r11
bl strlen
mov r1,r11
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov pc,r9
#######################
print13i:
mov r9,lr
mov r1,r13
bl strlen
mov r2,r0
mov r1,r13
mov r0,#stdout
mov r7, #write #herehere
svc 0
mov r1,r4
bl strlen
mov r1,r4
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov r1,r12
bl strlen
mov r1,r12
mov r2,r0
mov r0,#stdout
mov r7, #write
svc 0
mov pc,r9
after i assemble and link it
i run it using ./validate 9780306406157 1234567890
ISBN-13 : 9780306406157: VALID
ISBN-13 : 306406157: INVALID[Inferior 1 (process 22221) exited normally]
meaning that r4 at the second time through the loop got 306406157, i wanted it to get 1234567890...
after doing the suggested editing i ran the program and it gave me a segmentation on line 60, when i try to read a byte (a char) from the new argument, i ran gdb and i noticed that the value of r4 (supposed to be argv[2] on the second time through the loop) is very far from the value in the first time through the loop
14 mov r1, r4
(gdb) i r
r0 0x0 0
r1 0x0 0
r2 0x0 0
r3 0x0 0
r4 0xbefff8d6 3204446422
r5 0x3 3
r6 0x1 1
r7 0x0 0
r8 0xc 12
r9 0x0 0
r10 0x0 0
r11 0x0 0
r12 0x0 0
sp 0xbefff790 0xbefff790
lr 0x0 0
pc 0x8068 0x8068 <_start+20>
cpsr 0x10 16
(gdb) c
Continuing.
isbn-13 : 9780306406157: valid
Breakpoint 1, _start () at validate.s:12
12 0: ldr r4, [sp, r8]
(gdb) stepi
13 add r8, r8, #4
(gdb)
14 mov r1, r4
(gdb) i r
r0 0x7 7
r1 0x8106 33030
r2 0x7 7
r3 0x0 0
r4 0x6176203a 1635131450
r5 0x3 3
r6 0x2 2
r7 0x4 4
r8 0x10 16
r9 0x809c 32924
r10 0x80ee 33006
r11 0x8106 33030
r12 0x81f4 33268
sp 0x80fa 0x80fa
lr 0x82f8 33528
pc 0x8068 0x8068 <_start+20>
cpsr 0x20000010 536870928
any help?

What you get when you do ldr r4, [sp, #8] is argv[1] (argv[0] which is at [sp, #4] is the name of the executing program).
So addne r4, r4, #4 will just move 4 bytes ahead within argv[1]. What you should do to load argv[2], argv[3], etc., is to read from [sp, #0xC], [sp, #0x10], etc.
Something like this:
mov r8, #8 # Offset of argv[1]
0: ldr r4, [sp, r8] r4 = argv[n]
add r8, r8, #4 n++
mov r1, r4

Related

Convert C to ARM Assembly program

I have this C program:
void Move1Disk(int fm, int to);
void Hanoi(int num, int fm, int to, int aux)
{
if (num > 1) Hanoi(num - 1, fm, aux, to) ;
Move1Disk(fm, to) ;
if (num > 1) Hanoi(num - 1, aux, to, fm) ;
}
I have written this but can not compile, can anyone please tell me the issue?
Hanoi(int, int, int, int):
cmp r0, #1
push {r4, r5, r6, r7, r8, lr}
mov r5, r1
mov r7, r2
movgt r4, r0
movgt r6, r3
ble .L9
.L3:
sub r4, r4, #1
mov r0, r4
mov r3, r7
mov r2, r6
mov r1, r5
bl Hanoi(int, int, int, int)
mov r1, r7
mov r0, r5
bl Move1Disk(int, int)
cmp r4, #1
beq .L2
mov r3, r5
mov r5, r6
mov r6, r3
b .L3
.L9:
mov r6, r1
.L2:
mov r1, r7
mov r0, r6
pop {r4, r5, r6, r7, r8, lr}
b Move1Disk(int, int)
here I have included the main main method
and a picture of the error message:
On the very first line:
Hanoi(int, int, int, int):
C functions don't have their argument types as part of their names. If you really are trying to duplicate a C program, this should just be Hanoi:. The same for all other instances of that and of Move1Disk.

ARM assembly output isn't functioning correctly

This is supposed to output the contents of each line in arm assembly. Though line 18 add r4, r5, r4, lsl #1 isn't being outputted correctly and I am not sure why.
.data
str1: .asciz "%d and %d are the results \n"
n: .word word 1
.text
.global main
main: stmfd sp!, {lr}
ldr r4,=n
ldr r4, [r4]
add r4,r4, #1
mov r1, r4
ldr r0, =str1
bl printf
mov r5, r4
mov r1, r4
mov r2, r5
ldr r0, =str1
bl printf
add r4, r5, r4, lsl #1
mov r1, r4
ldr r0, = str1
bl printf
ldmfd sp!, {lr}
mov r0, #0
mov PC, or
.end

Self written simple memset not working with -03 eabi gcc on ARMv7

I wrote a very simple memset in c that works fine up to -O2 but not with -O3...
memset:
void * memset(void * blk, int c, size_t n)
{
unsigned char * dst = blk;
while (n-- > 0)
*dst++ = (unsigned char)c;
return blk;
}
...which compiles to this assembly when using -O2:
20000430 <memset>:
20000430: e3520000 cmp r2, #0 # compare param 'n' with zero
20000434: 012fff1e bxeq lr # if equal return to caller
20000438: e6ef1071 uxtb r1, r1 # else zero extend (extract byte from) param 'c'
2000043c: e0802002 add r2, r0, r2 # add pointer 'blk' to 'n'
20000440: e1a03000 mov r3, r0 # move pointer 'blk' to r3
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
20000450: e12fff1e bx lr # else back to caller
This makes sense to me. I annotated what happens here.
When I compile it with -O3 the program crashes. My memset calls itself repeatedly until it ate the whole stack:
200005e4 <memset>:
200005e4: e3520000 cmp r2, #0 # compare param 'n' with zero
200005e8: e92d4010 push {r4, lr} # ? (1)
200005ec: e1a04000 mov r4, r0 # move pointer 'blk' to r4 (temp to hold return value)
200005f0: 0a000001 beq 200005fc <memset+0x18> # if equal (first line compare) jump to epilogue
200005f4: e6ef1071 uxtb r1, r1 # zero extend (extract byte from) param 'c'
200005f8: ebfffff9 bl 200005e4 <memset> # call myself ? (2)
200005fc: e1a00004 mov r0, r4 # epilogue start. move return value to r0
20000600: e8bd8010 pop {r4, pc} # restore r4 and back to caller
I can't figure out how this optimised version is supposed to work without any strb or similar. It doesn't matter if I try to set the memory to '0' or something else so the function is not only called on .bss (zero initialised) variables.
(1) This is a problem. This push gets endlessly repeated without a matching pop as it's called by (2) when the function doesn't early-exit because of 'n' being zero. I verified this with uart prints. Also r2 is never touched so why should the compare to zero ever become true?
Please help me understand what's happening here. Is the compiler assuming prerequisites that I may not fulfill?
Background: I'm using external code that requires memset in my baremetal project so I rolled my own. It's only used once on startup and not performance critical.
/edit: The compiler is called with these options:
arm-none-eabi-gcc -O3 -Wall -Wextra -fPIC -nostdlib -nostartfiles -marm -fstrict-volatile-bitfields -march=armv7-a -mcpu=cortex-a9 -mfloat-abi=hard -mfpu=neon-vfpv3
Your first question (1). That is per the calling convention if you are going to make a nested function call you need to preserve the link register, and you need to be 64 bit aligned. The code uses r4 so that is the extra register saved. No magic there.
Your second question (2) it is not calling your memset it is optimizing your code because it sees it as an inefficient memset. Fuz has provided the answers to your question.
Rename the function
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: e92d4010 push {r4, lr}
8: e1a04000 mov r4, r0
c: 0a000001 beq 18 <xmemset+0x18>
10: e6ef1071 uxtb r1, r1
14: ebfffffe bl 0 <memset>
18: e1a00004 mov r0, r4
1c: e8bd8010 pop {r4, pc}
and you can see this.
If you were to use -ffreestanding as Fuz recommended then you see this or something like it
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: 012fff1e bxeq lr
8: e92d41f0 push {r4, r5, r6, r7, r8, lr}
c: e2426001 sub r6, r2, #1
10: e3560002 cmp r6, #2
14: e6efe071 uxtb lr, r1
18: 9a00002a bls c8 <xmemset+0xc8>
1c: e3a0c000 mov r12, #0
20: e3520023 cmp r2, #35 ; 0x23
24: e7c7c01e bfi r12, lr, #0, #8
28: e1a04122 lsr r4, r2, #2
2c: e7cfc41e bfi r12, lr, #8, #8
30: e7d7c81e bfi r12, lr, #16, #8
34: e7dfcc1e bfi r12, lr, #24, #8
38: 9a000024 bls d0 <xmemset+0xd0>
3c: e2445009 sub r5, r4, #9
40: e1a03000 mov r3, r0
44: e3c55007 bic r5, r5, #7
48: e3a07000 mov r7, #0
4c: e2851008 add r1, r5, #8
50: e1570005 cmp r7, r5
54: f5d3f0a0 pld [r3, #160] ; 0xa0
58: e1a08007 mov r8, r7
5c: e583c000 str r12, [r3]
60: e583c004 str r12, [r3, #4]
64: e2877008 add r7, r7, #8
68: e583c008 str r12, [r3, #8]
6c: e2833020 add r3, r3, #32
70: e503c014 str r12, [r3, #-20] ; 0xffffffec
74: e503c010 str r12, [r3, #-16]
78: e503c00c str r12, [r3, #-12]
7c: e503c008 str r12, [r3, #-8]
80: e503c004 str r12, [r3, #-4]
84: 1afffff1 bne 50 <xmemset+0x50>
88: e2811001 add r1, r1, #1
8c: e483c004 str r12, [r3], #4
90: e1540001 cmp r4, r1
94: 8afffffb bhi 88 <xmemset+0x88>
98: e3c23003 bic r3, r2, #3
9c: e1520003 cmp r2, r3
a0: e0466003 sub r6, r6, r3
a4: e0803003 add r3, r0, r3
a8: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
ac: e3560000 cmp r6, #0
b0: e5c3e000 strb lr, [r3]
b4: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
b8: e3560001 cmp r6, #1
bc: e5c3e001 strb lr, [r3, #1]
c0: 15c3e002 strbne lr, [r3, #2]
c4: e8bd81f0 pop {r4, r5, r6, r7, r8, pc}
c8: e1a03000 mov r3, r0
cc: eafffff6 b ac <xmemset+0xac>
d0: e1a03000 mov r3, r0
d4: e3a01000 mov r1, #0
d8: eaffffea b 88 <xmemset+0x88>
which appears like it simply inlined memset, the one it knows not your code (the faster one).
So if you want it to use your code then stick with -O2. Yours is pretty inefficient so not sure why you need to push it any further than it was.
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
It isn't going to get any better than that without replacing your code with something else.
Fuz already answered the question:
Compile with -fno-builtin-memset. The compiler recognises that the function implements memset and thus replaces it with a call to memset. You should in general compile with -ffreestanding when writing bare-metal code. I believe this fixes this sort of problem, too
It is replacing your code with memset, if you want it not to do that use -ffreestanding.
If you wish to go beyond that and wonder why -fno-builtin-memset didn't work that is a question for the gcc folks, file a ticket, let us know what they say (or just look at the compiler source code).

objdump produces wrong branch opcode interpretation

See the following objdump line of a specific object file of a specific function (func):
3c: e03a b.n 78 <func+0x78>
Now, the opcode e03a in the target system (ARMv6-M) says jump to the location of PC + 0x78. A correct interpretation will be:
3c: e03a b.n B4 <func+0xB4>
Every other function and file contains proper b.n interpretations with proper values calculations in their objdump dump. For some reason, only this function causes objdump to be "confused".
Note: funcstarts at 0x0.
I could not think of any reason for this situation. And since I have tools that parse and uses the objdump dump, this causes great problem for me.
Is there any reasonable reason for that?
toolchain: gcc-arm-none-eabi-4_9-2015q3
platform running this toolchain: Ubuntu 16.04.2 LTS
EDIT: I'm attaching partial dump:
Disassembly of section i.func:
00000000 <func>:
0: b531 push {r0, r4, r5, lr}
2: b088 sub sp, #32
4: 2100 movs r1, #0
6: 9106 str r1, [sp, #24]
8: 482c ldr r0, [pc, #176] ; (bc <func+0xbc>)
a: 6800 ldr r0, [r0, #0]
c: 6840 ldr r0, [r0, #4]
e: 9103 str r1, [sp, #12]
10: 1c40 adds r0, r0, #1
12: 9002 str r0, [sp, #8]
14: 492a ldr r1, [pc, #168] ; (c0 <func+0xc0>)
16: 2000 movs r0, #0
18: 9104 str r1, [sp, #16]
1a: 9005 str r0, [sp, #20]
1c: a802 add r0, sp, #8
1e: f7ff fffe bl 0 <random_func>
22: f7ff fffe bl 0 <random_func2>
26: 4604 mov r4, r0
28: 4d26 ldr r5, [pc, #152] ; (c4 <func+0xc4>)
2a: 42ac cmp r4, r5
2c: d007 beq.n 3e <func+0x3e>
2e: a326 add r3, pc, #152 ; (adr r3, c8 <func+0xc8>)
30: 22ee movs r2, #238 ; 0xee
32: 492c ldr r1, [pc, #176] ; (e4 <func+0xe4>)
34: 2000 movs r0, #0
36: 9400 str r4, [sp, #0]
38: f7ff fffe bl 0 <log_func>
3c: e03a b.n 78 <func+0x78> <---- PROBLEM IS HERE
3e: f7ff fffe bl 0 <func>
42: 9006 str r0, [sp, #24]
44: f3bf 8f5f dmb sy
48: a808 add r0, sp, #32
4a: 7800 ldrb r0, [r0, #0]
4c: 2800 cmp r0, #0
4e: d00f beq.n 70 <func+0x70>
50: 9806 ldr r0, [sp, #24]
52: 2803 cmp r0, #3
54: d016 beq.n 84 <func+0x84>
56: f7ff fffe bl 0 <some_hw_func>
5a: 4604 mov r4, r0
5c: 42ac cmp r4, r5
5e: d01a beq.n 96 <func+0x96>
60: a321 add r3, pc, #132 ; (adr r3, e8 <func+0xe8>)
62: 22fa movs r2, #250 ; 0xfa
64: 491f ldr r1, [pc, #124] ; (e4 <func+0xe4>)
66: 2000 movs r0, #0
68: 9400 str r4, [sp, #0]
6a: f7ff fffe bl 0 <log_func>
6e: e021 b.n 46 <random_delay+0x46> <--- ALSO HERE SAME PROBLEM
70: f7ff fffe bl 0 <random_delay>
74: 2800 cmp r0, #0
76: d003 beq.n 80 <func+0x80>
78: a808 add r0, sp, #32
7a: 7800 ldrb r0, [r0, #0]
7c: 2800 cmp r0, #0
7e: d018 beq.n b2 <func+0xb2>
80: f7ff fffe bl 0 <some_hw_func2>
84: f7ff fffe bl 0 <random_delay>
88: 2800 cmp r0, #0
8a: d002 beq.n 92 <func+0x92>
8c: 9806 ldr r0, [sp, #24]
8e: 2803 cmp r0, #3
90: d00f beq.n b2 <func+0xb2>
92: f7ff fffe bl 0 <some_hw_func2>
96: f7ff fffe bl 0 <func>
9a: 4604 mov r4, r0
9c: 42ac cmp r4, r5
9e: d008 beq.n b2 <func+0xb2>
a0: 22ff movs r2, #255 ; 0xff
a2: a318 add r3, pc, #96 ; (adr r3, 104 <func+0x104>)
a4: 3201 adds r2, #1
a6: 490f ldr r1, [pc, #60] ; (e4 <func+0xe4>)
a8: 2000 movs r0, #0
aa: 9400 str r4, [sp, #0]
ac: f7ff fffe bl 0 <log_func>
b0: e000 b.n b4 <func+0xb4>
b2: 462c mov r4, r5
b4: 4620 mov r0, r4
Looks like a bug; each time the jump is between jumps, that are subject to relocation like here
38: f7ff fffe bl 0 <log_func>
3c: e03a b.n 78 <func+0x78> <---- PROBLEM IS HERE
3e: f7ff fffe bl 0 <func>
or here
6a: f7ff fffe bl 0 <log_func>
6e: e021 b.n 46 <random_delay+0x46>
70: f7ff fffe bl 0 <random_delay>
the calculation is wrong.
There is no legitimate reason for this; a report to the bugtracking system http://www.sourceware.org/bugzilla/ is probably appropriate (after verifying, that the latest versions also suffer from this bug)
EDIT: I had some time to look deeper into this bug.
The problem is, that if the instruction before the b.n is any 32-bit instruction and the instruction after the b.n is subject to relocation, objdump falsely assumes that the b.n instruction has a relocation associated with it and sets the relative pc to 0 for the offset calculation.
This code part from binutils/objdump.c is the culprit:
bfd_signed_vma distance_to_rel;
distance_to_rel = (**relppp)->address
- (rel_offset + addr_offset);
/* Check to see if the current reloc is associated with
the instruction that we are about to disassemble. */
if (distance_to_rel == 0
/* FIXME: This is wrong. We are trying to catch
relocs that are addressed part way through the
current instruction, as might happen with a packed
VLIW instruction. Unfortunately we do not know the
length of the current instruction since we have not
disassembled it yet. Instead we take a guess based
upon the length of the previous instruction. The
proper solution is to have a new target-specific
disassembler function which just returns the length
of an instruction at a given address without trying
to display its disassembly. */
|| (distance_to_rel > 0
&& distance_to_rel < (bfd_signed_vma) (previous_octets/ opb)))
{
inf->flags |= INSN_HAS_RELOC;
aux->reloc = **relppp;
}
The comment says it all: this parser guesses from the previous 32-bit instruction, that the next instruction is also 32-bit (which it isn't!). The relocation is targeted for 3e and the disassembler assumes, that the next instruction is from 3c to 3f, so the b.n is flagged with INSN_HAS_RELOC, which in turn leads to the incorrect offset calculation. Looks, like this will not be easy to fix up.
However, you could try and patch your objdump like this:
if (distance_to_rel == 0) {
inf->flags |= INSN_HAS_RELOC;
aux->reloc = **relppp;
}
This might produce inaccuracies the other way round, but that should be rare cases and maybe that is better acceptable for you.

comprehending how "volatile" keyword and comparison work

If a variable is not specified with the keyword volatile, the compiler likely does caching. The variable must be accessed from memory always otherwise until its transaction unit ends. The point I wonder lies in assembly part.
int main() {
/* volatile */ int lock = 999;
while (lock);
}
On x86-64-clang-3.0.0 compiler, its assembly code is following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
cmp DWORD PTR [RSP - 8], 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
When volatile keyword is commented in, it turns out the following.
main: # #main
mov DWORD PTR [RSP - 4], 0
mov DWORD PTR [RSP - 8], 999
.LBB0_1: # =>This Inner Loop Header: Depth=1
mov EAX, DWORD PTR [RSP - 8]
cmp EAX, 0
je .LBB0_3
jmp .LBB0_1
.LBB0_3:
mov EAX, DWORD PTR [RSP - 4]
ret
The points I wonder and don't understand,
cmp DWORD PTR [RSP - 8], 0 . <---
Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
It looks like you forgot to enable optimization. -O0 treats all variables (except register variables) pretty similarly to volatile for consistent debugging.
With optimization enabled, compilers can hoist non-volatile loads out of loops. while(locked); will compile similarly to source like
if (locked) {
while(1){}
}
Or since locked has a compile-time-constant initializer, the whole function should compile to jmp main (an infinite loop).
See MCU programming - C++ O2 optimization breaks while loop for more details.
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
Some compilers are worse at folding loads into memory operands for other instructions when you use volatile. I think that's why you're getting a separate mov load here; it's just a missed optimization.
(Although cmp [mem], imm might be less efficient. I forget if it can macro-fuse with a JCC or something. With a RIP-relative addressing mode it couldn't micro-fuse the load, but a register base is ok.)
cmp EAX, 0 is weird, I guess clang with optimization disabled doesn't look for test eax,eax as a peephole optimization for comparing against zero.
As #user3386109 commented, locked in a boolean context is equivalent to locked != 0 in C / C++.
The compiler doesn't know about caching, it is not a caching thing, it tells the compiler that the value may change between accesses. So to functionally implement our code it needs to perform the accesses we ask for in the order we ask them. Can't optimize out.
void fun1 ( void )
{
/* volatile */ int lock = 999;
while (lock) continue;
}
void fun2 ( void )
{
volatile int lock = 999;
while (lock) continue;
}
volatile int vlock;
int ulock;
void fun3 ( void )
{
while(vlock) continue;
}
void fun4 ( void )
{
while(ulock) continue;
}
void fun5 ( void )
{
vlock=3;
vlock=4;
}
void fun6 ( void )
{
ulock=3;
ulock=4;
}
I find it easier to see in arm... doesn't really matter.
Disassembly of section .text:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
00001004 <fun2>:
1004: e59f3018 ldr r3, [pc, #24] ; 1024 <fun2+0x20>
1008: e24dd008 sub sp, sp, #8
100c: e58d3004 str r3, [sp, #4]
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
101c: e28dd008 add sp, sp, #8
1020: e12fff1e bx lr
1024: 000003e7 andeq r0, r0, r7, ror #7
00001028 <fun3>:
1028: e59f200c ldr r2, [pc, #12] ; 103c <fun3+0x14>
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
103c: 00002000
00001040 <fun4>:
1040: e59f3014 ldr r3, [pc, #20] ; 105c <fun4+0x1c>
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
105c: 00002004
00001060 <fun5>:
1060: e3a01003 mov r1, #3
1064: e3a02004 mov r2, #4
1068: e59f3008 ldr r3, [pc, #8] ; 1078 <fun5+0x18>
106c: e5831000 str r1, [r3]
1070: e5832000 str r2, [r3]
1074: e12fff1e bx lr
1078: 00002000
0000107c <fun6>:
107c: e3a02004 mov r2, #4
1080: e59f3004 ldr r3, [pc, #4] ; 108c <fun6+0x10>
1084: e5832000 str r2, [r3]
1088: e12fff1e bx lr
108c: 00002004
Disassembly of section .bss:
00002000 <vlock>:
2000: 00000000
00002004 <ulock>:
2004: 00000000
First one is the most telling:
00001000 <fun1>:
1000: eafffffe b 1000 <fun1>
Being a local variable that is initialized, and non volatile then the compiler can assume it won't change value between accesses so it can never change in the while loop, so this is essentially a while 1 loop. If the initial value had been zero this would be a simple return as it can never be non-zero, being non-volatile.
fun2 being a local variable a stack frame needs to be built then.
It does what one assumes the code was trying to do, wait for this shared variable, one that can change during the loop
1010: e59d3004 ldr r3, [sp, #4]
1014: e3530000 cmp r3, #0
1018: 1afffffc bne 1010 <fun2+0xc>
so it samples it and tests what it samples each time through the loop.
fun3 and fun4 same deal but more realistic, as external to the function code isnt going to change lock, being non-global doesn't make much sense for your while loop.
102c: e5923000 ldr r3, [r2]
1030: e3530000 cmp r3, #0
1034: 012fff1e bxeq lr
1038: eafffffb b 102c <fun3+0x4>
For the volatile fun3 case the variable has to be read and tested each loop
1044: e5933000 ldr r3, [r3]
1048: e3530000 cmp r3, #0
104c: 012fff1e bxeq lr
1050: e3530000 cmp r3, #0
1054: 012fff1e bxeq lr
1058: eafffffa b 1048 <fun4+0x8>
For the non-volatile being global it has to sample it once, very interesting what the compiler did here, have to think about why it would do that, but either way you can see that the "loop" retests the value read stored in a register (not cached) which will never change with a proper program. Functionally we asked it to only read the variable once by using non-volatile then it tests that value indefinitely.
fun5 and fun6 further demonstrate that volatile requires the compiler perform the accesses to the variable in its storage place before moving on to the next operation/access in the code. So when volatile we are asking the compiler to perform two assignments, two stores. When non-volatile the compiler can optimize out the first store and only do the last one as if you look at the code as a whole this function (fun6) leaves the variable set to 4, so the function leaves the variable set to 4.
The x86 solution is equally interesting repz retq is all over it (with the compiler on my computer), not hard to find out what that is all about.
Neither aarch64, x86, mips, riscv, msp430, pdp11 backends do the double check on fun3().
pdp11 is actually the easier code to read (no surprise there)
00000000 <_fun1>:
0: 01ff br 0 <_fun1>
00000002 <_fun2>:
2: 65c6 fffe add $-2, sp
6: 15ce 03e7 mov $1747, (sp)
a: 1380 mov (sp), r0
c: 02fe bne a <_fun2+0x8>
e: 65c6 0002 add $2, sp
12: 0087 rts pc
00000014 <_fun3>:
14: 1dc0 0026 mov $3e <_vlock>, r0
18: 02fd bne 14 <_fun3>
1a: 0087 rts pc
0000001c <_fun4>:
1c: 1dc0 001c mov $3c <_ulock>, r0
20: 0bc0 tst r0
22: 02fe bne 20 <_fun4+0x4>
24: 0087 rts pc
00000026 <_fun5>:
26: 15f7 0003 0012 mov $3, $3e <_vlock>
2c: 15f7 0004 000c mov $4, $3e <_vlock>
32: 0087 rts pc
00000034 <_fun6>:
34: 15f7 0004 0002 mov $4, $3c <_ulock>
3a: 0087 rts pc
(this is the not linked version)
cmp DWORD PTR [RSP - 8], 0 . <--- Why is the comparison done with 0 whilst DWORD PTR [RSP - 8] holds 999 within ?
while does a true false comparison meaning is it equal to zero or not equal to zero
Why is DWORD PTR [RSP - 8] copied into EAX and again why is the comparison done between 0 and EAX?
mov -0x8(%rsp),%eax
cmp 0,%eax
cmp 0,-0x8(%rsp)
as so.s -o so.o
so.s: Assembler messages:
so.s:3: Error: too many memory references for `cmp'
compare wants a register. So it reads into a register so it can do the compare as it can't do the compare between the immediate and the memory access in one instruction. If they could have done it in one instruction they would have.

Resources