Access to unaligned memory location? - arm

I'm doing an assignment where I create another array by reading a given array. Below is the relevant code:
.data
array_source:
.word 42, 67, 81, 90, 124, -5
array_sink:
.word 0, 0, 0, 0, 0, 0
array_length:
.word 6
add_amount:
.word 5
.text
.global _start
_start:
ldr r8,=array_source
ldr r0,=array_length
ldr r3, [r3]
ldr r4, =array_sink
ldr r5, =add_amount
ldr r5, [r5]
mov r6, #1
begin_loop:
cmp r6, r3
beq end_of_loop
ldr r8, [r8]
add r8, r8, r5
str r8, [r4]
add r6, r6, #1
add r0, r0, #4
add r4, r4, #4
b begin_loop
ARMSim# gives me the error "access to unaligned memory location, bad address = 0000002f" on the second loop through add r8, r8, r5 and I don't know why. Any advice would be appreciated.

Related

GCC is not finding the built-in 'memcpy'

I am doing work on an ARM Cortex-M microcontroller using the arm-none-eabi version of GCC. I am also using -fnostdlib and -fnostdin.
In my code I am using memcpy and strlen. Both of these functions are built-in functions as per the GCC manual. When I use these function as is or as __buitin_..., I get undefined reference to ....
Why is GCC not generating the code as expected?
builtins are not real functions. Compiler is free to replace them with the "normal" function call. ARM b as in this example:
void *m(void *a, void *b, size_t size)
{
return __builtin_memcpy(a,b,size);
}
void *m1(void *a, void *b)
{
return __builtin_memcpy(a,b,16);
}
void *m2(void *a, void *b)
{
return __builtin_memcpy(a,b,200);
}
volatile int a[1000],b[10000], c[1000];
int main(void)
{
m((void *)a,(void *)b,16);
__asm(":::m");
m((void *)a,(void *)c,400);
}
The resulting code will depend on the ARM architecture (if not aligned accesses are legal).
CORTEX-M4 CORTEX-M0
m: m:
b memcpy push {r4, lr}
m1: bl memcpy
push {r4, r5} pop {r4, pc}
ldr r5, [r1] # unaligned m1:
ldr r4, [r1, #4] # unaligned push {r4, lr}
ldr r2, [r1, #8] # unaligned movs r2, #16
ldr r1, [r1, #12] # unaligned bl memcpy
str r1, [r0, #12] # unaligned pop {r4, pc}
str r5, [r0] # unaligned m2:
str r4, [r0, #4] # unaligned push {r4, lr}
str r2, [r0, #8] # unaligned movs r2, #200
pop {r4, r5} bl memcpy
bx lr pop {r4, pc}
m2: main:
movs r2, #200 ldr r0, .L6
b memcpy push {r4, r5, r6, lr}
main: movs r2, r0
push {r4, lr} ldr r3, .L6+4
ldr r3, .L8 ldmia r3!, {r1, r4, r5}
ldr r4, .L8+4 stmia r2!, {r1, r4, r5}
ldm r3, {r0, r1, r2, r3} ldr r3, [r3]
stm r4, {r0, r1, r2, r3} str r3, [r2]
:::m :::m
mov r2, #400 movs r2, #200
mov r0, r4 ldr r1, .L6+8
ldr r1, .L8+8 lsls r2, r2, #1
bl memcpy bl memcpy
movs r0, #0 movs r0, #0
pop {r4, pc} pop {r4, r5, r6, pc}
.L8: .L6:
.word b .word a
.word a .word b
.word c .word c
https://godbolt.org/z/fh68cv

ARM assembly output isn't functioning correctly

This is supposed to output the contents of each line in arm assembly. Though line 18 add r4, r5, r4, lsl #1 isn't being outputted correctly and I am not sure why.
.data
str1: .asciz "%d and %d are the results \n"
n: .word word 1
.text
.global main
main: stmfd sp!, {lr}
ldr r4,=n
ldr r4, [r4]
add r4,r4, #1
mov r1, r4
ldr r0, =str1
bl printf
mov r5, r4
mov r1, r4
mov r2, r5
ldr r0, =str1
bl printf
add r4, r5, r4, lsl #1
mov r1, r4
ldr r0, = str1
bl printf
ldmfd sp!, {lr}
mov r0, #0
mov PC, or
.end

Self written simple memset not working with -03 eabi gcc on ARMv7

I wrote a very simple memset in c that works fine up to -O2 but not with -O3...
memset:
void * memset(void * blk, int c, size_t n)
{
unsigned char * dst = blk;
while (n-- > 0)
*dst++ = (unsigned char)c;
return blk;
}
...which compiles to this assembly when using -O2:
20000430 <memset>:
20000430: e3520000 cmp r2, #0 # compare param 'n' with zero
20000434: 012fff1e bxeq lr # if equal return to caller
20000438: e6ef1071 uxtb r1, r1 # else zero extend (extract byte from) param 'c'
2000043c: e0802002 add r2, r0, r2 # add pointer 'blk' to 'n'
20000440: e1a03000 mov r3, r0 # move pointer 'blk' to r3
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
20000450: e12fff1e bx lr # else back to caller
This makes sense to me. I annotated what happens here.
When I compile it with -O3 the program crashes. My memset calls itself repeatedly until it ate the whole stack:
200005e4 <memset>:
200005e4: e3520000 cmp r2, #0 # compare param 'n' with zero
200005e8: e92d4010 push {r4, lr} # ? (1)
200005ec: e1a04000 mov r4, r0 # move pointer 'blk' to r4 (temp to hold return value)
200005f0: 0a000001 beq 200005fc <memset+0x18> # if equal (first line compare) jump to epilogue
200005f4: e6ef1071 uxtb r1, r1 # zero extend (extract byte from) param 'c'
200005f8: ebfffff9 bl 200005e4 <memset> # call myself ? (2)
200005fc: e1a00004 mov r0, r4 # epilogue start. move return value to r0
20000600: e8bd8010 pop {r4, pc} # restore r4 and back to caller
I can't figure out how this optimised version is supposed to work without any strb or similar. It doesn't matter if I try to set the memory to '0' or something else so the function is not only called on .bss (zero initialised) variables.
(1) This is a problem. This push gets endlessly repeated without a matching pop as it's called by (2) when the function doesn't early-exit because of 'n' being zero. I verified this with uart prints. Also r2 is never touched so why should the compare to zero ever become true?
Please help me understand what's happening here. Is the compiler assuming prerequisites that I may not fulfill?
Background: I'm using external code that requires memset in my baremetal project so I rolled my own. It's only used once on startup and not performance critical.
/edit: The compiler is called with these options:
arm-none-eabi-gcc -O3 -Wall -Wextra -fPIC -nostdlib -nostartfiles -marm -fstrict-volatile-bitfields -march=armv7-a -mcpu=cortex-a9 -mfloat-abi=hard -mfpu=neon-vfpv3
Your first question (1). That is per the calling convention if you are going to make a nested function call you need to preserve the link register, and you need to be 64 bit aligned. The code uses r4 so that is the extra register saved. No magic there.
Your second question (2) it is not calling your memset it is optimizing your code because it sees it as an inefficient memset. Fuz has provided the answers to your question.
Rename the function
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: e92d4010 push {r4, lr}
8: e1a04000 mov r4, r0
c: 0a000001 beq 18 <xmemset+0x18>
10: e6ef1071 uxtb r1, r1
14: ebfffffe bl 0 <memset>
18: e1a00004 mov r0, r4
1c: e8bd8010 pop {r4, pc}
and you can see this.
If you were to use -ffreestanding as Fuz recommended then you see this or something like it
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: 012fff1e bxeq lr
8: e92d41f0 push {r4, r5, r6, r7, r8, lr}
c: e2426001 sub r6, r2, #1
10: e3560002 cmp r6, #2
14: e6efe071 uxtb lr, r1
18: 9a00002a bls c8 <xmemset+0xc8>
1c: e3a0c000 mov r12, #0
20: e3520023 cmp r2, #35 ; 0x23
24: e7c7c01e bfi r12, lr, #0, #8
28: e1a04122 lsr r4, r2, #2
2c: e7cfc41e bfi r12, lr, #8, #8
30: e7d7c81e bfi r12, lr, #16, #8
34: e7dfcc1e bfi r12, lr, #24, #8
38: 9a000024 bls d0 <xmemset+0xd0>
3c: e2445009 sub r5, r4, #9
40: e1a03000 mov r3, r0
44: e3c55007 bic r5, r5, #7
48: e3a07000 mov r7, #0
4c: e2851008 add r1, r5, #8
50: e1570005 cmp r7, r5
54: f5d3f0a0 pld [r3, #160] ; 0xa0
58: e1a08007 mov r8, r7
5c: e583c000 str r12, [r3]
60: e583c004 str r12, [r3, #4]
64: e2877008 add r7, r7, #8
68: e583c008 str r12, [r3, #8]
6c: e2833020 add r3, r3, #32
70: e503c014 str r12, [r3, #-20] ; 0xffffffec
74: e503c010 str r12, [r3, #-16]
78: e503c00c str r12, [r3, #-12]
7c: e503c008 str r12, [r3, #-8]
80: e503c004 str r12, [r3, #-4]
84: 1afffff1 bne 50 <xmemset+0x50>
88: e2811001 add r1, r1, #1
8c: e483c004 str r12, [r3], #4
90: e1540001 cmp r4, r1
94: 8afffffb bhi 88 <xmemset+0x88>
98: e3c23003 bic r3, r2, #3
9c: e1520003 cmp r2, r3
a0: e0466003 sub r6, r6, r3
a4: e0803003 add r3, r0, r3
a8: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
ac: e3560000 cmp r6, #0
b0: e5c3e000 strb lr, [r3]
b4: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
b8: e3560001 cmp r6, #1
bc: e5c3e001 strb lr, [r3, #1]
c0: 15c3e002 strbne lr, [r3, #2]
c4: e8bd81f0 pop {r4, r5, r6, r7, r8, pc}
c8: e1a03000 mov r3, r0
cc: eafffff6 b ac <xmemset+0xac>
d0: e1a03000 mov r3, r0
d4: e3a01000 mov r1, #0
d8: eaffffea b 88 <xmemset+0x88>
which appears like it simply inlined memset, the one it knows not your code (the faster one).
So if you want it to use your code then stick with -O2. Yours is pretty inefficient so not sure why you need to push it any further than it was.
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
It isn't going to get any better than that without replacing your code with something else.
Fuz already answered the question:
Compile with -fno-builtin-memset. The compiler recognises that the function implements memset and thus replaces it with a call to memset. You should in general compile with -ffreestanding when writing bare-metal code. I believe this fixes this sort of problem, too
It is replacing your code with memset, if you want it not to do that use -ffreestanding.
If you wish to go beyond that and wonder why -fno-builtin-memset didn't work that is a question for the gcc folks, file a ticket, let us know what they say (or just look at the compiler source code).

arm7tdmi assembly explanation + crash debugging

I'm currently investigating a crash that happened compiled with gcc 4.2.1 on arm7tdmi architecture (I could use 4.9.3 on demand). I'm using LPC2387 and I'm getting wdog resets. Instead of wdog resets I'm using wdog interrupts, so when it would reset otherwise, it gets into my handler, which saves state and prints a whole memory dumps (64k only). So basically I know the registers before wdog reset and have a stack showing all of the call history.
On the stack I can see loads of references to the end of the function, and I see many instructions as data in the memory region. Which I think will become the reason for the halt and then the consequent wdog interrupt. Any ideas what might be happening?
I guess reasons can be when dereferencing a function pointer, but my function seems to be quite straight forward. It is touching many hardware registers (interrupt, peripheral enable/disable).
Like this:
2015/05/27 04:45:30: addr: 4000BF2C value:7FE00390 -->this is "svcvc 0x00e00390" according to gcc 4.2.1 and ".word 0x7fe00390" according to 4.9.3.
Also at the end of the function I see this in gcc 4.9.3
191d4: e89d6ff8 ldm sp, {r3, r4, r5, r6, r7, r8, r9, sl, fp, sp, lr}
191d8: e12fff1e bx lr
191dc: 7fe00390 .word 0x7fe00390
191e0: 40000044 .word 0x40000044
191e4: 00064de5 .word 0x00064de5
191e8: 00064dfb .word 0x00064dfb
191ec: 4000107c .word 0x4000107c
191f0: e0028000 .word 0xe0028000
191f4: e01fc000 .word 0xe01fc000
191f8: 40001084 .word 0x40001084
191fc: 4000113c .word 0x4000113c
19200: 3800b010 .word 0x3800b010
19204: 40002a78 .word 0x40002a78
19208: 40002ab4 .word 0x40002ab4
1920c: 40002aa0 .word 0x40002aa0
19210: 40001080 .word 0x40001080
19214: 400001a9 .word 0x400001a9
19218: e002c000 .word 0xe002c000
1921c: 40001134 .word 0x40001134
19220: 00064e0e .word 0x00064e0e
It used to look like this on gcc 4.2.1:
1953c: 7fe00390 svcvc 0x00e00390
19540: 40000044 andmi r0, r0, r4, asr #32
19544: 0006d74c andeq sp, r6, ip, asr #14
19548: 0006d764 andeq sp, r6, r4, ror #14
1954c: 400012d0 ldrmid r1, [r0], -r0
19550: e0028000 and r8, r2, r0
19554: e01fc000 ands ip, pc, r0
19558: 40001390 mulmi r0, r0, r3
1955c: 40001394 mulmi r0, r4, r3
19560: e002c040 and ip, r2, r0, asr #32
19564: 40002e54 andmi r2, r0, r4, asr lr
19568: e002c068 and ip, r2, r8, rrx
1956c: e002c000 and ip, r2, r0
19570: 40002e90 mulmi r0, r0, lr
19574: e002c02c and ip, r2, ip, lsr #32
19578: 3fffc000 svccc 0x00ffc000
1957c: 40002e7c andmi r2, r0, ip, ror lr
19580: 3fffc0a0 svccc 0x00ffc0a0
19584: 400012d4 ldrmid r1, [r0], -r4
19588: 400001a1 andmi r0, r0, r1, lsr #3
1958c: 400012d8 ldrmid r1, [r0], -r8
19590: 0006d778 andeq sp, r6, r8, ror r7
Can someone explain me what is in the end of the function? what are the .word regions? Why would I see pointers to this area on the stack?
Thanks,
Peter
The bytes after the end of the function chunk is usually data.
e.g. if I have void *somePtr = 0xABCDEF12; then you will typically get an LDR instruction that puts the value into a register and, assuming little-endian operation, you'll see the sequence 12 EF CD AB in hex.

calculating the address of global offset table in arm literal pool

I am trying to understand the arm assembly code for writing the Literal Pool and Global OFFSET table
Compiling the C code with GNU ARM GCC
extern int i;
int foo(int j)
{
int t = i;
i = j;
return t;
}
GCC generates following code:
foo:
ldr r3, .L2
ldr r2, .L2+4
.LPIC0:
add r3, pc
ldr r3, [r3, r2]
# sp needed for prologue
ldr r2, [r3]
str r0, [r3]
mov r0, r2
bx lr
.L3:
.align 2
.L2:
.word _GLOBAL_OFFSET_TABLE_-(.LPIC0+4)
.word i(GOT)
I want to manually handle the global offset table in arm assembly.
Now I am facing difficulty to understand the above code.
Can any one please describe the literal pool calculation following lines of code?
.L2:
.word _GLOBAL_OFFSET_TABLE_-(.LPIC0+4)
.word i(GOT)
When compiled to a PIC(position independet code) file, global variable need to be relocated.
foo:
ldr r3, .L2
ldr r2, .L2+4
.LPIC0:
add r3, pc
ldr r3, [r3, r2]
Notice add r3, pc, in this instruction, pc is .LPIC0+4, so the result of add is _GLOBAL_OFFSET_TABLE_, which is the entry of the GOT.
.L2+4 is i(GOT), it is the offset of varaibel i in GOT.
Look at the result of objdump is more intuitive.
00000450 <foo>:
450: 4b03 ldr r3, [pc, #12] ; (460 <foo+0x10>)
452: 4a04 ldr r2, [pc, #16] ; (464 <foo+0x14>)
454: 447b add r3, pc
456: 589b ldr r3, [r3, r2]
458: 681a ldr r2, [r3, #0]
45a: 6018 str r0, [r3, #0]
45c: 4610 mov r0, r2
45e: 4770 bx lr
460: 00008ba8 andeq r8, r0, r8, lsr #23
464: 0000001c andeq r0, r0, ip, lsl r0
468: f3af 8000 nop.w
46c: f3af 8000 nop.w
In the disassembly, .L2 and .L2+4 is replaced with specific offset. the result of add r3, pc is 0x8ba8 + 0x458 = 0x9000. Then ldr r3, [r3, r2] would load from address 0x901c. Look up these address in the section header:
[Nr] Name Type Addr Off Size ES Flg Lk Inf Al
...
[17] .got PROGBITS 00009000 001000 000024 04 WA 0 0 4
...
the address 0x9000 is the entry of global offset table, and 0x901c is also in this section. the symbol info of 0x901c could be find in the .rel.dyn section:
Relocation section '.rel.dyn' at offset 0x348 contains 7 entries:
Offset Info Type Sym.Value Sym. Name
...
00009018 00000415 R_ARM_GLOB_DAT 00000000 _Jv_RegisterClasses
0000901c 00000515 R_ARM_GLOB_DAT 00000000 i
00009020 00000615 R_ARM_GLOB_DAT 00000000 __cxa_finalize

Resources