I am trying to get interrupts to work on an ARM processor using QEMU. I have tried Google and various attempts to solve it, but I cannot get my IRQ handler to be called.
I also tried keyboard interrupts, which did not work either. I think the book I have may have a few mistakes, as the UART driver ended up being slightly different than described.
My assembly code:
.code 32
.global start, vectors_start, vectors_end, lock, unlock
start:
LDR sp, =stack_top
BL copy_vector
MSR cpsr, #0x92
LDR sp, =irq_stack_top
MSR cpsr, #0x13
bl unlock
BL main
B .
irq_handler:
sub lr, lr, #4
stmfd sp!, {r0-r12, lr}
bl IRQ_handler
ldmfd sp!, {r0-r12, pc}^
vectors_start:
LDR PC, reset_handler_addr
LDR PC, undef_handler_addr
LDR PC, swi_handler_addr
LDR PC, prefetch_abort_handler_addr
LDR PC, data_abort_handler_addr
B .
LDR PC, irq_handler_addr
LDR PC, fiq_handler_addr
reset_handler_addr: .word start
undef_handler_addr: .word undef_handler
swi_handler_addr: .word swi_handler
prefetch_abort_handler_addr: .word prefetch_abort_handler
data_abort_handler_addr: .word data_abort_handler
irq_handler_addr: .word irq_handler
fiq_handler_addr: .word fiq_handler
vectors_end:
lock:
MRS r0, CPSR
ORR r0,r0,#0x80
MSR CPSR, r0
mov PC, lr
unlock:
MRS r0,CPSR
BIC r0,r0,#0x80
MSR CPSR, r0
mov PC, lr
Relevant C functions:
void copy_vector(void){
extern u32 vectors_start, vectors_end;
u32 *vectors_src = &vectors_start;
u32 *vectors_dst = (u32 *) 0;
while (vectors_src<&vectors_end){
*vectors_dst++ = *vectors_src++;
}
}
// this function has been modified somewhat because the book was incomplete
// the print statement never prints
void IRQ_handler(void){
int vicstatus = 0;
uprints(uarts,"in interupt");
// int vicstatus = 0;
if(1){
if(*(tp[0]->base+TVALUE)==0){
timer_handler(0);
}
if(*(tp[1]->base+TVALUE)==0){
timer_handler(1);
}
}
if( 1){
if(*(tp[2]->base+TVALUE)==0){
timer_handler(2);
}
if(*(tp[3]->base+TVALUE)==0){
timer_handler(3);
}
}
}
void timer_init()
{
// timers[0].base = (u32*) 0x101E2000;
// timers[1].base = (u32*) 0x101E2020;
// timers[2].base = (u32*) 0x101E3000;
// timers[3].base = (u32*) 0x101E3020;
int i; TIMER * tp;
for (i = 0; i<4; ++i){
tp = &timers[i];
if(i==0) tp->base = (u32*) 0x101E2000;
if(i==1) tp->base = (u32*) 0x101E2020;
if (i == 2) tp->base = (u32*) 0x101E3000;
if (i == 3) tp->base = (u32*) 0x101E3020;
*(tp->base+TLOAD) = 0x0;
*(tp->base+TVALUE) = 0xFFFFFFFF;
*(tp->base+TRIS) = 0x0;
*(tp->base+TMIS) = 0x0;
*(tp->base+TLOAD) = 0x100;
*(tp->base+TCNTL) = 0x66;
*(tp->base+TBGLOAD) = 0x1C00;
*(tp->base ) |= (unsigned int)(1<<6);
tp->tick = 0;
///uprints(uarts,"timer init");
}
}
When I reset the machine in QEMU I get the message Timer with delta zero disabling.
I can get the message to disappear by changing: *(tp->base+TLOAD) = 0x0;
to *(tp->base+TLOAD) = 0x1; but the timer still doesn't work.
Lastly, my build script is:
arm-none-eabi-as -o start3.o start3.s
arm-none-eabi-gcc -c main3.c
arm-none-eabi-ld -T t.ld -o start3.elf start3.o main3.o
arm-none-eabi-nm start3.elf
arm-none-eabi-objcopy -O binary start3.elf start3.bin
qemu-system-arm -M versatilepb -m 128M -kernel start3.bin
edit: in addition here is the disassembled code from my main.c file (which includes a few other c files for the drivers)
the board is the versatilepb which according to the docs emulates the following:
ARM926E, ARM1136 or Cortex-A8 CPU
00000000 <uart_init>:
0: e52db004 push {fp} ; (str fp, [sp, #-4]!)
4: e28db000 add fp, sp, #0
8: e24dd00c sub sp, sp, #12
c: e3a03000 mov r3, #0
10: e50b3008 str r3, [fp, #-8]
14: ea000011 b 60 <uart_init+0x60>
18: e51b3008 ldr r3, [fp, #-8]
1c: e1a03183 lsl r3, r3, #3
20: e59f2064 ldr r2, [pc, #100] ; 8c <uart_init+0x8c>
24: e0833002 add r3, r3, r2
28: e50b300c str r3, [fp, #-12]
2c: e51b2008 ldr r2, [fp, #-8]
30: e59f3058 ldr r3, [pc, #88] ; 90 <uart_init+0x90>
34: e0823003 add r3, r2, r3
38: e1a03603 lsl r3, r3, #12
3c: e1a02003 mov r2, r3
40: e51b300c ldr r3, [fp, #-12]
44: e5832000 str r2, [r3]
48: e51b300c ldr r3, [fp, #-12]
4c: e51b2008 ldr r2, [fp, #-8]
50: e5832004 str r2, [r3, #4]
54: e51b3008 ldr r3, [fp, #-8]
58: e2833001 add r3, r3, #1
5c: e50b3008 str r3, [fp, #-8]
60: e51b3008 ldr r3, [fp, #-8]
64: e3530003 cmp r3, #3
68: daffffea ble 18 <uart_init+0x18>
6c: e59f3018 ldr r3, [pc, #24] ; 8c <uart_init+0x8c>
70: e59f201c ldr r2, [pc, #28] ; 94 <uart_init+0x94>
74: e5832018 str r2, [r3, #24]
78: e1a00000 nop ; (mov r0, r0)
7c: e1a00003 mov r0, r3
80: e28bd000 add sp, fp, #0
84: e49db004 pop {fp} ; (ldr fp, [sp], #4)
88: e12fff1e bx lr
8c: 00000000 .word 0x00000000
90: 000101f1 .word 0x000101f1
94: 10009000 .word 0x10009000
00000098 <ugetc>:
98: e52db004 push {fp} ; (str fp, [sp, #-4]!)
9c: e28db000 add fp, sp, #0
a0: e24dd00c sub sp, sp, #12
a4: e50b0008 str r0, [fp, #-8]
a8: e1a00000 nop ; (mov r0, r0)
ac: e51b3008 ldr r3, [fp, #-8]
b0: e5933000 ldr r3, [r3]
b4: e2833018 add r3, r3, #24
b8: e5d33000 ldrb r3, [r3]
bc: e2033010 and r3, r3, #16
c0: e3530000 cmp r3, #0
c4: 1afffff8 bne ac <ugetc+0x14>
c8: e51b3008 ldr r3, [fp, #-8]
cc: e5933000 ldr r3, [r3]
d0: e5d33000 ldrb r3, [r3]
d4: e1a00003 mov r0, r3
d8: e28bd000 add sp, fp, #0
dc: e49db004 pop {fp} ; (ldr fp, [sp], #4)
e0: e12fff1e bx lr
000000e4 <uputc>:
e4: e52db004 push {fp} ; (str fp, [sp, #-4]!)
e8: e28db000 add fp, sp, #0
ec: e24dd00c sub sp, sp, #12
f0: e50b0008 str r0, [fp, #-8]
f4: e1a03001 mov r3, r1
f8: e54b3009 strb r3, [fp, #-9]
fc: e1a00000 nop ; (mov r0, r0)
100: e51b3008 ldr r3, [fp, #-8]
104: e5933000 ldr r3, [r3]
108: e2833018 add r3, r3, #24
10c: e5d33000 ldrb r3, [r3]
110: e2033020 and r3, r3, #32
114: e3530000 cmp r3, #0
118: 1afffff8 bne 100 <uputc+0x1c>
11c: e51b3008 ldr r3, [fp, #-8]
120: e5933000 ldr r3, [r3]
124: e55b2009 ldrb r2, [fp, #-9]
128: e5c32000 strb r2, [r3]
12c: e1a00000 nop ; (mov r0, r0)
130: e1a00003 mov r0, r3
134: e28bd000 add sp, fp, #0
138: e49db004 pop {fp} ; (ldr fp, [sp], #4)
13c: e12fff1e bx lr
00000140 <ugets>:
140: e92d4800 push {fp, lr}
144: e28db004 add fp, sp, #4
148: e24dd008 sub sp, sp, #8
14c: e50b0008 str r0, [fp, #-8]
150: e50b100c str r1, [fp, #-12]
154: ea000007 b 178 <ugets+0x38>
158: e51b300c ldr r3, [fp, #-12]
15c: e5d33000 ldrb r3, [r3]
160: e1a01003 mov r1, r3
164: e51b0008 ldr r0, [fp, #-8]
168: ebfffffe bl e4 <uputc>
16c: e51b300c ldr r3, [fp, #-12]
170: e2833001 add r3, r3, #1
174: e50b300c str r3, [fp, #-12]
178: e51b0008 ldr r0, [fp, #-8]
17c: ebfffffe bl 98 <ugetc>
180: e1a03000 mov r3, r0
184: e20320ff and r2, r3, #255 ; 0xff
188: e51b300c ldr r3, [fp, #-12]
18c: e5c32000 strb r2, [r3]
190: e51b300c ldr r3, [fp, #-12]
194: e5d33000 ldrb r3, [r3]
198: e353000d cmp r3, #13
19c: 1affffed bne 158 <ugets+0x18>
1a0: e51b300c ldr r3, [fp, #-12]
1a4: e3a02000 mov r2, #0
1a8: e5c32000 strb r2, [r3]
1ac: e1a00000 nop ; (mov r0, r0)
1b0: e1a00003 mov r0, r3
1b4: e24bd004 sub sp, fp, #4
1b8: e8bd4800 pop {fp, lr}
1bc: e12fff1e bx lr
000001c0 <uprints>:
1c0: e92d4800 push {fp, lr}
1c4: e28db004 add fp, sp, #4
1c8: e24dd008 sub sp, sp, #8
1cc: e50b0008 str r0, [fp, #-8]
1d0: e50b100c str r1, [fp, #-12]
1d4: ea000006 b 1f4 <uprints+0x34>
1d8: e51b300c ldr r3, [fp, #-12]
1dc: e2832001 add r2, r3, #1
1e0: e50b200c str r2, [fp, #-12]
1e4: e5d33000 ldrb r3, [r3]
1e8: e1a01003 mov r1, r3
1ec: e51b0008 ldr r0, [fp, #-8]
1f0: ebfffffe bl e4 <uputc>
1f4: e51b300c ldr r3, [fp, #-12]
1f8: e5d33000 ldrb r3, [r3]
1fc: e3530000 cmp r3, #0
200: 1afffff4 bne 1d8 <uprints+0x18>
204: e1a00000 nop ; (mov r0, r0)
208: e1a00003 mov r0, r3
20c: e24bd004 sub sp, fp, #4
210: e8bd4800 pop {fp, lr}
214: e12fff1e bx lr
00000218 <fbuf_init>:
218: e52db004 push {fp} ; (str fp, [sp, #-4]!)
21c: e28db000 add fp, sp, #0
220: e59f3060 ldr r3, [pc, #96] ; 288 <fbuf_init+0x70>
224: e3a02602 mov r2, #2097152 ; 0x200000
228: e5832000 str r2, [r3]
22c: e59f3058 ldr r3, [pc, #88] ; 28c <fbuf_init+0x74>
230: e59f2058 ldr r2, [pc, #88] ; 290 <fbuf_init+0x78>
234: e5832000 str r2, [r3]
238: e59f3054 ldr r3, [pc, #84] ; 294 <fbuf_init+0x7c>
23c: e59f2054 ldr r2, [pc, #84] ; 298 <fbuf_init+0x80>
240: e5832000 str r2, [r3]
244: e59f3050 ldr r3, [pc, #80] ; 29c <fbuf_init+0x84>
248: e59f2050 ldr r2, [pc, #80] ; 2a0 <fbuf_init+0x88>
24c: e5832000 str r2, [r3]
250: e59f304c ldr r3, [pc, #76] ; 2a4 <fbuf_init+0x8c>
254: e59f204c ldr r2, [pc, #76] ; 2a8 <fbuf_init+0x90>
258: e5832000 str r2, [r3]
25c: e59f3048 ldr r3, [pc, #72] ; 2ac <fbuf_init+0x94>
260: e3a02602 mov r2, #2097152 ; 0x200000
264: e5832000 str r2, [r3]
268: e59f3040 ldr r3, [pc, #64] ; 2b0 <fbuf_init+0x98>
26c: e59f2040 ldr r2, [pc, #64] ; 2b4 <fbuf_init+0x9c>
270: e5832000 str r2, [r3]
274: e1a00000 nop ; (mov r0, r0)
278: e1a00003 mov r0, r3
27c: e28bd000 add sp, fp, #0
280: e49db004 pop {fp} ; (ldr fp, [sp], #4)
284: e12fff1e bx lr
288: 00000000 .word 0x00000000
28c: 1000001c .word 0x1000001c
290: 00002c77 .word 0x00002c77
294: 10120000 .word 0x10120000
298: 3f1f3f9c .word 0x3f1f3f9c
29c: 10120004 .word 0x10120004
2a0: 090b61df .word 0x090b61df
2a4: 10120008 .word 0x10120008
2a8: 071f1800 .word 0x071f1800
2ac: 10120010 .word 0x10120010
2b0: 10120018 .word 0x10120018
2b4: 0000082b .word 0x0000082b
000002b8 <draw_box>:
2b8: e52db004 push {fp} ; (str fp, [sp, #-4]!)
2bc: e28db000 add fp, sp, #0
2c0: e24dd01c sub sp, sp, #28
2c4: e50b0010 str r0, [fp, #-16]
2c8: e50b1014 str r1, [fp, #-20] ; 0xffffffec
2cc: e50b2018 str r2, [fp, #-24] ; 0xffffffe8
2d0: e50b301c str r3, [fp, #-28] ; 0xffffffe4
2d4: e3a03000 mov r3, #0
2d8: e50b3008 str r3, [fp, #-8]
2dc: ea00001c b 354 <draw_box+0x9c>
2e0: e3a03000 mov r3, #0
2e4: e50b300c str r3, [fp, #-12]
2e8: ea000012 b 338 <draw_box+0x80>
2ec: e59f3084 ldr r3, [pc, #132] ; 378 <draw_box+0xc0>
2f0: e5932000 ldr r2, [r3]
2f4: e51b100c ldr r1, [fp, #-12]
2f8: e51b3014 ldr r3, [fp, #-20] ; 0xffffffec
2fc: e0813003 add r3, r1, r3
300: e59f1074 ldr r1, [pc, #116] ; 37c <draw_box+0xc4>
304: e5911000 ldr r1, [r1]
308: e0010193 mul r1, r3, r1
30c: e51b3008 ldr r3, [fp, #-8]
310: e0811003 add r1, r1, r3
314: e51b3010 ldr r3, [fp, #-16]
318: e0813003 add r3, r1, r3
31c: e1a03103 lsl r3, r3, #2
320: e0823003 add r3, r2, r3
324: e51b201c ldr r2, [fp, #-28] ; 0xffffffe4
328: e5832000 str r2, [r3]
32c: e51b300c ldr r3, [fp, #-12]
330: e2833001 add r3, r3, #1
334: e50b300c str r3, [fp, #-12]
338: e51b200c ldr r2, [fp, #-12]
33c: e51b3018 ldr r3, [fp, #-24] ; 0xffffffe8
340: e1520003 cmp r2, r3
344: baffffe8 blt 2ec <draw_box+0x34>
348: e51b3008 ldr r3, [fp, #-8]
34c: e2833001 add r3, r3, #1
350: e50b3008 str r3, [fp, #-8]
354: e51b2008 ldr r2, [fp, #-8]
358: e51b3018 ldr r3, [fp, #-24] ; 0xffffffe8
35c: e1520003 cmp r2, r3
360: baffffde blt 2e0 <draw_box+0x28>
364: e1a00000 nop ; (mov r0, r0)
368: e1a00000 nop ; (mov r0, r0)
36c: e28bd000 add sp, fp, #0
370: e49db004 pop {fp} ; (ldr fp, [sp], #4)
374: e12fff1e bx lr
...
00000380 <timer_init>:
380: e52db004 push {fp} ; (str fp, [sp, #-4]!)
384: e28db000 add fp, sp, #0
388: e24dd00c sub sp, sp, #12
38c: e3a03000 mov r3, #0
390: e50b3008 str r3, [fp, #-8]
394: ea000048 b 4bc <timer_init+0x13c>
398: e51b3008 ldr r3, [fp, #-8]
39c: e1a03183 lsl r3, r3, #3
3a0: e59f2134 ldr r2, [pc, #308] ; 4dc <timer_init+0x15c>
3a4: e0833002 add r3, r3, r2
3a8: e50b300c str r3, [fp, #-12]
3ac: e51b3008 ldr r3, [fp, #-8]
3b0: e3530000 cmp r3, #0
3b4: 1a000002 bne 3c4 <timer_init+0x44>
3b8: e51b300c ldr r3, [fp, #-12]
3bc: e59f211c ldr r2, [pc, #284] ; 4e0 <timer_init+0x160>
3c0: e5832000 str r2, [r3]
3c4: e51b3008 ldr r3, [fp, #-8]
3c8: e3530001 cmp r3, #1
3cc: 1a000002 bne 3dc <timer_init+0x5c>
3d0: e51b300c ldr r3, [fp, #-12]
3d4: e59f2108 ldr r2, [pc, #264] ; 4e4 <timer_init+0x164>
3d8: e5832000 str r2, [r3]
3dc: e51b3008 ldr r3, [fp, #-8]
3e0: e3530002 cmp r3, #2
3e4: 1a000002 bne 3f4 <timer_init+0x74>
3e8: e51b300c ldr r3, [fp, #-12]
3ec: e59f20f4 ldr r2, [pc, #244] ; 4e8 <timer_init+0x168>
3f0: e5832000 str r2, [r3]
3f4: e51b3008 ldr r3, [fp, #-8]
3f8: e3530003 cmp r3, #3
3fc: 1a000002 bne 40c <timer_init+0x8c>
400: e51b300c ldr r3, [fp, #-12]
404: e59f20e0 ldr r2, [pc, #224] ; 4ec <timer_init+0x16c>
408: e5832000 str r2, [r3]
40c: e51b300c ldr r3, [fp, #-12]
410: e5933000 ldr r3, [r3]
414: e3a02000 mov r2, #0
418: e5832000 str r2, [r3]
41c: e51b300c ldr r3, [fp, #-12]
420: e5933000 ldr r3, [r3]
424: e2833004 add r3, r3, #4
428: e3e02000 mvn r2, #0
42c: e5832000 str r2, [r3]
430: e51b300c ldr r3, [fp, #-12]
434: e5933000 ldr r3, [r3]
438: e2833010 add r3, r3, #16
43c: e3a02000 mov r2, #0
440: e5832000 str r2, [r3]
444: e51b300c ldr r3, [fp, #-12]
448: e5933000 ldr r3, [r3]
44c: e2833014 add r3, r3, #20
450: e3a02000 mov r2, #0
454: e5832000 str r2, [r3]
458: e51b300c ldr r3, [fp, #-12]
45c: e5933000 ldr r3, [r3]
460: e3a02c01 mov r2, #256 ; 0x100
464: e5832000 str r2, [r3]
468: e51b300c ldr r3, [fp, #-12]
46c: e5933000 ldr r3, [r3]
470: e2833008 add r3, r3, #8
474: e3a02066 mov r2, #102 ; 0x66
478: e5832000 str r2, [r3]
47c: e51b300c ldr r3, [fp, #-12]
480: e5933000 ldr r3, [r3]
484: e2833018 add r3, r3, #24
488: e3a02b07 mov r2, #7168 ; 0x1c00
48c: e5832000 str r2, [r3]
490: e51b300c ldr r3, [fp, #-12]
494: e5933000 ldr r3, [r3]
498: e5932000 ldr r2, [r3]
49c: e3822040 orr r2, r2, #64 ; 0x40
4a0: e5832000 str r2, [r3]
4a4: e51b300c ldr r3, [fp, #-12]
4a8: e3a02000 mov r2, #0
4ac: e5832004 str r2, [r3, #4]
4b0: e51b3008 ldr r3, [fp, #-8]
4b4: e2833001 add r3, r3, #1
4b8: e50b3008 str r3, [fp, #-8]
4bc: e51b3008 ldr r3, [fp, #-8]
4c0: e3530003 cmp r3, #3
4c4: daffffb3 ble 398 <timer_init+0x18>
4c8: e1a00000 nop ; (mov r0, r0)
4cc: e1a00000 nop ; (mov r0, r0)
4d0: e28bd000 add sp, fp, #0
4d4: e49db004 pop {fp} ; (ldr fp, [sp], #4)
4d8: e12fff1e bx lr
4dc: 00000000 .word 0x00000000
4e0: 101e2000 .word 0x101e2000
4e4: 101e2020 .word 0x101e2020
4e8: 101e3000 .word 0x101e3000
4ec: 101e3020 .word 0x101e3020
000004f0 <timer_clearInterupt>:
4f0: e52db004 push {fp} ; (str fp, [sp, #-4]!)
4f4: e28db000 add fp, sp, #0
4f8: e24dd014 sub sp, sp, #20
4fc: e50b0010 str r0, [fp, #-16]
500: e51b3010 ldr r3, [fp, #-16]
504: e1a03183 lsl r3, r3, #3
508: e59f2028 ldr r2, [pc, #40] ; 538 <timer_clearInterupt+0x48>
50c: e0833002 add r3, r3, r2
510: e50b3008 str r3, [fp, #-8]
514: e51b3008 ldr r3, [fp, #-8]
518: e5933000 ldr r3, [r3]
51c: e283300c add r3, r3, #12
520: e3e02000 mvn r2, #0
524: e5832000 str r2, [r3]
528: e1a00000 nop ; (mov r0, r0)
52c: e28bd000 add sp, fp, #0
530: e49db004 pop {fp} ; (ldr fp, [sp], #4)
534: e12fff1e bx lr
538: 00000000 .word 0x00000000
0000053c <timer_handler>:
53c: e92d4800 push {fp, lr}
540: e28db004 add fp, sp, #4
544: e24dd010 sub sp, sp, #16
548: e50b0010 str r0, [fp, #-16]
54c: e51b3010 ldr r3, [fp, #-16]
550: e1a03183 lsl r3, r3, #3
554: e59f203c ldr r2, [pc, #60] ; 598 <timer_handler+0x5c>
558: e0833002 add r3, r3, r2
55c: e50b3008 str r3, [fp, #-8]
560: e51b3008 ldr r3, [fp, #-8]
564: e5933004 ldr r3, [r3, #4]
568: e2832001 add r2, r3, #1
56c: e51b3008 ldr r3, [fp, #-8]
570: e5832004 str r2, [r3, #4]
574: e51b0010 ldr r0, [fp, #-16]
578: ebfffffe bl 4f0 <timer_clearInterupt>
57c: e59f1018 ldr r1, [pc, #24] ; 59c <timer_handler+0x60>
580: e59f0018 ldr r0, [pc, #24] ; 5a0 <timer_handler+0x64>
584: ebfffffe bl 1c0 <uprints>
588: e1a00000 nop ; (mov r0, r0)
58c: e24bd004 sub sp, fp, #4
590: e8bd4800 pop {fp, lr}
594: e12fff1e bx lr
...
000005a4 <timer_start>:
5a4: e92d4800 push {fp, lr}
5a8: e28db004 add fp, sp, #4
5ac: e24dd010 sub sp, sp, #16
5b0: e50b0010 str r0, [fp, #-16]
5b4: e59f1044 ldr r1, [pc, #68] ; 600 <timer_start+0x5c>
5b8: e59f0044 ldr r0, [pc, #68] ; 604 <timer_start+0x60>
5bc: ebfffffe bl 1c0 <uprints>
5c0: e51b3010 ldr r3, [fp, #-16]
5c4: e1a03183 lsl r3, r3, #3
5c8: e59f2038 ldr r2, [pc, #56] ; 608 <timer_start+0x64>
5cc: e0833002 add r3, r3, r2
5d0: e50b3008 str r3, [fp, #-8]
5d4: e51b3008 ldr r3, [fp, #-8]
5d8: e5933000 ldr r3, [r3]
5dc: e2832008 add r2, r3, #8
5e0: e5922000 ldr r2, [r2]
5e4: e2833008 add r3, r3, #8
5e8: e3822080 orr r2, r2, #128 ; 0x80
5ec: e5832000 str r2, [r3]
5f0: e1a00000 nop ; (mov r0, r0)
5f4: e24bd004 sub sp, fp, #4
5f8: e8bd4800 pop {fp, lr}
5fc: e12fff1e bx lr
600: 00000010 .word 0x00000010
...
0000060c <timer_stop>:
60c: e52db004 push {fp} ; (str fp, [sp, #-4]!)
610: e28db000 add fp, sp, #0
614: e24dd014 sub sp, sp, #20
618: e50b0010 str r0, [fp, #-16]
61c: e51b3010 ldr r3, [fp, #-16]
620: e1a03183 lsl r3, r3, #3
624: e59f2028 ldr r2, [pc, #40] ; 654 <timer_stop+0x48>
628: e0833002 add r3, r3, r2
62c: e50b3008 str r3, [fp, #-8]
630: e51b3008 ldr r3, [fp, #-8]
634: e5933000 ldr r3, [r3]
638: e2833008 add r3, r3, #8
63c: e3a0207f mov r2, #127 ; 0x7f
640: e5832000 str r2, [r3]
644: e1a00000 nop ; (mov r0, r0)
648: e28bd000 add sp, fp, #0
64c: e49db004 pop {fp} ; (ldr fp, [sp], #4)
650: e12fff1e bx lr
654: 00000000 .word 0x00000000
00000658 <undef_handler>:
658: e92d4800 push {fp, lr}
65c: e28db004 add fp, sp, #4
660: e59f1014 ldr r1, [pc, #20] ; 67c <undef_handler+0x24>
664: e59f0014 ldr r0, [pc, #20] ; 680 <undef_handler+0x28>
668: ebfffffe bl 1c0 <uprints>
66c: e1a00000 nop ; (mov r0, r0)
670: e24bd004 sub sp, fp, #4
674: e8bd4800 pop {fp, lr}
678: e12fff1e bx lr
67c: 00000020 .word 0x00000020
680: 00000000 .word 0x00000000
00000684 <swi_handler>:
684: e92d4800 push {fp, lr}
688: e28db004 add fp, sp, #4
68c: e59f1014 ldr r1, [pc, #20] ; 6a8 <swi_handler+0x24>
690: e59f0014 ldr r0, [pc, #20] ; 6ac <swi_handler+0x28>
694: ebfffffe bl 1c0 <uprints>
698: e1a00000 nop ; (mov r0, r0)
69c: e24bd004 sub sp, fp, #4
6a0: e8bd4800 pop {fp, lr}
6a4: e12fff1e bx lr
6a8: 00000020 .word 0x00000020
6ac: 00000000 .word 0x00000000
000006b0 <prefetch_abort_handler>:
6b0: e92d4800 push {fp, lr}
6b4: e28db004 add fp, sp, #4
6b8: e59f1014 ldr r1, [pc, #20] ; 6d4 <prefetch_abort_handler+0x24>
6bc: e59f0014 ldr r0, [pc, #20] ; 6d8 <prefetch_abort_handler+0x28>
6c0: ebfffffe bl 1c0 <uprints>
6c4: e1a00000 nop ; (mov r0, r0)
6c8: e24bd004 sub sp, fp, #4
6cc: e8bd4800 pop {fp, lr}
6d0: e12fff1e bx lr
6d4: 00000020 .word 0x00000020
6d8: 00000000 .word 0x00000000
000006dc <data_abort_handler>:
6dc: e92d4800 push {fp, lr}
6e0: e28db004 add fp, sp, #4
6e4: e59f1014 ldr r1, [pc, #20] ; 700 <data_abort_handler+0x24>
6e8: e59f0014 ldr r0, [pc, #20] ; 704 <data_abort_handler+0x28>
6ec: ebfffffe bl 1c0 <uprints>
6f0: e1a00000 nop ; (mov r0, r0)
6f4: e24bd004 sub sp, fp, #4
6f8: e8bd4800 pop {fp, lr}
6fc: e12fff1e bx lr
700: 00000020 .word 0x00000020
704: 00000000 .word 0x00000000
00000708 <fiq_handler>:
708: e92d4800 push {fp, lr}
70c: e28db004 add fp, sp, #4
710: e59f1014 ldr r1, [pc, #20] ; 72c <fiq_handler+0x24>
714: e59f0014 ldr r0, [pc, #20] ; 730 <fiq_handler+0x28>
718: ebfffffe bl 1c0 <uprints>
71c: e1a00000 nop ; (mov r0, r0)
720: e24bd004 sub sp, fp, #4
724: e8bd4800 pop {fp, lr}
728: e12fff1e bx lr
72c: 00000020 .word 0x00000020
730: 00000000 .word 0x00000000
00000734 <copy_vector>:
734: e52db004 push {fp} ; (str fp, [sp, #-4]!)
738: e28db000 add fp, sp, #0
73c: e24dd00c sub sp, sp, #12
740: e59f3050 ldr r3, [pc, #80] ; 798 <copy_vector+0x64>
744: e50b3008 str r3, [fp, #-8]
748: e3a03000 mov r3, #0
74c: e50b300c str r3, [fp, #-12]
750: ea000007 b 774 <copy_vector+0x40>
754: e51b2008 ldr r2, [fp, #-8]
758: e2823004 add r3, r2, #4
75c: e50b3008 str r3, [fp, #-8]
760: e51b300c ldr r3, [fp, #-12]
764: e2831004 add r1, r3, #4
768: e50b100c str r1, [fp, #-12]
76c: e5922000 ldr r2, [r2]
770: e5832000 str r2, [r3]
774: e51b3008 ldr r3, [fp, #-8]
778: e59f201c ldr r2, [pc, #28] ; 79c <copy_vector+0x68>
77c: e1530002 cmp r3, r2
780: 3afffff3 bcc 754 <copy_vector+0x20>
784: e1a00000 nop ; (mov r0, r0)
788: e1a00000 nop ; (mov r0, r0)
78c: e28bd000 add sp, fp, #0
790: e49db004 pop {fp} ; (ldr fp, [sp], #4)
794: e12fff1e bx lr
...
000007a0 <kbd_init>:
7a0: e52db004 push {fp} ; (str fp, [sp, #-4]!)
7a4: e28db000 add fp, sp, #0
7a8: e24dd00c sub sp, sp, #12
7ac: e59f302c ldr r3, [pc, #44] ; 7e0 <kbd_init+0x40>
7b0: e50b3008 str r3, [fp, #-8]
7b4: e51b3008 ldr r3, [fp, #-8]
7b8: e3a02014 mov r2, #20
7bc: e5c32000 strb r2, [r3]
7c0: e51b3008 ldr r3, [fp, #-8]
7c4: e283300c add r3, r3, #12
7c8: e3a02008 mov r2, #8
7cc: e5c32000 strb r2, [r3]
7d0: e1a00000 nop ; (mov r0, r0)
7d4: e28bd000 add sp, fp, #0
7d8: e49db004 pop {fp} ; (ldr fp, [sp], #4)
7dc: e12fff1e bx lr
7e0: 10006000 .word 0x10006000
000007e4 <IRQ_handler>:
7e4: e92d4800 push {fp, lr}
7e8: e28db004 add fp, sp, #4
7ec: e24dd008 sub sp, sp, #8
7f0: e3a03000 mov r3, #0
7f4: e50b3008 str r3, [fp, #-8]
7f8: e59f10a4 ldr r1, [pc, #164] ; 8a4 <IRQ_handler+0xc0>
7fc: e59f00a4 ldr r0, [pc, #164] ; 8a8 <IRQ_handler+0xc4>
800: ebfffffe bl 1c0 <uprints>
804: e59f30a0 ldr r3, [pc, #160] ; 8ac <IRQ_handler+0xc8>
808: e5933000 ldr r3, [r3]
80c: e5933000 ldr r3, [r3]
810: e2833004 add r3, r3, #4
814: e5933000 ldr r3, [r3]
818: e3530000 cmp r3, #0
81c: 1a000001 bne 828 <IRQ_handler+0x44>
820: e3a00000 mov r0, #0
824: ebfffffe bl 53c <timer_handler>
828: e59f307c ldr r3, [pc, #124] ; 8ac <IRQ_handler+0xc8>
82c: e5933004 ldr r3, [r3, #4]
830: e5933000 ldr r3, [r3]
834: e2833004 add r3, r3, #4
838: e5933000 ldr r3, [r3]
83c: e3530000 cmp r3, #0
840: 1a000001 bne 84c <IRQ_handler+0x68>
844: e3a00001 mov r0, #1
848: ebfffffe bl 53c <timer_handler>
84c: e59f3058 ldr r3, [pc, #88] ; 8ac <IRQ_handler+0xc8>
850: e5933008 ldr r3, [r3, #8]
854: e5933000 ldr r3, [r3]
858: e2833004 add r3, r3, #4
85c: e5933000 ldr r3, [r3]
860: e3530000 cmp r3, #0
864: 1a000001 bne 870 <IRQ_handler+0x8c>
868: e3a00002 mov r0, #2
86c: ebfffffe bl 53c <timer_handler>
870: e59f3034 ldr r3, [pc, #52] ; 8ac <IRQ_handler+0xc8>
874: e593300c ldr r3, [r3, #12]
878: e5933000 ldr r3, [r3]
87c: e2833004 add r3, r3, #4
880: e5933000 ldr r3, [r3]
884: e3530000 cmp r3, #0
888: 1a000001 bne 894 <IRQ_handler+0xb0>
88c: e3a00003 mov r0, #3
890: ebfffffe bl 53c <timer_handler>
894: e1a00000 nop ; (mov r0, r0)
898: e24bd004 sub sp, fp, #4
89c: e8bd4800 pop {fp, lr}
8a0: e12fff1e bx lr
8a4: 00000030 .word 0x00000030
...
000008b0 <main>:
8b0: e92d4800 push {fp, lr}
8b4: e28db004 add fp, sp, #4
8b8: e24dd010 sub sp, sp, #16
8bc: ebfffffe bl 218 <fbuf_init>
8c0: ebfffffe bl 0 <uart_init>
8c4: ebfffffe bl 380 <timer_init>
8c8: ebfffffe bl 7a0 <kbd_init>
8cc: e3a03000 mov r3, #0
8d0: e50b3008 str r3, [fp, #-8]
8d4: ea00000b b 908 <main+0x58>
8d8: e51b3008 ldr r3, [fp, #-8]
8dc: e1a03183 lsl r3, r3, #3
8e0: e59f20dc ldr r2, [pc, #220] ; 9c4 <main+0x114>
8e4: e0832002 add r2, r3, r2
8e8: e59f10d8 ldr r1, [pc, #216] ; 9c8 <main+0x118>
8ec: e51b3008 ldr r3, [fp, #-8]
8f0: e7812103 str r2, [r1, r3, lsl #2]
8f4: e51b0008 ldr r0, [fp, #-8]
8f8: ebfffffe bl 5a4 <timer_start>
8fc: e51b3008 ldr r3, [fp, #-8]
900: e2833001 add r3, r3, #1
904: e50b3008 str r3, [fp, #-8]
908: e51b3008 ldr r3, [fp, #-8]
90c: e3530003 cmp r3, #3
910: dafffff0 ble 8d8 <main+0x28>
914: e59f10b0 ldr r1, [pc, #176] ; 9cc <main+0x11c>
918: e59f00b0 ldr r0, [pc, #176] ; 9d0 <main+0x120>
91c: ebfffffe bl 1c0 <uprints>
920: e59f309c ldr r3, [pc, #156] ; 9c4 <main+0x114>
924: e5931004 ldr r1, [r3, #4]
928: e59f30a4 ldr r3, [pc, #164] ; 9d4 <main+0x124>
92c: e0c32391 smull r2, r3, r1, r3
930: e1a021c3 asr r2, r3, #3
934: e1a03fc1 asr r3, r1, #31
938: e0422003 sub r2, r2, r3
93c: e1a03002 mov r3, r2
940: e1a03183 lsl r3, r3, #3
944: e0433002 sub r3, r3, r2
948: e1a03183 lsl r3, r3, #3
94c: e0433002 sub r3, r3, r2
950: e1a03103 lsl r3, r3, #2
954: e0412003 sub r2, r1, r3
958: e1a03002 mov r3, r2
95c: e54b3009 strb r3, [fp, #-9]
960: e55b3009 ldrb r3, [fp, #-9]
964: e2833037 add r3, r3, #55 ; 0x37
968: e20330ff and r3, r3, #255 ; 0xff
96c: e1a01003 mov r1, r3
970: e59f0058 ldr r0, [pc, #88] ; 9d0 <main+0x120>
974: ebfffffe bl e4 <uputc>
978: e55b3009 ldrb r3, [fp, #-9]
97c: e383386e orr r3, r3, #7208960 ; 0x6e0000
980: e3833001 orr r3, r3, #1
984: e50b3010 str r3, [fp, #-16]
988: e51b3010 ldr r3, [fp, #-16]
98c: e3a02064 mov r2, #100 ; 0x64
990: e3a0100a mov r1, #10
994: e3a0000a mov r0, #10
998: ebfffffe bl 2b8 <draw_box>
99c: e59f3020 ldr r3, [pc, #32] ; 9c4 <main+0x114>
9a0: e5933004 ldr r3, [r3, #4]
9a4: e3530000 cmp r3, #0
9a8: 0affffdc beq 920 <main+0x70>
9ac: e3a03b32 mov r3, #51200 ; 0xc800
9b0: e3a02032 mov r2, #50 ; 0x32
9b4: e3a010fa mov r1, #250 ; 0xfa
9b8: e3a000c8 mov r0, #200 ; 0xc8
9bc: ebfffffe bl 2b8 <draw_box>
9c0: eaffffd6 b 920 <main+0x70>
...
9cc: 0000003c .word 0x0000003c
9d0: 00000000 .word 0x00000000
9d4: 094f2095 .word 0x094f2095
also my linker script:
ENTRY(start)
SECTIONS{
. = 0x10000;
.text : {*(.text)}
.data : {*(.data)}
.bss : {*(.bss)}
. =ALIGN(8);
. =. + 0x1000;
stack_top = .;
. = . + 0x1000;
irq_stack_top = .;
}
So I do not have a complete example but something looks wrong, really wrong that you will need to sort out. And maybe you are already through all of these basic steps to get to this point.
qemu-system-arm -M versatilepb -m 128M -kernel notmain.bin
Implies a binary memory image file as in objcopy -O binary is being used. And there is no addressing information, so some default built into that machine or simulation is being used. What is that address....
strap.s
mov sp,#0x10000
bl notmain
b .
.globl GETPC
GETPC:
mov r0,pc
bx lr
.globl PUT32
PUT32:
str r1,[r0]
bx lr
notmain.c
void PUT32 ( unsigned int, unsigned int );
unsigned int GETPC ( void );
#define UART_BASE 0x101F1000
#define UARTDR (UART_BASE+0x000)
static void uart_send ( unsigned int x )
{
PUT32(UARTDR,x);
}
static void hexstrings ( unsigned int d )
{
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_send(rc);
if(rb==0) break;
}
uart_send(0x20);
}
static void hexstring ( unsigned int d )
{
hexstrings(d);
uart_send(0x0D);
uart_send(0x0A);
}
int notmain ( void )
{
hexstring(GETPC());
return(0);
}
uart address culled from the source code to qemu. so far I find you can cheat and just write to the tx buffer, it does not actually emulate the time it takes to send out a character and overflow a fifo, etc. So I cheat.
/* memmap */
MEMORY
{
ram : ORIGIN = 0x00000000, LENGTH = 32K
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.text*) } > ram
}
build
arm-none-eabi-as --warn --fatal-warnings -march=armv4t strap.s -o strap.o
arm-none-eabi-gcc -c -Wall -O2 -ffreestanding -march=armv4t notmain.c -o notmain.o
arm-none-eabi-ld -nostdlib -nostartfiles strap.o notmain.o -T memmap -o notmain.elf
arm-none-eabi-objdump -D notmain.elf > notmain.list
arm-none-eabi-objcopy notmain.elf -O binary notmain.bin
examine output
Disassembly of section .text:
00000000 <GETPC-0xc>:
0: e3a0d801 mov sp, #65536 ; 0x10000
4: eb000004 bl 1c <notmain>
8: eafffffe b 8 <GETPC-0x4>
0000000c <GETPC>:
c: e1a0000f mov r0, pc
10: e12fff1e bx lr
00000014 <PUT32>:
14: e5801000 str r1, [r0]
18: e12fff1e bx lr
0000001c <notmain>:
1c: e92d4070 push {r4, r5, r6, lr}
20: ebfffff9 bl c <GETPC>
24: e3a04020 mov r4, #32
my entry point is where I want it. I am not loading an elf I am loading a binary image. If I wanted the entry point to be different and load an elf it may be possible that using ENTRY() in the linker script then having the entry point wherever might work.
qemu-system-arm -M versatilepb -m 128M -kernel notmain.bin
then view the serial0 output of 00010014 so they are loading us at 0x10000 so change memmap
ram : ORIGIN = 0x00010000, LENGTH = 32K
Now your linker script implies it should have put your code at 0x10000 but
clearly it has not based on your disassembly, so how are you running anything at all. Also your entry point using a binary image is not the bootstrap it is not the first instructions you want to be executing. maybe you disassembled an object not the final binary?
So for fun before changing the memory map to 0x10000 if I instead
qemu-system-arm -M versatilepb -m 128M -kernel notmain.elf
Then I get 00000014 on the output. it used the address defined in the elf file.
If I add this
b .
.globl _start
_start:
mov sp,#0x10000
bl notmain
b .
and this
/* memmap */
ENTRY(_start)
MEMORY
{
ram : ORIGIN = 0x00000000, LENGTH = 32K
}
SECTIONS
{
.text : { *(.text*) } > ram
.bss : { *(.text*) } > ram
}
and execute the elf I do get output of 00000018 so it is entering at the entry point not the first word of the loadable image.
run the .bin and you get no serial output as expected.
Change memmap to 0x10000 and run the elf you get 0x10018 as expected.
So that was a simple way to see how qemu loads our program.
so same build and linker script (with the 0x10000)
.globl _start
_start:
ldr pc,reset_handler
ldr pc,undefined_handler
ldr pc,swi_handler
ldr pc,prefetch_handler
ldr pc,data_handler
ldr pc,unused_handler
ldr pc,irq_handler
ldr pc,fiq_handler
reset_handler: .word reset
undefined_handler: .word hang
swi_handler: .word swi
prefetch_handler: .word hang
data_handler: .word hang
unused_handler: .word hang
irq_handler: .word hang
fiq_handler: .word hang
reset:
mov r0,#0x10000
mov r1,#0x00000
ldmia r0!,{r2,r3,r4,r5,r6,r7,r8,r9}
stmia r1!,{r2,r3,r4,r5,r6,r7,r8,r9}
ldmia r0!,{r2,r3,r4,r5,r6,r7,r8,r9}
stmia r1!,{r2,r3,r4,r5,r6,r7,r8,r9}
;# FIQ 110 10001
msr cpsr_c,#0xD1
mov sp,#0x1000
;# IRQ 110 10010
msr cpsr_c,#0xD2
mov sp,#0x2000
;# SVC 110 10011
msr cpsr_c,#0xD3
mov sp,#0x3000
;# ABT 110 10111
msr cpsr_c,#0xD7
mov sp,#0x4000
;# UND 110 11011
msr cpsr_c,#0xDB
mov sp,#0x5000
;# SYS 110 11111
msr cpsr_c,#0xDF
mov sp,#0x6000
;# SVC 110 10011
msr cpsr_c,#0xD3
bl notmain
hang:
b hang
swi:
push {r4-r12,lr}
bl swi_main
pop {r4-r12,lr}
movs pc,lr
.globl DO_SWI
DO_SWI:
push {lr}
svc 0
pop {lr}
bx lr
.globl GO_USER
GO_USER:
;# USER 110 10000
msr cpsr_c,#0xD0
bl gouser
b .
gouser:
bx r0
.globl GETMODE
GETMODE:
mrs r0,cpsr
bx lr
.globl GETPC
GETPC:
mov r0,pc
bx lr
.globl GETSP
GETSP:
mov r0,sp
bx lr
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.globl GET32
GET32:
ldr r0,[r0]
bx lr
and
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
unsigned int GETPC ( void );
unsigned int GETSP ( void );
unsigned int GETMODE ( void );
unsigned int GO_USER ( unsigned int );
unsigned int DO_SWI ( unsigned int, unsigned int, unsigned int, unsigned int );
#define UART_BASE 0x101F1000
#define UARTDR (UART_BASE+0x000)
#define UARTFR (UART_BASE+0x018)
static void uart_send ( unsigned int x )
{
while(1)
{
if(GET32(UARTFR)&(1<<7)) break;
}
PUT32(UARTDR,x);
}
static void hexstrings ( unsigned int d )
{
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_send(rc);
if(rb==0) break;
}
uart_send(0x20);
}
static void hexstring ( unsigned int d )
{
hexstrings(d);
uart_send(0x0D);
uart_send(0x0A);
}
unsigned int swi_main ( unsigned int a, unsigned int b, unsigned int c, unsigned int d )
{
hexstring(0xAABBCCDD);
switch(a)
{
case 0:
{
return(~b);
}
case 1:
{
return(GETMODE());
}
}
return(0);
}
void user_main ( void )
{
hexstring(GETMODE());
hexstring(GETSP());
hexstring(DO_SWI(0,0x1234,0,0));
hexstring(DO_SWI(1,0x1234,0,0));
}
int notmain ( void )
{
hexstring(GETMODE());
hexstring(GETSP());
GO_USER((unsigned int)user_main);
return(0);
}
get even a trivial swi in place first before attempting an interrupt. getting the exception table in place, the banked stack registers, etc. seems trivial but even with experience those things bite you and you end up chasing the wrong thing.
400001D3
00002FF8
200001D0
00005FF8
AABBCCDD
FFFFEDCB
AABBCCDD
600001D3
this was a bit overkill with mode changes and such, but it still has the swi handler in the right place. The code is linked for
notmain.elf: file format elf32-littlearm
Disassembly of section .text:
00010000 <_start>:
10000: e59ff018 ldr pc, [pc, #24] ; 10020 <reset_handler>
10004: e59ff018 ldr pc, [pc, #24] ; 10024 <undefined_handler>
10008: e59ff018 ldr pc, [pc, #24] ; 10028 <swi_handler>
1000c: e59ff018 ldr pc, [pc, #24] ; 1002c <prefetch_handler>
10010: e59ff018 ldr pc, [pc, #24] ; 10030 <data_handler>
10014: e59ff018 ldr pc, [pc, #24] ; 10034 <unused_handler>
10018: e59ff018 ldr pc, [pc, #24] ; 10038 <irq_handler>
1001c: e59ff018 ldr pc, [pc, #24] ; 1003c <fiq_handler>
00010020 <reset_handler>:
10020: 00010040 andeq r0, r1, r0, asr #32
00010024 <undefined_handler>:
10024: 00010090 muleq r1, r0, r0
packing the exception table with its pc-relative addresses and letting the toolchain do all the work of sorting out addresses for you, you can then simply copy it to 0x00000000 from 0x10000. It is probably fine but in your case I would at least put some level of alignment before your table, something makes me want to think it wants to be aligned. However you wish to copy it is your deal, I would not use C in a bootstrap, the habit creates problems from time to time that are easily avoided.
Knowing what we know of course if we linked for address 0x00000000 and used the elf file then we do not have to copy/move the exception table. It lands in the right place on load.
Newer cores have a VTOR register and you can simply choose a different exception or vector table address, but it does need to have a level of alignment that you cannot forget.
I did not go and research this timer, but I assume long before you attempted an interrupt you wrote very many throwaway programs that used polling to fully understand the timer and how to clear its interrupt, etc? Depending on the chip (not necessarily core) then there may also be arm side registers you can poll, disable interrupts into the core, poll the gvic or vpic or whatever it is called and see the interrupt hit the edge of the core, be able to turn it off, etc. Then.....enable it into the core now that you have mastered that peripheral. Sometimes leaving nothing left to work on, for some cores though there is some interrupt stuff that can only be done in an isr (how to detect and clear the interrupt source) this virtual machine predates that by a long time though, so no worries there.
so...
If that is your linker script then why is your output zero based, and why is the uart function up front and not the bootstrap entry point? If that is the disassembly of the final binary then there are all kinds of problems going on there.
I was seeing this as baremetal but forgetting qemu is loading linux-like so at some address like 0x8000, 0x10000, or 0x80000 are some examples, so yes you either need a core with a VTOR or copy the exception table over, and letting the tools build the addresses and pc-relative loads is the way to go. That copy is probably fine if you are actually booting right and executing it.
Then have you completely mastered that peripheral?
This is an emulated core so it may resemble one/some of the products from arm but remember they are not all identical some come with interrupt controllers some do not, some folks like the raspi folks disable that interrupt controller, etc. The base address of add on peripherals for real arm cores is based on the chip vendor not arm. Etc....Best to dig through the source code for the specific machine emulation and see what it is doing to truly get a qemu machine to work.
So maybe you have done all of these things and it still does not work. The question as written has some issues, the disassembly does not make sense for example. Need to resolve that. What if you try with an elf file first and an entry point then do the extra work to be able to run without the elf file and the bin file instead. Do not re-write but add more content to the question if you need further help. Unless someone else sees the issue with what has been presented.
Related
I have been writing a kernel for the Raspberry Pi 2 using C. To do so I have been following the Valvers and Baking Pi (written in Assembly) tutorials to do so. But each time I try to port the function to set a pin to output from the Baking Pi OK03 tutorial to C the led stops blinking (but the code compiles just fine). I have rewritten the function several times but I cannot get it to work.
Here is my code:
main.c:
#include "gpio.h"
int main(void) __attribute__((naked));
int main(void)
{
gpio = (unsigned int*)GPIO_BASE;
/* Write 1 to the GPIO16 init nibble in the Function Select 1 GPIO
peripheral register to enable GPIO16 as an output */
// gpio[LED_GPFSEL] |= (1 << LED_GPFBIT);
pinMode(47, 1);
// Never return from here
while(1)
{
for(tim = 0; tim < 500000; tim++)
;
/* Set the LED GPIO pin low ( Turn OK LED on for original Pi, and off
for plus models )*/
gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++)
;
/* Set the LED GPIO pin high ( Turn OK LED off for original Pi, and on
for plus models )*/
gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
}
}
gpio.h:
#ifndef GPIO_H
#define GPIO_H
#ifdef RPI2
#define GPIO_BASE 0x3F200000UL
#else
#define GPIO_BASE 0x20200000UL
#endif
#if defined( RPIBPLUS ) || defined( RPI2 )
#define LED_GPFSEL GPIO_GPFSEL4
#define LED_GPFBIT 21
#define LED_GPSET GPIO_GPSET1
#define LED_GPCLR GPIO_GPCLR1
#define LED_GPIO_BIT 15
#else
#define LED_GPFSEL GPIO_GPFSEL1
#define LED_GPFBIT 18
#define LED_GPSET GPIO_GPSET0
#define LED_GPCLR GPIO_GPCLR0
#define LED_GPIO_BIT 16
#endif
#define GPIO_GPFSEL0 0
#define GPIO_GPFSEL1 1
#define GPIO_GPFSEL2 2
#define GPIO_GPFSEL3 3
#define GPIO_GPFSEL4 4
#define GPIO_GPFSEL5 5
#define GPIO_GPSET0 7
#define GPIO_GPSET1 8
#define GPIO_GPCLR0 10
#define GPIO_GPCLR1 11
#define GPIO_GPLEV0 13
#define GPIO_GPLEV1 14
#define GPIO_GPEDS0 16
#define GPIO_GPEDS1 17
#define GPIO_GPREN0 19
#define GPIO_GPREN1 20
#define GPIO_GPFEN0 22
#define GPIO_GPFEN1 23
#define GPIO_GPHEN0 25
#define GPIO_GPHEN1 26
#define GPIO_GPLEN0 28
#define GPIO_GPLEN1 29
#define GPIO_GPAREN0 31
#define GPIO_GPAREN1 32
#define GPIO_GPAFEN0 34
#define GPIO_GPAFEN1 35
#define GPIO_GPPUD 37
#define GPIO_GPPUDCLK0 38
#define GPIO_GPPUDCLK1 39
/** GPIO Register set */
volatile unsigned int* gpio;
/** Simple loop variable */
volatile unsigned int tim;
// Function to change a pin's mode
int pinMode(int pinnum, int mode);
#endif
gpio.c:
#include "gpio.h"
/* Docs: The Pi has 54 GPIO pins and 6 Function Selec Registers (FSR). Each FSR
controls 10 GPIO pins and each FSR is made of 33 bits (each pin is controlled by
3 bits of the FSR). To know which pins of the FSR control each GPIO pin the formula
3n is used (where n is the pin number). In order for this to work, the pin number must be minor or equal to 9. Therefore if the pin is higher than 9, 10 units are subtracted from the pin number and one unit is added to the fsr variable. This loops until the pin number is lower or equal to 9. */
int pinMode(int pinnum, int mode) {
// Variable declaration and initialization
int fsr = 0;
int fsrbit;
// Let's check the pin does exist
if (pinnum < 0 || pinnum > 53) {
// Abort, there is no such pin.
return 1;
}
else if (mode < 0 || mode > 1) {
// Abort, invalid mode (Actually there are 7 modes but we will only use 2)
return 1;
}
// Create a pointer to the GPIO perhiperal register so we can speak to it
gpio = (unsigned int*)GPIO_BASE;
// And calculate wich FSR we should use
/* do {
if (pinnum > 9) {
pinnum -= 10;
fsr++;
}
} while (pinnum > 9); */
if (pinnum > 9) {
while (pinnum > 9) {
pinnum -= 10;
fsr++;
}
}
// Then we calculate the bytes of the fsreg to use
fsrbit = pinnum * 3;
// Finally let's set the pin to the desired mode
gpio[fsr] |= (mode << fsrbit);
return 0;
}
Please help, I have been stuck with this problem for weeks.
PS: In case you want the kernel disassembly:
kernel_disassembly.asm:
./kernel.elf: file format elf32-littlearm
Disassembly of section .text:
00008000 <main>:
8000: e3082200 movw r2, #33280 ; 0x8200
8004: e3402001 movt r2, #1
8008: e3a03000 mov r3, #0
800c: e3433f20 movt r3, #16160 ; 0x3f20
8010: e5823000 str r3, [r2]
8014: e3a01001 mov r1, #1
8018: e3a0002f mov r0, #47 ; 0x2f
801c: eb000032 bl 80ec <pinMode>
8020: e30831fc movw r3, #33276 ; 0x81fc
8024: e3403001 movt r3, #1
8028: e3a02000 mov r2, #0
802c: e5832000 str r2, [r3]
8030: ea000006 b 8050 <main+0x50>
8034: e30831fc movw r3, #33276 ; 0x81fc
8038: e3403001 movt r3, #1
803c: e5933000 ldr r3, [r3]
8040: e2832001 add r2, r3, #1
8044: e30831fc movw r3, #33276 ; 0x81fc
8048: e3403001 movt r3, #1
804c: e5832000 str r2, [r3]
8050: e30831fc movw r3, #33276 ; 0x81fc
8054: e3403001 movt r3, #1
8058: e5932000 ldr r2, [r3]
805c: e30a311f movw r3, #41247 ; 0xa11f
8060: e3403007 movt r3, #7
8064: e1520003 cmp r2, r3
8068: 9afffff1 bls 8034 <main+0x34>
806c: e3083200 movw r3, #33280 ; 0x8200
8070: e3403001 movt r3, #1
8074: e5933000 ldr r3, [r3]
8078: e283302c add r3, r3, #44 ; 0x2c
807c: e3a02902 mov r2, #32768 ; 0x8000
8080: e5832000 str r2, [r3]
8084: e30831fc movw r3, #33276 ; 0x81fc
8088: e3403001 movt r3, #1
808c: e3a02000 mov r2, #0
8090: e5832000 str r2, [r3]
8094: ea000006 b 80b4 <main+0xb4>
8098: e30831fc movw r3, #33276 ; 0x81fc
809c: e3403001 movt r3, #1
80a0: e5933000 ldr r3, [r3]
80a4: e2832001 add r2, r3, #1
80a8: e30831fc movw r3, #33276 ; 0x81fc
80ac: e3403001 movt r3, #1
80b0: e5832000 str r2, [r3]
80b4: e30831fc movw r3, #33276 ; 0x81fc
80b8: e3403001 movt r3, #1
80bc: e5932000 ldr r2, [r3]
80c0: e30a311f movw r3, #41247 ; 0xa11f
80c4: e3403007 movt r3, #7
80c8: e1520003 cmp r2, r3
80cc: 9afffff1 bls 8098 <main+0x98>
80d0: e3083200 movw r3, #33280 ; 0x8200
80d4: e3403001 movt r3, #1
80d8: e5933000 ldr r3, [r3]
80dc: e2833020 add r3, r3, #32
80e0: e3a02902 mov r2, #32768 ; 0x8000
80e4: e5832000 str r2, [r3]
80e8: eaffffcc b 8020 <main+0x20>
000080ec <pinMode>:
80ec: e52db004 push {fp} ; (str fp, [sp, #-4]!)
80f0: e28db000 add fp, sp, #0
80f4: e24dd014 sub sp, sp, #20
80f8: e50b0010 str r0, [fp, #-16]
80fc: e50b1014 str r1, [fp, #-20] ; 0xffffffec
8100: e3a03000 mov r3, #0
8104: e50b3008 str r3, [fp, #-8]
8108: e51b3010 ldr r3, [fp, #-16]
810c: e3530000 cmp r3, #0
8110: ba000002 blt 8120 <pinMode+0x34>
8114: e51b3010 ldr r3, [fp, #-16]
8118: e3530035 cmp r3, #53 ; 0x35
811c: da000001 ble 8128 <pinMode+0x3c>
8120: e3a03001 mov r3, #1
8124: ea000030 b 81ec <pinMode+0x100>
8128: e51b3014 ldr r3, [fp, #-20] ; 0xffffffec
812c: e3530000 cmp r3, #0
8130: ba000002 blt 8140 <pinMode+0x54>
8134: e51b3014 ldr r3, [fp, #-20] ; 0xffffffec
8138: e3530001 cmp r3, #1
813c: da000001 ble 8148 <pinMode+0x5c>
8140: e3a03001 mov r3, #1
8144: ea000028 b 81ec <pinMode+0x100>
8148: e3082200 movw r2, #33280 ; 0x8200
814c: e3402001 movt r2, #1
8150: e3a03000 mov r3, #0
8154: e3433f20 movt r3, #16160 ; 0x3f20
8158: e5823000 str r3, [r2]
815c: e51b3010 ldr r3, [fp, #-16]
8160: e3530009 cmp r3, #9
8164: da000009 ble 8190 <pinMode+0xa4>
8168: ea000005 b 8184 <pinMode+0x98>
816c: e51b3010 ldr r3, [fp, #-16]
8170: e243300a sub r3, r3, #10
8174: e50b3010 str r3, [fp, #-16]
8178: e51b3008 ldr r3, [fp, #-8]
817c: e2833001 add r3, r3, #1
8180: e50b3008 str r3, [fp, #-8]
8184: e51b3010 ldr r3, [fp, #-16]
8188: e3530009 cmp r3, #9
818c: cafffff6 bgt 816c <pinMode+0x80>
8190: e51b3010 ldr r3, [fp, #-16]
8194: e3a02003 mov r2, #3
8198: e0030392 mul r3, r2, r3
819c: e50b300c str r3, [fp, #-12]
81a0: e3083200 movw r3, #33280 ; 0x8200
81a4: e3403001 movt r3, #1
81a8: e5932000 ldr r2, [r3]
81ac: e51b3008 ldr r3, [fp, #-8]
81b0: e1a03103 lsl r3, r3, #2
81b4: e0822003 add r2, r2, r3
81b8: e3083200 movw r3, #33280 ; 0x8200
81bc: e3403001 movt r3, #1
81c0: e5931000 ldr r1, [r3]
81c4: e51b3008 ldr r3, [fp, #-8]
81c8: e1a03103 lsl r3, r3, #2
81cc: e0813003 add r3, r1, r3
81d0: e5933000 ldr r3, [r3]
81d4: e51b0014 ldr r0, [fp, #-20] ; 0xffffffec
81d8: e51b100c ldr r1, [fp, #-12]
81dc: e1a01110 lsl r1, r0, r1
81e0: e1833001 orr r3, r3, r1
81e4: e5823000 str r3, [r2]
81e8: e3a03000 mov r3, #0
81ec: e1a00003 mov r0, r3
81f0: e24bd000 sub sp, fp, #0
81f4: e49db004 pop {fp} ; (ldr fp, [sp], #4)
81f8: e12fff1e bx lr
Disassembly of section .bss:
000181fc <__bss_start>:
181fc: 00000000 andeq r0, r0, r0
00018200 <gpio>:
18200: 00000000 andeq r0, r0, r0
Disassembly of section .comment:
00000000 <.comment>:
0: 3a434347 bcc 10d0d24 <_stack+0x1050d24>
4: 4e472820 cdpmi 8, 4, cr2, cr7, cr0, {1}
8: 6f542055 svcvs 0x00542055
c: 20736c6f rsbscs r6, r3, pc, ror #24
10: 20726f66 rsbscs r6, r2, r6, ror #30
14: 204d5241 subcs r5, sp, r1, asr #4
18: 65626d45 strbvs r6, [r2, #-3397]! ; 0xfffff2bb
1c: 64656464 strbtvs r6, [r5], #-1124 ; 0xfffffb9c
20: 6f725020 svcvs 0x00725020
24: 73736563 cmnvc r3, #415236096 ; 0x18c00000
28: 2973726f ldmdbcs r3!, {r0, r1, r2, r3, r5, r6, r9, ip, sp, lr}^
2c: 342e3520 strtcc r3, [lr], #-1312 ; 0xfffffae0
30: 3220312e eorcc r3, r0, #-2147483637 ; 0x8000000b
34: 30363130 eorscc r3, r6, r0, lsr r1
38: 20393139 eorscs r3, r9, r9, lsr r1
3c: 6c657228 sfmvs f7, 2, [r5], #-160 ; 0xffffff60
40: 65736165 ldrbvs r6, [r3, #-357]! ; 0xfffffe9b
44: 415b2029 cmpmi fp, r9, lsr #32
48: 652f4d52 strvs r4, [pc, #-3410]! ; fffff2fe <_stack+0xfff7f2fe>
4c: 6465626d strbtvs r6, [r5], #-621 ; 0xfffffd93
50: 2d646564 cfstr64cs mvdx6, [r4, #-400]! ; 0xfffffe70
54: 72622d35 rsbvc r2, r2, #3392 ; 0xd40
58: 68636e61 stmdavs r3!, {r0, r5, r6, r9, sl, fp, sp, lr}^
5c: 76657220 strbtvc r7, [r5], -r0, lsr #4
60: 6f697369 svcvs 0x00697369
64: 3432206e ldrtcc r2, [r2], #-110 ; 0xffffff92
68: 36393430 ; <UNDEFINED> instruction: 0x36393430
6c: Address 0x000000000000006c is out of bounds.
Disassembly of section .debug_aranges:
00000000 <.debug_aranges>:
0: 0000001c andeq r0, r0, ip, lsl r0
4: 00000002 andeq r0, r0, r2
8: 00040000 andeq r0, r4, r0
c: 00000000 andeq r0, r0, r0
10: 00008000 andeq r8, r0, r0
14: 000000ec andeq r0, r0, ip, ror #1
...
20: 0000001c andeq r0, r0, ip, lsl r0
24: 00760002 rsbseq r0, r6, r2
28: 00040000 andeq r0, r4, r0
2c: 00000000 andeq r0, r0, r0
30: 000080ec andeq r8, r0, ip, ror #1
34: 00000110 andeq r0, r0, r0, lsl r1
...
Disassembly of section .debug_info:
00000000 <.debug_info>:
0: 00000072 andeq r0, r0, r2, ror r0
4: 00000004 andeq r0, r0, r4
8: 01040000 mrseq r0, (UNDEF: 4)
c: 0000003f andeq r0, r0, pc, lsr r0
10: 0000340c andeq r3, r0, ip, lsl #8
14: 00000d00 andeq r0, r0, r0, lsl #26
18: 00800000 addeq r0, r0, r0
1c: 0000ec00 andeq lr, r0, r0, lsl #24
20: 00000000 andeq r0, r0, r0
24: 00d10200 sbcseq r0, r1, r0, lsl #4
28: 04010000 streq r0, [r1], #-0
2c: 0000003a andeq r0, r0, sl, lsr r0
30: 00008000 andeq r8, r0, r0
34: 000000ec andeq r0, r0, ip, ror #1
38: 04039c01 streq r9, [r3], #-3073 ; 0xfffff3ff
3c: 746e6905 strbtvc r6, [lr], #-2309 ; 0xfffff6fb
40: 002f0400 eoreq r0, pc, r0, lsl #8
44: 42020000 andmi r0, r2, #0
48: 00000052 andeq r0, r0, r2, asr r0
4c: 82000305 andhi r0, r0, #335544320 ; 0x14000000
50: 04050001 streq r0, [r5], #-1
54: 0000005f andeq r0, r0, pc, asr r0
58: 00070406 andeq r0, r7, r6, lsl #8
5c: 07000000 streq r0, [r0, -r0]
60: 00000058 andeq r0, r0, r8, asr r0
64: 6d697408 cfstrdvs mvd7, [r9, #-32]! ; 0xffffffe0
68: 5f450200 svcpl 0x00450200
6c: 05000000 streq r0, [r0, #-0]
70: 0181fc03 orreq pc, r1, r3, lsl #24
74: 00af0000 adceq r0, pc, r0
78: 00040000 andeq r0, r4, r0
7c: 00000076 andeq r0, r0, r6, ror r0
80: 003f0104 eorseq r0, pc, r4, lsl #2
84: dd0c0000 stcle 0, cr0, [ip, #-0]
88: 0d000000 stceq 0, cr0, [r0, #-0]
8c: ec000000 stc 0, cr0, [r0], {-0}
90: 10000080 andne r0, r0, r0, lsl #1
94: 61000001 tstvs r0, r1
98: 02000000 andeq r0, r0, #0
9c: 000000ef andeq r0, r0, pc, ror #1
a0: 00770b01 rsbseq r0, r7, r1, lsl #22
a4: 80ec0000 rschi r0, ip, r0
a8: 01100000 tsteq r0, r0
ac: 9c010000 stcls 0, cr0, [r1], {-0}
b0: 00000077 andeq r0, r0, r7, ror r0
b4: 0000d603 andeq sp, r0, r3, lsl #12
b8: 770b0100 strvc r0, [fp, -r0, lsl #2]
bc: 02000000 andeq r0, r0, #0
c0: f7036c91 ; <UNDEFINED> instruction: 0xf7036c91
c4: 01000000 mrseq r0, (UNDEF: 0)
c8: 0000770b andeq r7, r0, fp, lsl #14
cc: 68910200 ldmvs r1, {r9}
d0: 72736604 rsbsvc r6, r3, #4, 12 ; 0x400000
d4: 770d0100 strvc r0, [sp, -r0, lsl #2]
d8: 02000000 andeq r0, r0, #0
dc: e8057491 stmda r5, {r0, r4, r7, sl, ip, sp, lr}
e0: 01000000 mrseq r0, (UNDEF: 0)
e4: 0000770e andeq r7, r0, lr, lsl #14
e8: 70910200 addsvc r0, r1, r0, lsl #4
ec: 05040600 streq r0, [r4, #-1536] ; 0xfffffa00
f0: 00746e69 rsbseq r6, r4, r9, ror #28
f4: 00002f07 andeq r2, r0, r7, lsl #30
f8: 8f420200 svchi 0x00420200
fc: 05000000 streq r0, [r0, #-0]
100: 01820003 orreq r0, r2, r3
104: 9c040800 stcls 8, cr0, [r4], {-0}
108: 09000000 stmdbeq r0, {} ; <UNPREDICTABLE>
10c: 00000704 andeq r0, r0, r4, lsl #14
110: 950a0000 strls r0, [sl, #-0]
114: 0b000000 bleq 11c <main-0x7ee4>
118: 006d6974 rsbeq r6, sp, r4, ror r9
11c: 009c4502 addseq r4, ip, r2, lsl #10
120: 03050000 movweq r0, #20480 ; 0x5000
124: 000181fc strdeq r8, [r1], -ip
...
Disassembly of section .debug_abbrev:
00000000 <.debug_abbrev>:
0: 25011101 strcs r1, [r1, #-257] ; 0xfffffeff
4: 030b130e movweq r1, #45838 ; 0xb30e
8: 110e1b0e tstne lr, lr, lsl #22
c: 10061201 andne r1, r6, r1, lsl #4
10: 02000017 andeq r0, r0, #23
14: 193f002e ldmdbne pc!, {r1, r2, r3, r5} ; <UNPREDICTABLE>
18: 0b3a0e03 bleq e8382c <_stack+0xe0382c>
1c: 19270b3b stmdbne r7!, {r0, r1, r3, r4, r5, r8, r9, fp}
20: 01111349 tsteq r1, r9, asr #6
24: 18400612 stmdane r0, {r1, r4, r9, sl}^
28: 00194296 mulseq r9, r6, r2
2c: 00240300 eoreq r0, r4, r0, lsl #6
30: 0b3e0b0b bleq f82c64 <_stack+0xf02c64>
34: 00000803 andeq r0, r0, r3, lsl #16
38: 03003404 movweq r3, #1028 ; 0x404
3c: 3b0b3a0e blcc 2ce87c <_stack+0x24e87c>
40: 3f13490b svccc 0x0013490b
44: 00180219 andseq r0, r8, r9, lsl r2
48: 000f0500 andeq r0, pc, r0, lsl #10
4c: 13490b0b movtne r0, #39691 ; 0x9b0b
50: 24060000 strcs r0, [r6], #-0
54: 3e0b0b00 vmlacc.f64 d0, d11, d0
58: 000e030b andeq r0, lr, fp, lsl #6
5c: 00350700 eorseq r0, r5, r0, lsl #14
60: 00001349 andeq r1, r0, r9, asr #6
64: 03003408 movweq r3, #1032 ; 0x408
68: 3b0b3a08 blcc 2ce890 <_stack+0x24e890>
6c: 3f13490b svccc 0x0013490b
70: 00180219 andseq r0, r8, r9, lsl r2
74: 11010000 mrsne r0, (UNDEF: 1)
78: 130e2501 movwne r2, #58625 ; 0xe501
7c: 1b0e030b blne 380cb0 <_stack+0x300cb0>
80: 1201110e andne r1, r1, #-2147483645 ; 0x80000003
84: 00171006 andseq r1, r7, r6
88: 012e0200 ; <UNDEFINED> instruction: 0x012e0200
8c: 0e03193f mcreq 9, 0, r1, cr3, cr15, {1}
90: 0b3b0b3a bleq ec2d80 <_stack+0xe42d80>
94: 13491927 movtne r1, #39207 ; 0x9927
98: 06120111 ; <UNDEFINED> instruction: 0x06120111
9c: 42971840 addsmi r1, r7, #64, 16 ; 0x400000
a0: 00130119 andseq r0, r3, r9, lsl r1
a4: 00050300 andeq r0, r5, r0, lsl #6
a8: 0b3a0e03 bleq e838bc <_stack+0xe038bc>
ac: 13490b3b movtne r0, #39739 ; 0x9b3b
b0: 00001802 andeq r1, r0, r2, lsl #16
b4: 03003404 movweq r3, #1028 ; 0x404
b8: 3b0b3a08 blcc 2ce8e0 <_stack+0x24e8e0>
bc: 0213490b andseq r4, r3, #180224 ; 0x2c000
c0: 05000018 streq r0, [r0, #-24] ; 0xffffffe8
c4: 0e030034 mcreq 0, 0, r0, cr3, cr4, {1}
c8: 0b3b0b3a bleq ec2db8 <_stack+0xe42db8>
cc: 18021349 stmdane r2, {r0, r3, r6, r8, r9, ip}
d0: 24060000 strcs r0, [r6], #-0
d4: 3e0b0b00 vmlacc.f64 d0, d11, d0
d8: 0008030b andeq r0, r8, fp, lsl #6
dc: 00340700 eorseq r0, r4, r0, lsl #14
e0: 0b3a0e03 bleq e838f4 <_stack+0xe038f4>
e4: 13490b3b movtne r0, #39739 ; 0x9b3b
e8: 1802193f stmdane r2, {r0, r1, r2, r3, r4, r5, r8, fp, ip}
ec: 0f080000 svceq 0x00080000
f0: 490b0b00 stmdbmi fp, {r8, r9, fp}
f4: 09000013 stmdbeq r0, {r0, r1, r4}
f8: 0b0b0024 bleq 2c0190 <_stack+0x240190>
fc: 0e030b3e vmoveq.16 d3[0], r0
100: 350a0000 strcc r0, [sl, #-0]
104: 00134900 andseq r4, r3, r0, lsl #18
108: 00340b00 eorseq r0, r4, r0, lsl #22
10c: 0b3a0803 bleq e82120 <_stack+0xe02120>
110: 13490b3b movtne r0, #39739 ; 0x9b3b
114: 1802193f stmdane r2, {r0, r1, r2, r3, r4, r5, r8, fp, ip}
118: Address 0x0000000000000118 is out of bounds.
Disassembly of section .debug_line:
00000000 <.debug_line>:
0: 0000005d andeq r0, r0, sp, asr r0
4: 002b0002 eoreq r0, fp, r2
8: 01020000 mrseq r0, (UNDEF: 2)
c: 000d0efb strdeq r0, [sp], -fp
10: 01010101 tsteq r1, r1, lsl #2
14: 01000000 mrseq r0, (UNDEF: 0)
18: 73010000 movwvc r0, #4096 ; 0x1000
1c: 00006372 andeq r6, r0, r2, ror r3
20: 6e69616d powvsez f6, f1, #5.0
24: 0100632e tsteq r0, lr, lsr #6
28: 70670000 rsbvc r0, r7, r0
2c: 682e6f69 stmdavs lr!, {r0, r3, r5, r6, r8, r9, sl, fp, sp, lr}
30: 00000100 andeq r0, r0, r0, lsl #2
34: 02050000 andeq r0, r5, #0
38: 00008000 andeq r8, r0, r0
3c: 6ba31316 blvs fe8c4c9c <_stack+0xfe844c9c>
40: 03040200 movweq r0, #16896 ; 0x4200
44: 02009e06 andeq r9, r0, #6, 28 ; 0x60
48: 06d60104 ldrbeq r0, [r6], r4, lsl #2
4c: 0200bcdb andeq fp, r0, #56064 ; 0xdb00
50: 9e060304 cdpls 3, 0, cr0, cr6, cr4, {0}
54: 01040200 mrseq r0, R12_usr
58: bbdb06d6 bllt ff6c1bb8 <_stack+0xff641bb8>
5c: 01000202 tsteq r0, r2, lsl #4
60: 00005f01 andeq r5, r0, r1, lsl #30
64: 2b000200 blcs 86c <main-0x7794>
68: 02000000 andeq r0, r0, #0
6c: 0d0efb01 vstreq d15, [lr, #-4]
70: 01010100 mrseq r0, (UNDEF: 17)
74: 00000001 andeq r0, r0, r1
78: 01000001 tsteq r0, r1
7c: 00637273 rsbeq r7, r3, r3, ror r2
80: 69706700 ldmdbvs r0!, {r8, r9, sl, sp, lr}^
84: 00632e6f rsbeq r2, r3, pc, ror #28
88: 67000001 strvs r0, [r0, -r1]
8c: 2e6f6970 mcrcs 9, 3, r6, cr15, cr0, {3}
90: 00010068 andeq r0, r1, r8, rrx
94: 05000000 streq r0, [r0, #-0]
98: 0080ec02 addeq lr, r0, r2, lsl #24
9c: 010a0300 mrseq r0, (UNDEF: 58)
a0: 02004da0 andeq r4, r0, #160, 26 ; 0x2800
a4: 66060104 strvs r0, [r6], -r4, lsl #2
a8: 004c6806 subeq r6, ip, r6, lsl #16
ac: 06010402 streq r0, [r1], -r2, lsl #8
b0: 4d680666 stclmi 6, cr0, [r8, #-408]! ; 0xfffffe68
b4: 672f67a6 strvs r6, [pc, -r6, lsr #15]!
b8: 02846c64 addeq r6, r4, #100, 24 ; 0x6400
bc: 022f1424 eoreq r1, pc, #36, 8 ; 0x24000000
c0: 01010008 tsteq r1, r8
Disassembly of section .debug_frame:
00000000 <.debug_frame>:
0: 0000000c andeq r0, r0, ip
4: ffffffff ; <UNDEFINED> instruction: 0xffffffff
8: 7c020001 stcvc 0, cr0, [r2], {1}
c: 000d0c0e andeq r0, sp, lr, lsl #24
10: 0000000c andeq r0, r0, ip
14: 00000000 andeq r0, r0, r0
18: 00008000 andeq r8, r0, r0
1c: 000000ec andeq r0, r0, ip, ror #1
20: 0000000c andeq r0, r0, ip
24: ffffffff ; <UNDEFINED> instruction: 0xffffffff
28: 7c020001 stcvc 0, cr0, [r2], {1}
2c: 000d0c0e andeq r0, sp, lr, lsl #24
30: 0000001c andeq r0, r0, ip, lsl r0
34: 00000020 andeq r0, r0, r0, lsr #32
38: 000080ec andeq r8, r0, ip, ror #1
3c: 00000110 andeq r0, r0, r0, lsl r1
40: 8b040e42 blhi 103950 <_stack+0x83950>
44: 0b0d4201 bleq 350850 <_stack+0x2d0850>
48: 0d0d8002 stceq 0, cr8, [sp, #-8]
4c: 000ecb42 andeq ip, lr, r2, asr #22
Disassembly of section .debug_str:
00000000 <.debug_str>:
0: 69736e75 ldmdbvs r3!, {r0, r2, r4, r5, r6, r9, sl, fp, sp, lr}^
4: 64656e67 strbtvs r6, [r5], #-3687 ; 0xfffff199
8: 746e6920 strbtvc r6, [lr], #-2336 ; 0xfffff6e0
c: 73552f00 cmpvc r5, #0, 30
10: 2f737265 svccs 0x00737265
14: 6f63614a svcvs 0x0063614a
18: 66666f53 uqsaxvs r6, r6, r3
1c: 7365442f cmnvc r5, #788529152 ; 0x2f000000
20: 706f746b rsbvc r7, pc, fp, ror #8
24: 6964452f stmdbvs r4!, {r0, r1, r2, r3, r5, r8, sl, lr}^
28: 2d6e6f73 stclcs 15, cr6, [lr, #-460]! ; 0xfffffe34
2c: 67005452 smlsdvs r0, r2, r4, r5
30: 006f6970 rsbeq r6, pc, r0, ror r9 ; <UNPREDICTABLE>
34: 2f637273 svccs 0x00637273
38: 6e69616d powvsez f6, f1, #5.0
3c: 4700632e strmi r6, [r0, -lr, lsr #6]
40: 4320554e ; <UNDEFINED> instruction: 0x4320554e
44: 35203131 strcc r3, [r0, #-305]! ; 0xfffffecf
48: 312e342e ; <UNDEFINED> instruction: 0x312e342e
4c: 31303220 teqcc r0, r0, lsr #4
50: 31393036 teqcc r9, r6, lsr r0
54: 72282039 eorvc r2, r8, #57 ; 0x39
58: 61656c65 cmnvs r5, r5, ror #24
5c: 20296573 eorcs r6, r9, r3, ror r5
60: 4d52415b ldfmie f4, [r2, #-364] ; 0xfffffe94
64: 626d652f rsbvs r6, sp, #197132288 ; 0xbc00000
68: 65646465 strbvs r6, [r4, #-1125]! ; 0xfffffb9b
6c: 2d352d64 ldccs 13, cr2, [r5, #-400]! ; 0xfffffe70
70: 6e617262 cdpvs 2, 6, cr7, cr1, cr2, {3}
74: 72206863 eorvc r6, r0, #6488064 ; 0x630000
78: 73697665 cmnvc r9, #105906176 ; 0x6500000
7c: 206e6f69 rsbcs r6, lr, r9, ror #30
80: 34303432 ldrtcc r3, [r0], #-1074 ; 0xfffffbce
84: 205d3639 subscs r3, sp, r9, lsr r6
88: 70666d2d rsbvc r6, r6, sp, lsr #26
8c: 656e3d75 strbvs r3, [lr, #-3445]! ; 0xfffff28b
90: 762d6e6f strtvc r6, [sp], -pc, ror #28
94: 34767066 ldrbtcc r7, [r6], #-102 ; 0xffffff9a
98: 666d2d20 strbtvs r2, [sp], -r0, lsr #26
9c: 74616f6c strbtvc r6, [r1], #-3948 ; 0xfffff094
a0: 6962612d stmdbvs r2!, {r0, r2, r3, r5, r8, sp, lr}^
a4: 7261683d rsbvc r6, r1, #3997696 ; 0x3d0000
a8: 6d2d2064 stcvs 0, cr2, [sp, #-400]! ; 0xfffffe70
ac: 68637261 stmdavs r3!, {r0, r5, r6, r9, ip, sp, lr}^
b0: 6d72613d ldfvse f6, [r2, #-244]! ; 0xffffff0c
b4: 612d3776 ; <UNDEFINED> instruction: 0x612d3776
b8: 746d2d20 strbtvc r2, [sp], #-3360 ; 0xfffff2e0
bc: 3d656e75 stclcc 14, cr6, [r5, #-468]! ; 0xfffffe2c
c0: 74726f63 ldrbtvc r6, [r2], #-3939 ; 0xfffff09d
c4: 612d7865 ; <UNDEFINED> instruction: 0x612d7865
c8: 672d2037 ; <UNDEFINED> instruction: 0x672d2037
cc: 304f2d20 subcc r2, pc, r0, lsr #26
d0: 69616d00 stmdbvs r1!, {r8, sl, fp, sp, lr}^
d4: 6970006e ldmdbvs r0!, {r1, r2, r3, r5, r6}^
d8: 6d756e6e ldclvs 14, cr6, [r5, #-440]! ; 0xfffffe48
dc: 63727300 cmnvs r2, #0, 6
e0: 6970672f ldmdbvs r0!, {r0, r1, r2, r3, r5, r8, r9, sl, sp, lr}^
e4: 00632e6f rsbeq r2, r3, pc, ror #28
e8: 62727366 rsbsvs r7, r2, #-1744830463 ; 0x98000001
ec: 70007469 andvc r7, r0, r9, ror #8
f0: 6f4d6e69 svcvs 0x004d6e69
f4: 6d006564 cfstr32vs mvfx6, [r0, #-400] ; 0xfffffe70
f8: 0065646f rsbeq r6, r5, pc, ror #8
Disassembly of section .ARM.attributes:
00000000 <_stack-0x80000>:
0: 00003441 andeq r3, r0, r1, asr #8
4: 61656100 cmnvs r5, r0, lsl #2
8: 01006962 tsteq r0, r2, ror #18
c: 0000002a andeq r0, r0, sl, lsr #32
10: 412d3705 ; <UNDEFINED> instruction: 0x412d3705
14: 070a0600 streq r0, [sl, -r0, lsl #12]
18: 09010841 stmdbeq r1, {r0, r6, fp}
1c: 0c050a02 stceq 10, cr0, [r5], {2}
20: 14041202 strne r1, [r4], #-514 ; 0xfffffdfe
24: 17011501 strne r1, [r1, -r1, lsl #10]
28: 19011803 stmdbne r1, {r0, r1, fp, ip}
2c: 1c011a01 stcne 10, cr1, [r1], {1}
30: 22061e01 andcs r1, r6, #1, 28
34: Address 0x0000000000000034 is out of bounds.
Thanks in advance
i think the tim=500000 can cause problems because the LED is switched on too short to be recognizable by human eye. RPi's processor has a speed around 1 GHz ( https://en.wikipedia.org/wiki/Raspberry_Pi ) what is 1.000.000.000 Hz
so the switched-on time of the LED is approx 0.5 ms. maybe try tim=5000000000 to try approx. 5s visibility or try other timings like delay(), sleep(),... ( implement time delay in c )
very slight modifications, the naked attributes are not required (what tools are you using?)
asm(".globl _start; _start: nop\n");
#ifndef GPIO_H
#define GPIO_H
#ifdef RPI2
#define GPIO_BASE 0x3F200000UL
#else
#define GPIO_BASE 0x20200000UL
#endif
#if defined( RPIBPLUS ) || defined( RPI2 )
#define LED_GPFSEL GPIO_GPFSEL4
#define LED_GPFBIT 21
#define LED_GPSET GPIO_GPSET1
#define LED_GPCLR GPIO_GPCLR1
#define LED_GPIO_BIT 15
#else
#define LED_GPFSEL GPIO_GPFSEL1
#define LED_GPFBIT 18
#define LED_GPSET GPIO_GPSET0
#define LED_GPCLR GPIO_GPCLR0
#define LED_GPIO_BIT 16
#endif
#define GPIO_GPFSEL0 0
#define GPIO_GPFSEL1 1
#define GPIO_GPFSEL2 2
#define GPIO_GPFSEL3 3
#define GPIO_GPFSEL4 4
#define GPIO_GPFSEL5 5
#define GPIO_GPSET0 7
#define GPIO_GPSET1 8
#define GPIO_GPCLR0 10
#define GPIO_GPCLR1 11
#define GPIO_GPLEV0 13
#define GPIO_GPLEV1 14
#define GPIO_GPEDS0 16
#define GPIO_GPEDS1 17
#define GPIO_GPREN0 19
#define GPIO_GPREN1 20
#define GPIO_GPFEN0 22
#define GPIO_GPFEN1 23
#define GPIO_GPHEN0 25
#define GPIO_GPHEN1 26
#define GPIO_GPLEN0 28
#define GPIO_GPLEN1 29
#define GPIO_GPAREN0 31
#define GPIO_GPAREN1 32
#define GPIO_GPAFEN0 34
#define GPIO_GPAFEN1 35
#define GPIO_GPPUD 37
#define GPIO_GPPUDCLK0 38
#define GPIO_GPPUDCLK1 39
/** GPIO Register set */
volatile unsigned int* gpio;
/** Simple loop variable */
volatile unsigned int tim;
// Function to change a pin's mode
static int pinMode(int pinnum, int mode);
#endif
static int pinMode(int pinnum, int mode) {
// Variable declaration and initialization
int fsr = 0;
int fsrbit;
// Let's check the pin does exist
if (pinnum < 0 || pinnum > 53) {
// Abort, there is no such pin.
return 1;
}
else if (mode < 0 || mode > 1) {
// Abort, invalid mode (Actually there are 7 modes but we will only use 2)
return 1;
}
// Create a pointer to the GPIO perhiperal register so we can speak to it
gpio = (unsigned int*)GPIO_BASE;
// And calculate wich FSR we should use
/* do {
if (pinnum > 9) {
pinnum -= 10;
fsr++;
}
} while (pinnum > 9); */
if (pinnum > 9) {
while (pinnum > 9) {
pinnum -= 10;
fsr++;
}
}
// Then we calculate the bytes of the fsreg to use
fsrbit = pinnum * 3;
// Finally let's set the pin to the desired mode
gpio[fsr] |= (mode << fsrbit);
return 0;
}
int main(void)
{
gpio = (unsigned int*)GPIO_BASE;
/* Write 1 to the GPIO16 init nibble in the Function Select 1 GPIO
peripheral register to enable GPIO16 as an output */
// gpio[LED_GPFSEL] |= (1 << LED_GPFBIT);
pinMode(47, 1);
// Never return from here
while(1)
{
for(tim = 0; tim < 500000; tim++)
;
/* Set the LED GPIO pin low ( Turn OK LED on for original Pi, and off
for plus models )*/
gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++)
;
/* Set the LED GPIO pin high ( Turn OK LED off for original Pi, and on
for plus models )*/
gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
}
}
along with some hackery to get it to link, not something you can use directly but main wont need to change, just bootstrap and linking.
00001000 <main>:
1000: e52de004 push {lr} ; (str lr, [sp, #-4]!)
1004: e3a00801 mov r0, #65536 ; 0x10000
1008: e3a0e000 mov lr, #0
100c: e59f3078 ldr r3, [pc, #120] ; 108c <main+0x8c>
1010: e59f2078 ldr r2, [pc, #120] ; 1090 <main+0x90>
1014: e5823000 str r3, [r2]
1018: e5932010 ldr r2, [r3, #16]
101c: e3822602 orr r2, r2, #2097152 ; 0x200000
1020: e1a0c003 mov r12, r3
1024: e5832010 str r2, [r3, #16]
1028: e59f1064 ldr r1, [pc, #100] ; 1094 <main+0x94>
102c: e59f3064 ldr r3, [pc, #100] ; 1098 <main+0x98>
1030: e583e000 str lr, [r3]
1034: e5932000 ldr r2, [r3]
1038: e1520001 cmp r2, r1
103c: 8a000005 bhi 1058 <main+0x58>
1040: e5932000 ldr r2, [r3]
1044: e2822001 add r2, r2, #1
1048: e5832000 str r2, [r3]
104c: e5932000 ldr r2, [r3]
1050: e1520001 cmp r2, r1
1054: 9afffff9 bls 1040 <main+0x40>
1058: e58c0028 str r0, [r12, #40] ; 0x28
105c: e583e000 str lr, [r3]
1060: e5932000 ldr r2, [r3]
1064: e1520001 cmp r2, r1
1068: 8a000005 bhi 1084 <main+0x84>
106c: e5932000 ldr r2, [r3]
1070: e2822001 add r2, r2, #1
1074: e5832000 str r2, [r3]
1078: e5932000 ldr r2, [r3]
107c: e1520001 cmp r2, r1
1080: 9afffff9 bls 106c <main+0x6c>
1084: e58c001c str r0, [r12, #28]
1088: eaffffe8 b 1030 <main+0x30>
108c: 20200000 eorcs r0, r0, r0
1090: 000110a4 andeq r1, r1, r4, lsr #1
1094: 0007a11f andeq r10, r7, pc, lsl r1
1098: 000110a0 andeq r1, r1, r0, lsr #1
0000109c <_start>:
109c: e1a00000 nop ; (mov r0, r0)
Disassembly of section .bss:
000110a0 <tim>:
110a0: 00000000 andeq r0, r0, r0
000110a4 <gpio>:
110a4: 00000000 andeq r0, r0, r0
so the first problem is here which comes straight from your code
fsrbit = pinnum * 3;
gpio[fsr] |= (mode << fsrbit);
100c: e59f3078 ldr r3, [pc, #120] ; 108c <main+0x8c>
1018: e5932010 ldr r2, [r3, #16]
101c: e3822602 orr r2, r2, #2097152 ; 0x200000
1024: e5832010 str r2, [r3, #16]
doing a read-modify-write is correct, but unless you knew the bits were zeros to start with it is best to zero them first
gpio[fsr] &= (~(7<<fsrbit));
gpio[fsr] |= (mode<<fsrbit);
by declaring tim as volatile the code as you can see actually counts takes a few instructions, yes it might be a 1GHZ processor but you are not running that fast, you are fetching from dram (slow), even with the cache, and you have pipe hazards, etc.
As pointed out though maybe make your loop count number larger. Another thing to try is to swap things around on/off state.
first this:
while(1)
{
gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++) continue;
gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++) continue;
}
is it on, does it glow? then try this
while(1)
{
gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++) continue;
gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
for(tim = 0; tim < 500000; tim++) continue;
}
if the led looks the same then your count may be too small, make it larger, control the bit then do the delay not the other way around. If it still glows and doesnt blink with set first then clear or clear first then set? then maybe too fast still, just do this
gpio[LED_GPSET] = (1 << LED_GPIO_BIT);
with no loop
or
gpio[LED_GPCLR] = (1 << LED_GPIO_BIT);
with no loop.
can you make it go on and stay on? can you make it go off and stay off? If not then there is something wrong in the code you use to make it go on and off, if so then there maybe something wrong with your delay. but examination of optimized code shown above the volatile is taking care of that and burning cycles.
your I solved with a naked answer, was deleted, so does that mean you didnt solve it? The baking pi tutorials are nice, glad they are there, but the baremetal forum at raspberry pi has over time been littered with it (baking pi) doesnt work questions, mostly makefile/directory issues, but perhaps others. Looks like though you get the gist of it, your gpio pointer/array style is interesting, I wouldnt go that way but so far it is working for you.
Ahh, wait and another bug...you didnt define RPI2 but clearly you had on the command line perhaps?
800c: e3433f20 movt r3, #16160 ; 0x3f20
so lets try my smaller build again
1034: e5804020 str r4, [r0, #32] ; 0x20
1048: e580e02c str lr, [r0, #44] ; 0x2c
much better GPSET1 and GPCLR1 where 47 is found...so no bug there.
actually instead of hardcoding 47 you should have finished out your header file and used some LED_GPIO_BIT in the pinMode() call insuring you matched the LED_otherstuff to the gpio pin.
thanks to other posts here, I managed to build and install Valgrind for ARM:
make clean
make distclean
export PATH=$PATH:/usr/local/angstrom/arm/bin
export CROSS_COMPILE=/usr/local/angstrom/arm/bin/arm-angstrom-linux-gnueabi-
export CC=${CROSS_COMPILE}gcc
export CPP=${CROSS_COMPILE}cpp
export CXX=${CROSS_COMPILE}g++
export LD=${CROSS_COMPILE}ld
export AR=${CROSS_COMPILE}ar
export RANLIB=${CROSS_COMPILE}ranlib
sed -i -e "s#armv7#arm#g" configure
./configure --host=arm-none-linux-gnueabi \
--host=arm-none-linux-gnueabi \
--prefix=/opt/valgrind/ \
CFLAGS=-static
make
make install
and copied it into the same path on the ARMv5 machine.
I am running an ARMv5 and apparently Valgrind is still not supporting ARMv5.
When starting Valgrind, I'll get
Illegal instruction
and I suppose this is, because Valgrind needs the ARMv7 instruction set?
So:
did I make any error in configure when cross-compiling valgrind?
is there any way to get Valgrind up and running on ARMv5 or even only the vgdb or Memcheck tool included in Valgrind?
Is there any other tool to do memchecks on ARMv5, I have heard about
Adress Sanitizer or
Malloc_Check
and are they recommendable for ARMv5?
Thanks a lot for help and support!
BR Florian
I have created a core dump and run gdb, unfortunately without debugging symbols:
What command should i used?
[New Thread 10495]
Core was generated by `/opt/valgrind/bin/valgrind'.
Program terminated with signal 4, Illegal instruction.
#0 0x38035804 in ?? ()
(gdb) list
305 platform ? platform : "unknown");
306
307 return platform;
308 }
309
310 /* Where we expect to find all our aux files */
311 static const char *valgrind_lib = VG_LIBDIR;
312
313 int main(int argc, char** argv, char** envp)
314 {
(gdb) info frame
Stack level 0, frame at 0x38ac7894:
pc = 0x38035804; saved pc 0x380425e0
called by frame at 0x38ac7894
Cannot access memory at address 0x0
Arglist at 0x38ac7894, args:
Locals at 0x38ac7894, Previous frame's sp is 0x38ac7894
(gdb) list main
326
327 /* Start the debugging-log system ASAP. First find out how many
328 "-d"s were specified. This is a pre-scan of the command line.
329 At the same time, look for the tool name. */
330 loglevel = 0;
331 for (i = 1; i < argc; i++) {
332 if (argv[i][0] != '-') {
333 clientname = argv[i];
334 break;
335 }
(gdb) disassemble main
Dump of assembler code for function main:
0x000087b4 <+0>: cmp r0, #1
0x000087b8 <+4>: push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
0x000087bc <+8>: mov r10, r0
0x000087c0 <+12>: sub sp, sp, #12
0x000087c4 <+16>: mov r11, r1
0x000087c8 <+20>: mov r9, r2
0x000087cc <+24>: ble 0x8c90 <main+1244>
0x000087d0 <+28>: ldr r4, [r1, #4]
0x000087d4 <+32>: ldrb r3, [r4]
0x000087d8 <+36>: cmp r3, #45 ; 0x2d
0x000087dc <+40>: movne r8, #0
0x000087e0 <+44>: movne r7, r8
0x000087e4 <+48>: bne 0x8820 <main+108>
0x000087e8 <+52>: ldr r0, [pc, #1372] ; 0x8d4c <main+1432>
0x000087ec <+56>: bl 0x29d20 <strlen>
0x000087f0 <+60>: mov r8, #0
0x000087f4 <+64>: mov r5, #1
0x000087f8 <+68>: mov r7, r8
0x000087fc <+72>: mov r6, r0
0x00008800 <+76>: cmp r6, #3
0x00008804 <+80>: bhi 0x8b70 <main+956>
0x00008808 <+84>: cmp r6, #0
0x0000880c <+88>: bne 0x8a00 <main+588>
0x00008810 <+92>: add r0, r5, #1
0x00008814 <+96>: cmp r10, r0
0x00008818 <+100>: ble 0x8b90 <main+988>
0x0000881c <+104>: ldr r4, [r11, r0, lsl #2]
0x00008820 <+108>: mov r0, r8
0x00008824 <+112>: ldr r1, [pc, #1316] ; 0x8d50 <main+1436>
0x00008828 <+116>: bl 0xa258 <vgPlain_debugLog_startup>
0x0000882c <+120>: cmp r7, #0
0x00008830 <+124>: mov r10, r7
0x00008834 <+128>: beq 0x8bac <main+1016>
0x00008838 <+132>: mov r3, r7
0x0000883c <+136>: mov r0, #1
0x00008840 <+140>: ldr r1, [pc, #1292] ; 0x8d54 <main+1440>
0x00008844 <+144>: ldr r2, [pc, #1292] ; 0x8d58 <main+1444>
0x00008848 <+148>: bl 0x9ff0 <vgPlain_debugLog>
0x0000884c <+152>: cmp r4, #0
0x00008850 <+156>: beq 0x8bc8 <main+1044>
0x00008854 <+160>: mov r0, r4
0x00008858 <+164>: bl 0x8278 <select_platform>
0x0000885c <+168>: subs r8, r0, #0
0x00008860 <+172>: beq 0x8c6c <main+1208>
0x00008864 <+176>: mov r0, #1
0x00008868 <+180>: ldr r1, [pc, #1252] ; 0x8d54 <main+1440>
0x0000886c <+184>: ldr r2, [pc, #1256] ; 0x8d5c <main+1448>
0x00008870 <+188>: mov r3, r8
0x00008874 <+192>: bl 0x9ff0 <vgPlain_debugLog>
0x00008878 <+196>: mov r0, #0
0x0000887c <+200>: mov r5, r0
0x00008880 <+204>: add r4, r0, #500 ; 0x1f4
0x00008884 <+208>: mov r1, r4
0x00008888 <+212>: mov r0, r5
0x0000888c <+216>: bl 0x290ec <realloc>
0x00008890 <+220>: subs r5, r0, #0
0x00008894 <+224>: beq 0x8c88 <main+1236>
0x00008898 <+228>: ldr r0, [pc, #1216] ; 0x8d60 <main+1452>
0x0000889c <+232>: mov r1, r5
0x000088a0 <+236>: mov r2, r4
0x000088a4 <+240>: bl 0x2d760 <readlink>
0x000088a8 <+244>: cmn r0, #1
0x000088ac <+248>: beq 0x8c44 <main+1168>
0x000088b0 <+252>: cmp r0, r4
0x000088b4 <+256>: beq 0x8880 <main+204>
0x000088b8 <+260>: movcc r3, #0
0x000088bc <+264>: movcc r6, r5
0x000088c0 <+268>: strbcc r3, [r5, r0]
0x000088c4 <+272>: bcs 0x8d38 <main+1412>
0x000088c8 <+276>: ldr r0, [pc, #1172] ; 0x8d64 <main+1456>
0x000088cc <+280>: bl 0x29d20 <strlen>
0x000088d0 <+284>: mov r4, r0
0x000088d4 <+288>: mov r0, r6
0x000088d8 <+292>: bl 0x29d20 <strlen>
0x000088dc <+296>: add r4, r4, r0
0x000088e0 <+300>: add r0, r4, #2
0x000088e4 <+304>: bl 0x28928 <malloc>
0x000088e8 <+308>: subs r7, r0, #0
0x000088ec <+312>: beq 0x8ca4 <main+1264>
0x000088f0 <+316>: ldr r1, [pc, #1132] ; 0x8d64 <main+1456>
0x000088f4 <+320>: bl 0x29b00 <strcpy>
0x000088f8 <+324>: ldr r1, [pc, #1128] ; 0x8d68 <main+1460>
0x000088fc <+328>: mov r0, r7
0x00008900 <+332>: bl 0x29994 <strcat>
0x00008904 <+336>: mov r0, r7
0x00008908 <+340>: mov r1, r6
0x0000890c <+344>: bl 0x29994 <strcat>
0x00008910 <+348>: ldr r3, [r9]
0x00008914 <+352>: cmp r3, #0
0x00008918 <+356>: moveq r4, r3
0x0000891c <+360>: moveq r5, #2
0x00008920 <+364>: moveq r0, #8
0x00008924 <+368>: beq 0x8944 <main+400>
0x00008928 <+372>: mov r4, #0
0x0000892c <+376>: add r4, r4, #1
0x00008930 <+380>: ldr r3, [r9, r4, lsl #2]
0x00008934 <+384>: cmp r3, #0
0x00008938 <+388>: bne 0x892c <main+376>
0x0000893c <+392>: add r5, r4, #2
0x00008940 <+396>: lsl r0, r5, #2
0x00008944 <+400>: bl 0x28928 <malloc>
0x00008948 <+404>: subs r6, r0, #0
0x0000894c <+408>: beq 0x8cac <main+1272>
0x00008950 <+412>: cmp r4, #0
0x00008954 <+416>: moveq r1, r4
0x00008958 <+420>: moveq r0, #4
0x0000895c <+424>: moveq r2, #2
0x00008960 <+428>: beq 0x8994 <main+480>
0x00008964 <+432>: mov r1, #0
0x00008968 <+436>: mov r2, r1
0x0000896c <+440>: ldr r3, [r9, r2]
0x00008970 <+444>: add r1, r1, #1
0x00008974 <+448>: cmp r4, r1
0x00008978 <+452>: str r3, [r6, r2]
0x0000897c <+456>: add r2, r2, #4
0x00008980 <+460>: bgt 0x896c <main+440>
0x00008984 <+464>: add r3, r4, #1
0x00008988 <+468>: add r2, r3, #1
0x0000898c <+472>: lsl r1, r4, #2
0x00008990 <+476>: lsl r0, r3, #2
0x00008994 <+480>: cmp r2, r5
0x00008998 <+484>: str r7, [r6, r1]
0x0000899c <+488>: mov r3, #0
0x000089a0 <+492>: str r3, [r6, r0]
0x000089a4 <+496>: bne 0x8cb4 <main+1280>
0x000089a8 <+500>: ldr r0, [pc, #956] ; 0x8d6c <main+1464>
0x000089ac <+504>: bl 0x12650 <getenv>
0x000089b0 <+508>: subs r3, r0, #0
0x000089b4 <+512>: ldrne r0, [pc, #948] ; 0x8d70 <main+1468>
0x000089b8 <+516>: ldreq r0, [pc, #944] ; 0x8d70 <main+1468>
0x000089bc <+520>: strne r3, [r0]
0x000089c0 <+524>: ldr r0, [r0]
0x000089c4 <+528>: bl 0x29d20 <strlen>
0x000089c8 <+532>: mov r4, r0
0x000089cc <+536>: mov r0, r10
0x000089d0 <+540>: bl 0x29d20 <strlen>
0x000089d4 <+544>: mov r5, r0
0x000089d8 <+548>: mov r0, r8
0x000089dc <+552>: bl 0x29d20 <strlen>
0x000089e0 <+556>: add r4, r4, r5
0x000089e4 <+560>: add r4, r4, #3
0x000089e8 <+564>: add r0, r4, r0
0x000089ec <+568>: bl 0x28928 <malloc>
0x000089f0 <+572>: subs r4, r0, #0
0x000089f4 <+576>: bne 0x8cc8 <main+1300>
0x000089f8 <+580>: ldr r0, [pc, #884] ; 0x8d74 <main+1472>
0x000089fc <+584>: bl 0x8234 <barf>
0x00008a00 <+588>: ldrb r2, [r4, #1]
0x00008a04 <+592>: cmp r6, #1
0x00008a08 <+596>: movls r3, #0
0x00008a0c <+600>: movhi r3, #1
0x00008a10 <+604>: cmp r2, #45 ; 0x2d
0x00008a14 <+608>: movne r3, #0
0x00008a18 <+612>: cmp r3, #0
0x00008a1c <+616>: sub r0, r2, #45 ; 0x2d
0x00008a20 <+620>: beq 0x8a50 <main+668>
0x00008a24 <+624>: ldrb r0, [r4, #2]
0x00008a28 <+628>: cmp r6, #2
0x00008a2c <+632>: movls r3, #0
0x00008a30 <+636>: movhi r3, #1
0x00008a34 <+640>: cmp r0, #0
0x00008a38 <+644>: movne r3, #0
0x00008a3c <+648>: cmp r3, #0
0x00008a40 <+652>: ldrne r1, [pc, #772] ; 0x8d4c <main+1432>
0x00008a44 <+656>: ldrbne r2, [r4, #3]
0x00008a48 <+660>: ldrbne r3, [r1, #3]
0x00008a4c <+664>: rsbne r0, r3, r2
0x00008a50 <+668>: cmp r0, #0
0x00008a54 <+672>: beq 0x8810 <main+92>
0x00008a58 <+676>: ldr r0, [pc, #792] ; 0x8d78 <main+1476>
0x00008a5c <+680>: bl 0x29d20 <strlen>
0x00008a60 <+684>: cmp r0, #3
0x00008a64 <+688>: bhi 0x8c10 <main+1116>
0x00008a68 <+692>: cmp r0, #0
0x00008a6c <+696>: beq 0x8c08 <main+1108>
0x00008a70 <+700>: ldrb r2, [r4, #1]
0x00008a74 <+704>: cmp r0, #1
0x00008a78 <+708>: movls r3, #0
0x00008a7c <+712>: movhi r3, #1
0x00008a80 <+716>: cmp r2, #100 ; 0x64
0x00008a84 <+720>: movne r3, #0
0x00008a88 <+724>: cmp r3, #0
0x00008a8c <+728>: sub r2, r2, #100 ; 0x64
0x00008a90 <+732>: beq 0x8ac0 <main+780>
0x00008a94 <+736>: ldrb r2, [r4, #2]
0x00008a98 <+740>: cmp r0, #2
0x00008a9c <+744>: movls r3, #0
0x00008aa0 <+748>: movhi r3, #1
0x00008aa4 <+752>: cmp r2, #0
0x00008aa8 <+756>: movne r3, #0
0x00008aac <+760>: cmp r3, #0
0x00008ab0 <+764>: ldrne r1, [pc, #704] ; 0x8d78 <main+1476>
0x00008ab4 <+768>: ldrbne r2, [r4, #3]
0x00008ab8 <+772>: ldrbne r3, [r1, #3]
0x00008abc <+776>: rsbne r2, r3, r2
0x00008ac0 <+780>: cmp r2, #0
0x00008ac4 <+784>: beq 0x8c08 <main+1108>
0x00008ac8 <+788>: ldr r0, [pc, #684] ; 0x8d7c <main+1480>
0x00008acc <+792>: bl 0x29d20 <strlen>
0x00008ad0 <+796>: cmp r0, #6
0x00008ad4 <+800>: mov r1, r0
0x00008ad8 <+804>: bhi 0x8be4 <main+1072>
0x00008adc <+808>: cmp r0, #3
0x00008ae0 <+812>: bhi 0x8c24 <main+1136>
0x00008ae4 <+816>: cmp r0, #0
0x00008ae8 <+820>: beq 0x8b80 <main+972>
0x00008aec <+824>: ldrb r3, [r4, #1]
0x00008af0 <+828>: cmp r3, #45 ; 0x2d
0x00008af4 <+832>: movne r0, #0
0x00008af8 <+836>: moveq r0, #1
0x00008afc <+840>: cmp r1, #1
0x00008b00 <+844>: movls r2, #0
0x00008b04 <+848>: andhi r2, r0, #1
0x00008b08 <+852>: cmp r2, #0
0x00008b0c <+856>: beq 0x8b44 <main+912>
0x00008b10 <+860>: ldrb r3, [r4, #2]
0x00008b14 <+864>: cmp r3, #116 ; 0x74
0x00008b18 <+868>: movne r0, #0
0x00008b1c <+872>: moveq r0, #1
0x00008b20 <+876>: cmp r1, #2
0x00008b24 <+880>: movls r2, #0
0x00008b28 <+884>: andhi r2, r0, #1
0x00008b2c <+888>: cmp r2, #0
0x00008b30 <+892>: beq 0x8b44 <main+912>
0x00008b34 <+896>: ldrb r3, [r4, #3]
0x00008b38 <+900>: cmp r3, #111 ; 0x6f
0x00008b3c <+904>: movne r0, #0
0x00008b40 <+908>: moveq r0, #1
0x00008b44 <+912>: cmp r0, #0
0x00008b48 <+916>: bne 0x8b80 <main+972>
0x00008b4c <+920>: add r5, r5, #1
0x00008b50 <+924>: cmp r10, r5
0x00008b54 <+928>: ble 0x8b90 <main+988>
0x00008b58 <+932>: ldr r4, [r11, r5, lsl #2]
0x00008b5c <+936>: ldrb r3, [r4]
0x00008b60 <+940>: cmp r3, #45 ; 0x2d
0x00008b64 <+944>: bne 0x8820 <main+108>
0x00008b68 <+948>: cmp r6, #3
0x00008b6c <+952>: bls 0x8808 <main+84>
0x00008b70 <+956>: mov r0, r4
0x00008b74 <+960>: ldr r1, [pc, #464] ; 0x8d4c <main+1432>
0x00008b78 <+964>: bl 0x29acc <strcmp>
0x00008b7c <+968>: b 0x8a50 <main+668>
0x00008b80 <+972>: add r5, r5, #1
0x00008b84 <+976>: cmp r10, r5
0x00008b88 <+980>: add r7, r4, #7
0x00008b8c <+984>: bgt 0x8b58 <main+932>
0x00008b90 <+988>: mov r0, r8
0x00008b94 <+992>: ldr r1, [pc, #436] ; 0x8d50 <main+1436>
0x00008b98 <+996>: bl 0xa258 <vgPlain_debugLog_startup>
0x00008b9c <+1000>: cmp r7, #0
0x00008ba0 <+1004>: mov r10, r7
0x00008ba4 <+1008>: mov r4, #0
0x00008ba8 <+1012>: bne 0x8838 <main+132>
0x00008bac <+1016>: mov r0, #1
0x00008bb0 <+1020>: ldr r1, [pc, #412] ; 0x8d54 <main+1440>
0x00008bb4 <+1024>: ldr r2, [pc, #452] ; 0x8d80 <main+1484>
0x00008bb8 <+1028>: bl 0x9ff0 <vgPlain_debugLog>
0x00008bbc <+1032>: cmp r4, #0
0x00008bc0 <+1036>: ldr r10, [pc, #444] ; 0x8d84 <main+1488>
0x00008bc4 <+1040>: bne 0x8854 <main+160>
0x00008bc8 <+1044>: mov r0, #1
0x00008bcc <+1048>: ldr r1, [pc, #384] ; 0x8d54 <main+1440>
0x00008bd0 <+1052>: ldr r2, [pc, #432] ; 0x8d88 <main+1492>
0x00008bd4 <+1056>: ldr r3, [pc, #432] ; 0x8d8c <main+1496>
0x00008bd8 <+1060>: bl 0x9ff0 <vgPlain_debugLog>
0x00008bdc <+1064>: ldr r8, [pc, #424] ; 0x8d8c <main+1496>
0x00008be0 <+1068>: b 0x8878 <main+196>
0x00008be4 <+1072>: mov r0, r4
0x00008be8 <+1076>: ldr r1, [pc, #396] ; 0x8d7c <main+1480>
0x00008bec <+1080>: mov r2, #7
0x00008bf0 <+1084>: bl 0x29e90 <strncmp>
0x00008bf4 <+1088>: rsbs r0, r0, #1
0x00008bf8 <+1092>: movcc r0, #0
0x00008bfc <+1096>: cmp r0, #0
0x00008c00 <+1100>: beq 0x8b4c <main+920>
0x00008c04 <+1104>: b 0x8b80 <main+972>
0x00008c08 <+1108>: add r8, r8, #1
0x00008c0c <+1112>: b 0x8ac8 <main+788>
0x00008c10 <+1116>: mov r0, r4
0x00008c14 <+1120>: ldr r1, [pc, #348] ; 0x8d78 <main+1476>
0x00008c18 <+1124>: bl 0x29acc <strcmp>
0x00008c1c <+1128>: mov r2, r0
0x00008c20 <+1132>: b 0x8ac0 <main+780>
0x00008c24 <+1136>: mov r0, r4
0x00008c28 <+1140>: ldr r1, [pc, #332] ; 0x8d7c <main+1480>
0x00008c2c <+1144>: bl 0x29acc <strcmp>
0x00008c30 <+1148>: rsbs r0, r0, #1
0x00008c34 <+1152>: movcc r0, #0
0x00008c38 <+1156>: cmp r0, #0
0x00008c3c <+1160>: beq 0x8b4c <main+920>
0x00008c40 <+1164>: b 0x8b80 <main+972>
0x00008c44 <+1168>: ldr r4, [pc, #324] ; 0x8d90 <main+1500>
0x00008c48 <+1172>: ldr r1, [pc, #324] ; 0x8d94 <main+1504>
0x00008c4c <+1176>: ldr r2, [pc, #268] ; 0x8d60 <main+1452>
0x00008c50 <+1180>: ldr r0, [r4]
0x00008c54 <+1184>: bl 0x1b4c0 <fprintf>
0x00008c58 <+1188>: ldr r0, [r4]
0x00008c5c <+1192>: ldr r1, [pc, #308] ; 0x8d98 <main+1508>
0x00008c60 <+1196>: bl 0x1b4c0 <fprintf>
0x00008c64 <+1200>: ldr r6, [pc, #304] ; 0x8d9c <main+1512>
0x00008c68 <+1204>: b 0x88c8 <main+276>
0x00008c6c <+1208>: mov r0, #1
0x00008c70 <+1212>: ldr r1, [pc, #220] ; 0x8d54 <main+1440>
0x00008c74 <+1216>: ldr r2, [pc, #292] ; 0x8da0 <main+1516>
0x00008c78 <+1220>: ldr r3, [pc, #268] ; 0x8d8c <main+1496>
0x00008c7c <+1224>: bl 0x9ff0 <vgPlain_debugLog>
0x00008c80 <+1228>: ldr r8, [pc, #260] ; 0x8d8c <main+1496>
0x00008c84 <+1232>: b 0x8878 <main+196>
0x00008c88 <+1236>: ldr r0, [pc, #276] ; 0x8da4 <main+1520>
0x00008c8c <+1240>: bl 0x8234 <barf>
0x00008c90 <+1244>: mov r0, #0
0x00008c94 <+1248>: ldr r1, [pc, #180] ; 0x8d50 <main+1436>
0x00008c98 <+1252>: bl 0xa258 <vgPlain_debugLog_startup>
0x00008c9c <+1256>: mov r4, #0
0x00008ca0 <+1260>: b 0x8bac <main+1016>
0x00008ca4 <+1264>: ldr r0, [pc, #252] ; 0x8da8 <main+1524>
0x00008ca8 <+1268>: bl 0x8234 <barf>
0x00008cac <+1272>: ldr r0, [pc, #248] ; 0x8dac <main+1528>
0x00008cb0 <+1276>: bl 0x8234 <barf>
0x00008cb4 <+1280>: ldr r0, [pc, #244] ; 0x8db0 <main+1532>
0x00008cb8 <+1284>: ldr r1, [pc, #244] ; 0x8db4 <main+1536>
0x00008cbc <+1288>: movw r2, #463 ; 0x1cf
0x00008cc0 <+1292>: ldr r3, [pc, #240] ; 0x8db8 <main+1540>
0x00008cc4 <+1296>: bl 0xd5dc <__assert_fail>
0x00008cc8 <+1300>: ldr r3, [pc, #160] ; 0x8d70 <main+1468>
0x00008ccc <+1304>: ldr r1, [pc, #232] ; 0x8dbc <main+1544>
0x00008cd0 <+1308>: ldr r2, [r3]
0x00008cd4 <+1312>: mov r3, r10
0x00008cd8 <+1316>: str r8, [sp]
0x00008cdc <+1320>: bl 0x1b4e0 <sprintf>
0x00008ce0 <+1324>: mov r3, r4
0x00008ce4 <+1328>: mov r0, #1
0x00008ce8 <+1332>: ldr r1, [pc, #100] ; 0x8d54 <main+1440>
0x00008cec <+1336>: ldr r2, [pc, #204] ; 0x8dc0 <main+1548>
0x00008cf0 <+1340>: bl 0x9ff0 <vgPlain_debugLog>
0x00008cf4 <+1344>: mov r1, r11
0x00008cf8 <+1348>: mov r2, r6
0x00008cfc <+1352>: mov r0, r4
0x00008d00 <+1356>: bl 0x2ca60 <execve>
0x00008d04 <+1360>: ldr r3, [pc, #132] ; 0x8d90 <main+1500>
0x00008d08 <+1364>: ldr r4, [r3]
0x00008d0c <+1368>: bl 0xd5c0 <__errno_location>
0x00008d10 <+1372>: ldr r0, [r0]
0x00008d14 <+1376>: bl 0x29b60 <strerror>
0x00008d18 <+1380>: mov r2, r10
0x00008d1c <+1384>: mov r3, r8
0x00008d20 <+1388>: ldr r1, [pc, #156] ; 0x8dc4 <main+1552>
0x00008d24 <+1392>: str r0, [sp]
0x00008d28 <+1396>: mov r0, r4
0x00008d2c <+1400>: bl 0x1b4c0 <fprintf>
0x00008d30 <+1404>: mov r0, #1
0x00008d34 <+1408>: bl 0x12768 <exit>
0x00008d38 <+1412>: ldr r0, [pc, #136] ; 0x8dc8 <main+1556>
0x00008d3c <+1416>: ldr r1, [pc, #112] ; 0x8db4 <main+1536>
0x00008d40 <+1420>: movw r2, #438 ; 0x1b6
0x00008d44 <+1424>: ldr r3, [pc, #108] ; 0x8db8 <main+1540>
0x00008d48 <+1428>: bl 0xd5dc <__assert_fail>
0x00008d4c <+1432>: ; <UNDEFINED> instruction: 0x000672b4
0x00008d50 <+1436>: andeq r7, r6, r4, asr #5
0x00008d54 <+1440>: andeq r7, r6, r0, asr #2
0x00008d58 <+1444>: andeq r7, r6, r12, asr #5
0x00008d5c <+1448>: andeq r7, r6, r12, lsl #5
0x00008d60 <+1452>: andeq r7, r6, r0, lsr #7
0x00008d64 <+1456>: andeq r7, r6, r8, lsr r4
0x00008d68 <+1460>: andeq r8, r7, r0, asr #24
0x00008d6c <+1464>: muleq r6, r0, r4
0x00008d70 <+1468>: strdeq r4, [r8], -r0
0x00008d74 <+1472>: andeq r7, r6, r0, lsr #9
0x00008d78 <+1476>: ; <UNDEFINED> instruction: 0x000672b8
0x00008d7c <+1480>: ; <UNDEFINED> instruction: 0x000672bc
0x00008d80 <+1484>: andeq r7, r6, r4, ror #5
0x00008d84 <+1488>: andeq r7, r6, r4, lsl r3
0x00008d88 <+1492>: andeq r7, r6, r0, lsr #6
0x00008d8c <+1496>: andeq r7, r6, r8, lsl #4
0x00008d90 <+1500>: andeq r4, r8, r8, lsr #20
0x00008d94 <+1504>: ; <UNDEFINED> instruction: 0x000673b0
0x00008d98 <+1508>: andeq r7, r6, r8, ror #7
0x00008d9c <+1512>: andeq r9, r7, r8, lsl #29
0x00008da0 <+1516>: andeq r7, r6, r4, asr r3
0x00008da4 <+1520>: andeq r7, r6, r8, lsl #7
0x00008da8 <+1524>: andeq r7, r6, r12, asr #8
0x00008dac <+1528>: andeq r7, r6, r8, ror #8
0x00008db0 <+1532>: andeq r7, r6, r4, lsl #9
0x00008db4 <+1536>: muleq r6, r0, r1
0x00008db8 <+1540>: andeq r7, r6, r0, lsr r5
0x00008dbc <+1544>: ; <UNDEFINED> instruction: 0x000674bc
0x00008dc0 <+1548>: andeq r7, r6, r8, asr #9
0x00008dc4 <+1552>: ldrdeq r7, [r6], -r8
0x00008dc8 <+1556>: andeq r7, r6, r12, lsr #8
There are many topics here whether a direct access or an indirect access (via pointer) are faster when accessing structure members, in C.
One example: C pointers vs direct member access for structs
The general opinion is direct access will be faster (at least theoretically) since pointer dereferencing is not used.
So I gave it a try with a chunk of code in my system: GNU Embedded Tools GCC 4.7.4, generating code for ARM (actually ARM-Cortex-A15).
Surprisingly, direct access was much slower. Then I generated the assembly codes for the object file.
Direct access code has 114 lines of assembly code, and indirect access code has 33 lines of assembly code. What is going on here?
Below are the C code and generated assembly code of the functions. The structures are all map to external memory and the structure members are all one-byte word long (unsigned char type).
First function, with indirect access:
void sub_func_1(unsigned int num_of, struct file_s *__restrict__ first_file_ptr, struct file_s *__restrict__ second_file_ptr, struct output_s *__restrict__ output_ptr)
{
if(LIKELY(num_of == 0))
{
output_ptr->curr_id = UNUSED;
output_ptr->curr_cnt = output_ptr->cnt;
output_ptr->curr_mode = output_ptr->_mode;
output_ptr->curr_type = output_ptr->type;
output_ptr->curr_size = output_ptr->size;
output_ptr->curr_allocation_type = output_ptr->allocation_type;
output_ptr->curr_allocation_localized = output_ptr->allocation_localized;
output_ptr->curr_mode_enable = output_ptr->mode_enable;
if(output_ptr->curr_cnt == 1)
{
first_file_ptr->status = BLOCK_IDLE;
first_file_ptr->type = USER_DATA_TYPE;
first_file_ptr->index = FIRST__WORD;
first_file_ptr->layer_cnt = output_ptr->layer_cnt;
second_file_ptr->status = DISABLED;
second_file_ptr->index = 0;
second_file_ptr->redundancy_version = 1;
output_ptr->total_layer_cnt = first_file_ptr->layer_cnt;
}
}
}
00000000 <sub_func_1>:
0: e3500000 cmp r0, #0
4: e92d01f0 push {r4, r5, r6, r7, r8}
8: 1a00001b bne 7c <sub_func_1+0x7c>
c: e5d34007 ldrb r4, [r3, #7]
10: e3a05008 mov r5, #8
14: e5d3c003 ldrb ip, [r3, #3]
18: e5d38014 ldrb r8, [r3, #20]
1c: e5c35001 strb r5, [r3, #1]
20: e5d37015 ldrb r7, [r3, #21]
24: e5d36018 ldrb r6, [r3, #24]
28: e5c34008 strb r4, [r3, #8]
2c: e5d35019 ldrb r5, [r3, #25]
30: e35c0001 cmp ip, #1
34: e5c3c005 strb ip, [r3, #5]
38: e5d34012 ldrb r4, [r3, #18]
3c: e5c38010 strb r8, [r3, #16]
40: e5c37011 strb r7, [r3, #17]
44: e5c3601a strb r6, [r3, #26]
48: e5c3501b strb r5, [r3, #27]
4c: e5c34013 strb r4, [r3, #19]
50: 1a000009 bne 7c <sub_func_1+0x7c>
54: e5d3400b ldrb r4, [r3, #11]
58: e3a05005 mov r5, #5
5c: e5c1c000 strb ip, [r1]
60: e5c10002 strb r0, [r1, #2]
64: e5c15001 strb r5, [r1, #1]
68: e5c20000 strb r0, [r2]
6c: e5c14003 strb r4, [r1, #3]
70: e5c20005 strb r0, [r2, #5]
74: e5c2c014 strb ip, [r2, #20]
78: e5c3400f strb r4, [r3, #15]
7c: e8bd01f0 pop {r4, r5, r6, r7, r8}
80: e12fff1e bx lr
Second function, with direct access:
void sub_func_2(unsigned int output_index, unsigned int cc_index, unsigned int num_of)
{
if(LIKELY(num_of == 0))
{
output_file[output_index].curr_id = UNUSED;
output_file[output_index].curr_cnt = output_file[output_index].cnt;
output_file[output_index].curr_mode = output_file[output_index]._mode;
output_file[output_index].curr_type = output_file[output_index].type;
output_file[output_index].curr_size = output_file[output_index].size;
output_file[output_index].curr_allocation_type = output_file[output_index].allocation_type;
output_file[output_index].curr_allocation_localized = output_file[output_index].allocation_localized;
output_file[output_index].curr_mode_enable = output_file[output_index].mode_enable;
if(output_file[output_index].curr_cnt == 1)
{
output_file[output_index].cc_file[cc_index].file[0].status = BLOCK_IDLE;
output_file[output_index].cc_file[cc_index].file[0].type = USER_DATA_TYPE;
output_file[output_index].cc_file[cc_index].file[0].index = FIRST__WORD;
output_file[output_index].cc_file[cc_index].file[0].layer_cnt = output_file[output_index].layer_cnt;
output_file[output_index].cc_file[cc_index].file[1].status = DISABLED;
output_file[output_index].cc_file[cc_index].file[1].index = 0;
output_file[output_index].cc_file[cc_index].file[1].redundancy_version = 1;
output_file[output_index].total_layer_cnt = output_file[output_index].cc_file[cc_index].file[0].layer_cnt;
}
}
}
00000084 <sub_func_2>:
84: e92d0ff0 push {r4, r5, r6, r7, r8, r9, sl, fp}
88: e3520000 cmp r2, #0
8c: e24dd018 sub sp, sp, #24
90: e58d2004 str r2, [sp, #4]
94: 1a000069 bne 240 <sub_func_2+0x1bc>
98: e3a03d61 mov r3, #6208 ; 0x1840
9c: e30dc0c0 movw ip, #53440 ; 0xd0c0
a0: e340c001 movt ip, #1
a4: e3002000 movw r2, #0
a8: e0010193 mul r1, r3, r1
ac: e3402000 movt r2, #0
b0: e3067490 movw r7, #25744 ; 0x6490
b4: e3068488 movw r8, #25736 ; 0x6488
b8: e3a0b008 mov fp, #8
bc: e3066498 movw r6, #25752 ; 0x6498
c0: e02c109c mla ip, ip, r0, r1
c4: e082c00c add ip, r2, ip
c8: e28c3b19 add r3, ip, #25600 ; 0x6400
cc: e08c4007 add r4, ip, r7
d0: e5d39083 ldrb r9, [r3, #131] ; 0x83
d4: e08c5006 add r5, ip, r6
d8: e5d3a087 ldrb sl, [r3, #135] ; 0x87
dc: e5c3b081 strb fp, [r3, #129] ; 0x81
e0: e5c39085 strb r9, [r3, #133] ; 0x85
e4: e2833080 add r3, r3, #128 ; 0x80
e8: e7cca008 strb sl, [ip, r8]
ec: e5d4a004 ldrb sl, [r4, #4]
f0: e7cca007 strb sl, [ip, r7]
f4: e5d47005 ldrb r7, [r4, #5]
f8: e5c47001 strb r7, [r4, #1]
fc: e7dc6006 ldrb r6, [ip, r6]
100: e5d5c001 ldrb ip, [r5, #1]
104: e5c56002 strb r6, [r5, #2]
108: e5c5c003 strb ip, [r5, #3]
10c: e5d4c002 ldrb ip, [r4, #2]
110: e5c4c003 strb ip, [r4, #3]
114: e5d33005 ldrb r3, [r3, #5]
118: e3530001 cmp r3, #1
11c: 1a000047 bne 240 <sub_func_2+0x1bc>
120: e30dc0c0 movw ip, #53440 ; 0xd0c0
124: e30db0c0 movw fp, #53440 ; 0xd0c0
128: e1a0700c mov r7, ip
12c: e7dfc813 bfi ip, r3, #16, #16
130: e1a05007 mov r5, r7
134: e1a0900b mov r9, fp
138: e02c109c mla ip, ip, r0, r1
13c: e1a04005 mov r4, r5
140: e1a0a00b mov sl, fp
144: e7df9813 bfi r9, r3, #16, #16
148: e7dfb813 bfi fp, r3, #16, #16
14c: e1a06007 mov r6, r7
150: e7dfa813 bfi sl, r3, #16, #16
154: e58dc008 str ip, [sp, #8]
158: e7df6813 bfi r6, r3, #16, #16
15c: e1a0c004 mov ip, r4
160: e7df4813 bfi r4, r3, #16, #16
164: e02b109b mla fp, fp, r0, r1
168: e7df5813 bfi r5, r3, #16, #16
16c: e0291099 mla r9, r9, r0, r1
170: e7df7813 bfi r7, r3, #16, #16
174: e7dfc813 bfi ip, r3, #16, #16
178: e0261096 mla r6, r6, r0, r1
17c: e0241094 mla r4, r4, r0, r1
180: e082b00b add fp, r2, fp
184: e0829009 add r9, r2, r9
188: e02a109a mla sl, sl, r0, r1
18c: e28bbc65 add fp, fp, #25856 ; 0x6500
190: e58d600c str r6, [sp, #12]
194: e2899c65 add r9, r9, #25856 ; 0x6500
198: e3a06005 mov r6, #5
19c: e58d4010 str r4, [sp, #16]
1a0: e59d4008 ldr r4, [sp, #8]
1a4: e0251095 mla r5, r5, r0, r1
1a8: e5cb3000 strb r3, [fp]
1ac: e082a00a add sl, r2, sl
1b0: e59db00c ldr fp, [sp, #12]
1b4: e5c96001 strb r6, [r9, #1]
1b8: e59d6004 ldr r6, [sp, #4]
1bc: e28aac65 add sl, sl, #25856 ; 0x6500
1c0: e58d5014 str r5, [sp, #20]
1c4: e0271097 mla r7, r7, r0, r1
1c8: e0825004 add r5, r2, r4
1cc: e30d40c0 movw r4, #53440 ; 0xd0c0
1d0: e02c109c mla ip, ip, r0, r1
1d4: e0855008 add r5, r5, r8
1d8: e7df4813 bfi r4, r3, #16, #16
1dc: e5ca6002 strb r6, [sl, #2]
1e0: e5d59003 ldrb r9, [r5, #3]
1e4: e082600b add r6, r2, fp
1e8: e59db014 ldr fp, [sp, #20]
1ec: e0201094 mla r0, r4, r0, r1
1f0: e2866c65 add r6, r6, #25856 ; 0x6500
1f4: e59d1010 ldr r1, [sp, #16]
1f8: e306a53c movw sl, #25916 ; 0x653c
1fc: e0827007 add r7, r2, r7
200: e2877c65 add r7, r7, #25856 ; 0x6500
204: e082c00c add ip, r2, ip
208: e5c69003 strb r9, [r6, #3]
20c: e59d6004 ldr r6, [sp, #4]
210: e28ccc65 add ip, ip, #25856 ; 0x6500
214: e082500b add r5, r2, fp
218: e0820000 add r0, r2, r0
21c: e0824001 add r4, r2, r1
220: e085500a add r5, r5, sl
224: e0808008 add r8, r0, r8
228: e7c4600a strb r6, [r4, sl]
22c: e5c56005 strb r6, [r5, #5]
230: e5c73050 strb r3, [r7, #80] ; 0x50
234: e5dc3003 ldrb r3, [ip, #3]
238: e287704c add r7, r7, #76 ; 0x4c
23c: e5c83007 strb r3, [r8, #7]
240: e28dd018 add sp, sp, #24
244: e8bd0ff0 pop {r4, r5, r6, r7, r8, r9, sl, fp}
248: e12fff1e bx lr
And last part, my compile options are:
# Compile options.
C_OPTS = -Wall \
-std=gnu99 \
-fgnu89-inline \
-Wcast-align \
-Werror=uninitialized \
-Werror=maybe-uninitialized \
-Werror=overflow \
-mcpu=cortex-a15 \
-mtune=cortex-a15 \
-mabi=aapcs \
-mfpu=neon \
-ftree-vectorize \
-ftree-slp-vectorize \
-ftree-vectorizer-verbose=4 \
-mfloat-abi=hard \
-O3 \
-flto \
-marm \
-ffat-lto-objects \
-fno-gcse \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
-fno-strict-overflow \
-fuse-linker-plugin \
-falign-functions=4 \
-falign-loops=4 \
-falign-labels=4 \
-falign-jumps=4
Update:
Note: I deleted the structure definitions because there was differences with the version of my own program. It is actually a huge structure and it is not efficient to put here completely.
As suggested, I get rid of -fno-gcse, and the generated asm is not huge as before.
Without -fno-gcse, sub_func_1 generates the same code as above.
For sub_func_2:
00000084 <sub_func_2>:
84: e3520000 cmp r2, #0
88: e92d0070 push {r4, r5, r6}
8c: 1a000030 bne 154 <sub_func_2+0xd0>
90: e30d30c0 movw r3, #53440 ; 0xd0c0
94: e3a06008 mov r6, #8
98: e3403001 movt r3, #1
9c: e0030093 mul r3, r3, r0
a0: e3a00d61 mov r0, #6208 ; 0x1840
a4: e0213190 mla r1, r0, r1, r3
a8: e59f30ac ldr r3, [pc, #172] ; 15c <sub_func_2+0xd8>
ac: e0831001 add r1, r3, r1
b0: e2813b19 add r3, r1, #25600 ; 0x6400
b4: e5d34083 ldrb r4, [r3, #131] ; 0x83
b8: e1a00003 mov r0, r3
bc: e5d35087 ldrb r5, [r3, #135] ; 0x87
c0: e5c36081 strb r6, [r3, #129] ; 0x81
c4: e5c34085 strb r4, [r3, #133] ; 0x85
c8: e3064488 movw r4, #25736 ; 0x6488
cc: e2833080 add r3, r3, #128 ; 0x80
d0: e7c15004 strb r5, [r1, r4]
d4: e5d05094 ldrb r5, [r0, #148] ; 0x94
d8: e0844006 add r4, r4, r6
dc: e7c15004 strb r5, [r1, r4]
e0: e5d04095 ldrb r4, [r0, #149] ; 0x95
e4: e5d0c092 ldrb ip, [r0, #146] ; 0x92
e8: e5c04091 strb r4, [r0, #145] ; 0x91
ec: e3064498 movw r4, #25752 ; 0x6498
f0: e7d15004 ldrb r5, [r1, r4]
f4: e5c0c093 strb ip, [r0, #147] ; 0x93
f8: e5d04099 ldrb r4, [r0, #153] ; 0x99
fc: e5c0509a strb r5, [r0, #154] ; 0x9a
100: e5c0409b strb r4, [r0, #155] ; 0x9b
104: e5d33005 ldrb r3, [r3, #5]
108: e3530001 cmp r3, #1
10c: 1a000010 bne 154 <sub_func_2+0xd0>
110: e281cc65 add ip, r1, #25856 ; 0x6500
114: e3a06005 mov r6, #5
118: e2810b19 add r0, r1, #25600 ; 0x6400
11c: e1a0500c mov r5, ip
120: e5cc3000 strb r3, [ip]
124: e1a0400c mov r4, ip
128: e5cc6001 strb r6, [ip, #1]
12c: e5cc2002 strb r2, [ip, #2]
130: e5d0608b ldrb r6, [r0, #139] ; 0x8b
134: e5cc6003 strb r6, [ip, #3]
138: e306c53c movw ip, #25916 ; 0x653c
13c: e7c1200c strb r2, [r1, ip]
140: e5c52041 strb r2, [r5, #65] ; 0x41
144: e285503c add r5, r5, #60 ; 0x3c
148: e5c43050 strb r3, [r4, #80] ; 0x50
14c: e284404c add r4, r4, #76 ; 0x4c
150: e5c0608f strb r6, [r0, #143] ; 0x8f
154: e8bd0070 pop {r4, r5, r6}
158: e12fff1e bx lr
15c: 00000000 .word 0x00000000
TL:DR: can't reproduce that insane compiler output. Maybe the surrounding code + LTO did it?
I do have suggestions to improve the code: see the stuff below about copying whole structs instead of copying many individual members.
The question you linked is about accessing a value-type global directly vs. through a global pointer. On ARM, where it takes multiple instructions or a load from a nearby constant to get an arbitrary 32bit pointer into a register, passing around pointers is better than having each function reference a global directly.
See this example on the Godbolt Compiler Explorer (ARM gcc 4.8.2 -O3)
struct example {
int a, b, c;
} global_example;
int load_global(void) { return global_example.c; }
movw r3, #:lower16:global_example # tmp113,
movt r3, #:upper16:global_example # tmp113,
ldr r0, [r3, #8] #, global_example.c
bx lr #
int load_pointer(struct example *p) { return p->c; }
ldr r0, [r0, #8] #, p_2(D)->c
bx lr #
(Apparently gcc is horrible at passing structs by val as function args, see the code for byval(struct example by_val) on the godbolt link.)
Even worse is if you have a global pointer: first you have to load the value of the pointer, then another load to dereference it. This is the indirection overhead that was being discussed in the question you linked. If both loads miss in cache, you're paying the round-trip latency twice. The load address for the 2nd load isn't available until the first load completes, so no pipelining of those memory requests is possible even on an out-of-order CPU.
If you already have a pointer as an arg, it will be in a register. Dereferencing it is the same as loading from a global. (But better, because you don't need to get the global's address into a register yourself.)
Your real code
I can't reproduce your massive asm output with ARM gcc 4.8.2 on Godbolt, or locally with ARM gcc 5.2.1. I'm not using LTO, though, since I don't have a complete test program.
All I can see is just slightly larger code to do some index math.
bfi is Bitfield Insert. I think 144: e7df9813 bfi r9, r3, #16, #16 is setting the top half of r9 = low half of r3. I don't see how that and mla (integer mul-accumulate) make much sense. Other than perverse results from -ftree-vectorize, all I can think of is maybe -fno-gcse has a really bad impact for the version of gcc you tested.
Is it manipulating constants that are going to be stored? The code you actually posted #defines everything to 0, which gcc takes advantage of. (It also takes advantage of the fact that it already has 1 in a register if curr_cnt == 1, and stores that register for the second_file_ptr->redundancy_version = 1;). ARM doesn't have a str [mem], immediate or anything like x86's mov [mem], imm.
If your compiler output is from code with different values for those constants, the compiler would be doing more work to store different things.
Unfortunately gcc is bad at merging narrow stores into a single wider store (long-standing missed-optimization bug). For x86, clang does this in at least one case, storing 0x0100 (256) instead of a 0 and a 1. (check on godbolt by flipping the compiler to clang 3.7.1 or something, and removing the ARM-specific compiler args. There's a mov word ptr \[rsi\], 256 where gcc uses
mov BYTE PTR [rsi], 0 # *first_file_ptr_23(D).status,
mov BYTE PTR [rsi+1], 1 # *first_file_ptr_23(D).type,
If you arranged your structs carefully, there would be more opportunities for copying 4B blocks in this function.
It might also help to have two identical sub-structs of curr and not-curr, instead of curr_size and size. You might have to declare it packed to avoid padding after the sub-structs, though. Your two groups of members aren't in exactly the same order, which prevents compilers from doing much block-copying anyway when you do a bunch of assignments.
It helps gcc and clang copy multiple bytes at once if you do:
struct output_s_optimized {
struct __attribute__((packed)) stuff {
unsigned char cnt,
mode,
type,
size,
allocation_type,
allocation_localized,
mode_enable;
} curr; // 7B
unsigned char curr_id; // no non-curr id?
struct stuff non_curr;
unsigned char layer_cnt;
// Another 8 byte boundary here
unsigned char total_layer_cnt;
struct cc_file_s cc_file[128];
};
void foo(struct output_s_optimized *p) {
p->curr_id = 0;
p->non_curr = p->curr;
}
void bar(struct output_s_optimized *output_ptr) {
output_ptr->curr_id = 0;
output_ptr->curr.cnt = output_ptr->non_curr.cnt;
output_ptr->curr.mode = output_ptr->non_curr.mode;
output_ptr->curr.type = output_ptr->non_curr.type;
output_ptr->curr.size = output_ptr->non_curr.size;
output_ptr->curr.allocation_type = output_ptr->non_curr.allocation_type;
output_ptr->curr.allocation_localized = output_ptr->non_curr.allocation_localized;
output_ptr->curr.mode_enable = output_ptr->non_curr.mode_enable;
}
gcc 4.8.2 compiles foo() to three copies: byte, 2B, and 4B, even on ARM. It compiles bar() to eight 1B copies, and so does clang-3.8 on x86. So copying whole structs can help your compiler a lot (as well as making sure the data to be copied is arranged in the same order in both locations).
the same code on x86: nothing new
You can use -fverbose-asm to put comments on each line. For x86, the compiler output from gcc 6.1 -O3 is very similar between versions, as you can see on the Godbolt Compiler Explorer. x86 addressing modes can index a global variable directly, so you see stuff like
movzx edi, BYTE PTR [rcx+10] # *output_ptr_7(D)._mode
# where rcx is the output_ptr arg, used directly
vs.
movzx ecx, BYTE PTR output_file[rdi+10] # output_file[output_index_7(D)]._mode
# where rdi = output_index * 1297 (sizeof(output_file[0])), calculated once at the start
(gcc apparently doesn't care that each instruction has a 4B displacement as part of the addressing mode, but this is an ARM question so I won't go tradeoffs between code-size and insn count with x86's variable-length insns.)
In broad (architecture-agnostic) terms, this is what your instructions do:
global_structure_pointer->field = value;
loads the value of global_structure_pointer into an addressing register.
adds the offset represented by field to the addressing register.
stores value into the memory location addressed by the addressing register.
global_structure[index].field = value;
loads the address of global_structure into an addressing register.
loads the value of index into an arithmetic register.
multiplies the arithmetic register by the size of global_structure.
adds the arithmetic register to the addressing register.
stores value into the memory location addressed by the addressing register.
Your confusion seems to be due to a misunderstanding of what "direct access" actually is.
THIS is direct access:
global_structure.field = value;
What you thought of as direct access is in fact indexed access.
This question already has an answer here:
Rustc/LLVM generates faulty code for aarch64 with opt-level=0
(1 answer)
Closed 7 years ago.
I'm trying to get a small ARM kernel up and running on QEMU (Versatile Express for Cortex-A15). Currently it simply sets sp to the top of a small stack and sends a single character to UART0.
_start.arm:
.set stack_size, 0x10000
.comm stack, stack_size
.global _start
_start:
ldr sp, =stack+stack_size
bl start
1:
b 1b
.size _start, . - _start
start.c:
/* UART_0 is a struct overlaid on 0x1c090000 */
void printChar(char c)
{
while (UART_0->flags & TRANSMIT_FULL);
UART_0->data = c;
}
void start()
{
while (UART_0->flags & TRANSMIT_FULL);
UART_0->data = 'A';
printChar('a');
}
From GDB, I know that execution progresses through _start into start and successfully sends 'A' to UART_0. printChar gets called and completes, but doesn't seem to print anything to the serial port . When running without GDB, the kernel repeatedly prints 'A', though I'm not sure if this is the processor resetting or jumping incorrectly.
From objdump:
Disassembly of section .stub:
00010000 <_start>:
10000: e59fd004 ldr sp, [pc, #4] ; 1000c <__STACK_SIZE+0xc>
10004: eb000016 bl 10064 <start>
10008: eafffffe b 10008 <_start+0x8>
1000c: 000200d0 .word 0x000200d0
Disassembly of section .text:
00010010 <printChar>:
10010: e52db004 push {fp} ; (str fp, [sp, #-4]!)
10014: e28db000 add fp, sp, #0
10018: e24dd00c sub sp, sp, #12
1001c: e1a03000 mov r3, r0
10020: e54b3005 strb r3, [fp, #-5]
10024: e1a00000 nop ; (mov r0, r0)
10028: e3a03000 mov r3, #0
1002c: e3413c09 movt r3, #7177 ; 0x1c09
10030: e1d331ba ldrh r3, [r3, #26]
10034: e6ff3073 uxth r3, r3
10038: e2033020 and r3, r3, #32
1003c: e3530000 cmp r3, #0
10040: 1afffff8 bne 10028 <printChar+0x18>
10044: e3a03000 mov r3, #0
10048: e3413c09 movt r3, #7177 ; 0x1c09
1004c: e55b2005 ldrb r2, [fp, #-5]
10050: e6ff2072 uxth r2, r2
10054: e1c320b2 strh r2, [r3, #2]
10058: e24bd000 sub sp, fp, #0
1005c: e49db004 pop {fp} ; (ldr fp, [sp], #4)
10060: e12fff1e bx lr
00010064 <start>:
10064: e52db008 str fp, [sp, #-8]!
10068: e58de004 str lr, [sp, #4]
1006c: e28db004 add fp, sp, #4
10070: e1a00000 nop ; (mov r0, r0)
10074: e3a03000 mov r3, #0
10078: e3413c09 movt r3, #7177 ; 0x1c09
1007c: e1d331ba ldrh r3, [r3, #26]
10080: e6ff3073 uxth r3, r3
10084: e2033020 and r3, r3, #32
10088: e3530000 cmp r3, #0
1008c: 1afffff8 bne 10074 <start+0x10>
10090: e3a03000 mov r3, #0
10094: e3413c09 movt r3, #7177 ; 0x1c09
10098: e5d32002 ldrb r2, [r3, #2]
1009c: e3a02000 mov r2, #0
100a0: e3822041 orr r2, r2, #65 ; 0x41
100a4: e5c32002 strb r2, [r3, #2]
100a8: e5d32003 ldrb r2, [r3, #3]
100ac: e3a02000 mov r2, #0
100b0: e5c32003 strb r2, [r3, #3]
100b4: e3a00061 mov r0, #97 ; 0x61
100b8: ebffffd4 bl 10010 <printChar>
100bc: e24bd004 sub sp, fp, #4
100c0: e59db000 ldr fp, [sp]
100c4: e28dd004 add sp, sp, #4
100c8: e49df004 pop {pc} ; (ldr pc, [sp], #4)
000100cc <UART_0>:
100cc: 1c090000 ....
I may have missed something, but I am not seeing where you have enabled interrupts, or poll to see if you can send the next character. If you have enabled the interrupts and set up the UART hardware correctly, your driver my have a bug. If you have not setup the UART hardware correctly, it may not be generating interrupts, or it may not be doing the FIFO correctly, or any number of other problems.
I have a few place in my code that could really use some speed up, when I try to use CM4 SIMD instructions the result is always slower than scalar version, for example, this is an alpha blending function I'm using a lot, it's not really slow but it serves as an example:
for (int y=0; y<h; y++) {
i=y*w;
for (int x=0; x<w; x++) {
uint spix = *srcp++;
uint dpix = dstp[i+x];
uint r=(alpha*R565(spix)+(256-alpha)*R565(dpix))>>8;
uint g=(alpha*G565(spix)+(256-alpha)*G565(dpix))>>8;
uint b=(alpha*B565(spix)+(256-alpha)*B565(dpix))>>8;
dstp[i+x]= RGB565(r, g, b);
}
}
R565, G565, B565 and RGB565 are macros that extract and pack RGB565 respectively, please ignore
Now I tried using __SMUAD and see if anything changes, the result was slower (or same speed as the original code) even tried loop unrolling, with no luck:
uint v0, vr, vg, vb;
v0 = (alpha<<16)|(256-alpha);
for (int y=0; y<h; y++) {
i=y*w;
for (int x=0; x<w; x++) {
spix = *srcp++;
dpix = dstp[i+x];
uint vr = R565(spix)<<16 | R565(dpix);
uint vg = G565(spix)<<16 | G565(dpix);
uint vb = B565(spix)<<16 | B565(dpix);
uint r = __SMUAD(v0, vr)>>8;
uint g = __SMUAD(v0, vg)>>8;
uint b = __SMUAD(v0, vb)>>8;
dstp[i+x]= RGB565(r, g, b);
}
}
I know this has been asked before, but given the architectural differences, and the fact that none of the answers really solve my problem, I'm asking again. Thanks!
Update
Scalar disassembly:
Disassembly of section .text.blend:
00000000 <blend>:
0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
4: 6846 ldr r6, [r0, #4]
6: 68c4 ldr r4, [r0, #12]
8: b086 sub sp, #24
a: 199e adds r6, r3, r6
c: 9601 str r6, [sp, #4]
e: 9200 str r2, [sp, #0]
10: 68ca ldr r2, [r1, #12]
12: f89d 5038 ldrb.w r5, [sp, #56] ; 0x38
16: 9204 str r2, [sp, #16]
18: 9a01 ldr r2, [sp, #4]
1a: 426e negs r6, r5
1c: 4293 cmp r3, r2
1e: b2f6 uxtb r6, r6
20: da5b bge.n da <blend+0xda>
22: 8809 ldrh r1, [r1, #0]
24: 6802 ldr r2, [r0, #0]
26: 9102 str r1, [sp, #8]
28: fb03 fb01 mul.w fp, r3, r1
2c: 9900 ldr r1, [sp, #0]
2e: 4411 add r1, r2
30: 9103 str r1, [sp, #12]
32: 0052 lsls r2, r2, #1
34: 9205 str r2, [sp, #20]
36: 9903 ldr r1, [sp, #12]
38: 9a00 ldr r2, [sp, #0]
3a: 428a cmp r2, r1
3c: fa1f fb8b uxth.w fp, fp
40: da49 bge.n d6 <blend+0xd6>
42: 4610 mov r0, r2
44: 4458 add r0, fp
46: f100 4000 add.w r0, r0, #2147483648 ; 0x80000000
4a: 9a04 ldr r2, [sp, #16]
4c: f8dd a014 ldr.w sl, [sp, #20]
50: 3801 subs r0, #1
52: eb02 0040 add.w r0, r2, r0, lsl #1
56: 44a2 add sl, r4
58: f834 1b02 ldrh.w r1, [r4], #2
5c: 8842 ldrh r2, [r0, #2]
5e: f3c1 07c4 ubfx r7, r1, #3, #5
62: f3c2 09c4 ubfx r9, r2, #3, #5
66: f001 0c07 and.w ip, r1, #7
6a: f3c1 2804 ubfx r8, r1, #8, #5
6e: fb07 f705 mul.w r7, r7, r5
72: 0b49 lsrs r1, r1, #13
74: fb06 7709 mla r7, r6, r9, r7
78: ea41 01cc orr.w r1, r1, ip, lsl #3
7c: f3c2 2904 ubfx r9, r2, #8, #5
80: f002 0c07 and.w ip, r2, #7
84: fb08 f805 mul.w r8, r8, r5
88: 0b52 lsrs r2, r2, #13
8a: fb01 f105 mul.w r1, r1, r5
8e: 097f lsrs r7, r7, #5
90: fb06 8809 mla r8, r6, r9, r8
94: ea42 02cc orr.w r2, r2, ip, lsl #3
98: fb06 1202 mla r2, r6, r2, r1
9c: f007 07f8 and.w r7, r7, #248 ; 0xf8
a0: f408 58f8 and.w r8, r8, #7936 ; 0x1f00
a4: 0a12 lsrs r2, r2, #8
a6: ea48 0107 orr.w r1, r8, r7
aa: ea41 3142 orr.w r1, r1, r2, lsl #13
ae: f3c2 02c2 ubfx r2, r2, #3, #3
b2: 430a orrs r2, r1
b4: 4554 cmp r4, sl
b6: f820 2f02 strh.w r2, [r0, #2]!
ba: d1cd bne.n 58 <blend+0x58>
bc: 9902 ldr r1, [sp, #8]
be: 448b add fp, r1
c0: 9901 ldr r1, [sp, #4]
c2: 3301 adds r3, #1
c4: 428b cmp r3, r1
c6: fa1f fb8b uxth.w fp, fp
ca: d006 beq.n da <blend+0xda>
cc: 9a00 ldr r2, [sp, #0]
ce: 9903 ldr r1, [sp, #12]
d0: 428a cmp r2, r1
d2: 4654 mov r4, sl
d4: dbb5 blt.n 42 <blend+0x42>
d6: 46a2 mov sl, r4
d8: e7f0 b.n bc <blend+0xbc>
da: b006 add sp, #24
dc: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
e0: 4770 bx lr
e2: bf00 nop
SIMD disassembly:
sassembly of section .text.blend:
00000000 <blend>:
0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
4: 6846 ldr r6, [r0, #4]
6: 68c4 ldr r4, [r0, #12]
8: b086 sub sp, #24
a: 199e adds r6, r3, r6
c: 9601 str r6, [sp, #4]
e: 9200 str r2, [sp, #0]
10: 68ca ldr r2, [r1, #12]
12: f89d 5038 ldrb.w r5, [sp, #56] ; 0x38
16: 9204 str r2, [sp, #16]
18: 9a01 ldr r2, [sp, #4]
1a: f5c5 7680 rsb r6, r5, #256 ; 0x100
1e: 4293 cmp r3, r2
20: ea46 4505 orr.w r5, r6, r5, lsl #16
24: da5d bge.n e2 <blend+0xe2>
26: 8809 ldrh r1, [r1, #0]
28: 6802 ldr r2, [r0, #0]
2a: 9102 str r1, [sp, #8]
2c: fb03 fb01 mul.w fp, r3, r1
30: 9900 ldr r1, [sp, #0]
32: 4411 add r1, r2
34: 9103 str r1, [sp, #12]
36: 0052 lsls r2, r2, #1
38: 9205 str r2, [sp, #20]
3a: 9903 ldr r1, [sp, #12]
3c: 9a00 ldr r2, [sp, #0]
3e: 428a cmp r2, r1
40: fa1f fb8b uxth.w fp, fp
44: da4b bge.n de <blend+0xde>
46: 4610 mov r0, r2
48: 4458 add r0, fp
4a: f100 4000 add.w r0, r0, #2147483648 ; 0x80000000
4e: 9a04 ldr r2, [sp, #16]
50: f8dd a014 ldr.w sl, [sp, #20]
54: 3801 subs r0, #1
56: eb02 0040 add.w r0, r2, r0, lsl #1
5a: 44a2 add sl, r4
5c: f834 2b02 ldrh.w r2, [r4], #2
60: 8841 ldrh r1, [r0, #2]
62: f3c2 07c4 ubfx r7, r2, #3, #5
66: f3c1 06c4 ubfx r6, r1, #3, #5
6a: ea46 4707 orr.w r7, r6, r7, lsl #16
6e: fb25 f707 smuad r7, r5, r7
72: f001 0907 and.w r9, r1, #7
76: ea4f 3c51 mov.w ip, r1, lsr #13
7a: f002 0607 and.w r6, r2, #7
7e: ea4f 3852 mov.w r8, r2, lsr #13
82: ea4c 0cc9 orr.w ip, ip, r9, lsl #3
86: ea48 06c6 orr.w r6, r8, r6, lsl #3
8a: ea4c 4606 orr.w r6, ip, r6, lsl #16
8e: fb25 f606 smuad r6, r5, r6
92: f3c1 2104 ubfx r1, r1, #8, #5
96: f3c2 2204 ubfx r2, r2, #8, #5
9a: ea41 4202 orr.w r2, r1, r2, lsl #16
9e: fb25 f202 smuad r2, r5, r2
a2: f3c6 260f ubfx r6, r6, #8, #16
a6: 097f lsrs r7, r7, #5
a8: f3c6 01c2 ubfx r1, r6, #3, #3
ac: f007 07f8 and.w r7, r7, #248 ; 0xf8
b0: 430f orrs r7, r1
b2: f402 52f8 and.w r2, r2, #7936 ; 0x1f00
b6: ea47 3646 orr.w r6, r7, r6, lsl #13
ba: 4316 orrs r6, r2
bc: 4554 cmp r4, sl
be: f820 6f02 strh.w r6, [r0, #2]!
c2: d1cb bne.n 5c <blend+0x5c>
c4: 9902 ldr r1, [sp, #8]
c6: 448b add fp, r1
c8: 9901 ldr r1, [sp, #4]
ca: 3301 adds r3, #1
cc: 428b cmp r3, r1
ce: fa1f fb8b uxth.w fp, fp
d2: d006 beq.n e2 <blend+0xe2>
d4: 9a00 ldr r2, [sp, #0]
d6: 9903 ldr r1, [sp, #12]
d8: 428a cmp r2, r1
da: 4654 mov r4, sl
dc: dbb3 blt.n 46 <blend+0x46>
de: 46a2 mov sl, r4
e0: e7f0 b.n c4 <blend+0xc4>
e2: b006 add sp, #24
e4: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
e8: 4770 bx lr
ea: bf00 nop
If you want to truly optimize a function, you should check the assembler output of the compiler. You'll then learn how it is transforming your code, and then you can learn how to write code to help the compiler produce better output, or write the necessary assembler.
One of the easy wins you'd hit on quickly on your alpha blending loop is that division is slow.
Instead of x / 100, use
x * 65536 / 65536 / 100
-> x * (65536 / 100) / 65536
-> x * 655.36 >> 16
-> x * 656 >> 16
An even better alternative would be to use alpha values between 0 -> 256 so that you can just bitshift the result without even needing to do this trick.
One reason why smuad might not giving any benefit is that you're having to move data into a format specifically for this command.
I'm not sure whether you'll be able to do better in general, but I thought I'd point out a way to avoid the division in your sample routine. Also, if you inspect the assembly, you may discover that there is code generation that you don't expect that can be eliminated.
Community Wiki Answer
The major change to accommodate SIMD type instruction is to transform the loading. The SMUAD instruction can be looked at as a 'C' instruction like,
/* a,b are register vectors/arrays of 16 bits */
SMUAD = a[0] * b[0] + a[1] * b[1];
It is very easy to transform these. Instead of,
u16 *srcp, dstp;
uint spix = *srcp++;
uint dpix = dstp[i+x];
Use the full bus and get 32bits at a time,
uint *srcp, *dstp; /* These are two 16 bit values. */
uint spix = *srcp++;
uint dpix = dstp[i+x];
/* scale `dpix` and `spix` by alpha */
spix /= (alpha << 16 | alpha); /* Precompute, reduce strength, etc. */
dpix /= (1-alpha << 16 | 1-alpha); /* Precompute, reduce strength, etc. */
/* Hint, you can use SMUAD here? or maybe not.
You could if you scale at the same time*/
It looks like SMUL is a good fit for the alpha scaling; you don't want to add the two halves.
Now, spix and dpix contain two pixels. The vr synthetic is not needed. You may do two operations at one time.
uint rb = (dpix + spix) & ~GMASK; /* GMASK is x6xx6x bits. */
uint g = (dpix + spix) & GMASK;
/* Maybe you don't care about overflow?
A dual 16bit add helps, if the M4 has it? */
dstp[i+x]= rb | g; /* write 32bits or two pixels at a time. */
Mainly just making better use of the BUS by loading 32bits at a time will definitely speed up your routine. Standard 32 bit integer math may work most of the time if you are careful of the ranges and don't overflow the lower 16bit value in to the upper one.
For blitter code, Bit blog and Bit hacks are useful for extraction and manipulation of RGB565 values; whether SIMD or straight Thumb2 code.
Mainly, it is never a simple re-compile to use SIMD. It can be weeks of work to transform an algorithm. If done properly, SIMD speed ups are significant when the algorithm is not memory bandwidth bound and don't involve many conditionals.
Now with the disassembly posted: You'll see that both the scalar and the simd version have 29 instructions, and the SIMD version actually takes more code space. (Scalar is 0x58 -> 0xba for the inner loop, vs SIMD is (0x5c -> 0xc2))
You can see a lot of instructions are used getting the data into the right format for both loops.. maybe you can improve the performance more by working on the RGB bit unpacking/repacking rather than the alpha blend calculation!
Edit: You may also want to consider processing pairs of pixels at a time.