I'm working with 2 memories on my device, DDR and SRAM. The device is running a pre-OS code written in C and ARM.
I would like to conduct a DDR calibration, for that I need to copy a few functions to the SRAM, jump to it, run the calibration code, and go back to DDR when done.
In order to do so, I've modify my scatter file (.lds) so the relevant functions will be mapped to SRAM (instructions, data etc.).
After compiling the image, he is copied into the DDR and start running from there.
My problem is as follows:
How can I locate the starting address and size of these functions on DDR, so I'll be able to copy them to the SRAM and jump there?
Thanks you all in advance!
I am assuming you are talking about ARM architecture:
compile the code with __attribute__((always_inline)); on all related functions and compile with -fpic -fPIC read here for more info.
disassembling it and put it as-is on SRAM, e.g at adress 0xd1001000
reserve {r4-r15} on SRAM.
set pc to 0xd1001000 and sp properly to point the stack.
restore {r4-r15}
jump back to DDR.
You can look at here for a good resource of how to use the right gcc flags.
here is a refernce from uboot - it doesn't jump back to the initial place:
/*
* void relocate_code (addr_sp, gd, addr_moni)
*
* This "function" does not return, instead it continues in RAM
* after relocating the monitor code.
*
*/
.globl relocate_code
relocate_code:
mov r4, r0 /* save addr_sp */
mov r5, r1 /* save addr of gd */
mov r6, r2 /* save addr of destination */
/* Set up the stack */
stack_setup:
mov sp, r4
adr r0, _start
cmp r0, r6
moveq r9, #0 /* no relocation. relocation offset(r9) = 0 */
beq clear_bss /* skip relocation */
mov r1, r6 /* r1 <- scratch for copy_loop */
ldr r3, _image_copy_end_ofs
add r2, r0, r3 /* r2 <- source end address */
copy_loop:
ldmia r0!, {r9-r10} /* copy from source address [r0] */
stmia r1!, {r9-r10} /* copy to target address [r1] */
cmp r0, r2 /* until source end address [r2] */
blo copy_loop
#ifndef CONFIG_SPL_BUILD
/*
* fix .rel.dyn relocations
*/
ldr r0, _TEXT_BASE /* r0 <- Text base */
sub r9, r6, r0 /* r9 <- relocation offset */
ldr r10, _dynsym_start_ofs /* r10 <- sym table ofs */
add r10, r10, r0 /* r10 <- sym table in FLASH */
ldr r2, _rel_dyn_start_ofs /* r2 <- rel dyn start ofs */
add r2, r2, r0 /* r2 <- rel dyn start in FLASH */
ldr r3, _rel_dyn_end_ofs /* r3 <- rel dyn end ofs */
add r3, r3, r0 /* r3 <- rel dyn end in FLASH */
fixloop:
ldr r0, [r2] /* r0 <- location to fix up, IN FLASH! */
add r0, r0, r9 /* r0 <- location to fix up in RAM */
ldr r1, [r2, #4]
and r7, r1, #0xff
cmp r7, #23 /* relative fixup? */
beq fixrel
cmp r7, #2 /* absolute fixup? */
beq fixabs
/* ignore unknown type of fixup */
b fixnext
fixabs:
/* absolute fix: set location to (offset) symbol value */
mov r1, r1, LSR #4 /* r1 <- symbol index in .dynsym */
add r1, r10, r1 /* r1 <- address of symbol in table */
ldr r1, [r1, #4] /* r1 <- symbol value */
add r1, r1, r9 /* r1 <- relocated sym addr */
b fixnext
fixrel:
/* relative fix: increase location by offset */
ldr r1, [r0]
add r1, r1, r9
fixnext:
str r1, [r0]
add r2, r2, #8 /* each rel.dyn entry is 8 bytes */
cmp r2, r3
blo fixloop
b clear_bss
_rel_dyn_start_ofs:
.word __rel_dyn_start - _start
_rel_dyn_end_ofs:
.word __rel_dyn_end - _start
_dynsym_start_ofs:
.word __dynsym_start - _start
#endif /* #ifndef CONFIG_SPL_BUILD */
clear_bss:
#ifdef CONFIG_SPL_BUILD
/* No relocation for SPL */
ldr r0, =__bss_start
ldr r1, =__bss_end__
#else
ldr r0, _bss_start_ofs
ldr r1, _bss_end_ofs
mov r4, r6 /* reloc addr */
add r0, r0, r4
add r1, r1, r4
#endif
mov r2, #0x00000000 /* clear */
clbss_l:str r2, [r0] /* clear loop... */
add r0, r0, #4
cmp r0, r1
bne clbss_l
/*
* We are done. Do not return, instead branch to second part of board
* initialization, now running from RAM.
*/
jump_2_ram:
/*
* If I-cache is enabled invalidate it
*/
#ifndef CONFIG_SYS_ICACHE_OFF
mcr p15, 0, r0, c7, c5, 0 # invalidate icache
mcr p15, 0, r0, c7, c10, 4 # DSB
mcr p15, 0, r0, c7, c5, 4 # ISB
#endif
ldr r0, _board_init_r_ofs
adr r1, _start
add lr, r0, r1
add lr, lr, r9
/* setup parameters for board_init_r */
mov r0, r5 /* gd_t */
mov r1, r6 /* dest_addr */
/* jump to it ... */
mov pc, lr
_board_init_r_ofs:
.word board_init_r - _start
Related
I am writing a program for an Arm Cortex M7 based microcontroller, where I am facing an issue with (I guess) compiler optimization.
I compile using arm-none-eabi-gcc v10.2.1 with -mcpu=cortex-m7 -std=gnu11 -Ofast -mthumb.
I extracted the relevant parts, they are as below. I copy some data from a global variable into a local one, and from there the data is written to a hardware
CRC engine to compute the CRC over the data. Before writing it to the engine, the byte order is to be reversed.
#include <stddef.h>
#include <stdint.h>
#define MEMORY_BARRIER asm volatile("" : : : "memory")
#define __REV(x) __builtin_bswap32(x) // Reverse byte order
// Dummy struct, similiar as it is used in our program
typedef struct {
float n1, n2, n3, dn1, dn2, dn3;
} test_struct_t;
// CRC Engine Registers
typedef struct {
volatile uint32_t DR; // CRC Data register Address offset: 0x00
} CRC_TypeDef;
#define CRC ((CRC_TypeDef*) (0x40000000UL))
// Write data to CRC engine
static inline void CRC_write(uint32_t* data, size_t size)
{
for(int index = 0; index < (size / 4); index++) {
CRC->DR = __REV(*(data + index));
}
}
// Global variable holding some data, similiar as it is used in our program
volatile test_struct_t global_var = { 0 };
int main()
{
test_struct_t local_tmp;
MEMORY_BARRIER;
// Copy the current data into a local variable
local_tmp = global_var;
// Now, write the data to the CRC engine to compute CRC over the data
CRC_write((uint32_t*) &local_tmp, sizeof(local_tmp));
MEMORY_BARRIER;
return 0;
}
This compiles to:
main:
push {r4, r5, lr}
sub sp, sp, #28
ldr r4, .L4
mov ip, sp
ldr r3, [sp] ; load first float into r3
mov lr, #1073741824 ; load address of CRC engine DR register into lr
rev r5, r3 ; reverse byte order of first float and store in r5
ldmia r4!, {r0, r1, r2, r3} ; load first 4 floats into r0-r3
stmia ip!, {r0, r1, r2, r3} ; copy first 4 floats into local variable
ldm r4, {r0, r1} ; load float 5 and 6 into r0 and r1
stm ip, {r0, r1} ; copy float 5 and 6 into local variable
str r5, [lr] ; write reversed bytes from r5 to CRC engine
ldr r3, [sp, #4]
rev r3, r3
str r3, [lr]
ldr r3, [sp, #8]
rev r3, r3
str r3, [lr]
ldr r3, [sp, #12]
rev r3, r3
str r3, [lr]
ldr r3, [sp, #16]
rev r3, r3
str r3, [lr]
ldr r3, [sp, #20]
rev r3, r3
str r3, [lr]
movs r0, #0
add sp, sp, #28
pop {r4, r5, pc}
.L4:
.word .LANCHOR0
global_var:
The problem is now that the first float is loaded and its byte order is reversed (rev r5, r3) before the data has been copied from the global variable (happens with the ldmia/sdmia).
This is clearly not intended, leads to a wrong CRC, and I wonder why the compiler does not recognize this data dependency. Am I doing anything wrong or is there a bug in GCC?
Code in compiler explorer: https://godbolt.org/z/ErWThc5dx
I'm trying to port RTEMS on the SAME54P20A. I managed to make a basic BSP that compile and a basic application.
I compile the application using
./waf configure --rtems=$HOME/rtems/5 --rtems-bsp=arm/same54p20a
./waf
But when running the application on the target, it seems that it never gets to the application task.
I identified the issue by stepping through the assembly code. A call to no existing function is made during __libc_init_array.
Here is the assembly for the function :
00006ae4 <__libc_init_array>:
6ae4: b570 push {r4, r5, r6, lr}
6ae6: 4e0d ldr r6, [pc, #52] ; (6b1c <__libc_init_array+0x38>)
6ae8: 4d0d ldr r5, [pc, #52] ; (6b20 <__libc_init_array+0x3c>)
6aea: 1b76 subs r6, r6, r5
6aec: 10b6 asrs r6, r6, #2
6aee: d006 beq.n 6afe <__libc_init_array+0x1a>
6af0: 2400 movs r4, #0
6af2: 3401 adds r4, #1
6af4: f855 3b04 ldr.w r3, [r5], #4
6af8: 4798 blx r3
6afa: 42a6 cmp r6, r4
6afc: d1f9 bne.n 6af2 <__libc_init_array+0xe>
6afe: 4e09 ldr r6, [pc, #36] ; (6b24 <__libc_init_array+0x40>)
6b00: 4d09 ldr r5, [pc, #36] ; (6b28 <__libc_init_array+0x44>)
6b02: 1b76 subs r6, r6, r5
6b04: f000 fe46 bl 7794 <_init>
6b08: 10b6 asrs r6, r6, #2
6b0a: d006 beq.n 6b1a <__libc_init_array+0x36>
6b0c: 2400 movs r4, #0
6b0e: 3401 adds r4, #1
6b10: f855 3b04 ldr.w r3, [r5], #4
6b14: 4798 blx r3
6b16: 42a6 cmp r6, r4
6b18: d1f9 bne.n 6b0e <__libc_init_array+0x2a>
6b1a: bd70 pop {r4, r5, r6, pc}
6b1c: 00008914 andeq r8, r0, r4, lsl r9
6b20: 00008914 andeq r8, r0, r4, lsl r9
6b24: 00008918 andeq r8, r0, r8, lsl r9
6b28: 00008914 andeq r8, r0, r4, lsl r9
When reaching address 0x6b14 the register r3 contains 0xffffffff which eventually trigger a reset of the board.
Here are the registers states :
r0 0x200016c8 536876744 r1 0xa010001 167837697 │
│r2 0x0 0 r3 0xffffffff -1 │
│r4 0x1 1 r5 0x8918 35096 │
│r6 0x1 1 r7 0x0 0 │
│r8 0x0 0 r9 0x0 0 │
│r10 0x0 0 r11 0x0 0 │
│r12 0x20000400 536871936 sp 0x20005888 0x20005888 │
│lr 0x6b09 27401 pc 0x6b14 0x6b14 <__libc_init_array+48> │
│xPSR 0x1000000 16777216 fpscr 0x0 0 │
│msp 0x20004888 0x20004888 <_ISR_Stack_area_begin+4072> psp 0x20005888 0x20005888 │
│primask 0x0 0 basepri 0x0 0 │
│faultmask 0x0 0 control 0x2 2
I can't figure out the reason of this.
Here is the application code :
/*
* app.c
*/
#include <rtems.h>
#include <stdlib.h>
#include "bsp/same54p20a.h"
#include "bsp/port.h"
rtems_task Init(
rtems_task_argument ignored
)
{
/* Set pin PC21 as output to control LED1 */
PORT_REGS->GROUP[2].PORT_DIR |= PORT_PC21;
/* Turn on the led */
PORT_REGS->GROUP[2].PORT_OUTCLR |= PORT_PC21;
exit( 0 );
}
/*
* init.c
*/
#define CONFIGURE_APPLICATION_NEEDS_ZERO_DRIVER
#define CONFIGURE_APPLICATION_DOES_NOT_NEED_CLOCK_DRIVER
#define CONFIGURE_MAXIMUM_TASKS 1
#define CONFIGURE_RTEMS_INIT_TASKS_TABLE
#define CONFIGURE_INIT
#include <rtems/confdefs.h>
EDIT (After Tom V comment)
Register r3 is loaded by the line
6b10: f855 3b04 ldr.w r3, [r5], #4
So it is loaded with the litteral at address 0x8914 which is (from the dump):
00008914 <__frame_dummy_init_array_entry>:
8914: 000003e9 andeq r0, r0, r9, ror #7
If i'm not mistaken, shouldn't r3 take the value 0x3e9 ?
The value of r3 is loaded from the address in the literal at 0x6b28.
The literal is address 0x00008914. What is at address 0x00008914 in your disassembly is the word 0x000003e9. This means the value of r3 should be 0x000003e9, and it should call the function at 0x000003e8 (the difference of 1 is because this is an interworking address, the 1 in the LSB means that the target uses the thumb instruction set).
If your debbugger shows that the value of r3 that gets called is 0xffffffff, then the value in memory does not match your disassembly dump.
If this memory is flash ROM then something is wrong with your static linking process or your link script. A relocation might not have been applied to the image that you programmed, but then somehow does get applied in your dump.
If this memory is RAM, then maybe something overwrote that value. If you are using a runtime linker (unusual on a microcontroller) then maybe it never applied the relocation in the first place.
In addition to Tom V analysis :
The issue was that some unwanted sections were keep while creating the .hex file.
The solution is to add the switch --strip-unneeded to the command objcopy.
I am trying to figure out how arrays work in ARM assembly, but I am just overwhelmed. I want to initialize an array of size 20 to 0, 1, 2 and so on.
A[0] = 0
A[1] = 1
I can't even figure out how to print what I have to see if I did it correctly. This is what I have so far:
.data
.balign 4 # Memory location divisible by 4
string: .asciz "a[%d] = %d\n"
a: .skip 80 # allocates 20
.text
.global main
.extern printf
main:
push {ip, lr} # return address + dummy register
ldr r1, =a # set r1 to index point of array
mov r2, #0 # index r2 = 0
loop:
cmp r2, #20 # 20 elements?
beq end # Leave loop if 20 elements
add r3, r1, r2, LSL #2 # r3 = r1 + (r2*4)
str r2, [r3] # r3 = r2
add r2, r2, #1 # r2 = r2 + 1
b loop # branch to next loop iteration
print:
push {lr} # store return address
ldr r0, =string # format
bl printf # c printf
pop {pc} # return address
ARM confuses me enough as it is, I don't know what i'm doing wrong. If anyone could help me better understand how this works that would be much appreciated.
This might help down the line for others who want to know about how to allocate memory for array in arm assembly language
here is a simple example to add corresponding array elements and store in the third array.
.global _start
_start:
MOV R0, #5
LDR R1,=first_array # loading the address of first_array[0]
LDR R2,=second_array # loading the address of second_array[0]
LDR R7,=final_array # loading the address of final_array[0]
MOV R3,#5 # len of array
MOV R4,#0 # to store sum
check:
cmp R3,#1 # like condition in for loop for i>1
BNE loop # if R3 is not equal to 1 jump to the loop label
B _exit # else exit
loop:
LDR R5,[R1],#4 # loading the values and storing in registers and base register gets updated automatically R1 = R1 + 4
LDR R6,[R2],#4 # similarly
add R4,R5,R6
STR R4,[R7],#4 # storing the values back to the final array
SUB R3,R3,#1 # decrment value just like i-- in for loop
B check
_exit:
LDR R7,=final_array # before exiting checking the values stored
LDR R1, [R7] # R1 = 60
LDR R2, [R7,#4] # R2 = 80
LDR R3, [R7,#8] # R3 = 100
LDR R4, [R7,#12] # R4 = 120
MOV R7, #1 # terminate syscall, 1
SWI 0 # execute syscall
.data
first_array: .word 10,20,30,40
second_array: .word 50,60,70,80
final_array: .word 0,0,0,0,0
as mentioned your printf has problems, you can use the toolchain itself to see what the calling convention is, and then conform to that.
#include <stdio.h>
unsigned int a,b;
void notmain ( void )
{
printf("a[%d] = %d\n",a,b);
}
giving
00001008 <notmain>:
1008: e59f2010 ldr r2, [pc, #16] ; 1020 <notmain+0x18>
100c: e59f3010 ldr r3, [pc, #16] ; 1024 <notmain+0x1c>
1010: e5921000 ldr r1, [r2]
1014: e59f000c ldr r0, [pc, #12] ; 1028 <notmain+0x20>
1018: e5932000 ldr r2, [r3]
101c: eafffff8 b 1004 <printf>
1020: 0000903c andeq r9, r0, ip, lsr r0
1024: 00009038 andeq r9, r0, r8, lsr r0
1028: 0000102c andeq r1, r0, ip, lsr #32
Disassembly of section .rodata:
0000102c <.rodata>:
102c: 64255b61 strtvs r5, [r5], #-2913 ; 0xb61
1030: 203d205d eorscs r2, sp, sp, asr r0
1034: 000a6425 andeq r6, sl, r5, lsr #8
Disassembly of section .bss:
00009038 <b>:
9038: 00000000 andeq r0, r0, r0
0000903c <a>:
903c:
the calling convention is generally first parameter in r0, second in r1, third in r2 up to r3 then use the stack. There are many exceptions to this, but we can see here that the compiler which normally works fine with a printf call, wants the address of the format string in r0. the value of a then the value of b in r1 and r2 respectively.
Your printf has the string in r0, but a printf call with that format string needs three parameters.
The code above used a tail optimization and branch to printf rather than called it and returned from. The arm convention these days prefers the stack to be aligned on 64 bit boundaries, so you can put some register, you dont necessarily care to preserve on the push/pop in order to keep that alignment
push {r3,lr}
...
pop {r3,pc}
It certainly wont hurt you to do this, it may or may not hurt to not do it depending on what downstream assumes.
Your setup and loop should function just fine assuming that r1 (label a) is a word aligned address. Which it may or may not be if you mess with your string, should put a first then the string or put another alignment statement before a to insure the array is aligned. There are instruction set features that can simply the code, but it appears functional as is.
I'm still learning ARM and I couldn't understand what this function is supposed to do.
Can you guys help me out explaining how it works?
.text:0006379C EXPORT _nativeD2AB
.text:0006379C _nativeD2AB
.text:0006379C var_28 = -0x28
.text:0006379C
.text:0006379C STMFD SP!, {R4-R11,LR}
.text:000637A0 SUB SP, SP, #0x3A4
.text:000637A4 STMFA SP, {R0-R3}
.text:000637A8 LDR R0, =(_GLOBAL_OFFSET_ - 0x637B8)
.text:000637AC LDR R1, =(__stack_chk - 0x134EAC)
.text:000637B0 ADD R0, PC, R0 ; _GLOBAL_OFFSET_
.text:000637B4 LDR R0, [R1,R0] ; __stack_chk
.text:000637B8 LDR R0, [R0]
.text:000637BC STR R0, [SP,#0x3C8+var_28]
.text:000637C0 MOV R0, #1
.text:000637C4 ADR R1, sub_637D0
.text:000637C8 MUL R0, R1, R0
.text:000637CC MOV PC, R0
.text:000637CC ; End of function _nativeD2AB
.
.got:00134EAC _GLOBAL_OFFSET_TABLE_ DCD 0
.
.got:00134B0C AREA .got, DATA
.got:00134B0C __stack_chk DCD __stack_chkA
.
Found the rest of the function. If I understood some of it correctly, it seems to be scrambling the data, though that may be just a wild guess:
.text:000637D0 sub_637D0
.text:000637D0 MOV R0, #1
.text:000637D4 ADR R1, sub_637E0
.text:000637D8 MUL R0, R1, R0
.text:000637DC MOV PC, R0
.text:000637DC ; End of function sub_637D0
.text:000637E0 sub_637E0
.text:000637E0
.text:000637E0 arg_14 = 0x14
.text:000637E0
.text:000637E0 STR R2, [SP,#arg_14]
.text:000637E4 MOV R0, #1
.text:000637E8 ADR R1, loc_637F4
.text:000637EC MUL R0, R1, R0
.text:000637F0 MOV PC, R0
.text:000637F0 ; End of function sub_637E0
.text:000637F4 loc_637F4
.text:000637F4 STR R2, [SP,#0x28]
.text:000637F8 STR R0, [SP,#0x18]
.text:000637FC MOV R1, #2
.text:00063800 STR R2, [SP,#0x1C]
.text:00063804 STR R0, [SP,#0x20]
.text:00063808 STR R0, [SP,#0x24]
The function has several parts:
Store registers to the stacj and reserve space (Strangely, not restored)
Load to R0 the address of GLOBAL_OFFSET (Once added with PC), to actually access __stack_chk (When added to GLOBAL_OFFSET). This is done in a very strange way.
Load the data at __stack_chk and store it in the stack
Load to R0 the value of sub_637D0, by doing a multiplication by 1. This is the value returned by the function.
So in my opinion, this does not seem to do anything useful...
I'm starting to write a toy OS for the BeagleBone Black, which uses an ARM Cortex-A8-based TI Sitara AM3359 SoC and the U-Boot bootloader. I've got a simple standalone hello world app writing to UART0 that I can load through U-Boot so far, and now I'm trying to move on to interrupt handlers, but I can't get SWI to do anything but hang the device.
According to the AM335x TRM (starting on page 4099, if you're interested), the interrupt vector table is mapped in ROM at 0x20000. The ROM SWI handler branches to RAM at 0x4030ce08, which branches to the address stored at 0x4030ce28. (Initially, this is a unique dead loop at 0x20084.)
My code sets up all the ARM processor modes' SP to their own areas at the top of RAM, and enables interrupts in the CPSR, then executes an SWI instruction, which always hangs. (Perhaps jumping to some dead-loop instruction?) I've looked at a bunch of samples, and read whatever documentation I could find, and I don't see what I'm missing.
Currently my only interaction with the board is over serial connection on UART0 with my linux box. U-Boot initializes UART0, and allows loading of the binary over the serial connection.
Here's the relevant assembly:
.arm
.section ".text.boot"
.equ usr_mode, 0x10
.equ fiq_mode, 0x11
.equ irq_mode, 0x12
.equ svc_mode, 0x13
.equ abt_mode, 0x17
.equ und_mode, 0x1b
.equ sys_mode, 0x1f
.equ swi_vector, 0x4030ce28
.equ rom_swi_b_addr, 0x20008
.equ rom_swi_addr, 0x20028
.equ ram_swi_b_addr, 0x4030CE08
.equ ram_swi_addr, 0x4030CE28
.macro setup_mode mode, stackpointer
mrs r0, cpsr
mov r1, r0
and r1, r1, #0x1f
bic r0, r0, #0x1f
orr r0, r0, #\mode
msr cpsr_csfx, r0
ldr sp, =\stackpointer
bic r0, r0, #0x1f
orr r0, r0, r1
msr cpsr_csfx, r0
.endm
.macro disable_interrupts
mrs r0, cpsr
orr r0, r0, #0x80
msr cpsr_c, r0
.endm
.macro enable_interrupts
mrs r0, cpsr
bic r0, r0, #0x80
msr cpsr_c, r0
.endm
.global _start
_start:
// Initial SP
ldr r3, =_C_STACK_TOP
mov sp, r3
// Set up all the modes' stacks
setup_mode fiq_mode, _FIQ_STACK_TOP
setup_mode irq_mode, _IRQ_STACK_TOP
setup_mode svc_mode, _SVC_STACK_TOP
setup_mode abt_mode, _ABT_STACK_TOP
setup_mode und_mode, _UND_STACK_TOP
setup_mode sys_mode, _C_STACK_TOP
// Clear out BSS
ldr r0, =_bss_start
ldr r1, =_bss_end
mov r5, #0
mov r6, #0
mov r7, #0
mov r8, #0
b _clear_bss_check$
_clear_bss$:
stmia r0!, {r5-r8}
_clear_bss_check$:
cmp r0, r1
blo _clear_bss$
// Load our SWI handler's address into
// the vector table
ldr r0, =_swi_handler
ldr r1, =swi_vector
str r0, [r1]
// Debug-print out these SWI addresses
ldr r0, =rom_swi_b_addr
bl print_mem
ldr r0, =rom_swi_addr
bl print_mem
ldr r0, =ram_swi_b_addr
bl print_mem
ldr r0, =ram_swi_addr
bl print_mem
enable_interrupts
swi_call$:
swi #0xCC
bl kernel_main
b _reset
.global _swi_handler
_swi_handler:
// Get the SWI parameter into r0
ldr r0, [lr, #-4]
bic r0, r0, #0xff000000
// Save lr onto the stack
stmfd sp!, {lr}
bl print_uint32
ldmfd sp!, {pc}
Those debugging prints produce the expected values:
00020008: e59ff018
00020028: 4030ce08
4030ce08: e59ff018
4030ce28: 80200164
(According to objdump, 0x80200164 is indeed _swi_handler. 0xe59ff018 is the instruction "ldr pc, [pc, #0x20]".)
What am I missing? It seems like this should work.
The firmware on the board changes the ARM execution mode and the locations of
the vector tables associated with the various modes. In my own case (a bare-metal
snippet code executed at Privilege Level 1 and launched by BBB's uBoot) the active vector table is at address 0x9f74b000.
In general, you might use something like the following function to locate the
active vector table:
static inline unsigned int *get_vectors_address(void)
{
unsigned int v;
/* read SCTLR */
__asm__ __volatile__("mrc p15, 0, %0, c1, c0, 0\n"
: "=r" (v) : : );
if (v & (1<<13))
return (unsigned int *) 0xffff0000;
/* read VBAR */
__asm__ __volatile__("mrc p15, 0, %0, c12, c0, 0\n"
: "=r" (v) : : );
return (unsigned int *) v;
}
change
ldr r0, [lr, #-4]
bic r0, r0, #0xff000000
stmfd sp!, {lr}
bl print_uint32
ldmfd sp!, {pc}
to
stmfd sp!, {r0-r3, r12, lr}
ldr r0, [lr, #-4]
bic r0, r0, #0xff000000
bl print_uint32
ldmfd sp!, {r0-r3, r12, pc}^
PS: You don't restore SPSR into CPSR of interrupted task AND you also scratch registers which are not banked by the cpu mode switch.