Related
I am currently trying to get backtrace based on stack pointer and link register on ARM64 device using C program.
Below is example of objdump
bar() calls foo() with 240444: ebfffd68 bl 23f9ec <foo##Base>
I can get link register (lr) and from that getting 23f9ec, save it to backtrace list as last routine.
My question: From below assembly code with current lr 0023f9ec <foo##Base>:, how to calculate to get previous routine with lr is 0023fe14 <bar##Base> using C language?
here is my implementation, but getting wrong previous lr
int bt(void** backtrace, int max_size) {
unsigned long* sp = __get_SP();
unsigned long* ra = __get_LR();
int* funcbase = (int*)(int)&bt;
int spofft = (short)((*funcbase));
sp = (char*)sp-spofft;
unsigned long* wra = (unsigned long*)ra;
int spofft;
int depth = 0;
while(ra) {
wra = ra;
while((*wra >> 16) != 0xe92d) {
wra--;
}
if(wra == 0)
return 0;
spofft = (short)(*wra & 0xffff);
if(depth < max_size)
backtrace[depth] = ra;
else
break;
ra =(unsigned long *)((unsigned long)ra + spofft);
sp =(unsigned long *)((unsigned long)sp + spofft);
depth++;
}
return 1;
}
0023f9ec <foo##Base>:
23f9ec: e92d42f3 push {r0, r1, r4, r5, r6, r7, r9, lr}
23f9f0: e1a09001 mov r9, r1
23f9f4: e1a07000 mov r7, r0
23f9f8: ebfffff9 bl 23f9e4 <__get_SP##Base>
23f9fc: e59f4060 ldr r4, [pc, #96] ; 23fa64 <foo##Base+0x78>
23fa00: e08f4004 add r4, pc, r4
23fa04: e1a05000 mov r5, r0
23fa08: ebfffff3 bl 23f9dc <__get_LR##Base>
23fa0c: e59f3054 ldr r3, [pc, #84] ; 23fa68 <foo##Base+0x7c>
23fa10: e3002256 movw r2, #598 ; 0x256
23fa14: e59f1050 ldr r1, [pc, #80] ; 23fa6c <foo##Base+0x80>
23fa18: e7943003 ldr r3, [r4, r3]
23fa1c: e08f1001 add r1, pc, r1
23fa20: e5934000 ldr r4, [r3]
23fa24: e1a03005 mov r3, r5
23fa28: e6bf4074 sxth r4, r4
23fa2c: e58d4004 str r4, [sp, #4]
23fa30: e1a06000 mov r6, r0
23fa34: e58d0000 str r0, [sp]
23fa38: e59f0030 ldr r0, [pc, #48] ; 23fa70 <foo##Base+0x84>
23fa3c: e08f0000 add r0, pc, r0
23fa40: ebfd456d bl 190ffc <printf#plt>
23fa44: e1a03009 mov r3, r9
23fa48: e1a02007 mov r2, r7
23fa4c: e1a01006 mov r1, r6
23fa50: e0640005 rsb r0, r4, r5
23fa54: ebffff70 bl 23f81c <get_prev_sp_ra2##Base>
23fa58: e3a00000 mov r0, #0
23fa5c: e28dd008 add sp, sp, #8
23fa60: e8bd82f0 pop {r4, r5, r6, r7, r9, pc}
23fa64: 003d5be0 eorseq r5, sp, r0, ror #23
23fa68: 000026c8 andeq r2, r0, r8, asr #13
23fa6c: 002b7ba6 eoreq r7, fp, r6, lsr #23
23fa70: 002b73e5 eoreq r7, fp, r5, ror #7
0023fe14 <bar##Base>:
23fe14: e92d4ef0 push {r4, r5, r6, r7, r9, sl, fp, lr}
23fe18: e24dde16 sub sp, sp, #352 ; 0x160
23fe1c: e59f76a8 ldr r7, [pc, #1704] ; 2404cc <bar##Base+0x6b8>
23fe20: e1a04000 mov r4, r0
23fe24: e59f66a4 ldr r6, [pc, #1700] ; 2404d0 <bar##Base+0x6bc>
23fe28: e1a03000 mov r3, r0
23fe2c: e59f26a0 ldr r2, [pc, #1696] ; 2404d4 <bar##Base+0x6c0>
23fe30: e08f7007 add r7, pc, r7
23fe34: e08f6006 add r6, pc, r6
23fe38: e3a00000 mov r0, #0
23fe3c: e08f2002 add r2, pc, r2
23fe40: e1a05001 mov r5, r1
23fe44: e3a01003 mov r1, #3
23fe48: e59f9688 ldr r9, [pc, #1672] ; 2404d8 <bar##Base+0x6c4>
.....................................................................
24043c: e3a0100f mov r1, #15
240440: e1a0000a mov r0, sl
240444: ebfffd68 bl 23f9ec <foo##Base>
240448: e59f2108 ldr r2, [pc, #264] ; 240558 <bar##Base+0x744>
24044c: e3a01003 mov r1, #3
240450: e08f2002 add r2, pc, r2
240454: e1a05000 mov r5, r0
240458: e1a03000 mov r3, r0
24045c: e3a00000 mov r0, #0
I don't think there's an easy way to do this.
Normally the register ABI of any operating system contains a "frame pointer" register. For example, on Apple's armv7 ABI, this is r7:
0x10006fc0 b0b5 push {r4, r5, r7, lr}
0x10006fc2 02af add r7, sp, 8
0x10006fc4 0448 ldr r0, [0x10006fd8]
0x10006fc6 d0e90c45 ldrd r4, r5, [r0, 0x30]
0x10006fca 0020 movs r0, 0
0x10006fcc fff7a6ff bl 0x10006f1c
0x10006fd0 0019 adds r0, r0, r4
0x10006fd2 6941 adcs r1, r5
0x10006fd4 b0bd pop {r4, r5, r7, pc}
If you dereference r7 there, you get to a pair of pointers, the second of which is lr, and the first of which is the r7 of the calling function, allowing you to repeat this process until you reach the bottom of the stack.
Judging by the assembly you posted, the codebase you're looking at doesn't have that. This means that the only way to obtain the return address is the same way that the code itself does: step forward through each instruction and parse/interpret them until you reach something that loads into pc. This is of course imperfect, since there may be functions in your call stack that do not ever return, but there's not much you can do about that.
It may be tempting to search backwards instead, and while you can do a heuristic approach and probably reach quite reasonable results with it, that is even less reliable than searching forward, since you have absolutely no way of telling whether you arrived at address X by stepping forward from the previous instruction or by explicitly jumping there from somewhere else.
I am doing work on an ARM Cortex-M microcontroller using the arm-none-eabi version of GCC. I am also using -fnostdlib and -fnostdin.
In my code I am using memcpy and strlen. Both of these functions are built-in functions as per the GCC manual. When I use these function as is or as __buitin_..., I get undefined reference to ....
Why is GCC not generating the code as expected?
builtins are not real functions. Compiler is free to replace them with the "normal" function call. ARM b as in this example:
void *m(void *a, void *b, size_t size)
{
return __builtin_memcpy(a,b,size);
}
void *m1(void *a, void *b)
{
return __builtin_memcpy(a,b,16);
}
void *m2(void *a, void *b)
{
return __builtin_memcpy(a,b,200);
}
volatile int a[1000],b[10000], c[1000];
int main(void)
{
m((void *)a,(void *)b,16);
__asm(":::m");
m((void *)a,(void *)c,400);
}
The resulting code will depend on the ARM architecture (if not aligned accesses are legal).
CORTEX-M4 CORTEX-M0
m: m:
b memcpy push {r4, lr}
m1: bl memcpy
push {r4, r5} pop {r4, pc}
ldr r5, [r1] # unaligned m1:
ldr r4, [r1, #4] # unaligned push {r4, lr}
ldr r2, [r1, #8] # unaligned movs r2, #16
ldr r1, [r1, #12] # unaligned bl memcpy
str r1, [r0, #12] # unaligned pop {r4, pc}
str r5, [r0] # unaligned m2:
str r4, [r0, #4] # unaligned push {r4, lr}
str r2, [r0, #8] # unaligned movs r2, #200
pop {r4, r5} bl memcpy
bx lr pop {r4, pc}
m2: main:
movs r2, #200 ldr r0, .L6
b memcpy push {r4, r5, r6, lr}
main: movs r2, r0
push {r4, lr} ldr r3, .L6+4
ldr r3, .L8 ldmia r3!, {r1, r4, r5}
ldr r4, .L8+4 stmia r2!, {r1, r4, r5}
ldm r3, {r0, r1, r2, r3} ldr r3, [r3]
stm r4, {r0, r1, r2, r3} str r3, [r2]
:::m :::m
mov r2, #400 movs r2, #200
mov r0, r4 ldr r1, .L6+8
ldr r1, .L8+8 lsls r2, r2, #1
bl memcpy bl memcpy
movs r0, #0 movs r0, #0
pop {r4, pc} pop {r4, r5, r6, pc}
.L8: .L6:
.word b .word a
.word a .word b
.word c .word c
https://godbolt.org/z/fh68cv
I am using IAR to compile routines, but run error on ARM A7; then i got the question below when i open the .lst file generated by IAR.
It is a ISR, first push {r3, r4, r5, lr}, but POP {r0, r4, r5, lr} when return, the R0 value is changed to the value of R3 before push. So R0 is wrong when returned from irqHandler which lead to error in follow routines.
why ?
void irqHandler(void)
{
878: e92d4038 push {r3, r4, r5, lr}
volatile u32 *pt = (u32 *)AM_INTC_BASE;
87c: e3a044b0 mov r4, #176, 8 ; 0xb0000000
u32 id_spin;
id_spin = *(pt+0x200c/4) & 0x3ff;
880: e302000c movw r0, #8204 ; 0x200c
884: e7900004 ldr r0, [r0, r4]
888: e1b00b00 lsls r0, r0, #22
88c: e1b00b20 lsrs r0, r0, #22
890: e1b05000 movs r5, r0
if(id_spin<32)
894: e3550020 cmp r5, #32
898: 2a000000 bcs 8a0 <irqHandler+0x28>
{
#ifdef WHOLECHIPSIM
print("id_spid<32 error...\r\n",0);
#endif
while(1);
89c: eafffffe b 89c <irqHandler+0x24>
}
else
{
(pFuncIrq[id_spin-32])();
8a0: e59f0010 ldr r0, [pc, #16] ; 8b8 <.text_8>
8a4: e1b01105 lsls r1, r5, #2
8a8: e0910000 adds r0, r1, r0
8ac: e5100080 ldr r0, [r0, #-128] ; 0x80
8b0: e12fff30 blx r0
}
}
8b4: e8bd8031 pop {r0, r4, r5, pc}
The abi requires a 64 bit aligned stack, so the push of r3 simply facilitates that. Could have chosen any register not already specified. Likewise on the pop they need to clean up the stack the function is prototyped as void so the return (r0) is a dont care and r0-r3 are not expected to be preserved so no reason to match the r3 on each end nor match an r0 on each end.
had they chose a register numbered above r3 (r6 for example) on the push then that would have needed to be matched on the pop. Otherwise the pop would have to be one of r0-r3 to not trash a non-volatile register. (couldnt push r3 then pop r6 that would trash r6)
It does not matter as R0-R3, R12, LR, PC, xPSR are saved on the stack automaticly when the hardware invokes the interrupt vector routine. When bx, ldm, pop, or ldr with PC is invoked hardware executes interrupt routine exit poping those registers.
Do not check your compiler. It knows what it does. Check tour wrong logic - especially printing strings in the interrupt handler.
assemble code with the keyword __irq __arm is below:
__irq __arm void irqHandler(void)
{
878: e24ee004 sub lr, lr, #4
87c: e92d503f push {r0, r1, r2, r3, r4, r5, ip, lr}
volatile u32 *pt = (u32 *)AM_INTC_BASE;
880: e3a044b0 mov r4, #176, 8 ; 0xb0000000
u32 id_spin;
id_spin = *(pt+0x200c/4) & 0x3ff;
884: e302000c movw r0, #8204 ; 0x200c
888: e7900004 ldr r0, [r0, r4]
88c: e1b00b00 lsls r0, r0, #22
890: e1b00b20 lsrs r0, r0, #22
894: e1b05000 movs r5, r0
if(id_spin<32)
898: e3550020 cmp r5, #32
89c: 2a000000 bcs 8a4 <irqHandler+0x2c>
{
#ifdef WHOLECHIPSIM
print("id_spid<32 error...\r\n",0);
#endif
while(1);
8a0: eafffffe b 8a0 <irqHandler+0x28>
}
else
{
(pFuncIrq[id_spin-32])();
8a4: e59f0010 ldr r0, [pc, #16] ; 8bc <.text_8>
8a8: e1b01105 lsls r1, r5, #2
8ac: e0910000 adds r0, r1, r0
8b0: e5100080 ldr r0, [r0, #-128] ; 0x80
8b4: e12fff30 blx r0
}
}
8b8: e8fd903f ldm sp!, {r0, r1, r2, r3, r4, r5, ip, pc}^
Cortex A7 PUSH log ,it just push 7 register, so 32bit aligned is ok
follow link is the log info:
http://img.blog.csdn.net/20170819120758443?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvcmFpbmJvd2JpcmRzX2Flcw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center
I'm working on a context switch in ARM v6 assembly. I posted about writing the switch in C, but assembly seems to be safer and more reliable. I've spent a while checking all the offsets and being careful not to delete data from registers, but the context switch just doesn't seem to work properly. I have set up and tested timer interrupts without switching context.
Here's my code:
interrupt_asm:
//store basic interrupt stuff
sub lr, lr, #4
//call the interrupt vector
push { r0-r12 }
mov r0, lr # Pass old pc
bl interrupt_vector # C function
pop { r0-r12 }
# save_current_thread:
//remember r1 so you can use it for r0
push {r1}
mov r1, r0 //store r0 so it can be restored
push {r2, r3}
bl get_current_thread //r0 now has the address of CURRENT_THREAD
pop {r2, r3}
add r0, r0, #4 // r0 = &CURRENT_THREAD.r0
str r1, [r0] // save what the r0 was
pop {r1} // restore r1
add r0, r0, #4 // r0 = &CURRENT_THREAD.r1
str r1, [r0] // save r1
//r2
add r0, r0, #4
str r2, [r0]
//r3
add r0, r0, #4
str r3, [r0]
//r4
add r0, r0, #4
str r4, [r0]
//r5
add r0, r0, #4
str r5, [r0]
//r6
add r0, r0, #4
str r6, [r0]
//r7
add r0, r0, #4
str r7, [r0]
//r8
add r0, r0, #4
str r8, [r0]
//r9
add r0, r0, #4
str r9, [r0]
//r10
add r0, r0, #4
str r10, [r0]
//r11
add r0, r0, #4
str r11, [r0]
//r12
add r0, r0, #4
str r12, [r0]
//store SVC sp, lr, and pc
mrs r1, cpsr
bic r1, r1, #0x1F
orr r1, r1, #0x13
msr cpsr_c, r1
//sp
add r0, r0, #4
str sp, [r0]
//lr
add r0, r0, #4
str lr, [r0]
//back to IRQ land
mrs r1, cpsr
bic r1, r1, #0x1F
orr r1, r1, #0x12
msr cpsr_c, r1
//pc THIS NEEDS TO BE LR
add r0, r0, #4
str lr, [r0]
//cpsr
add r0, r0, #4
mrs r1, cpsr
str r1, [r0]
//spsr
add r0, r0, #4
mrs r1, spsr
str r1, [r0]
push {r2, r3}
bl increment_thread //r0 now has the address of our next thread
pop {r2, r3}
# restore_thread:
//were pushing in order, so from the bottom up our stack is: r0, ... r12, sp, lr, pc, spsr
add r0, r0, #4 //r0 = &CURRENT_THREAD.r0
ldr r1, [r0] //r1 = thread.r0
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r1
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r2
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r3
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r4
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r5
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r6
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r7
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r8
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r9
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r10
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r11
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.r12
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.sp
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.lr
push {r1}
add r0, r0, #4
ldr r1, [r0] //r1 = thread.pc
push {r1}
add r0, r0, #8 //skip cpsr - agreed
ldr r1, [r0] //r1 = thread.spsr
push {r1}
//Our stack now looks like spsr, pc, lr, sp, r12, ... r0 (in order of popping)
pop {r1} //this was the spsr - SPSR
pop {lr} //this was the pc - PC (from thread)
pop {r2} //this was the lr - LR (from thread)
pop {r3} //this was the sp - SP (from thread)
//switch to SVC
mrs r0, cpsr
bic r0, r0, #0x1F
orr r0, r0, #0x13
msr cpsr_c, r0
msr spsr, r1 //restore spsr
mov lr, r2 //restore lr to be old lr
mov sp, r3 //restore sp
//switch to IRQ
// mrs r0, cpsr
// bic r0, r0, #0x1F
// orr r0, r0, #0x12
// msr cpsr_c, r0
cps #0x12
//our stack now just has the normal registers in it. Restore them
pop {r12}
pop {r11}
pop {r10}
pop {r9}
pop {r8}
pop {r7}
pop {r6}
pop {r5}
pop {r4}
pop {r3}
pop {r2}
pop {r2}
pop {r1}
pop {r0}
push {lr}
ldm sp!, {pc}^
A thread looks like this:
typedef struct __attribute__((packed, aligned(8))) {
void (*run)();
unsigned r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, sp, lr, pc;
unsigned cpsr, spsr;
unsigned id;
unsigned priority;
thread_status status;
wait_event wait_status;
} thread_t;
Do you have any advice about what's going on? An interrupt occurs and it never goes back to a new thread. I've debugged with GDB simulator but can't seem to nail down the issue.
The C function interrupt_vector just does this:
void interrupt_vector(unsigned pc) {
CURRENT_THREAD.pc = pc;
printf(" interrupt vector (pc = 0x%08x | thread.r0 = 0x%08x)\n", pc, CURRENT_THREAD.r0);
armtimer_clear_interrupt();
}
My other C functions are literally one line, and I've looked at their disassembly:
void increment_thread(){
// Not trying to actually increment yet
__asm__ volatile("mov r0, %0" : : "r" ((unsigned) &CURRENT_THREAD));
}
void get_current_thread(){
__asm__ volatile("mov r0, %0" : : "r" ((unsigned) &CURRENT_THREAD));
}
I'm still learning ARM and I couldn't understand what this function is supposed to do.
Can you guys help me out explaining how it works?
.text:0006379C EXPORT _nativeD2AB
.text:0006379C _nativeD2AB
.text:0006379C var_28 = -0x28
.text:0006379C
.text:0006379C STMFD SP!, {R4-R11,LR}
.text:000637A0 SUB SP, SP, #0x3A4
.text:000637A4 STMFA SP, {R0-R3}
.text:000637A8 LDR R0, =(_GLOBAL_OFFSET_ - 0x637B8)
.text:000637AC LDR R1, =(__stack_chk - 0x134EAC)
.text:000637B0 ADD R0, PC, R0 ; _GLOBAL_OFFSET_
.text:000637B4 LDR R0, [R1,R0] ; __stack_chk
.text:000637B8 LDR R0, [R0]
.text:000637BC STR R0, [SP,#0x3C8+var_28]
.text:000637C0 MOV R0, #1
.text:000637C4 ADR R1, sub_637D0
.text:000637C8 MUL R0, R1, R0
.text:000637CC MOV PC, R0
.text:000637CC ; End of function _nativeD2AB
.
.got:00134EAC _GLOBAL_OFFSET_TABLE_ DCD 0
.
.got:00134B0C AREA .got, DATA
.got:00134B0C __stack_chk DCD __stack_chkA
.
Found the rest of the function. If I understood some of it correctly, it seems to be scrambling the data, though that may be just a wild guess:
.text:000637D0 sub_637D0
.text:000637D0 MOV R0, #1
.text:000637D4 ADR R1, sub_637E0
.text:000637D8 MUL R0, R1, R0
.text:000637DC MOV PC, R0
.text:000637DC ; End of function sub_637D0
.text:000637E0 sub_637E0
.text:000637E0
.text:000637E0 arg_14 = 0x14
.text:000637E0
.text:000637E0 STR R2, [SP,#arg_14]
.text:000637E4 MOV R0, #1
.text:000637E8 ADR R1, loc_637F4
.text:000637EC MUL R0, R1, R0
.text:000637F0 MOV PC, R0
.text:000637F0 ; End of function sub_637E0
.text:000637F4 loc_637F4
.text:000637F4 STR R2, [SP,#0x28]
.text:000637F8 STR R0, [SP,#0x18]
.text:000637FC MOV R1, #2
.text:00063800 STR R2, [SP,#0x1C]
.text:00063804 STR R0, [SP,#0x20]
.text:00063808 STR R0, [SP,#0x24]
The function has several parts:
Store registers to the stacj and reserve space (Strangely, not restored)
Load to R0 the address of GLOBAL_OFFSET (Once added with PC), to actually access __stack_chk (When added to GLOBAL_OFFSET). This is done in a very strange way.
Load the data at __stack_chk and store it in the stack
Load to R0 the value of sub_637D0, by doing a multiplication by 1. This is the value returned by the function.
So in my opinion, this does not seem to do anything useful...