Does arm-none-eabi-gcc produce slower code than Keil uVision

Does arm-none-eabi-gcc produce slower code than Keil uVision - c

I have a simple blinking led program running on STM32f103C8 (without initialization boilerplate):
void soft_delay(void) {
for (volatile uint32_t i=0; i<2000000; ++i) { }
}
uint32_t iters = 0;
while (1)
{
LL_GPIO_TogglePin(LED_GPIO_Port, LED_Pin);
soft_delay();
++iters;
}
It was compiled with both Keil uVision v.5 (default compiler) and CLion using arm-none-eabi-gcc compiler.
The surprise is that arm-none-eabi-gcc program runs 50% slower in Release mode (-O2 -flto) and 100% slower in Debug mode.
I suspect 3 reasons:
Keil over-optimization (unlikely, because the code is very simple)
arm-none-eabi-gcc under-optimization due to wrong compiler flags (I use CLion Embedded plugins` CMakeLists.txt)
A bug in the initialization so that chip has lower clock frequency with arm-none-eabi-gcc (to be investigated)
I have not yet dived into the jungles of optimization and disassembling,
I hope that there are many experienced embedded developers who already encountered this issue and have the answer.
UPDATE 1
Playing around with different optimization levels of Keil ArmCC, I see
how it affects the generated code. And it affects drastically, especially execution time. Here are the benchmarks and disassembly of soft_delay() function for each optimization level (RAM and Flash amounts include initialization code).
-O0: RAM: 1032, Flash: 1444, Execution Time (20 iterations): 18.7 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
B |L6.14|
|L6.8|
LDR r0,[sp,#0]
ADDS r0,r0,#1
STR r0,[sp,#0]
|L6.14|
LDR r1,|L6.24|
LDR r0,[sp,#0]
CMP r0,r1
BCC |L6.8|
POP {r3,pc}
ENDP
-O1: RAM: 1032, Flash: 1216, Execution Time (20 iterations): 13.3 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L6.24|
B |L6.16|
|L6.10|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
|L6.16|
LDR r1,[sp,#0]
CMP r1,r0
BCC |L6.10|
POP {r3,pc}
ENDP
-O2 -Otime: RAM: 1032, Flash: 1136, Execution Time (20 iterations): 9.8 sec
soft_delay PROC
SUB sp,sp,#4
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L4.24|
|L4.8|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC |L4.8|
ADD sp,sp,#4
BX lr
ENDP
-O3: RAM: 1032, Flash: 1176, Execution Time (20 iterations): 9.9 sec
soft_delay PROC
PUSH {r3,lr}
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,|L5.20|
|L5.8|
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC |L5.8|
POP {r3,pc}
ENDP
TODO: benchmarking and disassembly for arm-none-eabi-gcc.

This second answer is a demonstration of the kinds of things that would affect the performance results the OP may be seeing and examples of to possibly test for those STM32F103C8 blue pill.
Complete source code:
flash.ld
MEMORY
{
rom : ORIGIN = 0x08000000, LENGTH = 0x1000
ram : ORIGIN = 0x20000000, LENGTH = 0x1000
}
SECTIONS
{
.text : { *(.text*) } > rom
.rodata : { *(.rodata*) } > rom
.bss : { *(.bss*) } > ram
}
flash.s
.cpu cortex-m0
.thumb
.thumb_func
.global _start
_start:
stacktop: .word 0x20001000
.word reset
.word hang
.word hang
.thumb_func
reset:
bl notmain
b hang
.thumb_func
hang: b .
.align
.thumb_func
.globl PUT32
PUT32:
str r1,[r0]
bx lr
.thumb_func
.globl GET32
GET32:
ldr r0,[r0]
bx lr
.thumb_func
.globl dummy
dummy:
bx lr
test.s
.cpu cortex-m0
.thumb
.word 0,0,0
.word 0,0,0,0
.thumb_func
.globl TEST
TEST:
bx lr
notmain.c
//PA9 TX
//PA10 RX
void PUT32 ( unsigned int, unsigned int );
unsigned int GET32 ( unsigned int );
void dummy ( unsigned int );
#define USART1_BASE 0x40013800
#define USART1_SR (USART1_BASE+0x00)
#define USART1_DR (USART1_BASE+0x04)
#define USART1_BRR (USART1_BASE+0x08)
#define USART1_CR1 (USART1_BASE+0x0C)
#define USART1_CR2 (USART1_BASE+0x10)
#define USART1_CR3 (USART1_BASE+0x14)
//#define USART1_GTPR (USART1_BASE+0x18)
#define GPIOA_BASE 0x40010800
#define GPIOA_CRH (GPIOA_BASE+0x04)
#define RCC_BASE 0x40021000
#define RCC_APB2ENR (RCC_BASE+0x18)
#define STK_CSR 0xE000E010
#define STK_RVR 0xE000E014
#define STK_CVR 0xE000E018
#define STK_MASK 0x00FFFFFF
static void uart_init ( void )
{
//assuming 8MHz clock, 115200 8N1
unsigned int ra;
ra=GET32(RCC_APB2ENR);
ra|=1<<2; //GPIOA
ra|=1<<14; //USART1
PUT32(RCC_APB2ENR,ra);
//pa9 TX alternate function output push-pull
//pa10 RX configure as input floating
ra=GET32(GPIOA_CRH);
ra&=~(0xFF0);
ra|=0x490;
PUT32(GPIOA_CRH,ra);
PUT32(USART1_CR1,0x2000);
PUT32(USART1_CR2,0x0000);
PUT32(USART1_CR3,0x0000);
//8000000/16 = 500000
//500000/115200 = 4.34
//4 and 5/16 = 4.3125
//4.3125 * 16 * 115200 = 7948800
PUT32(USART1_BRR,0x0045);
PUT32(USART1_CR1,0x200C);
}
static void uart_putc ( unsigned int c )
{
while(1)
{
if(GET32(USART1_SR)&0x80) break;
}
PUT32(USART1_DR,c);
}
static void hexstrings ( unsigned int d )
{
//unsigned int ra;
unsigned int rb;
unsigned int rc;
rb=32;
while(1)
{
rb-=4;
rc=(d>>rb)&0xF;
if(rc>9) rc+=0x37; else rc+=0x30;
uart_putc(rc);
if(rb==0) break;
}
uart_putc(0x20);
}
static void hexstring ( unsigned int d )
{
hexstrings(d);
uart_putc(0x0D);
uart_putc(0x0A);
}
void soft_delay(void) {
for (volatile unsigned int i=0; i<2000000; ++i) { }
}
int notmain ( void )
{
PUT32(STK_CSR,4);
PUT32(STK_RVR,0x00FFFFFF);
PUT32(STK_CVR,0x00000000);
PUT32(STK_CSR,5);
uart_init();
hexstring(0x12345678);
hexstring(GET32(0xE000E018));
hexstring(GET32(0xE000E018));
return(0);
}
build
arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 flash.s -o flash.o
arm-none-eabi-as --warn --fatal-warnings -mcpu=cortex-m3 test.s -o test.o
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m0 -march=armv6-m -c notmain.c -o notmain.thumb.o
arm-none-eabi-ld -o notmain.thumb.elf -T flash.ld flash.o test.o notmain.thumb.o
arm-none-eabi-objdump -D notmain.thumb.elf > notmain.thumb.list
arm-none-eabi-objcopy notmain.thumb.elf notmain.thumb.bin -O binary
arm-none-eabi-gcc -Wall -Werror -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m3 -march=armv7-m -c notmain.c -o notmain.thumb2.o
arm-none-eabi-ld -o notmain.thumb2.elf -T flash.ld flash.o test.o notmain.thumb2.o
arm-none-eabi-objdump -D notmain.thumb2.elf > notmain.thumb2.list
arm-none-eabi-objcopy notmain.thumb2.elf notmain.thumb2.bin -O binary
uart output as shown
12345678
00FFE445
00FFC698
If I take your code, make it shorter, don't have all day.
void soft_delay(void) {
for (volatile unsigned int i=0; i<0x2000; ++i) { }
}
arm-none-eabi-gcc -c -O0 -mthumb -mcpu=cortex-m0 hello.c -o hello.o
yes I know this is an m3
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 5.4.0
gives
00000000 <soft_delay>:
0: b580 push {r7, lr}
2: b082 sub sp, #8
4: af00 add r7, sp, #0
6: 2300 movs r3, #0
8: 607b str r3, [r7, #4]
a: e002 b.n 12 <soft_delay+0x12>
c: 687b ldr r3, [r7, #4]
e: 3301 adds r3, #1
10: 607b str r3, [r7, #4]
12: 687b ldr r3, [r7, #4]
14: 4a03 ldr r2, [pc, #12] ; (24 <soft_delay+0x24>)
16: 4293 cmp r3, r2
18: d9f8 bls.n c <soft_delay+0xc>
1a: 46c0 nop ; (mov r8, r8)
1c: 46bd mov sp, r7
1e: b002 add sp, #8
20: bd80 pop {r7, pc}
22: 46c0 nop ; (mov r8, r8)
24: 00001fff
first check the test infrastructure
.cpu cortex-m0
.thumb
.align 8
.word 0,0
.thumb_func
.globl TEST
TEST:
push {r4,r5,r6,lr}
mov r4,r0
mov r5,r1
ldr r6,[r4]
inner:
bl soft_delay
sub r5,#1
bne inner
ldr r3,[r4]
sub r0,r6,r3
pop {r4,r5,r6,pc}
.align 8
soft_delay:
bx lr
in the openocd telnet window
reset halt
flash write_image erase notmain.thumb.elf
reset
gives
12345678
00001B59
7001 clocks, assuming the systick matches the cpu, thats 7001 arm clocks, 4 instructions per loop.
Step back note I aligned some things
08000108 <TEST>:
8000108: b570 push {r4, r5, r6, lr}
800010a: 1c04 adds r4, r0, #0
800010c: 1c0d adds r5, r1, #0
800010e: 6826 ldr r6, [r4, #0]
08000110 <inner>:
8000110: f000 f876 bl 8000200 <soft_delay>
8000114: 3d01 subs r5, #1
8000116: d1fb bne.n 8000110 <inner>
8000118: 6823 ldr r3, [r4, #0]
800011a: 1af0 subs r0, r6, r3
800011c: bd70 pop {r4, r5, r6, pc}
08000200 <soft_delay>:
8000200: 4770 bx lr
both loops are nicely aligned.
Now if I do this:
0800010a <TEST>:
800010a: b570 push {r4, r5, r6, lr}
800010c: 1c04 adds r4, r0, #0
800010e: 1c0d adds r5, r1, #0
8000110: 6826 ldr r6, [r4, #0]
08000112 <inner>:
8000112: f000 f875 bl 8000200 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
800011a: 6823 ldr r3, [r4, #0]
800011c: 1af0 subs r0, r6, r3
800011e: bd70 pop {r4, r5, r6, pc}
Simply changing the alignment of the code that is supposed to be testing the code under test I now get:
00001F40
8000 ticks to do that loop 1000 times with that call with the code function under test still being aligned
08000200 <soft_delay>:
8000200: 4770 bx lr
The .align 8, in general don't use .align with a number on gnu its behavior does not translate across targets. .balign is better. Anyway I used it. The two words are because the align made TEST aligned, but inner is what I wanted aligned so I added two words to make it aligned.
.align 8
.word 0,0
nop
.thumb_func
.globl TEST
TEST:
push {r4,r5,r6,lr}
mov r4,r0
mov r5,r1
ldr r6,[r4]
inner:
bl soft_delay
sub r5,#1
bne inner
ldr r3,[r4]
sub r0,r6,r3
pop {r4,r5,r6,pc}
A little code review to make sure I didn't make a mistake here.
r0 is the systick current value register
r1 is the number of loops I want to run the code under test
The calling convention allows for r0-r3 to be clobbered so I need to move r0 and r1 to non-volatile registers (per the calling convention).
I want to sample the time the instruction before the loop and the instruction after.
so I need two registers for r0 and r1 and a register to store the begin time so r4,r5,r6 and that fits in nicely to have an even number of registers pushed on the stack. Have to preserve lr so we can return.
we can now safely call soft_delay in the loop, subtract the count, branch if not equal to inner, once the count is done read the timer in r3. from output above this is a down counter so subtract end from beginning, technically since this is a 24 bit counter I should and with 0x00FFFFFF to correctly do that subtraction, but because this isn't going to roll over I can assume out that operation. result/return value goes in r0, pop everything which includes popping the pc to do the return to the C calling function which prints out r0's value.
I think the test code is good.
reading the CPUID register
411FC231
So that means r1p1, while the TRM I am using is written for r2p1 you have to be very careful to use the right document but also sometimes use the current document or all the ones in between if available to see what changed.
ICode memory interface
Instruction fetches from Code memory space 0x00000000 to 0x1FFFFFFF
are performed over the 32-bit AHB-Lite bus. The Debugger cannot access
this interface. All fetches are word-wide. The number of instructions
fetched per word depends on the code running and the alignment of the
code in memory.
Sometimes in ARM TRMs you see the fetch info up top near the processor features, this tells me what I wanted to know.
08000112 <inner>:
8000112: f000 f875 bl 8000200 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
this requires a fetch at 110, 114 and 118.
08000110 <inner>:
8000110: f000 f876 bl 8000200 <soft_delay>
8000114: 3d01 subs r5, #1
8000116: d1fb bne.n 8000110 <inner>
This a fetch at 110 and 114, but not one at 118, so that extra fetch could be our added clock. the m3 was the first publicly available one and it has a lot of features in the core that went away and similar ones came back. Some of the smaller cores fetch differently and you don't see this alignment issue. with bigger cores like full sized ones they fetch sometimes 4 or 8 instructions at a time and you have to change your alignment even more to hit the boundary but you can hit the boundary and since it is 2 or 4 clocks plus bus overhead for the extra fetch you can see those.
If I put two nops
nop
nop
.thumb_func
.globl TEST
TEST:
gives
08000114 <inner>:
8000114: f000 f874 bl 8000200 <soft_delay>
8000118: 3d01 subs r5, #1
800011a: d1fb bne.n 8000114 <inner>
800011c: 6823 ldr r3, [r4, #0]
800011e: 1af0 subs r0, r6, r3
8000120: bd70 pop {r4, r5, r6, pc}
gives
00001B59
So that's good we are back to that number, could try a few more to confirm but it appears that alignment is sensitive to our outer test loop, which is bad, but we can manage that, don't change it it won't affect the test. If I didn't care about alignment and had something like this:
void soft_delay(void) {
for (volatile unsigned int i=0; i<0x2000; ++i) { }
}
int notmain ( void )
{
unsigned int ra;
unsigned int beg;
unsigned int end;
PUT32(STK_CSR,4);
PUT32(STK_RVR,0x00FFFFFF);
PUT32(STK_CVR,0x00000000);
PUT32(STK_CSR,5);
uart_init();
hexstring(0x12345678);
beg=GET32(STK_CVR);
for(ra=0;ra<1000;ra++)
{
soft_delay();
}
end=GET32(STK_CVR);
hexstring((beg-end)&0x00FFFFFF);
return(0);
}
Then as I played with optimization options and I also played with using different compilers any change in the program/binary in front of the test loop would/could move the test loop changing its performance, in my simple example it was a 14% performance difference, that's massive if you are doing performance tests. letting the compiler take care of all this without us being in control the everything in front of the function under test could mess with the function under test, as written above the compiler might opt to inline the function rather than call it making an even more interesting situation as the test loop while probably not as clean as mine, certainly not if not optimized, but now the code under test is dynamic as options or alignments change.
I'm very happy you happened to be using this core/chip...
If I re-align inner and now mess with this
.align 8
nop
soft_delay:
bx lr
08000202 <soft_delay>:
8000202: 4770 bx lr
it's a single instruction which is fetched at 0x200 from what we have read and seem to be able to tell. wouldn't expect this to change anything and it didn't
00001B59
but now that we know what we know, we can use our experience to mess with this trivial Not interesting at all example.
.align 8
nop
soft_delay:
nop
bx lr
gives
00001F41
as expected. and we can have even more fun:
.align 8
.word 0,0
nop
.thumb_func
.globl TEST
TEST:
combined gives
08000112 <inner>:
8000112: f000 f876 bl 8000202 <soft_delay>
8000116: 3d01 subs r5, #1
8000118: d1fb bne.n 8000112 <inner>
08000202 <soft_delay>:
8000202: 46c0 nop ; (mov r8, r8)
8000204: 4770 bx lr
no surprise if you know what you are doing:
00002328
9000 clocks, 29% performance difference. we are literally talking about 5 (technically 6) instructions, same exact machine code and by simply changing alignment the performance can be 29% different, compiler and options have nothing to do with it, yet, have not even gotten there.
How can we expect to do any kind of performance evaluation of a program using the time the code a bunch of times in a loop method? We cant unless we know what we are doing, have an understanding of the architecture, etc.
Now as it should be obvious and reading the documentation I am using the internal 8Mhz clock, everything is derived from that so the systick times are not going to sometimes vary as you might see with dram for example. The LATENCY bits in the FLASH_ACR register should have defaulted to zero wait states for 0 < SYSCLK <- 24Mhz. If I were to bump up the clock above 24Mhz, the processor is running faster but the flash is now slower relative to the processor.
Without messing with the clocks and simply adding a wait state by changing the FLASH_ACR register to 0x31.
000032C6
12998 up from 9000, I didn't expect it to double necessarily and it didn't.
Hmm for fun make a PUT16 using strh, and
.thumb_func
.globl HOP
HOP:
bx r2
and
PUT16(0x2000010a,0xb570); // 800010a: b570 push {r4, r5, r6, lr}
PUT16(0x2000010c,0x1c04); // 800010c: 1c04 adds r4, r0, #0
PUT16(0x2000010e,0x1c0d); // 800010e: 1c0d adds r5, r1, #0
PUT16(0x20000110,0x6826); // 8000110: 6826 ldr r6, [r4, #0]
PUT16(0x20000112,0xf000); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000114,0xf876); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000116,0x3d01); // 8000116: 3d01 subs r5, #1
PUT16(0x20000118,0xd1fb); // 8000118: d1fb bne.n 8000112 <inner>
PUT16(0x2000011a,0x6823); // 800011a: 6823 ldr r3, [r4, #0]
PUT16(0x2000011c,0x1af0); // 800011c: 1af0 subs r0, r6, r3
PUT16(0x2000011e,0xbd70); // 800011e: bd70 pop {r4, r5, r6, pc}
PUT16(0x20000202,0x46c0); // 8000202: 46c0 nop ; (mov r8, r8)
PUT16(0x20000204,0x4770); // 8000204: 4770 bx lr
hexstring(HOP(STK_CVR,1000,0x2000010B));
gives
0000464B
and that was not at all expected. but is 18,000 basically
Putting ram to bed after this
PUT16(0x20000108,0xb570); // 800010a: b570 push {r4, r5, r6, lr}
PUT16(0x2000010a,0x1c04); // 800010c: 1c04 adds r4, r0, #0
PUT16(0x2000010c,0x1c0d); // 800010e: 1c0d adds r5, r1, #0
PUT16(0x2000010e,0x6826); // 8000110: 6826 ldr r6, [r4, #0]
PUT16(0x20000110,0xf000); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000112,0xf876); // 8000112: f000 f876 bl 8000202 <soft_delay>
PUT16(0x20000114,0x3d01); // 8000116: 3d01 subs r5, #1
PUT16(0x20000116,0xd1fb); // 8000118: d1fb bne.n 8000112 <inner>
PUT16(0x20000118,0x6823); // 800011a: 6823 ldr r3, [r4, #0]
PUT16(0x2000011a,0x1af0); // 800011c: 1af0 subs r0, r6, r3
PUT16(0x2000011c,0xbd70); // 800011e: bd70 pop {r4, r5, r6, pc}
PUT16(0x20000200,0x46c0); // 8000202: 46c0 nop ; (mov r8, r8)
PUT16(0x20000200,0x4770); // 8000204: 4770 bx lr
hexstring(HOP(STK_CVR,1000,0x20000109));
00002EDE
The machine code did not change because I moved both back by 2 so the relative address between them was the same. Note that bl is two separate instructions not one 32 bit one. You cant see this in the newer docs you need to go back to the original/early ARM ARM where it is explained. And it is easy to do experiments where you split the two instructions and put other stuff in between and they work just fine, because they are two separate instructions.
At this point the reader should be able to make a 2 instruction test loop, time it and dramatically change the performance of the execution of those two instructions on this platform using the same exact machine code.
So let's try the volatile loop that you wrote.
.align 8
soft_delay:
push {r7, lr}
sub sp, #8
add r7, sp, #0
mov r3, #0
str r3, [r7, #4]
b L12
Lc:
ldr r3, [r7, #4]
add r3, #1
str r3, [r7, #4]
L12:
ldr r3, [r7, #4]
ldr r2, L24
cmp r3, r2
bls Lc
nop
mov sp, r7
add sp, #8
pop {r7, pc}
nop
.align
L24: .word 0x1FFF
this is I believe the unoptimized -O0 version. starting off with one test loop
hexstring(TEST(STK_CVR,1));
experience, the times we are seeing will overflow our 24 bit counter and the results will be very strange or lead to false conclusions.
0001801F
98,000, quick check for safety:
.align
L24: .word 0x1F
0000019F
not bad that is on par with 256 times faster.
so we have some wiggle room in our test loop but not much try 10
hexstring(TEST(STK_CVR,10));
000F012D
98334 ticks per loop.
changing the alignment
08000202 <soft_delay>:
8000202: b580 push {r7, lr}
8000204: b082 sub sp, #8
gave the same result
000F012D
not unheard of, you can examine the differences if you want count through each instruction check fetch cycles, etc.
had I made the test:
soft_delay:
nop
nop
bx lr
its two fetch cycles no matter what the alignment or if I had left it bx lr with no nops as we saw so by simply having an odd number of instructions in the test then alignment won't affect the results on fetches along, but note that from what we know now had some other code in the program moved the outer timing/test loop that may have changed performance and the results may show a difference between two tests that were purely the timing code and not the code under test (read Michael Abrash).
The cortex-m3 is based on the armv7-m architecture. If I change the compiler from -mcpu=cortex-m0 (all cortex-m compatible so far) to -mcpu=cortex-m3 (not all cortex-m compatible will break on half of them) it produces a little bit less code.
.align 8
soft_delay:
push {r7}
sub sp, #12
add r7, sp, #0
movs r3, #0
str r3, [r7, #4]
b L12
Lc:
ldr r3, [r7, #4]
add r3, #1
str r3, [r7, #4]
L12:
ldr r3, [r7, #4]
/*14: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
//cmp.w r3, #8192
.word 0x5f00f5b3
bcc Lc
nop
add r7, #12
mov sp, r7
pop {r7}
bx lr
000C80FB 81945 ticks for the code under test.
I hate unified syntax, that was a massive mistake, so I fumble along in legacy mode. thus the .word thing there in the middle.
As part of writing this I kinda messed up my system in order to demonstrate something. I was building a gcc 5.4.0 but overwrote my 9.2.0 so had to re-build both.
2.95 was the version I started using with arm and didn't support thumb gcc 3.x.x was the first to. And either gcc 4.x.x or gcc 5.x.x produced "slower" code for some of my projects, at work we are currently moving from ubuntu 16.04 to 18.04 for our build systems which if you use the apt-got cross compiler for arm that moves you from 5.x.x to 7.x.x and it is making larger binaries for the same source code and where we are tight on memory it is pushing us beyond what's available so we have to either remove some code (easiest to make the printed messages shorter, cut text out) or stick to the older compiler by building our own or apt-getting the older one. 19.10 does no longer offers the 5.x.x version.
So both are now built.
18: d3f8 bcc.n c <soft_delay+0xc>
1a: bf00 nop
1c: bf00 nop
1e: 370c adds r7, #12
these nops after bcc are baffling to me...
18: d3f8 bcc.n c <soft_delay+0xc>
1a: bf00 nop
1c: 370c adds r7, #12
gcc 5.4.0 is putting one, gcc 9.2.0 is putting two nops, ARM doesn't have the branch shadow thing of MIPS (MIPS doesn't currently either).
000C80FB gcc 5.4.0
000C8105 gcc 9.2.0
I call the function 10 times, the nop is outside the code under tests loop so has a lesser effect.
Optimized all cortex-m variants (to date) using gcc 9.2.0
soft_delay:
mov r3, #0
mov r2, #128
sub sp, #8
str r3, [sp, #4]
ldr r3, [sp, #4]
lsl r2, r2, #6
cmp r3, r2
bcs L1c
L10:
ldr r3, [sp, #4]
add r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
cmp r3, r2
bcc L10
L1c:
add sp, #8
bx lr
(also understand that not all say gcc 9.2.0 builds produce the same code when you build the compiler you have options and those options can affect the output making different builds of 9.2.0 possibly producing different results)
000C80B5
gcc 9.2.0 built for cortex-m3:
soft_delay:
mov r3, #0
sub sp, #8
str r3, [sp, #4]
ldr r3, [sp, #4]
/*8: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
.word 0x5F00F5B3
bcs L1c
Le:
ldr r3, [sp, #4]
add r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
/*16: f5b3 5f00 cmp.w r3, #8192 ; 0x2000*/
.word 0x5F00F5B3
bcc Le
L1c:
add sp, #8
bx lr
000C80A1
That's in the noise. despite the code built has differences. they simply didn't gain in comparing the 0x2000 in fewer instructions. and note if you change that 0x2000 to some other number then that does not simply make the loop take that much longer it can change the generated code for architectures like this.
How I like to make these counted delay loops is to use a function outside the compile domain
extern void dummy ( unsigned int );
void soft_delay(void) {
for (unsigned int i=0; i<0x2000; ++i) { dummy(i); }
}
soft_delay:
push {r4, r5, r6, lr}
mov r5, #128
mov r4, #0
lsl r5, r5, #6
L8:
mov r0, r4
add r4, #1
bl dummy
cmp r4, r5
bne L8
pop {r4, r5, r6, pc}
the feature there is you don't need the overhead of what volatile does you do have a call and clearly there is overhead as well due to the call but not as much
000B40C9
or even better:
soft_delay:
sub r0,#1
bne soft_delay
bx lr
I would have to change the code wrapped around the code under test to make that function work.
Another note specific to these targets but also something you deal with
unsigned int more_fun ( unsigned int, unsigned int );
unsigned int fun ( unsigned int a, unsigned int b )
{
return(more_fun(a,b)+a+(b<<2));
}
00000000 <fun>:
0: b570 push {r4, r5, r6, lr}
2: 000c movs r4, r1
4: 0005 movs r5, r0
6: f7ff fffe bl 0 <more_fun>
a: 00a4 lsls r4, r4, #2
c: 1964 adds r4, r4, r5
e: 1820 adds r0, r4, r0
10: bd70 pop {r4, r5, r6, pc}
12: 46c0 nop ; (mov r8, r8)
a question repeated here at SO on a period basis. why is it pushing r6 it isn't using r6.
The compiler operates using what I call and used to be called a calling convention, now they use terms ABI, EABI, whatever either case it is the same thing it is a set of rules the compiler follows for a particular target. Arm added a rule to keep the stack aligned on a 64 bit address boundary instead of 32, this caused the extra item to keep the stack aligned, what register is used there can vary. If you use an older gcc vs a newer this can/will affect the performance of your code all by itself.

There are many factors at play here. Certainly if you have an optimizing compiler and you compare optimized vs not DEPENDING ON THE CODE you can see a large difference in execution speed. Using the volatile in the tiny loop here actually masks some of that, in both cases it should be read/written to memory every loop.
But the calling code the loop variable unoptimized would touch ram two or three times in that loop, optimized ideally would be in a register the whole time, making for a dramatic difference in execution performance even with zero wait state ram.
The toggle pin code is relatively large (talking to the peripheral directly would be less code), depending on whether that library was compiled separately with different options or at the same time with the same options makes a big difference with respect to performance.
Add that this is an mcu and running off of a flash which with the age of this part the flash might at best be half the clock rate of the cpu and worst a number of wait states and I don't remember off hand if ST had the caching in front of it at that time. so every instruction you add can add a clock, so just the loop variable alone can dramatically change the timing.
Being a high performance pipelined core I have demonstrated here and elsewhere that alignment can (not always) play a role, so if in one case the exact same machine code links to address 0x100 in one case and 0x102 in another it is possible that exact same machine code takes extra or fewer clocks to execute based on the nature of the pre-fetcher in the design, or the flash implementation, cache if any implementation, branch predictor, etc.
And then the biggest problem is how did you time this, it is not uncommon for there to be error in not using a clock correctly such that the clock/timing code itself varies and is creating some of the difference. Plus are there background things going on, interrupts/multitasking.
Michal Abrash wrote a wonderful book called The Zen of Assembly Language, you can get it for free in ePub or perhaps pdf form on GitHub. the 8088 was obsolete when the book was released but if you focus on that then you have missed the point, I bought it when it came out and have used what I learned on nearly a daily basis.
gcc is not a high performance compiler it is more of a general purpose compiler built Unix style where you can have different language front ends and different target backends. When I was in the position you are now trying to first understand these things I sampled many compilers for the same arm target and same C code and the results were vast. Later I wrote an instruction set simulator so I could count instructions and memory accesses to compare gnu vs llvm, as the latter has more optimization opportunities than gnu but for execution tests of code gcc was faster sometimes but not slower. That ended up being more of a long weekend having fun than something I used to analyze the differences.
It is easier to start with small-ish code like this and disassemble the two. Understand that fewer instructions doesn't mean faster, one distant memory access on a dram based system can take hundreds of clock cycles, that might be replaced with another solution that takes a handful/dozen of linearly fetched instructions to end up with the same result (do some math vs look up something in a rarely sampled table) and depending on the situation the dozen instructions execute much faster. at the same time the table solution can be much faster. it depends.
Examination of the disassembly often leads to incorrect conclusions (read abrash, not just that book, everything) as first off folks think less instructions means faster code. rearranging instructions in a pipelined processor can improve performance if you move an instruction into a time period that would have otherwise been wasted clocks. incrementing a register not related to a memory access in front of the memory access instead of after in a non-superscaler processor.
Ahh, back to a comment. This was years ago and competing compilers were more of a thing most folks just wrap their gui around gnu and the ide/gui is the product not the compiler. But there was the arm compiler itself, before the rvct tools, ads and I forget the other, those were "better" than gcc. I forget the names of the others but there was one that produced significantly faster code, granted this was Dhrystone so you will also find that they may tune optimizers for Dhrystone just to play benchmark games. Now that I can see how easy it is to manipulate benchmarks I consider them to in general be bu33zzit, can't be trusted. Kiel used to be a multi-target tool for mcus and similar, but then was purchased by arm and I thought at the time they were dropping all other targets, but have not checked in a while. I might have tried them once to get access to a free/demo version of rvct as when I was working at one job we had a budget to buy multi-thousand dollar tools, but that didn't include rvct (although I was on phone calls with the formerly Allant folks who were part of a purchase that became the rvct tools) which I was eager to try once they had finished the development/integration of that product, by then didn't have a budget for that and later couldn't afford or wasn't interested in buying even kiels tools, much less arms. Their early demos of rvct created an encrypted/obfuscated binary that was not arm machine code it only ran on their simulator so you couldn't use it to eval performance or to compare it to others, don't think they were willing to give us an un-obfuscated version and we weren't willing to reverse engineer it. now it is easier just to use gcc or clang and hand optimize where NEEDED. Likewise with experience can write C code that optimizes better based on experience examining compiler output.
You have to know the hardware, particularly in this case where you take processor IP and most of the chip is not related to the processor IP and most of the performance is not related to the processor IP (pretty much true for a lot of platforms today in particular your server/desktop/laptop). The Gameboy Advance for example used a lot of 16 bit buses instead of 32, thumb tools were barely being integrated, but thumb mode while counting instructions/or bytes was like 10% more code at the time, executed significantly faster on that chip. On other implementations both arm architecture and chip design thumb may have performance penalties, or not.
ST in general with the cortex-m products tends to put a cache in front of the flash, sometimes they document it and provide enable/disable control sometimes not so it can be difficult at best to get a real performance value as the typical thing is to run the code under test many times in a loop so you can get a better time measurement. other vendors don't necessarily do this and it is much easier to see the flash wait states and get a real, worst case, timing value that you can use to validate your design. caches in general as well as pipelines make it difficult at best to get good, repeatable, reliable numbers to validate your design. So for example you sometimes cannot do the alignment trick to mess with performance of the same machine code on an st but on say a ti with the same core you can. st might not give you the icache in a cortex-m7 where another vendor might since st has already covered that. Even within one brand name though don't expect the results of one chip/family to translate to another chip/family even if they use the same core. Also look at subtle comments in the arm documentation as to whether some cores offer a fetch size as an advertised option, single or multi-cycle multiply, etc. and I'll tell you that there are other compile time options for the core that are not shown in the technical reference manual that can affect performance so don't assume that all cortex-m3s are the same even if they are the same revision from arm. The chip vendor has the source so they can go even further and modify it, or for example a register bank to be implemented by the consumer they might change it from no protection to parity to ecc which might affect performance while retaining all of arms original code as is. When you look at an avr or pic though or even an msp430, while I cant prove it those designs appear more static not tiny vs Xmega vs regular old avr as there are definite differences there, but one tiny to another.
Your assumptions are a good start, there really isn't such a thing as over-optimization, more of a thing of missed optimizations, but there may be other factors you are not seeing in your assumptions that may or may not be see in a disassembly. there are obvious things that we would expect like one of the loop variables to be register based vs memory based. alignment, I wouldn't expect the clock settings to change if you used the same code, but a different set of experiments using timers or a scope you can measure the clock settings to see if they were configured the same. background tasks, interrupts and dumb luck as to how/when they hit the test. But bottom line, sometimes it is as simple as a missed optimization or subtle differences in how one compiler generates code to another, it is as often not those things and more of a system issue, memory speed, peripheral speed, caches or their architecture, how the various busses in the design operate, etc. For some of these cortex-ms (and many other processors) you can exploit their bus behavior to show a performance difference in something that the average person wouldn't expect to see.

Keil over-optimization (unlikely, because the code is very simple)
You cant over-optimize you can under/miss so if anything gcc missed something that Kiel didn't. Not the other way around
arm-none-eabi-gcc under-optimization due to wrong compiler flags (I use CLion Embedded plugins` CMakeLists.txt)
will see below but this is highly likely esp debug vs release, I never build for debug (never use a debugger) you have to test everything twice, and if you don't test as you go it makes it much harder to debug so the release version if it has issues takes a lot more work to figure out the issues.
A bug in the initialization so that chip has lower clock frequency with arm-none-eabi-gcc (to be investigated)
My guess is it isn't this, this would imply you made a really big mistake and didn't compile the same code on each tool so it wasn't a fair comparison.
Let's run it.
Using the systick timer, 24 bit (current value register address passed in r0)
.align 8
.thumb_func
.globl TEST
TEST:
push {r4,r5,r6,lr}
mov r4,r0
ldr r5,[r4]
bl soft_delay
ldr r3,[r4]
sub r0,r5,r3
pop {r4,r5,r6,pc}
to avoid overflowing the 24 bit timer the loops count to limited to 200000 times not 2000000 times. I assume the code you left out is 2000000 - 1. If not this still shows the relevant differences.
-O0 code
.align 8
soft_delay:
PUSH {r3,lr}
MOV r0,#0
STR r0,[sp,#0]
B L6.14
L6.8:
LDR r0,[sp,#0]
ADD r0,r0,#1
STR r0,[sp,#0]
L6.14:
LDR r1,L6.24
LDR r0,[sp,#0]
CMP r0,r1
BCC L6.8
POP {r3,pc}
.align
L6.24: .word 100000 - 1
08000200 <soft_delay>:
8000200: b508 push {r3, lr}
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: e002 b.n 800020e <L6.14>
08000208 <L6.8>:
8000208: 9800 ldr r0, [sp, #0]
800020a: 3001 adds r0, #1
800020c: 9000 str r0, [sp, #0]
0800020e <L6.14>:
800020e: 4902 ldr r1, [pc, #8] ; (8000218 <L6.24>)
8000210: 9800 ldr r0, [sp, #0]
8000212: 4288 cmp r0, r1
8000214: d3f8 bcc.n 8000208 <L6.8>
8000216: bd08 pop {r3, pc}
08000218 <L6.24>:
8000218: 0001869f
00124F8B systick timer ticks
-O1 code
soft_delay:
PUSH {r3,lr}
MOV r0,#0
STR r0,[sp,#0]
LDR r0,L6.24
B L6.16
L6.10:
LDR r1,[sp,#0]
ADD r1,r1,#1
STR r1,[sp,#0]
L6.16:
LDR r1,[sp,#0]
CMP r1,r0
BCC L6.10
POP {r3,pc}
.align
L6.24: .word 100000 - 1
08000200 <soft_delay>:
8000200: b508 push {r3, lr}
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: 4804 ldr r0, [pc, #16] ; (8000218 <L6.24>)
8000208: e002 b.n 8000210 <L6.16>
0800020a <L6.10>:
800020a: 9900 ldr r1, [sp, #0]
800020c: 3101 adds r1, #1
800020e: 9100 str r1, [sp, #0]
08000210 <L6.16>:
8000210: 9900 ldr r1, [sp, #0]
8000212: 4281 cmp r1, r0
8000214: d3f9 bcc.n 800020a <L6.10>
8000216: bd08 pop {r3, pc}
08000218 <L6.24>:
8000218: 0001869f
000F424E systicks
-O2 code
soft_delay:
SUB sp,sp,#4
MOVS r0,#0
STR r0,[sp,#0]
LDR r0,L4.24
L4.8:
LDR r1,[sp,#0]
ADDS r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC L4.8
ADD sp,sp,#4
BX lr
.align
L4.24: .word 100000 - 1
08000200 <soft_delay>:
8000200: b081 sub sp, #4
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: 4804 ldr r0, [pc, #16] ; (8000218 <L4.24>)
08000208 <L4.8>:
8000208: 9900 ldr r1, [sp, #0]
800020a: 3101 adds r1, #1
800020c: 9100 str r1, [sp, #0]
800020e: 4281 cmp r1, r0
8000210: d3fa bcc.n 8000208 <L4.8>
8000212: b001 add sp, #4
8000214: 4770 bx lr
8000216: 46c0 nop ; (mov r8, r8)
08000218 <L4.24>:
8000218: 0001869f
000AAE65 systicks
-O3
soft_delay:
PUSH {r3,lr}
MOV r0,#0
STR r0,[sp,#0]
LDR r0,L5.20
L5.8:
LDR r1,[sp,#0]
ADD r1,r1,#1
STR r1,[sp,#0]
CMP r1,r0
BCC L5.8
POP {r3,pc}
.align
L5.20: .word 100000 - 1
08000200 <soft_delay>:
8000200: b508 push {r3, lr}
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: 4803 ldr r0, [pc, #12] ; (8000214 <L5.20>)
08000208 <L5.8>:
8000208: 9900 ldr r1, [sp, #0]
800020a: 3101 adds r1, #1
800020c: 9100 str r1, [sp, #0]
800020e: 4281 cmp r1, r0
8000210: d3fa bcc.n 8000208 <L5.8>
8000212: bd08 pop {r3, pc}
08000214 <L5.20>:
8000214: 0001869f
000AAE6A systicks
Interestingly alignment doesn't affect any of these results.
Comparing your results relative to each other and the above in a spreadsheet
18.7 1.000 00124F8B 1200011 1.000
13.3 0.711 000F424E 1000014 0.833
9.8 0.524 000AAE65 700005 0.583
9.9 0.529 000AAE6A 700010 0.583
It shows that the various stages as I have measured also show improvements and that -O3 is slightly slower.
Analyze what happened.
void soft_delay(void) {
for (volatile uint32_t i=0; i<2000000; ++i) { }
}
because this counts up AND is volatile the compiler cannot do the usual count down and save an instruction (subs then bne rather than add, cmp, bcc)
-O0 code
soft_delay:
PUSH {r3,lr} allocate space for i
MOV r0,#0 i = 0
STR r0,[sp,#0] i = 0
B L6.14
L6.8:
LDR r0,[sp,#0] read i from memory
ADD r0,r0,#1 increment i
STR r0,[sp,#0] save i to memory
L6.14:
LDR r1,L6.24 read max value
LDR r0,[sp,#0] read i from memory
CMP r0,r1 compare i and max value
BCC L6.8 branch if unsigned lower
POP {r3,pc} return
I should have examined the code first L6.24 should have been 2000000 not 2000000 - 1. You left this out of your question.
No optimization generally means just bang out the code in order as in the high level language.
r3 doesn't need to be preserved neither does LR but the variable is volatile so it needs space on the stack the compiler chose to do it this way for this optimization level pushing lr allows for it to pop pc at the end.
push is a pseudo instruction for stm (stmdb) so 8 is subtracted from the stack pointer then the registers are saved in order so if the sp was at 0x1008 then it changes to 0x1000 and writes r3 to 0x1000 and lr to 0x1004 so for the rest of this function it uses sp+0 which is 0x1000 in this example. The r3 and the push used in this way is to allocate a location for the variable i in the code.
-O1 version
soft_delay:
PUSH {r3,lr} allocate space
MOV r0,#0 i = 0
STR r0,[sp,#0] i = 0
LDR r0,L6.24 read max/test value
B L6.16
L6.10:
LDR r1,[sp,#0] load i from memory
ADD r1,r1,#1 increment i
STR r1,[sp,#0] save i to memory
L6.16:
LDR r1,[sp,#0] read i from memory
CMP r1,r0 compare i with test value
BCC L6.10 branch if unsigned lower
POP {r3,pc}
The primary difference between -O0 and -O1 in this case is the -O0 version reads the max value every time through the loop. The -O1 version reads it outside the loop one time.
-O0
08000208 <L6.8>:
8000208: 9800 ldr r0, [sp, #0]
800020a: 3001 adds r0, #1
800020c: 9000 str r0, [sp, #0]
800020e: 4902 ldr r1, [pc, #8] ; (8000218 <L6.24>)
8000210: 9800 ldr r0, [sp, #0]
8000212: 4288 cmp r0, r1
8000214: d3f8 bcc.n 8000208 <L6.8>
1200011 / 100000 = 12
The bulk of the time is in the above loop. 7 instructions three loads two stores. That is 12 things so perhaps its one clock per.
-O1 code
0800020a <L6.10>:
800020a: 9900 ldr r1, [sp, #0]
800020c: 3101 adds r1, #1
800020e: 9100 str r1, [sp, #0]
08000210 <L6.16>:
8000210: 9900 ldr r1, [sp, #0]
8000212: 4281 cmp r1, r0
8000214: d3f9 bcc.n 800020a <L6.10>
1000014 / 100000 = 10
0800020a <L6.10>:
800020a: 9900 ldr r1, [sp, #0]
800020c: 3101 adds r1, #1
800020e: 9100 str r1, [sp, #0]
8000210: 9900 ldr r1, [sp, #0]
8000212: 4281 cmp r1, r0
8000214: d3f9 bcc.n 800020a <L6.10>
6 instructions, two loads one store. 8 things 10 clocks. The difference here from -O0 is that the compare value is read before/outside the loop so that saves that instruction and that memory cycle.
-O2 code
08000208 <L4.8>:
8000208: 9900 ldr r1, [sp, #0]
800020a: 3101 adds r1, #1
800020c: 9100 str r1, [sp, #0]
800020e: 4281 cmp r1, r0
8000210: d3fa bcc.n 8000208 <L4.8>
700005 / 100000 = 7 ticks per loop
So by some folks definition, this isn't honoring the volatile, or is it? The compare value is outside the loop and the way this is written it should be 2000000 + 1, yes? It reads i from memory one time per loop rather than twice but does store it every time through the loop with the new value. Basically it removed the second load and that saved some time waiting on that read to finish.
-O3 code
08000208 <L5.8>:
8000208: 9900 ldr r1, [sp, #0]
800020a: 3101 adds r1, #1
800020c: 9100 str r1, [sp, #0]
800020e: 4281 cmp r1, r0
8000210: d3fa bcc.n 8000208 <L5.8>
The inner loop is the same as -O2.
-O2 does this
08000200 <soft_delay>:
8000200: b081 sub sp, #4
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: 4804 ldr r0, [pc, #16] ; (8000218 <L4.24>)
...
8000212: b001 add sp, #4
8000214: 4770 bx lr
-O3 does this
08000200 <soft_delay>:
8000200: b508 push {r3, lr}
8000202: 2000 movs r0, #0
8000204: 9000 str r0, [sp, #0]
8000206: 4803 ldr r0, [pc, #12] ; (8000214 <L5.20>)
8000212: bd08 pop {r3, pc}
Now that is fewer instructions yes, but the push and pop take longer they have memory cycle overhead, the subtract and add of the stack pointer instructions are faster than those memory cycles even with the fewer instructions. So the subtle difference in time is the push/pop outside the loop.
Now for GCC (9.2.0)
For starters I don't know if Kiel was targetted at thumb in general (all variants) the cortex-ms or the cortex-m3 specifically.
First -O0 code:
-O0
soft_delay:
push {r7, lr}
sub sp, sp, #8
add r7, sp, #0
movs r3, #0
str r3, [r7, #4]
b .L2
.L3:
ldr r3, [r7, #4]
adds r3, r3, #1
str r3, [r7, #4]
.L2:
ldr r3, [r7, #4]
ldr r2, .L4
cmp r3, r2
bls .L3
nop
nop
mov sp, r7
add sp, sp, #8
# sp needed
pop {r7}
pop {r0}
bx r0
.L5:
.align 2
.L4:
.word 199999
08000200 <soft_delay>:
8000200: b580 push {r7, lr}
8000202: b082 sub sp, #8
8000204: af00 add r7, sp, #0
8000206: 2300 movs r3, #0
8000208: 607b str r3, [r7, #4]
800020a: e002 b.n 8000212 <soft_delay+0x12>
800020c: 687b ldr r3, [r7, #4]
800020e: 3301 adds r3, #1
8000210: 607b str r3, [r7, #4]
8000212: 687b ldr r3, [r7, #4]
8000214: 4a04 ldr r2, [pc, #16] ; (8000228 <soft_delay+0x28>)
8000216: 4293 cmp r3, r2
8000218: d9f8 bls.n 800020c <soft_delay+0xc>
800021a: 46c0 nop ; (mov r8, r8)
800021c: 46c0 nop ; (mov r8, r8)
800021e: 46bd mov sp, r7
8000220: b002 add sp, #8
8000222: bc80 pop {r7}
8000224: bc01 pop {r0}
8000226: 4700 bx r0
8000228: 00030d3f andeq r0, r3, pc, lsr sp
00124F9F
Immediately we see two things, first the stack frame which Kiel was not building and second these mystery nops after the compare, gotta be some chip errata or something, need to look that up. From my other answer that may be deleted by now gcc 5.4.0 put one nop, tcc 9.2.0 put two. so this loop has
1200031 / 100000 = 12 ticks per loop
800020c: 687b ldr r3, [r7, #4]
800020e: 3301 adds r3, #1
8000210: 607b str r3, [r7, #4]
8000212: 687b ldr r3, [r7, #4]
8000214: 4a04 ldr r2, [pc, #16] ; (8000228 <soft_delay+0x28>)
8000216: 4293 cmp r3, r2
8000218: d9f8 bls.n 800020c <soft_delay+0xc>
The main loop where this code spends its time is also 12 ticks like Kiel its the same just different registers which don't matter. The subtle overall time difference is that the stack frame and the extra nops make the gcc version slightly longer.
arm-none-eabi-gcc -O0 -fomit-frame-pointer -c -mthumb -mcpu=cortex-m0 hello.c -o hello.o
arm-none-eabi-objdump -D hello.o > hello.list
arm-none-eabi-gcc -O0 -fomit-frame-pointer -S -mthumb -mcpu=cortex-m0 hello.c
If I build without a frame pointer then gcc -O0 becomes
soft_delay:
sub sp, sp, #8
movs r3, #0
str r3, [sp, #4]
b .L2
.L3:
ldr r3, [sp, #4]
adds r3, r3, #1
str r3, [sp, #4]
.L2:
ldr r3, [sp, #4]
ldr r2, .L4
cmp r3, r2
bls .L3
nop
nop
add sp, sp, #8
bx lr
.L5:
.align 2
.L4:
.word 99999
08000200 <soft_delay>:
8000200: b082 sub sp, #8
8000202: 2300 movs r3, #0
8000204: 9301 str r3, [sp, #4]
8000206: e002 b.n 800020e <soft_delay+0xe>
8000208: 9b01 ldr r3, [sp, #4]
800020a: 3301 adds r3, #1
800020c: 9301 str r3, [sp, #4]
800020e: 9b01 ldr r3, [sp, #4]
8000210: 4a03 ldr r2, [pc, #12] ; (8000220 <soft_delay+0x20>)
8000212: 4293 cmp r3, r2
8000214: d9f8 bls.n 8000208 <soft_delay+0x8>
8000216: 46c0 nop ; (mov r8, r8)
8000218: 46c0 nop ; (mov r8, r8)
800021a: b002 add sp, #8
800021c: 4770 bx lr
800021e: 46c0 nop ; (mov r8, r8)
8000220: 0001869f
00124F94
and saves 11 clocks over the other gcc version unlike Kiel gcc is not doing the push pop thing so saving some clocks over Kiel but the nops don't help.
Update: I had the wrong number of loops for Kiel because it used unsigned lower instead of unsigned lower or same as with gcc. Even the playing field, remove the nops fix the loops gcc is 00124F92 and Kiel 00124F97 5 clocks slower due to the push/pop vs sp math. gcc 5.4.0 also does the sp math thing, with the nop 00124F93. Being outside the loop stuff these differences while measurable are also in the noise when comparing these two (three) compilers.
gcc -O1
soft_delay:
sub sp, sp, #8
mov r3, #0
str r3, [sp, #4]
ldr r2, [sp, #4]
ldr r3, .L5
cmp r2, r3
bhi .L1
mov r2, r3
.L3:
ldr r3, [sp, #4]
add r3, r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
cmp r3, r2
bls .L3
.L1:
add sp, sp, #8
bx lr
.L6:
.align 2
.L5:
.word 99999
08000200 <soft_delay>:
8000200: b082 sub sp, #8
8000202: 2300 movs r3, #0
8000204: 9301 str r3, [sp, #4]
8000206: 9a01 ldr r2, [sp, #4]
8000208: 4b05 ldr r3, [pc, #20] ; (8000220 <soft_delay+0x20>)
800020a: 429a cmp r2, r3
800020c: d806 bhi.n 800021c <soft_delay+0x1c>
800020e: 1c1a adds r2, r3, #0
8000210: 9b01 ldr r3, [sp, #4]
8000212: 3301 adds r3, #1
8000214: 9301 str r3, [sp, #4]
8000216: 9b01 ldr r3, [sp, #4]
8000218: 4293 cmp r3, r2
800021a: d9f9 bls.n 8000210 <soft_delay+0x10>
800021c: b002 add sp, #8
800021e: 4770 bx lr
8000220: 0001869f muleq r1, pc, r6 ; <UNPREDICTABLE>
000F4251
10 ticks per loop
8000210: 9b01 ldr r3, [sp, #4]
8000212: 3301 adds r3, #1
8000214: 9301 str r3, [sp, #4]
8000216: 9b01 ldr r3, [sp, #4]
8000218: 4293 cmp r3, r2
800021a: d9f9 bls.n 8000210 <soft_delay+0x10>
Same as Kiel the load of the compare value is outside the loop now saving a little per loop. It was architected a little different. And I believe the nops after the bls are something else. I just saw someone asking about why gcc did something that another didn't what seemed to be an extra instruction. I would use the term missed optimization vs bug, but either way this one doesn't have the nops...
gcc -O2 code
soft_delay:
mov r3, #0
sub sp, sp, #8
str r3, [sp, #4]
ldr r3, [sp, #4]
ldr r2, .L7
cmp r3, r2
bhi .L1
.L3:
ldr r3, [sp, #4]
add r3, r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
cmp r3, r2
bls .L3
.L1:
add sp, sp, #8
bx lr
.L8:
.align 2
.L7:
.word 99999
08000200 <soft_delay>:
8000200: 2300 movs r3, #0
8000202: b082 sub sp, #8
8000204: 9301 str r3, [sp, #4]
8000206: 9b01 ldr r3, [sp, #4]
8000208: 4a05 ldr r2, [pc, #20] ; (8000220 <soft_delay+0x20>)
800020a: 4293 cmp r3, r2
800020c: d805 bhi.n 800021a <soft_delay+0x1a>
800020e: 9b01 ldr r3, [sp, #4]
8000210: 3301 adds r3, #1
8000212: 9301 str r3, [sp, #4]
8000214: 9b01 ldr r3, [sp, #4]
8000216: 4293 cmp r3, r2
8000218: d9f9 bls.n 800020e <soft_delay+0xe>
800021a: b002 add sp, #8
800021c: 4770 bx lr
800021e: 46c0 nop ; (mov r8, r8)
8000220: 0001869f
000F4251
No difference from -O1
800020e: 9b01 ldr r3, [sp, #4]
8000210: 3301 adds r3, #1
8000212: 9301 str r3, [sp, #4]
8000214: 9b01 ldr r3, [sp, #4]
8000216: 4293 cmp r3, r2
8000218: d9f9 bls.n 800020e <soft_delay+0xe>
gcc is not willing to take that second load out of the loop.
at the -O2 level Kiel is 70005 ticks and gcc 1000017. 42 percent more/slower.
gcc -O3 produced the same code as -O2.
So the key difference here is perhaps an interpretation of what volatile does, and there are some folks at SO that get upset about its use anyway, but let's just assume that it means everything you do with the variable needs to go to/from memory.
From what I normally see that means this
.L3:
ldr r3, [sp, #4]
add r3, r3, #1
str r3, [sp, #4]
ldr r3, [sp, #4]
cmp r3, r2
bls .L3
not this
.L3:
ldr r3, [sp, #4]
add r3, r3, #1
str r3, [sp, #4]
cmp r3, r2
bls .L3
Is that a Kiel bug? Do you want to use your over-optimization term here?
There are two operations an increment
ldr r3, [sp, #4]
add r3, r3, #1
str r3, [sp, #4]
and a compare
ldr r3, [sp, #4]
cmp r3, r2
bls .L3
arguably each should access the variable from memory not from a register. (in a pure debug version sense you should see code like this too btw, although the tool defines what it means by debug version)
When you figure out which gcc you have and how it was used it may account for even more code on the gcc side being the 100% slower not 40%.
I don't know that you could make this any tighter, I don't think re-arranging instructions will improve performance either.
Also, this was a missed optimization in gcc:
cmp r3, r2
bhi .L1
gcc knew that it was starting from zero and knew it was going to a bigger number so r3 would never be larger than r2 here.
We wish for the tool to make this:
soft_delay:
mov r3, #0
ldr r2, .L7
.L3:
add r3, r3, #1
cmp r3, r2
bls .L3
.L1:
bx lr
.L8:
.align 2
.L7:
.word 99999
00061A88
at 4 instructions per loop on average
but without the volatile it is dead code so the optimizer would simply remove it rather than make this code. A down count loop would be slightly smaller
soft_delay:
ldr r2, .L7
.L3:
sub r2, r2, #1
bne .L3
.L1:
bx lr
.L8:
.align 2
.L7:
.word 100000
000493E7
3 ticks per loop, removing the extra instruction helped.
Keil over-optimization (unlikely, because the code is very simple)
You might actually be right here, not because it is simple, but what does volatile really mean, and is it subject to interpretation by the compilers (I would have to find a spec). Is this a Kiel bug, did it over optimize?
There still isn't such a thing as over-optimization, there is a name for that, a compiler bug. So did Kiel interpret this wrong or Kiel and gcc disagree on the interpretation of volatile.
arm-none-eabi-gcc under-optimization due to wrong compiler flags (I use CLion Embedded plugins` CMakeLists.txt)
This could be it as well, for the same reason. Is this simply an "implementation defined" difference between compilers and both are right based on their definition?
Now gcc did miss an optimization here (or two), but it accounts for a small amount as it is outside the loop.

GCC ||| KEIL
|||
soft_delay: |||
mov r3, #0 |||
sub sp, sp, #8 |||
str r3, [sp, #4] |||
ldr r3, [sp, #4] |||
ldr r2, .L7 |||
cmp r3, r2 |||
bhi .L1 ||| soft_delay PROC
.L3: ||| PUSH {r3,lr}
ldr r3, [sp, #4] ||| MOVS r0,#0
add r3, r3, #1 ||| STR r0,[sp,#0]
str r3, [sp, #4] ||| LDR r0,|L5.20|
ldr r3, [sp, #4] ||| |L5.8|
cmp r3, r2 ||| LDR r1,[sp,#0]
bls .L3 ||| ADDS r1,r1,#1
.L1: ||| STR r1,[sp,#0]
add sp, sp, #8 ||| CMP r1,r0
bx lr ||| BCC |L5.8|
.L7: ||| POP {r3,pc}
.word 1999999 ||| ENDP
There is obvious bug in KEIL. volatile means that its value has to be loaded before every use and saved when changed. ? Keil is missing one load.
The variable is used 2 times: 1: when increased, 2: when compared. Two loads needed.

Related

Self written simple memset not working with -03 eabi gcc on ARMv7

I wrote a very simple memset in c that works fine up to -O2 but not with -O3...
memset:
void * memset(void * blk, int c, size_t n)
{
unsigned char * dst = blk;
while (n-- > 0)
*dst++ = (unsigned char)c;
return blk;
}
...which compiles to this assembly when using -O2:
20000430 <memset>:
20000430: e3520000 cmp r2, #0 # compare param 'n' with zero
20000434: 012fff1e bxeq lr # if equal return to caller
20000438: e6ef1071 uxtb r1, r1 # else zero extend (extract byte from) param 'c'
2000043c: e0802002 add r2, r0, r2 # add pointer 'blk' to 'n'
20000440: e1a03000 mov r3, r0 # move pointer 'blk' to r3
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
20000450: e12fff1e bx lr # else back to caller
This makes sense to me. I annotated what happens here.
When I compile it with -O3 the program crashes. My memset calls itself repeatedly until it ate the whole stack:
200005e4 <memset>:
200005e4: e3520000 cmp r2, #0 # compare param 'n' with zero
200005e8: e92d4010 push {r4, lr} # ? (1)
200005ec: e1a04000 mov r4, r0 # move pointer 'blk' to r4 (temp to hold return value)
200005f0: 0a000001 beq 200005fc <memset+0x18> # if equal (first line compare) jump to epilogue
200005f4: e6ef1071 uxtb r1, r1 # zero extend (extract byte from) param 'c'
200005f8: ebfffff9 bl 200005e4 <memset> # call myself ? (2)
200005fc: e1a00004 mov r0, r4 # epilogue start. move return value to r0
20000600: e8bd8010 pop {r4, pc} # restore r4 and back to caller
I can't figure out how this optimised version is supposed to work without any strb or similar. It doesn't matter if I try to set the memory to '0' or something else so the function is not only called on .bss (zero initialised) variables.
(1) This is a problem. This push gets endlessly repeated without a matching pop as it's called by (2) when the function doesn't early-exit because of 'n' being zero. I verified this with uart prints. Also r2 is never touched so why should the compare to zero ever become true?
Please help me understand what's happening here. Is the compiler assuming prerequisites that I may not fulfill?
Background: I'm using external code that requires memset in my baremetal project so I rolled my own. It's only used once on startup and not performance critical.
/edit: The compiler is called with these options:
arm-none-eabi-gcc -O3 -Wall -Wextra -fPIC -nostdlib -nostartfiles -marm -fstrict-volatile-bitfields -march=armv7-a -mcpu=cortex-a9 -mfloat-abi=hard -mfpu=neon-vfpv3

Your first question (1). That is per the calling convention if you are going to make a nested function call you need to preserve the link register, and you need to be 64 bit aligned. The code uses r4 so that is the extra register saved. No magic there.
Your second question (2) it is not calling your memset it is optimizing your code because it sees it as an inefficient memset. Fuz has provided the answers to your question.
Rename the function
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: e92d4010 push {r4, lr}
8: e1a04000 mov r4, r0
c: 0a000001 beq 18 <xmemset+0x18>
10: e6ef1071 uxtb r1, r1
14: ebfffffe bl 0 <memset>
18: e1a00004 mov r0, r4
1c: e8bd8010 pop {r4, pc}
and you can see this.
If you were to use -ffreestanding as Fuz recommended then you see this or something like it
00000000 <xmemset>:
0: e3520000 cmp r2, #0
4: 012fff1e bxeq lr
8: e92d41f0 push {r4, r5, r6, r7, r8, lr}
c: e2426001 sub r6, r2, #1
10: e3560002 cmp r6, #2
14: e6efe071 uxtb lr, r1
18: 9a00002a bls c8 <xmemset+0xc8>
1c: e3a0c000 mov r12, #0
20: e3520023 cmp r2, #35 ; 0x23
24: e7c7c01e bfi r12, lr, #0, #8
28: e1a04122 lsr r4, r2, #2
2c: e7cfc41e bfi r12, lr, #8, #8
30: e7d7c81e bfi r12, lr, #16, #8
34: e7dfcc1e bfi r12, lr, #24, #8
38: 9a000024 bls d0 <xmemset+0xd0>
3c: e2445009 sub r5, r4, #9
40: e1a03000 mov r3, r0
44: e3c55007 bic r5, r5, #7
48: e3a07000 mov r7, #0
4c: e2851008 add r1, r5, #8
50: e1570005 cmp r7, r5
54: f5d3f0a0 pld [r3, #160] ; 0xa0
58: e1a08007 mov r8, r7
5c: e583c000 str r12, [r3]
60: e583c004 str r12, [r3, #4]
64: e2877008 add r7, r7, #8
68: e583c008 str r12, [r3, #8]
6c: e2833020 add r3, r3, #32
70: e503c014 str r12, [r3, #-20] ; 0xffffffec
74: e503c010 str r12, [r3, #-16]
78: e503c00c str r12, [r3, #-12]
7c: e503c008 str r12, [r3, #-8]
80: e503c004 str r12, [r3, #-4]
84: 1afffff1 bne 50 <xmemset+0x50>
88: e2811001 add r1, r1, #1
8c: e483c004 str r12, [r3], #4
90: e1540001 cmp r4, r1
94: 8afffffb bhi 88 <xmemset+0x88>
98: e3c23003 bic r3, r2, #3
9c: e1520003 cmp r2, r3
a0: e0466003 sub r6, r6, r3
a4: e0803003 add r3, r0, r3
a8: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
ac: e3560000 cmp r6, #0
b0: e5c3e000 strb lr, [r3]
b4: 08bd81f0 popeq {r4, r5, r6, r7, r8, pc}
b8: e3560001 cmp r6, #1
bc: e5c3e001 strb lr, [r3, #1]
c0: 15c3e002 strbne lr, [r3, #2]
c4: e8bd81f0 pop {r4, r5, r6, r7, r8, pc}
c8: e1a03000 mov r3, r0
cc: eafffff6 b ac <xmemset+0xac>
d0: e1a03000 mov r3, r0
d4: e3a01000 mov r1, #0
d8: eaffffea b 88 <xmemset+0x88>
which appears like it simply inlined memset, the one it knows not your code (the faster one).
So if you want it to use your code then stick with -O2. Yours is pretty inefficient so not sure why you need to push it any further than it was.
20000444: e4c31001 strb r1, [r3], #1 # store value of 'c' to address of r3, increment r3 for next pass
20000448: e1530002 cmp r3, r2 # compare current store address to calculated max address
2000044c: 1afffffc bne 20000444 <memset+0x14> # if not equal store next byte
It isn't going to get any better than that without replacing your code with something else.
Fuz already answered the question:
Compile with -fno-builtin-memset. The compiler recognises that the function implements memset and thus replaces it with a call to memset. You should in general compile with -ffreestanding when writing bare-metal code. I believe this fixes this sort of problem, too
It is replacing your code with memset, if you want it not to do that use -ffreestanding.
If you wish to go beyond that and wonder why -fno-builtin-memset didn't work that is a question for the gcc folks, file a ticket, let us know what they say (or just look at the compiler source code).

C - occasional CPU stall during memcmp on Cortex-R5

I'm running some tests on a Cortex-R5 (Ultrascale MpSoC). It basically generates 2 random numbers with a hardware module and compares them at the end to ensure they're not 0, nor the same values.
uint32_t status;
const uint8_t zeros[32] = {0};
uint8_t bytes1[32] = {0};
uint8_t bytes2[32] = {0};
// (generate random numbers and put them in bytes1)
// (generate random numbers and put them in bytes2)
printf("memcmp 0\n");
status = !memcmp(bytes1, bytes2, 32);
printf("memcmp 1\n");
status |= !memcmp(bytes1, zeros, 32);
printf("memcmp 2\n");
status |= !memcmp(bytes2, zeros, 32);
Some tests are running fine. Some executions are stalled after printing "memcmp 0" (when it freezes, it's always at the first memcmp)...
I have tried several things:
When I print the values in bytes1 and 2, they are indeed random numbers not equal to 0 and not equal with each other.
Moving the memcmp at different places, or switching the memcmp's. It's always the first one which freezes.
Replacing memcmp with a custom function to do comparison => it never freezes.
The memcmp function is used at other places of the code and it freezes nowhere else. Perhaps the difference is that the random check is the only place where the memcmp expects different values (at other places it's to ensure a function produces expected output).
I couldn't find the definition of memcmp... I don't know where to look. The only thing I could find is the assembly code, but it'd be difficult to attach a debugger to know exactly which instruction can't complete.
000064d0 <memcmp>:
64d0: 2a03 cmp r2, #3
64d2: b470 push {r4, r5, r6}
64d4: d912 bls.n 64fc <memcmp+0x2c>
64d6: ea40 0501 orr.w r5, r0, r1
64da: 4604 mov r4, r0
64dc: 07ad lsls r5, r5, #30
64de: 460b mov r3, r1
64e0: d120 bne.n 6524 <memcmp+0x54>
64e2: 681d ldr r5, [r3, #0]
64e4: 4619 mov r1, r3
64e6: 6826 ldr r6, [r4, #0]
64e8: 4620 mov r0, r4
64ea: 3304 adds r3, #4
64ec: 3404 adds r4, #4
64ee: 42ae cmp r6, r5
64f0: d118 bne.n 6524 <memcmp+0x54>
64f2: 3a04 subs r2, #4
64f4: 4620 mov r0, r4
64f6: 2a03 cmp r2, #3
64f8: 4619 mov r1, r3
64fa: d8f2 bhi.n 64e2 <memcmp+0x12>
64fc: 1e54 subs r4, r2, #1
64fe: b172 cbz r2, 651e <memcmp+0x4e>
6500: 7802 ldrb r2, [r0, #0]
6502: 780b ldrb r3, [r1, #0]
6504: 429a cmp r2, r3
6506: bf08 it eq
6508: 1864 addeq r4, r4, r1
650a: d006 beq.n 651a <memcmp+0x4a>
650c: e00c b.n 6528 <memcmp+0x58>
650e: f810 2f01 ldrb.w r2, [r0, #1]!
6512: f811 3f01 ldrb.w r3, [r1, #1]!
6516: 429a cmp r2, r3
6518: d106 bne.n 6528 <memcmp+0x58>
651a: 42a1 cmp r1, r4
651c: d1f7 bne.n 650e <memcmp+0x3e>
651e: 2000 movs r0, #0
6520: bc70 pop {r4, r5, r6}
6522: 4770 bx lr
6524: 1e54 subs r4, r2, #1
6526: e7eb b.n 6500 <memcmp+0x30>
6528: 1ad0 subs r0, r2, r3
652a: bc70 pop {r4, r5, r6}
652c: 4770 bx lr
652e: bf00 nop
Where can I see the source code of memcmp for cortex R5? FYI, the used compiler is armr5-none-eabi-gcc.
Any idea what could cause a CPU stall with this function?
Thank you

qemu-arm branches to a seemingly abstract instruction

I am trying to build a binary translator for arm bear metal compiled code and I try to verify proper execution flow by comparing it to that of qemu-arm. I use the following command to dump the program flow:
qemu-arm -d in_asm,cpu -singlestep -D a.flow a.out
I noticed something strange, where the program seems to jump to an irrelevant instruction, since 0x000080b4 is not the branch nor the next instruction following 0x000093ec.
0x000093ec: 1afffff9 bne 0x93d8
R00=00000000 R01=00009c44 R02=00000002 R03=00000000
R04=00000001 R05=0001d028 R06=00000002 R07=00000000
R08=00000000 R09=00000000 R10=0001d024 R11=00000000
R12=f6ffed88 R13=f6ffed88 R14=000093e8 R15=000093ec
PSR=20000010 --C- A usr32
R00=00000000 R01=00009c44 R02=00000002 R03=00000000
R04=00000001 R05=0001d028 R06=00000002 R07=00000000
R08=00000000 R09=00000000 R10=0001d024 R11=00000000
R12=f6ffed88 R13=f6ffed88 R14=000093e8 R15=000093d8
PSR=20000010 --C- A usr32
----------------
IN:
0x000080b4: e59f3060 ldr r3, [pc, #96] ; 0x811c
The instruction that actually executes corresponds to the beggining of the <frame_dummy> tag in the disassembly. Can someone explain what actually happens within the emulator and is this behavior normal in the ARM architecture? The program was compiled with: arm-none-eabi-gcc --specs=rdimon.specs a.c
Here is the same segment of the program flow without the CPU state:
0x0000804c: e59f3018 ldr r3, [pc, #24] ; 0x806c
0x00008050: e3530000 cmp r3, #0 ; 0x0
0x00008054: 01a0f00e moveq pc, lr
----------------
IN: __libc_init_array
0x000093e8: e1560004 cmp r6, r4
0x000093ec: 1afffff9 bne 0x93d8
----------------
IN:
0x000080b4: e59f3060 ldr r3, [pc, #96] ; 0x811c
0x000080b8: e3530000 cmp r3, #0 ; 0x0
0x000080bc: 0a000009 beq 0x80e8
This is the disassembly of this part:
93d4: 0a000005 beq 93f0 <__libc_init_array+0x68>
93d8: e2844001 add r4, r4, #1
93dc: e4953004 ldr r3, [r5], #4
93e0: e1a0e00f mov lr, pc
93e4: e1a0f003 mov pc, r3
93e8: e1560004 cmp r6, r4
93ec: 1afffff9 bne 93d8 <__libc_init_array+0x50>
93f0: e8bd4070 pop {r4, r5, r6, lr}

It is a reverse jump to previously emitted TB, you don't even have to read that much back:
IN: __libc_init_array
0x000093d8: e2844001 add r4, r4, #1 ; 0x1
0x000093dc: e4953004 ldr r3, [r5], #4
0x000093e0: e1a0e00f mov lr, pc
0x000093e4: e1a0f003 mov pc, r3
----------------
IN: register_fini
0x0000804c: e59f3018 ldr r3, [pc, #24] ; 0x806c
0x00008050: e3530000 cmp r3, #0 ; 0x0
0x00008054: 01a0f00e moveq pc, lr
----------------
IN: __libc_init_array
0x000093e8: e1560004 cmp r6, r4
0x000093ec: 1afffff9 bne 0x93d8
So, qemu is not showing it again. Notice this loop is iterating function pointers, the first one points to register_fini and the second one to the magical 0x000080b4 address in question (no symbol for it). When this unnamed function conditionally returns with moveq pc, lr control is transferred back to __libc_init_array address 0x000093e8 which then determines that the array end has been reached and again just returns to its caller at 0x000093f0.

ARM-C Inter-working

I am trying out a simple program for ARM-C inter-working. Here is the code:
#include<stdio.h>
#include<stdlib.h>
int Double(int a);
extern int Start(void);
int main(){
int result=0;
printf("in C main\n");
result=Start();
printf("result=%d\n",result);
return 0;
}
int Double(int a)
{
printf("inside double func_argument_value=%d\n",a);
return (a*2);
}
The assembly file goes as-
.syntax unified
.cpu cortex-m3
.thumb
.align
.global Start
.global Double
.thumb_func
Start:
mov r10,lr
mov r0,#42
bl Double
mov lr,r10
mov r2,r0
mov pc,lr
During debugging on LPC1769(embedded artists board), I get an hardfault error on the instruction " result=Start(). " I am trying to do an arm-C internetworking here. the lr value during the execution of the above the statement(result=Start()) is 0x0000029F, where the faulting instruction is,and the pc value is 0x0000029E.
This is how I got the faulting instruction in r1
__asm("mrs r0,MSP\n"
"isb\n"
"ldr r1,[r0,#24]\n");
Can anybody please explain where I am going wrong? Any solution is appreciated.
Thank you in advance.
I am a beginner in cortex-m3 & am using the NXP LPCXpresso IDE powered by Code_Red.
Here is the disassembly of my code.
IntDefaultHandler:
00000269: push {r7}
0000026b: add r7, sp, #0
0000026d: b.n 0x26c <IntDefaultHandler+4>
0000026f: nop
00000271: mov r3, lr
00000273: mov.w r0, #42 ; 0x2a
00000277: bl 0x2c0 <Double>
0000027b: mov lr, r3
0000027d: mov r2, r0
0000027f: mov pc, lr
main:
00000281: push {r7, lr}
00000283: sub sp, #8
00000285: add r7, sp, #0
00000287: mov.w r3, #0
0000028b: str r3, [r7, #4]
0000028d: movw r3, #11212 ; 0x2bcc
00000291: movt r3, #0
00000295: mov r0, r3
00000297: bl 0xd64 <printf>
0000029b: bl 0x270 <Start>
0000029f: mov r3, r0
000002a1: str r3, [r7, #4]
000002a3: movw r3, #11224 ; 0x2bd8
000002a7: movt r3, #0
000002ab: mov r0, r3
000002ad: ldr r1, [r7, #4]
000002af: bl 0xd64 <printf>
000002b3: mov.w r3, #0
000002b7: mov r0, r3
000002b9: add.w r7, r7, #8
000002bd: mov sp, r7
000002bf: pop {r7, pc}
Double:
000002c0: push {r7, lr}
000002c2: sub sp, #8
000002c4: add r7, sp, #0
000002c6: str r0, [r7, #4]
000002c8: movw r3, #11236 ; 0x2be4
000002cc: movt r3, #0
000002d0: mov r0, r3
000002d2: ldr r1, [r7, #4]
000002d4: bl 0xd64 <printf>
000002d8: ldr r3, [r7, #4]
000002da: mov.w r3, r3, lsl #1
000002de: mov r0, r3
000002e0: add.w r7, r7, #8
000002e4: mov sp, r7
000002e6: pop {r7, pc}
As per your advice Dwelch, I have changed the r10 to r3.

I assume you mean interworking not internetworking? The LPC1769 is a cortex-m3 which is thumb/thumb2 only so it doesnt support arm instructions so there is no interworking available for that platform. Nevertheless, playing with the compiler to see what goes on:
Get the compiler to do it for you first, then try it yourself in asm...
start.s
.thumb
.globl _start
_start:
ldr r0,=hello
mov lr,pc
bx r0
hang : b hang
hello.c
extern unsigned int two ( unsigned int );
unsigned int hello ( unsigned int h )
{
return(two(h)+7);
}
two.c
unsigned int two ( unsigned int t )
{
return(t+5);
}
Makefile
hello.list : start.s hello.c two.c
arm-none-eabi-as -mthumb start.s -o start.o
arm-none-eabi-gcc -c -O2 hello.c -o hello.o
arm-none-eabi-gcc -c -O2 -mthumb two.c -o two.o
arm-none-eabi-ld -Ttext=0x1000 start.o hello.o two.o -o hello.elf
arm-none-eabi-objdump -D hello.elf > hello.list
clean :
rm -f *.o
rm -f *.elf
rm -f *.list
produces hello.list
Disassembly of section .text:
00001000 <_start>:
1000: 4801 ldr r0, [pc, #4] ; (1008 <hang+0x2>)
1002: 46fe mov lr, pc
1004: 4700 bx r0
00001006 <hang>:
1006: e7fe b.n 1006 <hang>
1008: 0000100c andeq r1, r0, ip
0000100c <hello>:
100c: e92d4008 push {r3, lr}
1010: eb000004 bl 1028 <__two_from_arm>
1014: e8bd4008 pop {r3, lr}
1018: e2800007 add r0, r0, #7
101c: e12fff1e bx lr
00001020 <two>:
1020: 3005 adds r0, #5
1022: 4770 bx lr
1024: 0000 movs r0, r0
...
00001028 <__two_from_arm>:
1028: e59fc000 ldr ip, [pc] ; 1030 <__two_from_arm+0x8>
102c: e12fff1c bx ip
1030: 00001021 andeq r1, r0, r1, lsr #32
1034: 00000000 andeq r0, r0, r0
hello.o disassembled by itself:
00000000 <hello>:
0: e92d4008 push {r3, lr}
4: ebfffffe bl 0 <two>
8: e8bd4008 pop {r3, lr}
c: e2800007 add r0, r0, #7
10: e12fff1e bx lr
the compiler uses bl assuming/hoping it will be calling arm from arm. but it didnt, so what they did was put a trampoline in there.
0000100c <hello>:
100c: e92d4008 push {r3, lr}
1010: eb000004 bl 1028 <__two_from_arm>
1014: e8bd4008 pop {r3, lr}
1018: e2800007 add r0, r0, #7
101c: e12fff1e bx lr
00001028 <__two_from_arm>:
1028: e59fc000 ldr ip, [pc] ; 1030 <__two_from_arm+0x8>
102c: e12fff1c bx ip
1030: 00001021 andeq r1, r0, r1, lsr #32
1034: 00000000 andeq r0, r0, r0
the bl to __two_from_arm is an arm mode to arm mode branch link. the address of the destination function (two) with the lsbit set, which tells bx to switch to thumb mode, is loaded into the disposable register ip (r12?) then the bx ip happens switching modes. the branch link had setup the return address in lr, which was an arm mode address no doubt (lsbit zero).
00001020 <two>:
1020: 3005 adds r0, #5
1022: 4770 bx lr
1024: 0000 movs r0, r0
the two() function does its thing and returns, note you have to use bx lr not mov pc,lr when interworking. Basically if you are not running an ARMv4 without the T, or an ARMv5 without the T, mov pc,lr is an okay habit. But anything ARMv4T or newer (ARMv5T or newer) use bx lr to return from a function unless you have a special reason not to. (avoid using pop {pc} as well for the same reason unless you really need to save that instruction and are not interworking). Now being on a cortex-m3 which is thumb+thumb2 only, well you cant interwork so you can use mov pc,lr and pop {pc}, but the code is not portable, and it is not a good habit as that habit will bite you when you switch back to arm programming.
So since hello was in arm mode when it used bl which is what set the link register, the bx in two_from_arm does not touch the link register, so when two() returns with a bx lr it is returning to arm mode after the bl __two_from_arm line in the hello() function.
Also note the extra 0x0000 after the thumb function, this was to align the program on a word boundary so that the following arm code was aligned...
to see how the compiler does thumb to arm change two as follows
unsigned int three ( unsigned int );
unsigned int two ( unsigned int t )
{
return(three(t)+5);
}
and put that function in hello.c
extern unsigned int two ( unsigned int );
unsigned int hello ( unsigned int h )
{
return(two(h)+7);
}
unsigned int three ( unsigned int t )
{
return(t+3);
}
and now we get another trampoline
00001028 <two>:
1028: b508 push {r3, lr}
102a: f000 f80b bl 1044 <__three_from_thumb>
102e: 3005 adds r0, #5
1030: bc08 pop {r3}
1032: bc02 pop {r1}
1034: 4708 bx r1
1036: 46c0 nop ; (mov r8, r8)
...
00001044 <__three_from_thumb>:
1044: 4778 bx pc
1046: 46c0 nop ; (mov r8, r8)
1048: eafffff4 b 1020 <three>
104c: 00000000 andeq r0, r0, r0
Now this is a very cool trampoline. the bl to three_from_thumb is in thumb mode and the link register is set to return to the two() function with the lsbit set no doubt to indicate to return to thumb mode.
The trampoline starts with a bx pc, pc is set to two instructions ahead and the pc internally always has the lsbit clear so a bx pc will always take you to arm mode if not already in arm mode, and in either mode two instructions ahead. Two instructions ahead of the bx pc is an arm instruction that branches (not branch link!) to the three function, completing the trampoline.
Notice how I wrote the call to hello() in the first place
_start:
ldr r0,=hello
mov lr,pc
bx r0
hang : b hang
that actually wont work will it? It will get you from arm to thumb but not from thumb to arm. I will leave that as an exercise for the reader.
If you change start.s to this
.thumb
.globl _start
_start:
bl hello
hang : b hang
the linker takes care of us:
00001000 <_start>:
1000: f000 f820 bl 1044 <__hello_from_thumb>
00001004 <hang>:
1004: e7fe b.n 1004 <hang>
...
00001044 <__hello_from_thumb>:
1044: 4778 bx pc
1046: 46c0 nop ; (mov r8, r8)
1048: eaffffee b 1008 <hello>
I would and do always disassemble programs like these to make sure the compiler and linker resolved these issues. Also note that for example __hello_from_thumb can be used from any thumb function, if I call hello from several places, some arm, some thumb, and hello was compiled for arm, then the arm calls would call hello directly (if they can reach it) and all the thumb calls would share the same hello_from_thumb (if they can reach it).
The compiler in these examples was assuming code that stays in the same mode (simple branch link) and the linker added the interworking code...
If you really meant inter-networking and not interworking, then please describe what that is and I will delete this answer.
EDIT:
You were using a register to preserve lr during the call to Double, that will not work, no register will work for that you need to use memory, and the easiest is the stack. See how the compiler does it:
00001008 <hello>:
1008: e92d4008 push {r3, lr}
100c: eb000009 bl 1038 <__two_from_arm>
1010: e8bd4008 pop {r3, lr}
1014: e2800007 add r0, r0, #7
1018: e12fff1e bx lr
r3 is pushed likely to align the stack on a 64 bit boundary (makes it faster). the thing to notice is the link register is preserved on the stack, but the pop does not pop to pc because this is not an ARMv4 build, so a bx is needed to return from the function. Because this is arm mode we can pop to lr and simply bx lr.
For thumb you can only push r0-r7 and lr directly and pop r0-r7 and pc directly you dont want to pop to pc because that only works if you are staying in the same mode (thumb or arm). this is fine for a cortex-m, or fine if you know what all of your callers are, but in general bad. So
00001024 <two>:
1024: b508 push {r3, lr}
1026: f000 f811 bl 104c <__three_from_thumb>
102a: 3005 adds r0, #5
102c: bc08 pop {r3}
102e: bc02 pop {r1}
1030: 4708 bx r1
same deal r3 is used as a dummy register to keep the stack aligned for performance (I used the default build for gcc 4.8.0 which is likely a platform with a 64 bit axi bus, specifying the architecture might remove that extra register). Because we cannot pop pc, I assume because r1 and r3 would be out of order and r3 was chosen (they could have chosen r2 and saved an instruction) there are two pops, one to get rid of the dummy value on the stack and the other to put the return value in a register so that they can bx to it to return.
Your Start function does not conform to the ABI and as a result when you mix it in with such large libraries as a printf call, no doubt you will crash. If you didnt it was dumb luck. Your assembly listing of main shows that neither r4 nor r10 were used and assuming main() is not called other than the bootstrap, then that is why you got away with either r4 or r10.
If this really is an LPC1769 this this whole discussion is irrelevant as it does not support ARM and does not support interworking (interworking = mixing of ARM mode code and thumb mode code). Your problem was unrelated to interworking, you are not interworking (note the pop {pc} at the end of the functions). Your problem was likely related to your assembly code.
EDIT2:
Changing the makefile to specify the cortex-m
00001008 <hello>:
1008: b508 push {r3, lr}
100a: f000 f805 bl 1018 <two>
100e: 3007 adds r0, #7
1010: bd08 pop {r3, pc}
1012: 46c0 nop ; (mov r8, r8)
00001014 <three>:
1014: 3003 adds r0, #3
1016: 4770 bx lr
00001018 <two>:
1018: b508 push {r3, lr}
101a: f7ff fffb bl 1014 <three>
101e: 3005 adds r0, #5
1020: bd08 pop {r3, pc}
1022: 46c0 nop ; (mov r8, r8)
first and foremost it is all thumb since there is no arm mode on a cortex-m, second the bx is not needed for function returns (Because there are no arm/thumb mode changes). So pop {pc} will work.
it is curious that the dummy register is still used on a push, I tried an arm7tdmi/armv4t build and it still did that, so there is some other flag to use to get rid of that behavior.
If your desire was to learn how to make an assembly function that you can call from C, you should have just done that. Make a C function that somewhat resembles the framework of the function you want to create in asm:
extern unsigned int Double ( unsigned int );
unsigned int Start ( void )
{
return(Double(42));
}
assemble then disassemble
00000000 <Start>:
0: b508 push {r3, lr}
2: 202a movs r0, #42 ; 0x2a
4: f7ff fffe bl 0 <Double>
8: bd08 pop {r3, pc}
a: 46c0 nop ; (mov r8, r8)
and start with that as you assembly function.
.globl Start
.thumb_func
Start:
push {lr}
mov r0, #42
bl Double
pop {pc}
That, or read the arm abi for gcc and understand what registers you can and cant use without saving them on the stack, what registers are used for passing and returning parameters.

_start function is the entry point of a C program which makes a call to main(). In order to debug _start function of any C program is in the assembly file. Actually the real entry point of a program on linux is not main(), but rather a function called _start(). The standard libraries normally provide a version of this that runs some initialization code, then calls main().
Try compiling this with gcc -nostdlib:

Local variable location in memory

For a homework assignment I have been given some c files, and compiled them using arm-linux-gcc (we will eventually be targeting gumstix boards, but for these exercises we have been working with qemu and ema).
One of the questions confuses me a bit-- we are told to:
Use arm-linux-objdump to find the location of variables declared in main() in the executable binary.
However, these variables are local and thus shouldn't have addresses until runtime, correct?
I'm thinking that maybe what I need to find is the offset in the stack frame, which can in fact be found using objdump (not that I know how).
Anyways, any insight into the matter would be greatly appreciated, and I would be happy to post the source code if necessary.

unsigned int one ( unsigned int, unsigned int );
unsigned int two ( unsigned int, unsigned int );
unsigned int myfun ( unsigned int x, unsigned int y, unsigned int z )
{
unsigned int a,b;
a=one(x,y);
b=two(a,z);
return(a+b);
}
compile and disassemble
arm-none-eabi-gcc -c fun.c -o fun.o
arm-none-eabi-objdump -D fun.o
code created by compiler
00000000 <myfun>:
0: e92d4800 push {fp, lr}
4: e28db004 add fp, sp, #4
8: e24dd018 sub sp, sp, #24
c: e50b0010 str r0, [fp, #-16]
10: e50b1014 str r1, [fp, #-20]
14: e50b2018 str r2, [fp, #-24]
18: e51b0010 ldr r0, [fp, #-16]
1c: e51b1014 ldr r1, [fp, #-20]
20: ebfffffe bl 0 <one>
24: e50b0008 str r0, [fp, #-8]
28: e51b0008 ldr r0, [fp, #-8]
2c: e51b1018 ldr r1, [fp, #-24]
30: ebfffffe bl 0 <two>
34: e50b000c str r0, [fp, #-12]
38: e51b2008 ldr r2, [fp, #-8]
3c: e51b300c ldr r3, [fp, #-12]
40: e0823003 add r3, r2, r3
44: e1a00003 mov r0, r3
48: e24bd004 sub sp, fp, #4
4c: e8bd4800 pop {fp, lr}
50: e12fff1e bx lr
Short answer is the memory is "allocated" both at compile time and at run time. At compile time in the sense that the compiler at compile time determines the size of the stack frame and who goes where. Run time in the sense that the memory itself is on the stack which is a dynamic thing. The stack frame is taken from stack memory at run time, almost like a malloc() and free().
It helps to know the calling convention, x enters in r0, y in r1, z in r2. then x has its home at fp-16, y at fp-20, and z at fp-24. then the call to one() needs x and y so it pulls those from the stack (x and y). the result of one() goes into a which is saved at fp-8 so that is the home for a. and so on.
the function one is not really at address 0, this is a disassembly of an object file not a linked binary. once an object is linked in with the rest of the objects and libraries, the missing parts, like where external functions are, are patched in by the linker and the calls to one() and two() will get real addresses. (and the program will likely not start at address 0).
I cheated here a little, I knew that with no optimizations enabled on the compiler and a relatively simple function like this there really is no reason for a stack frame:
compile with just a little optimization
arm-none-eabi-gcc -O1 -c fun.c -o fun.o
arm-none-eabi-objdump -D fun.o
and the stack frame is gone, the local variables remain in registers.
00000000 :
0: e92d4038 push {r3, r4, r5, lr}
4: e1a05002 mov r5, r2
8: ebfffffe bl 0
c: e1a04000 mov r4, r0
10: e1a01005 mov r1, r5
14: ebfffffe bl 0
18: e0800004 add r0, r0, r4
1c: e8bd4038 pop {r3, r4, r5, lr}
20: e12fff1e bx lr
what the compiler decided to do instead is give itself more registers to work with by saving them on the stack. Why it saved r3 is a mystery, but that is another topic...
entering the function r0 = x, r1 = y and r2 = z per the calling convention, we can leave r0 and r1 alone (try again with one(y,x) and see what happens) since they drop right into one() and are never used again. The calling convention says that r0-r3 can be destroyed by a function, so we need to preserve z for later so we save it in r5. The result of one() is r0 per the calling convention, since two() can destroy r0-r3 we need to save a for later, after the call to two() also we need r0 for the call to two anyway, so r4 now holds a. We saved z in r5 (was in r2 moved to r5) before the call to one, we need the result of one() as the first parameter to two(), and it is already there, we need z as the second so we move r5 where we had saved z to r1, then we call two(). the result of two() per the calling convention. Since b + a = a + b from basic math properties the final add before returning is r0 + r4 which is b + a, and the result goes in r0 which is the register used to return something from a function, per the convention. clean up the stack and restore the modified registers, done.
Since myfun() made calls to other functions using bl, bl modifies the link register (r14), in order to be able to return from myfun() we need the value in the link register to be preserved from the entry into the function to the final return (bx lr), so lr is pushed on the stack. The convention states that we can destroy r0-r3 in our function but not other registers so r4 and r5 are pushed on the stack because we used them. why r3 is pushed on the stack is not necessary from a calling convention perspective, I wonder if it was done in anticipation of a 64 bit memory system, making two full 64 bit writes is cheaper than one 64 bit write and one 32 bit right. but you would need to know the alignment of the stack going in so that is just a theory. There is no reason to preserve r3 in this code.
Now take this knowledge and disassemble the code assigned (arm-...-objdump -D something.something) and do the same kind of analysis. particularly with functions named main() vs functions not named main (I did not use main() on purpose) the stack frame can be a size that doesnt make sense, or less sense than other functions. In the non optimized case above we needed to store 6 things total, x,y,z,a,b and the link register 6*4 = 24 bytes which resulted in sub sp, sp, #24, I need to think about the stack pointer vs frame pointer
thing for a bit. I think there is a command line argument to tell the compiler not to use a frame pointer. -fomit-frame-pointer and it saves a couple of instructions
00000000 <myfun>:
0: e52de004 push {lr} ; (str lr, [sp, #-4]!)
4: e24dd01c sub sp, sp, #28
8: e58d000c str r0, [sp, #12]
c: e58d1008 str r1, [sp, #8]
10: e58d2004 str r2, [sp, #4]
14: e59d000c ldr r0, [sp, #12]
18: e59d1008 ldr r1, [sp, #8]
1c: ebfffffe bl 0 <one>
20: e58d0014 str r0, [sp, #20]
24: e59d0014 ldr r0, [sp, #20]
28: e59d1004 ldr r1, [sp, #4]
2c: ebfffffe bl 0 <two>
30: e58d0010 str r0, [sp, #16]
34: e59d2014 ldr r2, [sp, #20]
38: e59d3010 ldr r3, [sp, #16]
3c: e0823003 add r3, r2, r3
40: e1a00003 mov r0, r3
44: e28dd01c add sp, sp, #28
48: e49de004 pop {lr} ; (ldr lr, [sp], #4)
4c: e12fff1e bx lr
optimizing saves a whole lot more though...

It's going to depend on the program and how exactly they want the location of the variables. Does the question want what code section they're stored in? .const .bss etc? Does it want specific addresses? Either way a good start is using objdump -S flag
objdump -S myprogram > dump.txt
This is nice because it will print out an intermixing of your source code and the assembly with addresses. From here just do a search for your int main and that should get you started.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight