How do I get GCC to not delete reads from volatile pointers?

How do I get GCC to not delete reads from volatile pointers? - c

I have the following code in my source file:
void *hardware = AllocateHardwareArea(SIZE);
volatile uint32_t *reader = (uint32_t *) hardware;
unsigned x;
for (x = 0; x < SIZE / sizeof(u32); ++x)
(void) *reader++;
ReleaseHardwareArea(hardware);
But when I compile this on ARMv6-targeted GCC 4.9.2 with -O3, the compiler is deleting the entire for loop from the assembly language output:
STMFD SP!, {R3,LR}
MOV R0, #0
MOV R1, #0x10000
BL AllocateHardwareArea
LDMFD SP!, {R3,LR}
B ReleaseHardwareArea
Isn't volatile supposed to be for hardware register situations such as this?

I cannot replicate your results using GCC-4.9.3 (gcc-arm-none-eabi-4.9.3.2015q2-1trusty1 from Terry Guo's PPA for Ubuntu 14.04.2 LTS on x86_64). Starting with file.c,
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
void test(const unsigned int size)
{
void *hardware = AllocateHardwareArea(size);
volatile unsigned int *reader = hardware;
unsigned int x;
for (x = 0; x < size / sizeof *reader; x++)
(void)*reader++;
ReleaseHardwareArea(hardware);
}
using arm-none-eabi-gcc-4.9.3 -march=armv6 -mtune=arm6 -O3 -S file.c compiles to the following assembly:
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "file.c"
.text
.align 2
.global test
.type test, %function
test:
# args = 0, pretend = 0, frame = 0
# frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r4, lr}
mov r4, r0
bl AllocateHardwareArea
movs r2, r4, lsr #2
beq .L2
mov r3, r0
add r2, r0, r2, asl #2
.L3:
ldr r1, [r3]
add r3, r3, #4
cmp r3, r2
bne .L3
.L2:
ldmfd sp!, {r4, lr}
b ReleaseHardwareArea
.size test, .-test
or, compiled to object code using arm-none-eabi-gcc-4.9.3 -march=armv6 -mtune=arm6 -O3 -c file.c, the disassembly using arm-none-eabi-objdump -d file.o is
file.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test>:
0: e92d4010 push {r4, lr}
4: e1a04000 mov r4, r0
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1b02124 lsrs r2, r4, #2
10: 0a000005 beq 2c <test+0x2c>
14: e1a03000 mov r3, r0
18: e0802102 add r2, r0, r2, lsl #2
1c: e5931000 ldr r1, [r3]
20: e2833004 add r3, r3, #4
24: e1530002 cmp r3, r2
28: 1afffffb bne 1c <test+0x1c>
2c: e8bd4010 pop {r4, lr}
30: eafffffe b 0 <ReleaseHardwareArea>
The allocated area is read, as it should be, in unsigned int-sized units. In the assembly source, the read loop is between labels .L3 and .L2. In the object code, the read loop is at 1c..28.
Edited to add: Olaf pointed out in a comment, that OP might use a constant size. Let's examine that case, too:
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
#define SIZE 32
void test(void)
{
void *hardware = AllocateHardwareArea(SIZE);
volatile unsigned int *reader = hardware;
unsigned int x;
for (x = 0; x < SIZE / sizeof *reader; x++)
(void)*reader++;
ReleaseHardwareArea(hardware);
}
The assembly is
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "file2.c"
.text
.align 2
.global test
.type test, %function
test:
# args = 0, pretend = 0, frame = 0
# frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r3, lr}
mov r0, #32
bl AllocateHardwareArea
mov r3, r0
ldr r2, [r0]
ldr r2, [r0, #4]
ldr r2, [r0, #8]
ldr r2, [r0, #12]
ldr r2, [r0, #16]
ldr r2, [r0, #20]
ldr r2, [r0, #24]
ldr r3, [r3, #28]
ldmfd sp!, {r3, lr}
b ReleaseHardwareArea
.size test, .-test
.ident "GCC: (GNU Tools for ARM Embedded Processors) 4.9.3 20150529 (release) [ARM/embedded-4_9-branch revision 224288]"
and the disassembly of the object code
00000000 <test>:
0: e92d4008 push {r3, lr}
4: e3a00020 mov r0, #32
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1a03000 mov r3, r0
10: e5902000 ldr r2, [r0]
14: e5902004 ldr r2, [r0, #4]
18: e5902008 ldr r2, [r0, #8]
1c: e590200c ldr r2, [r0, #12]
20: e5902010 ldr r2, [r0, #16]
24: e5902014 ldr r2, [r0, #20]
28: e5902018 ldr r2, [r0, #24]
2c: e593301c ldr r3, [r3, #28]
30: e8bd4008 pop {r3, lr}
34: eafffffe b 0 <ReleaseHardwareArea>
i.e. the loop is simply unrolled. Of course, if SIZE is less than 4, then the loop is optimized away. Unrolling occurs for SIZE <= 71. For SIZE = 72, the object code is
00000000 <test>:
0: e92d4008 push {r3, lr}
4: e3a00048 mov r0, #72 ; 0x48
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1a03000 mov r3, r0
10: e2802048 add r2, r0, #72 ; 0x48
14: e5931000 ldr r1, [r3]
18: e2833004 add r3, r3, #4
1c: e1530002 cmp r3, r2
20: 1afffffb bne 14 <test+0x14>
24: e8bd4008 pop {r3, lr}
28: eafffffe b 0 <ReleaseHardwareArea>
Since you are compiling with extreme optimizations (-O3), I recommend rewriting your code snippet, sprinkling const liberally, instead of assuming the compiler detects const-ness automatically. For example, using the same commands as above, the following version
void *AllocateHardwareArea(const unsigned int);
void ReleaseHardwareArea(void *);
void test(const unsigned int size)
{
void *const hardware = AllocateHardwareArea(size);
volatile unsigned int *const reader = hardware;
const unsigned int n = size / sizeof *reader;
unsigned int i;
for (i = 0; i < n; i++)
reader[i];
ReleaseHardwareArea(hardware);
}
performs the exact same task, but with one fewer instruction within the inner loop. The assembly is
.arch armv6
.fpu softvfp
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 1
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.file "new.c"
.text
.align 2
.global test
.type test, %function
test:
# args = 0, pretend = 0, frame = 0
# frame_needed = 0, uses_anonymous_args = 0
stmfd sp!, {r4, lr}
mov r4, r0
bl AllocateHardwareArea
movs r2, r4, lsr #2
beq .L2
mov r3, r0
add r2, r0, r2, asl #2
.L3:
ldr r1, [r3], #4
cmp r3, r2
bne .L3
.L2:
ldmfd sp!, {r4, lr}
b ReleaseHardwareArea
.size test, .-test
.ident "GCC: (GNU Tools for ARM Embedded Processors) 4.9.3 20150529 (release) [ARM/embedded-4_9-branch revision 224288]"
and the object code
Disassembly of section .text:
00000000 <test>:
0: e92d4010 push {r4, lr}
4: e1a04000 mov r4, r0
8: ebfffffe bl 0 <AllocateHardwareArea>
c: e1b02124 lsrs r2, r4, #2
10: 0a000004 beq 28 <test+0x28>
14: e1a03000 mov r3, r0
18: e0802102 add r2, r0, r2, lsl #2
1c: e4931004 ldr r1, [r3], #4
20: e1530002 cmp r3, r2
24: 1afffffc bne 1c <test+0x1c>
28: e8bd4010 pop {r4, lr}
2c: eafffffe b 0 <ReleaseHardwareArea>
Perhaps you could test if your GCC compiles this latter version correctly? If not, we have a compiler bug at hand (assuming SIZE is at least 4), possibly/likely already fixed in later versions.

Related

How can I generate following arm assembler output using ARM gcc 7.3?

myfunction:
# Function supports interworking.
# args = 0, pretend = 0, frame = 0
# frame_needed = 0, uses_anonymous_args = 0
# link register save eliminated.
mul r3, r0, r0
mov r0, r3
mla r0, r1, r0, r2
bx lr
I am able to generate everything except for the mov instruction using following C function.
int myfunction(int r0, int r1, int r2, int r3)
{
r3 = r0*r0;
r0 = r3;
r3 = r0;
return (r1*r3)+r2;
}
How can I instruct r3 to be set to the address of r0 in assembly code?

unsigned int myfunction(unsigned int a, unsigned int b, unsigned int c)
{
return (a*a*b)+c;
}
Your choices are going to be something like this
00000000 <myfunction>:
0: e52db004 push {r11} ; (str r11, [sp, #-4]!)
4: e28db000 add r11, sp, #0
8: e24dd014 sub sp, sp, #20
c: e50b0008 str r0, [r11, #-8]
10: e50b100c str r1, [r11, #-12]
14: e50b2010 str r2, [r11, #-16]
18: e51b3008 ldr r3, [r11, #-8]
1c: e51b2008 ldr r2, [r11, #-8]
20: e0010392 mul r1, r2, r3
24: e51b200c ldr r2, [r11, #-12]
28: e0000291 mul r0, r1, r2
2c: e51b3010 ldr r3, [r11, #-16]
30: e0803003 add r3, r0, r3
34: e1a00003 mov r0, r3
38: e28bd000 add sp, r11, #0
3c: e49db004 pop {r11} ; (ldr r11, [sp], #4)
40: e12fff1e bx lr
or this
00000000 <myfunction>:
0: e0030090 mul r3, r0, r0
4: e0202391 mla r0, r1, r3, r2
8: e12fff1e bx lr
as you have probably figured out.
The mov should never be considered by the compiler backend as it just wastes an instruction. r3 goes into the mla no need to put it in r0 then do the mla. Not quite sure how to get the compiler to do more. Even this doesn't encourage it
unsigned int fun ( unsigned int a )
{
return(a*a);
}
unsigned int myfunction(unsigned int a, unsigned int b, unsigned int c)
{
return (fun(a)*b)+c;
}
giving
00000000 <fun>:
0: e1a03000 mov r3, r0
4: e0000093 mul r0, r3, r0
8: e12fff1e bx lr
0000000c <myfunction>:
c: e0030090 mul r3, r0, r0
10: e0202391 mla r0, r1, r3, r2
14: e12fff1e bx lr
Basically if you don't optimize you get nowhere near what you were after. If you optimize that mov shouldn't be there, should be easy to optimize out.
While some level of manipulation of writing high level code to encourage the compiler to output low level code is possible, trying to get this exact output is not something you should expect to be able to do.
Unless you use inline asm
asm
(
"mul r3, r0, r0\n"
"mov r0, r3\n"
"mla r0, r1, r0, r2\n"
"bx lr\n"
);
giving your result
Disassembly of section .text:
00000000 <.text>:
0: e0030090 mul r3, r0, r0
4: e1a00003 mov r0, r3
8: e0202091 mla r0, r1, r0, r2
c: e12fff1e bx lr
or real asm
mul r3, r0, r0
mov r0, r3
mla r0, r1, r0, r2
bx lr
and feed it into gcc rather than as (arm-whatever-gcc so.s -o so.o)
Disassembly of section .text:
00000000 <.text>:
0: e0030090 mul r3, r0, r0
4: e1a00003 mov r0, r3
8: e0202091 mla r0, r1, r0, r2
c: e12fff1e bx lr
so that technically you were using gcc on the command line but gcc does some preprocessing and then feeds it to as.
Unless you find a core or where Rd and Rs have to be the same register and can then specify that core/bug/whatever on the gcc command line, I don't see the mov happening, maybe, just maybe, with clang/llvm compile fun and myfunction separately to bytecode then combine them then optimize then output to the target then examine that. I would hope either in the optimization or the output that the mov would be optimized out but you might get lucky.
Edit
I made an error:
unsigned int myfunction(unsigned int a, unsigned int b, unsigned int c)
{
return (a*a*b)+c;
}
arm-linux-gnueabi-gcc --version
arm-linux-gnueabi-gcc (Ubuntu/Linaro 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
Disassembly of section .text:
00000000 <myfunction>:
0: e0030090 mul r3, r0, r0
4: e1a00003 mov r0, r3
8: e0202091 mla r0, r1, r0, r2
c: e12fff1e bx lr
but this
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 8.2.0
Copyright (C) 2018 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
arm-none-eabi-gcc -O2 -c so.c -o so.o
arm-none-eabi-objdump -D so.o
so.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <myfunction>:
0: e0030090 mul r3, r0, r0
4: e0202391 mla r0, r1, r3, r2
8: e12fff1e bx lr
I'll have to build a 7.3 or go find one. Somewhere between 5.x.x and 8.x.x the backend changed or...
Note you may need -mcpu=arm7tdmi or -mcpu=arm9tdmi or -march=armv4t or -march=armv5t on the command line depending on the default target (cpu/arch) built into your compiler. Or you might get something like this
Disassembly of section .text:
00000000 <myfunction>:
0: fb00 f000 mul.w r0, r0, r0
4: fb01 2000 mla r0, r1, r0, r2
8: 4770 bx lr
a: bf00 nop
this
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 7.3.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
produces
Disassembly of section .text:
00000000 <myfunction>:
0: e0030090 mul r3, r0, r0
4: e0202391 mla r0, r1, r3, r2
8: e12fff1e bx lr
So you may have to work backward to find the version where it changed, the source code change to gcc that caused it and modify 7.3.0 making something that is not really 7.3.0 but reports as 7.3.0 and outputs your desired code.

How to make bare metal ARM programs and run them on QEMU?

I am trying to get this tutorial to work as intended without success (Something fails after the bl main instruction).
According to the tutorial the command
(qemu) xp /1dw 0xa0000018
should result in the print 33 (But i get 0x00 instead)
a0000018: 33
This is the content of the registers after the main call (see startup.s)
(qemu) info registers
R00=a000001c R01=a000001c R02=00000006 R03=00000000
R04=00000000 R05=00000005 R06=00000006 R07=00000007
R08=00000008 R09=00000009 R10=00000000 R11=a3fffffc
R12=00000000 R13=00000000 R14=0000003c R15=00000004
PSR=800001db N--- A und32
FPSCR: 00000000
I have the following files
main.c
startup.s
lscript.ld
Makefile
And I am using the following toolchain
arm-2013.11-24-arm-none-eabi-i686-pc-linux-gnu
Makefile:
SRCS := main.c startup.s
LINKER_NAME := lscript.ld
ELF_NAME := program.elf
BIN_NAME := program.bin
FLASH_NAME := flash.bin
CC := arm-none-eabi
CFLAGS := -nostdlib
OBJFLAGS ?= -DS
QEMUFLAGS := -M connex -pflash $(FLASH_NAME) -nographic -serial /dev/null
# Allocate 16MB to use as a virtual flash for th qemu
# bs = blocksize -> 4KB
# count = number of block -> 4096
# totalsize = 16MB
setup:
dd if=/dev/zero of=$(FLASH_NAME) bs=4096 count=4096
# Compile srcs and write to virtual flash
all: clean setup
$(CC)-gcc $(CFLAGS) -o $(ELF_NAME) -T $(LINKER_NAME) $(SRCS)
$(CC)-objcopy -O binary $(ELF_NAME) $(BIN_NAME)
dd if=$(BIN_NAME) of=$(FLASH_NAME) bs=4096 conv=notrunc
objdump:
$(CC)-objdump $(OBJFLAGS) $(ELF_NAME)
mem-placement:
$(CC)-nm -n $(ELF_NAME)
qemu:
qemu-system-arm $(QEMUFLAGS)
clean:
rm -rf *.bin
rm -rf *.elf
main.c:
static int arr[] = { 1, 10, 4, 5, 6, 7 };
static int sum;
static const int n = sizeof(arr) / sizeof(arr[0]);
int main()
{
int i;
for (i = 0; i < n; i++){
sum += arr[i];
}
return 0;
}
startup.s:
.section "vectors"
reset: b _start
undef: b undef
swi: b swi
pabt: b pabt
dabt: b dabt
nop
irq: b irq
fiq: b fiq
.text
_start:
init:
## Copy data to RAM.
ldr r0, =flash_sdata
ldr r1, =ram_sdata
ldr r2, =data_size
## Handle data_size == 0
cmp r2, #0
beq init_bss
copy:
ldrb r4, [r0], #1
strb r4, [r1], #1
subs r2, r2, #1
bne copy
init_bss:
## Initialize .bss
ldr r0, =sbss
ldr r1, =ebss
ldr r2, =bss_size
## Handle bss_size == 0
cmp r2, #0
beq init_stack
mov r4, #0
zero:
strb r4, [r0], #1
subs r2, r2, #1
bne zero
init_stack:
## Initialize the stack pointer
ldr sp, =0xA4000000
## **this call dosent work as expected.. (r13/sp contains 0xA4000000)**
bl main
## Dosent return from main
## r0 should now contain 33
stop:
b stop
lscript.ld:
/*
* Linker for testing purposes
* (using 16 MB virtual flash = 0x0100_0000)
*/
MEMORY {
rom (rx) : ORIGIN = 0x00000000, LENGTH = 0x01000000
ram (rwx) : ORIGIN = 0xA0000000, LENGTH = 0x04000000
}
SECTIONS {
.text : {
* (vectors);
* (.text);
} > rom
.rodata : {
* (.rodata);
} > rom
flash_sdata = .;
ram_sdata = ORIGIN(ram);
.data : AT (flash_sdata) {
* (.data);
} > ram
ram_edata = .;
data_size = ram_edata - ram_sdata;
sbss = .;
.bss : {
* (.bss);
} > ram
ebss = .;
bss_size = ebss - sbss;
/DISCARD/ : {
*(.note*)
*(.comment)
*(.ARM*)
/*
*(.debug*)
*/
}
}
Disassembly of the executable (objdump):
program.elf: file format elf32-littlearm
Disassembly of section .text:
00000000 <reset>:
0: ea000023 b 94 <_start>
00000004 <undef>:
4: eafffffe b 4 <undef>
00000008 <swi>:
8: eafffffe b 8 <swi>
0000000c <pabt>:
c: eafffffe b c <pabt>
00000010 <dabt>:
10: eafffffe b 10 <dabt>
14: e320f000 nop {0}
00000018 <irq>:
18: eafffffe b 18 <irq>
0000001c <fiq>:
1c: eafffffe b 1c <fiq>
00000020 <main>:
20: e52db004 push {fp} ; (str fp, [sp, #-4]!)
24: e28db000 add fp, sp, #0
28: e24dd00c sub sp, sp, #12
2c: e3a03000 mov r3, #0
30: e50b3008 str r3, [fp, #-8]
34: ea00000d b 70 <main+0x50>
38: e3003000 movw r3, #0
3c: e34a3000 movt r3, #40960 ; 0xa000
40: e51b2008 ldr r2, [fp, #-8]
44: e7932102 ldr r2, [r3, r2, lsl #2]
48: e3003018 movw r3, #24
4c: e34a3000 movt r3, #40960 ; 0xa000
50: e5933000 ldr r3, [r3]
54: e0822003 add r2, r2, r3
58: e3003018 movw r3, #24
5c: e34a3000 movt r3, #40960 ; 0xa000
60: e5832000 str r2, [r3]
64: e51b3008 ldr r3, [fp, #-8]
68: e2833001 add r3, r3, #1
6c: e50b3008 str r3, [fp, #-8]
70: e3a02006 mov r2, #6
74: e51b3008 ldr r3, [fp, #-8]
78: e1530002 cmp r3, r2
7c: baffffed blt 38 <main+0x18>
80: e3a03000 mov r3, #0
84: e1a00003 mov r0, r3
88: e24bd000 sub sp, fp, #0
8c: e49db004 pop {fp} ; (ldr fp, [sp], #4)
90: e12fff1e bx lr
00000094 <_start>:
94: e59f004c ldr r0, [pc, #76] ; e8 <stop+0x4>
98: e59f104c ldr r1, [pc, #76] ; ec <stop+0x8>
9c: e59f204c ldr r2, [pc, #76] ; f0 <stop+0xc>
a0: e3520000 cmp r2, #0
a4: 0a000003 beq b8 <init_bss>
000000a8 <copy>:
a8: e4d04001 ldrb r4, [r0], #1
ac: e4c14001 strb r4, [r1], #1
b0: e2522001 subs r2, r2, #1
b4: 1afffffb bne a8 <copy>
000000b8 <init_bss>:
b8: e59f0034 ldr r0, [pc, #52] ; f4 <stop+0x10>
bc: e59f1034 ldr r1, [pc, #52] ; f8 <stop+0x14>
c0: e59f2034 ldr r2, [pc, #52] ; fc <stop+0x18>
c4: e3520000 cmp r2, #0
c8: 0a000003 beq dc <init_stack>
cc: e3a04000 mov r4, #0
000000d0 <zero>:
d0: e4c04001 strb r4, [r0], #1
d4: e2522001 subs r2, r2, #1
d8: 1afffffc bne d0 <zero>
000000dc <init_stack>:
dc: e3a0d329 mov sp, #-1543503872 ; 0xa4000000
e0: ebffffce bl 20 <main>
000000e4 <stop>:
e4: eafffffe b e4 <stop>
e8: 00000104 andeq r0, r0, r4, lsl #2
ec: a0000000 andge r0, r0, r0
f0: 00000018 andeq r0, r0, r8, lsl r0
f4: a0000018 andge r0, r0, r8, lsl r0
f8: a000001c andge r0, r0, ip, lsl r0
fc: 00000004 andeq r0, r0, r4
Disassembly of section .rodata:
00000100 <n>:
100: 00000006 andeq r0, r0, r6
Disassembly of section .data:
a0000000 <arr>:
a0000000: 00000001 andeq r0, r0, r1
a0000004: 0000000a andeq r0, r0, sl
a0000008: 00000004 andeq r0, r0, r4
a000000c: 00000005 andeq r0, r0, r5
a0000010: 00000006 andeq r0, r0, r6
a0000014: 00000007 andeq r0, r0, r7
Disassembly of section .bss:
a0000018 <sum>:
a0000018: 00000000 andeq r0, r0, r0
Can someone point me in the right direction to why this isn't working according to my expectations?
Thanks Henrik

Minimal examples that just work
https://github.com/cirosantilli/linux-kernel-module-cheat/tree/54e15e04338c0fecc0be139a0da2d0d972c21419#baremetal-setup-getting-started
The prompt.c example takes input from your host terminal and gives back output all through the simulated UART:
enter a character
got: a
new alloc of 1 bytes at address 0x0x4000a1c0
enter a character
got: b
new alloc of 2 bytes at address 0x0x4000a1c0
enter a character
It uses Newlib to expose a subset of the C standard library. This allows you to run existing programs written in C if the only use that restricted subset of the C standard library.
More details about Newlib at: https://electronics.stackexchange.com/questions/223929/c-standard-libraries-on-bare-metal/400077#400077
https://github.com/freedomtan/aarch64-bare-metal-qemu/tree/2ae937a2b106b43bfca49eec49359b3e30eac1b1 for -M virt, just the hello world on the repo. Compile with:
sudo apt-get install gcc-aarch64-linux-gnu
make CROSS_PREFIX=aarch64-linux-gnu-
Here is the example minimized to printing a single character from assembly: How to run a bare metal ELF file on QEMU?
https://github.com/bztsrc/raspi3-tutorial for -M raspi3. Quick getting started at: https://raspberrypi.stackexchange.com/questions/34733/how-to-do-qemu-emulation-for-bare-metal-raspberry-pi-images/85135#85135 Several other examples on the repo going to more advanced subjects.
Also does display output on 09_framebuffer.
Both write a hello world to the UART.
Tested in Ubuntu 18.04, gcc-aarch64-linux-gnu version 4:7.3.0-3ubuntu2.

Debugging!
First, look at the PC and PSR: You're in Undef mode, in the undefined instruction handler.
OK, in an exception mode, the LR tells you where you took the exception. There are some slightly complicated rules between the PC offset and the preferred return address determining exactly what it points at, but just eyeballing it it's clearly in the vicinity of the movw/movt pair.
The movw instruction effectively only exists in the ARMv7 ISA onwards. A brief investigation tells me the machine you're emulating is some old PXA255 thing, whose CPU only implements the ARMv5 ISA. Thus it's not surprising it faults on an instruction that it predates by many years.
Your compiler is apparently configured to target ARMv7 by default (which is not uncommon), so you need to add at least -march=armv5te to your CFLAGS to target the appropriate architecture version. The 'advanced' challenge would be to switch to a different, newer, machine, but that's going to involve adapting the linker script to a new memory map and rewriting any hardware-touching code for new peripherals, so I'd save that idea for the longer term, once you're comfortable with the basics of bare-metal code and slogging through hardware reference manuals.

for the same code on my ubuntu i got
arm-none-eabi-gcc -nostdlib -o sum.elf sum.lds startup.s -w
/usr/lib/gcc/arm-none-eabi/4.9.3/../../../arm-none-eabi/bin/ld: warning: cannot find entry symbol _start; defaulting to 00000000
/tmp/ccBthV7t.o: In function init_stack':
(.text+0x4c): undefined reference tomain'
collect2: error: ld returned 1 exit status

inserting assembly code for arm into a C function, how do I insert extern variables?

I have a C-file f1.c with several functions inside. I need to write one of those functions in assembly code for armv4 and then put it back inside f1.c.
So I extracted the function I wanted in assembly code into another file (called test1.c) and compiled it with:
arm-linux-gnueabi-gcc -S -march=armv4 test1.c
In test1.c I have:
extern long int li1, li2, li3;
int main()
{
iowrite32(li1, li2);
iowrite32(li3, li1);
return 0;
}
After that I get the following code test1.s:
.arch armv4
.eabi_attribute 27, 3
.fpu vfpv3-d16
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 6
.eabi_attribute 34, 0
.eabi_attribute 18, 4
.file "test1.c"
.text
.align 2
.global main
.type main, %function
main:
# Function supports interworking.
# args = 0, pretend = 0, frame = 0
# frame_needed = 1, uses_anonymous_args = 0
stmfd sp!, {fp, lr}
add fp, sp, #4
ldr r3, .L2
ldr r2, [r3, #0]
ldr r3, .L2+4
ldr r3, [r3, #0]
mov r0, r2
mov r1, r3
bl iowrite32
ldr r3, .L2+8
ldr r2, [r3, #0]
ldr r3, .L2
ldr r3, [r3, #0]
mov r0, r2
mov r1, r3
bl iowrite32
mov r3, #0
mov r0, r3
sub sp, fp, #4
ldmfd sp!, {fp, lr}
bx lr
.L3:
.align 2
.L2:
.word li1
.word li2
.word li3
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",%progbits
The problem is that I need to put that assembly code (t1.s) back to file f1.C, so I use:
asm volatile( "stmfd sp!, {fp, lr}\n\t"
"add fp, sp, #4\n\t"
[...]);
BUT I don't know how I can put back instructions such as:
ldr r3, .L2
Because they refer to the extern variables, which are under the label L2 and f1.c won't compile if I try
"ldr r3, .L2\n\t"
Could anyone tell me how I can do it?
Thank you.

In the end, I'm doing it this way:
Instead of:
"ldr r3, .L2+4\n\t"
I'm saving the direction of the variable in another register:
"ldr r4, =li2\n\t"
And then I load it in the first one:
"ldr r3, [r4]\n\t"
I don't know if it is the correct answer, but it compiles and works as expected

This assembly code makes nothing

I'm working in a C file, which is something like this:
#define v2 0x560000a0
int main(void)
{
long int v1;
v1 = ioread32(v2);
return 0;
}
and I've extracted this part to write it in assembly:
int main ()
{
extern v1,v2;
v1=ioread32(v2);
return 0;
}
I'm trying to write the value of v2 in v1 using assembly code for armv4. Using
arm-linux-gnueabi-gcc -S -march=armv4 assembly_file.c
I get this code:
.arch armv4
.eabi_attribute 27, 3
.fpu vfpv3-d16
.eabi_attribute 20, 1
.eabi_attribute 21, 1
.eabi_attribute 23, 3
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 6
.eabi_attribute 34, 0
.eabi_attribute 18, 4
.file "txind-rsi.c"
.text
.align 2
.global main
.type main, %function
main:
# Function supports interworking.
# args = 0, pretend = 0, frame = 0
# frame_needed = 1, uses_anonymous_args = 0
stmfd sp!, {fp, lr}
add fp, sp, #4
ldr r3, .L2
ldr r3, [r3, #0]
mov r0, r3
bl ioread32
mov r2, r0
ldr r3, .L2+4
str r2, [r3, #0]
mov r3, #0
mov r0, r3
sub sp, fp, #4
ldmfd sp!, {fp, lr}
bx lr
.L3:
.align 2
.L2:
.word v1
.word v2
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",%progbits
I use that code to put it back inside the C file this way:
asm volatile(
"stmfd sp!, {fp, lr}\n"
"add fp, sp, #4\n"
"ldr r3, =v1\n"
"ldr r3, [r3, #0]\n"
"mov r0, r2\n"
"ldr r3, =v2\n"
"str r2, [r3, #0]\n"
"sub sp, fp, #4\n"
"ldmfd sp!, {fp, lr}\n"
"bx lr"
);
The code doesn't do anything.
In fact, it stops the target working.
Does anyones know why?
EDITED:
After reading your answers I have another question:
How would I put a constant value in a register?. The code in C would be this:
#define v2 0x560000a0
int main(void)
{
long int value = 0x0000ffff;
long int v1;
v1 = ioread32(v2);
iowrite32(v2,value);
return 0;
}
I've tried this:
asm volatile("mov r3, #value");
and I get an assembler message: "value symbol is in a different section";
I've also tried
asm volatile("mov r3, #0x0000ffff);
and the assembler message is:
"invalid constant(ffff) after fixup". And after reading this: Invalid constant after fixup?
I don't know how I can put that value into r3, as it seems I can't do it with mov.I'm using armv4, not armv7. And this solution proposed in the link, doesn't work for me:
asm volatile("ldr r3, =#0000ffff\n");

There are no problems with your code, bx lr is the proper way to terminate main, no issue there. Your code is most likely crashing due to the address you are accessing, it is probably not an address you are allowed to access...
#define v2 0x560000a0
int main(void)
{
long int v1;
v1 = ioread32(v2);
return 0;
}
if you optimize on the C compile step you can see a cleaner, simpler version
00000000 <main>:
0: e92d4008 push {r3, lr}
4: e59f0008 ldr r0, [pc, #8] ; 14 <main+0x14>
8: ebfffffe bl 0 <ioread32>
c: e3a00000 mov r0, #0
10: e8bd8008 pop {r3, pc}
14: 560000a0 strpl r0, [r0], -r0, lsr #1
which is not hard to implement in asm.
.globl main
main:
push {r3,lr}
ldr r0,=0x560000A0
bl ioread32
mov r0,#0
pop {r3,pc}
assembled and disassembled
00000000 <main>:
0: e92d4008 push {r3, lr}
4: e59f0008 ldr r0, [pc, #8] ; 14 <main+0x14>
8: ebfffffe bl 0 <ioread32>
c: e3a00000 mov r0, #0
10: e8bd8008 pop {r3, pc}
14: 560000a0 strpl r0, [r0], -r0, lsr #1
you had simplified the code to the point that v1 becomes dead code, but the function call cannot be optimized out, so the return is discarded.
if you dont use main but create a separate function that returns
#define v2 0x560000a0
long int fun(void)
{
long int v1;
v1 = ioread32(v2);
return v1;
}
...damn...tail optimization:
00000000 <fun>:
0: e59f0000 ldr r0, [pc] ; 8 <fun+0x8>
4: eafffffe b 0 <ioread32>
8: 560000a0 strpl r0, [r0], -r0, lsr #1
Oh well. You are on the right path, see what the C compiler generates then mimic or modify that. I suspect it is your ioread that is the problem and not the outer structure (why are you doing a 64 bit thing with a 32 bit read, maybe that is the problem long it is most likely going to be implemented as 64 bit).

The bx lr at the end would try to return from the current function. You probably don't want that. I guess that is also the reason for the crash, given that you don't let the compiler undo whatever setup it has done in the function prologue.
Also, in your asm block you don't use any locals so you don't need to adjust the stack pointer, and you don't need a new stack frame either. The mov r0, r2 is also broken because you have loaded the value into r3. You can delete the mov if you just load directly into whatever register you want. Furthermore, in gcc inline asm you should tell the compiler what registers you modify. Not doing so can also lead to a crash because you might overwrite values the compiler relies upon.
Not sure what's the point in doing this in assembly. If you insist on that, the following should work somewhat better:
asm volatile(
"ldr r3, =v1\n"
"ldr r2, [r3, #0]\n"
"ldr r3, =v2\n"
"str r2, [r3, #0]\n" ::: "r2", "r3"
);

arm-none-eabi-gcc C Pointers

So working with C in the arm-none-eabi-gcc. I have been having an issue with pointers, they don't seem to exists. Perhaps I'm passing the wrong cmds to the compiler.
Here is an example.
unsigned int * gpuPointer = GetGPU_Pointer(framebufferAddress);
unsigned int color = 16;
int y = 768;
int x = 1024;
while(y >= 0)
{
while(x >= 0)
{
*gpuPointer = color;
color = color + 2;
x--;
}
color++;
y--;
x = 1024;
}
and the output from the disassembler.
81c8: ebffffc3 bl 80dc <GetGPU_Pointer>
81cc: e3a0c010 mov ip, #16 ; 0x10
81d0: e28c3b02 add r3, ip, #2048 ; 0x800
81d4: e2833002 add r3, r3, #2 ; 0x2
81d8: e1a03803 lsl r3, r3, #16
81dc: e1a01823 lsr r1, r3, #16
81e0: e1a0300c mov r3, ip
81e4: e1a02003 mov r2, r3
81e8: e2833002 add r3, r3, #2 ; 0x2
81ec: e1a03803 lsl r3, r3, #16
81f0: e1a03823 lsr r3, r3, #16
81f4: e1530001 cmp r3, r1
81f8: 1afffff9 bne 81e4 <setup_framebuffer+0x5c>
Shouldn't there be a str cmd around 81e4? To add further the GetGPU_Pointer is coming from an assembler file but there is a declaration as so.
extern unsigned int * GetGPU_Pointer(unsigned int framebufferAddress);
My gut feeling is its something absurdly simple but I'm missing it.

You never change the value of gpuPointer and you haven't declared it to point to a volatile. So from the compiler's perspective you are overwriting a single memory location (*gpuPointer) 768*1024 times, but since you never use the value you are writing into it, the compiler is entitled to optimize by doing a single write at the end of the loop.

Adding to rici's answer (upvote rici not me)...
It gets even better, taking what you offered and wrapping it
extern unsigned int * GetGPU_Pointer ( unsigned int );
void fun ( unsigned int framebufferAddress )
{
unsigned int * gpuPointer = GetGPU_Pointer(framebufferAddress);
unsigned int color = 16;
int y = 768;
int x = 1024;
while(y >= 0)
{
while(x >= 0)
{
*gpuPointer = color;
color = color + 2;
x--;
}
color++;
y--;
x = 1024;
}
}
Optimizes to
00000000 <fun>:
0: e92d4008 push {r3, lr}
4: ebfffffe bl 0 <GetGPU_Pointer>
8: e59f3008 ldr r3, [pc, #8] ; 18 <fun+0x18>
c: e5803000 str r3, [r0]
10: e8bd4008 pop {r3, lr}
14: e12fff1e bx lr
18: 00181110 andseq r1, r8, r0, lsl r1
because the code really doesnt do anything but that one store.
Now if you were to modify the pointer
while(x >= 0)
{
*gpuPointer = color;
gpuPointer++;
color = color + 2;
x--;
}
then you get the store you were looking for
00000000 <fun>:
0: e92d4010 push {r4, lr}
4: ebfffffe bl 0 <GetGPU_Pointer>
8: e59f403c ldr r4, [pc, #60] ; 4c <fun+0x4c>
c: e1a02000 mov r2, r0
10: e3a0c010 mov ip, #16
14: e2820a01 add r0, r2, #4096 ; 0x1000
18: e2801004 add r1, r0, #4
1c: e1a0300c mov r3, ip
20: e4823004 str r3, [r2], #4
24: e1520001 cmp r2, r1
28: e2833002 add r3, r3, #2
2c: 1afffffb bne 20 <fun+0x20>
30: e28ccb02 add ip, ip, #2048 ; 0x800
34: e28cc003 add ip, ip, #3
38: e15c0004 cmp ip, r4
3c: e2802004 add r2, r0, #4
40: 1afffff3 bne 14 <fun+0x14>
44: e8bd4010 pop {r4, lr}
48: e12fff1e bx lr
4c: 00181113 andseq r1, r8, r3, lsl r1
or if you make it volatile (and then dont have to modify it)
volatile unsigned int * gpuPointer = GetGPU_Pointer(framebufferAddress);
then
00000000 <fun>:
0: e92d4008 push {r3, lr}
4: ebfffffe bl 0 <GetGPU_Pointer>
8: e59fc02c ldr ip, [pc, #44] ; 3c <fun+0x3c>
c: e3a03010 mov r3, #16
10: e2831b02 add r1, r3, #2048 ; 0x800
14: e2812002 add r2, r1, #2
18: e5803000 str r3, [r0]
1c: e2833002 add r3, r3, #2
20: e1530002 cmp r3, r2
24: 1afffffb bne 18 <fun+0x18>
28: e2813003 add r3, r1, #3
2c: e153000c cmp r3, ip
30: 1afffff6 bne 10 <fun+0x10>
34: e8bd4008 pop {r3, lr}
38: e12fff1e bx lr
3c: 00181113 andseq r1, r8, r3, lsl r1
then you get your store
arm-none-eabi-gcc -O2 -c a.c -o a.o
arm-none-eabi-objdump -D a.o
arm-none-eabi-gcc (GCC) 4.8.2
Copyright (C) 2013 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
The problem is, as written, you didnt tell the compiler to update the pointer more than the one time. So as in my first example it has no reason to even implement the loop, it can pre-compute the answer and write it one time. In order to force the compiler to implement the loop and write to the pointer more than one time, you either need to make it volatile and/or modify it, depends on what you were really needing to do.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight