I want to compile a program with arm-none-eabi-gcc 9.2.1 using the libopencm3 project and run it on ARM Cortex-M4 processors. My program is composed of two files: main.c
#include "../common/stm32wrapper.h"
#include "test.h"
#include <stdio.h>
#include <string.h>
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
int main(void)
{
clock_setup();
gpio_setup();
usart_setup(115200);
flash_setup();
SCS_DEMCR |= SCS_DEMCR_TRCENA;
DWT_CYCCNT = 0;
DWT_CTRL |= DWT_CTRL_CYCCNTENA;
u32 oldcount, newcount;
u32 a = 0x75;
u32 b = 0x14;
char buffer[36];
oldcount = DWT_CYCCNT;
u32 c = test(a,b);
newcount = DWT_CYCCNT-oldcount;
sprintf(buffer, "cycles: %d, %08x", newcount, c);
send_USART_str(buffer);
return 0;
}
and test.c.
uint32_t test(uint32_t a, uint32_t b) {
uint32_t tmp0, tmp1;
uint32_t c;
for(int i = 0; i< 4096; i++) {
tmp0 = a & 0xff;
tmp1 = b & 0xff;
c = tmp0 ^ tmp1 ^ (a>>(i/512)) ^ (b >> (i/1024));
}
return c;
}
To compile my program, I use the following makefile:
.PHONY: all clean
PREFIX ?= arm-none-eabi
CC = $(PREFIX)-gcc -v
LD = $(PREFIX)-gcc -v
OBJCOPY = $(PREFIX)-objcopy
OBJDUMP = $(PREFIX)-objdump
GDB = $(PREFIX)-gdb
OPENCM3DIR = ../libopencm3
ARMNONEEABIDIR = /usr/arm-none-eabi
COMMONDIR = ../common
all: test_m4.bin
test_m4.%: ARCH_FLAGS = -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16
test_m4.o: CFLAGS += -DSTM32F4
$(COMMONDIR)/stm32f4_wrapper.o: CFLAGS += -DSTM32F4
test_m4.elf: LDSCRIPT = $(COMMONDIR)/stm32f4-discovery.ld
test_m4.elf: LDFLAGS += -L$(OPENCM3DIR)/lib/ -lopencm3_stm32f4
test_m4.elf: OBJS += $(COMMONDIR)/stm32f4_wrapper.o
test_m4.elf: $(COMMONDIR)/stm32f4_wrapper.o $(OPENCM3DIR)/lib/libopencm3_stm32f4.a
CFLAGS += -O3 \
-Wall -Wextra -Wimplicit-function-declaration \
-Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes \
-Wundef -Wshadow \
-I$(ARMNONEEABIDIR)/include -I$(OPENCM3DIR)/include \
-fno-common $(ARCH_FLAGS) -MD \
-ftime-report
LDFLAGS += --static -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group \
-T$(LDSCRIPT) -nostartfiles -Wl,--gc-sections,--no-print-gc-sections \
$(ARCH_FLAGS)
OBJS += test.c
%.bin: %.elf
$(OBJCOPY) -Obinary $^ $#
%.elf: %.o $(OBJS) $(LDSCRIPT)
$(LD) -o $# $< $(OBJS) $(LDFLAGS)
test%.o: main.c
$(CC) $(CFLAGS) -o $# -c $^
%.o: %.c
$(CC) $(CFLAGS) -o $# -c $^
clean:
rm -f *.o *.d *.elf *.bin
I can compile and run my code using this makefile. By running make I get the following output:
arm-none-eabi-gcc -v -O3 -Wall -Wextra -Wimplicit-function-declaration -Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes -Wundef -Wshadow -I/usr/arm-none-eabi/include -I../libopencm3/include -fno-common -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -MD -ftime-report -DSTM32F4 -o test_m4.o -c main.c
Using built-in specs.
COLLECT_GCC=arm-none-eabi-gcc
Target: arm-none-eabi
Configured with: /mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/src/gcc/configure --target=arm-none-eabi --prefix=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native --libexecdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/lib --infodir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/info --mandir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/man --htmldir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/html --pdfdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/pdf --enable-languages=c,c++ --enable-plugins --disable-decimal-float --disable-libffi --disable-libgomp --disable-libmudflap --disable-libquadmath --disable-libssp --disable-libstdcxx-pch --disable-nls --disable-shared --disable-threads --disable-tls --with-gnu-as --with-gnu-ld --with-newlib --with-headers=yes --with-python-dir=share/gcc-arm-none-eabi --with-sysroot=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/arm-none-eabi --build=x86_64-linux-gnu --host=x86_64-linux-gnu --with-gmp=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpfr=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpc=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-isl=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-libelf=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic -lm' --with-pkgversion='GNU Tools for Arm Embedded Processors 9-2019-q4-major' --with-multilib-list=rmprofile
Thread model: single
gcc version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (GNU Tools for Arm Embedded Processors 9-2019-q4-major)
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/cc1 -quiet -v -I /usr/arm-none-eabi/include -I ../libopencm3/include -imultilib thumb/v7e-m+fp/hard -iprefix /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/ -isysroot /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -MD test_m4.d -MQ test_m4.o -D__USES_INITFINI__ -D STM32F4 main.c -quiet -dumpbase main.c -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -march=armv7e-m+fp -auxbase-strip test_m4.o -O3 -Wall -Wextra -Wimplicit-function-declaration -Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes -Wundef -Wshadow -version -fno-common -ftime-report -o /tmp/ccm5h1i9.s
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/local/include"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include-fixed"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/include"
ignoring nonexistent directory "/usr/arm-none-eabi/include"
#include "..." search starts here:
#include <...> search starts here:
../libopencm3/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include-fixed
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include
End of search list.
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 4381e146d4f016ae8e44a645dba65184
Time variable usr sys wall GGC
phase setup : 0.01 ( 8%) 0.01 ( 20%) 0.03 ( 17%) 3569 kB ( 62%)
phase parsing : 0.10 ( 83%) 0.04 ( 80%) 0.14 ( 78%) 2069 kB ( 36%)
phase opt and generate : 0.01 ( 8%) 0.00 ( 0%) 0.01 ( 6%) 120 kB ( 2%)
preprocessing : 0.03 ( 25%) 0.03 ( 60%) 0.03 ( 17%) 889 kB ( 15%)
lexical analysis : 0.04 ( 33%) 0.00 ( 0%) 0.05 ( 28%) 0 kB ( 0%)
parser (global) : 0.02 ( 17%) 0.00 ( 0%) 0.04 ( 22%) 1063 kB ( 18%)
parser struct body : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 6%) 41 kB ( 1%)
parser enumerator list : 0.01 ( 8%) 0.01 ( 20%) 0.01 ( 6%) 54 kB ( 1%)
tree gimplify : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 6%) 8 kB ( 0%)
initialize rtl : 0.01 ( 8%) 0.00 ( 0%) 0.00 ( 0%) 7 kB ( 0%)
TOTAL : 0.12 0.05 0.18 5767 kB
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/as -v -I /usr/arm-none-eabi/include -I ../libopencm3/include -march=armv7e-m -mfloat-abi=hard -mfpu=fpv4-sp-d16 -meabi=5 -o test_m4.o /tmp/ccm5h1i9.s
GNU assembler version 2.33.1 (arm-none-eabi) using BFD version (GNU Tools for Arm Embedded Processors 9-2019-q4-major) 2.33.1.20191025
COMPILER_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/
LIBRARY_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
arm-none-eabi-gcc -v -o test_m4.elf test_m4.o test.c ../common/stm32f4_wrapper.o --static -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group -T../common/stm32f4-discovery.ld -nostartfiles -Wl,--gc-sections,--no-print-gc-sections -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -L../libopencm3/lib/ -lopencm3_stm32f4
Using built-in specs.
COLLECT_GCC=arm-none-eabi-gcc
COLLECT_LTO_WRAPPER=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/lto-wrapper
Target: arm-none-eabi
Configured with: /mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/src/gcc/configure --target=arm-none-eabi --prefix=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native --libexecdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/lib --infodir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/info --mandir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/man --htmldir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/html --pdfdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/pdf --enable-languages=c,c++ --enable-plugins --disable-decimal-float --disable-libffi --disable-libgomp --disable-libmudflap --disable-libquadmath --disable-libssp --disable-libstdcxx-pch --disable-nls --disable-shared --disable-threads --disable-tls --with-gnu-as --with-gnu-ld --with-newlib --with-headers=yes --with-python-dir=share/gcc-arm-none-eabi --with-sysroot=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/arm-none-eabi --build=x86_64-linux-gnu --host=x86_64-linux-gnu --with-gmp=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpfr=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpc=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-isl=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-libelf=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic -lm' --with-pkgversion='GNU Tools for Arm Embedded Processors 9-2019-q4-major' --with-multilib-list=rmprofile
Thread model: single
gcc version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (GNU Tools for Arm Embedded Processors 9-2019-q4-major)
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/cc1 -quiet -v -imultilib thumb/v7e-m+fp/hard -iprefix /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/ -isysroot /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -D__USES_INITFINI__ test.c -quiet -dumpbase test.c -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -march=armv7e-m+fp -auxbase test -version -o /tmp/cc3yny6o.s
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/local/include"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include-fixed"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/include"
#include "..." search starts here:
#include <...> search starts here:
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include-fixed
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include
End of search list.
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 4381e146d4f016ae8e44a645dba65184
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/as -v -march=armv7e-m -mfloat-abi=hard -mfpu=fpv4-sp-d16 -meabi=5 -o /tmp/ccfflDpW.o /tmp/cc3yny6o.s
GNU assembler version 2.33.1 (arm-none-eabi) using BFD version (GNU Tools for Arm Embedded Processors 9-2019-q4-major) 2.33.1.20191025
COMPILER_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/
LIBRARY_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/collect2 -plugin /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/liblto_plugin.so -plugin-opt=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/lto-wrapper -plugin-opt=-fresolution=/tmp/cc4qN1Kt.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc --sysroot=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -Bstatic -X -o test_m4.elf -L../libopencm3/lib/ -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1 -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib test_m4.o /tmp/ccfflDpW.o ../common/stm32f4_wrapper.o --start-group -lc -lgcc -lnosys --end-group --gc-sections --no-print-gc-sections -lopencm3_stm32f4 --start-group -lgcc -lc --end-group -T ../common/stm32f4-discovery.ld
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
arm-none-eabi-objcopy -Obinary test_m4.elf test_m4.bin
The thing that it seems the optimization flag does not seem to be taken into account, as no matter what I put, the generated binary is always the same and the program always print cycles: 196645, 00000063. By disassembling the binary I get the following output for both -Os and -O3 optimizations:
080001ac <main>:
80001ac: b570 push {r4, r5, r6, lr}
80001ae: b08a sub sp, #40 ; 0x28
80001b0: f006 fc06 bl 80069c0 <clock_setup>
80001b4: f006 fc1c bl 80069f0 <gpio_setup>
80001b8: f44f 30e1 mov.w r0, #115200 ; 0x1c200
80001bc: f006 fc32 bl 8006a24 <usart_setup>
80001c0: f006 fc52 bl 8006a68 <flash_setup>
80001c4: 490e ldr r1, [pc, #56] ; (8000200 <main+0x54>)
80001c6: 4c0f ldr r4, [pc, #60] ; (8000204 <main+0x58>)
80001c8: 680b ldr r3, [r1, #0]
80001ca: 4a0f ldr r2, [pc, #60] ; (8000208 <main+0x5c>)
80001cc: 2500 movs r5, #0
80001ce: f043 7380 orr.w r3, r3, #16777216 ; 0x1000000
80001d2: 600b str r3, [r1, #0]
80001d4: 6025 str r5, [r4, #0]
80001d6: 6813 ldr r3, [r2, #0]
80001d8: f043 0301 orr.w r3, r3, #1
80001dc: 6013 str r3, [r2, #0]
80001de: 6826 ldr r6, [r4, #0]
80001e0: f000 f816 bl 8000210 <test>
80001e4: 6822 ldr r2, [r4, #0]
80001e6: 4909 ldr r1, [pc, #36] ; (800020c <main+0x60>)
80001e8: 4603 mov r3, r0
80001ea: 1b92 subs r2, r2, r6
80001ec: a801 add r0, sp, #4
80001ee: f006 fca5 bl 8006b3c <sprintf>
80001f2: a801 add r0, sp, #4
80001f4: f006 fc48 bl 8006a88 <send_USART_str>
80001f8: 4628 mov r0, r5
80001fa: b00a add sp, #40 ; 0x28
80001fc: bd70 pop {r4, r5, r6, pc}
80001fe: bf00 nop
8000200: e000edfc .word 0xe000edfc
8000204: e0001004 .word 0xe0001004
8000208: e0001000 .word 0xe0001000
800020c: 0800c1e8 .word 0x0800c1e8
08000210 <test>:
8000210: b480 push {r7}
8000212: b087 sub sp, #28
8000214: af00 add r7, sp, #0
8000216: 2375 movs r3, #117 ; 0x75
8000218: 60fb str r3, [r7, #12]
800021a: 2314 movs r3, #20
800021c: 60bb str r3, [r7, #8]
800021e: 2300 movs r3, #0
8000220: 613b str r3, [r7, #16]
8000222: e020 b.n 8000266 <test+0x56>
8000224: 68fb ldr r3, [r7, #12]
8000226: b2db uxtb r3, r3
8000228: 607b str r3, [r7, #4]
800022a: 68bb ldr r3, [r7, #8]
800022c: b2db uxtb r3, r3
800022e: 603b str r3, [r7, #0]
8000230: 687a ldr r2, [r7, #4]
8000232: 683b ldr r3, [r7, #0]
8000234: 405a eors r2, r3
8000236: 693b ldr r3, [r7, #16]
8000238: 2b00 cmp r3, #0
800023a: da01 bge.n 8000240 <test+0x30>
800023c: f203 13ff addw r3, r3, #511 ; 0x1ff
8000240: 125b asrs r3, r3, #9
8000242: 4619 mov r1, r3
8000244: 68fb ldr r3, [r7, #12]
8000246: 40cb lsrs r3, r1
8000248: 405a eors r2, r3
800024a: 693b ldr r3, [r7, #16]
800024c: 2b00 cmp r3, #0
800024e: da01 bge.n 8000254 <test+0x44>
8000250: f203 33ff addw r3, r3, #1023 ; 0x3ff
8000254: 129b asrs r3, r3, #10
8000256: 4619 mov r1, r3
8000258: 68bb ldr r3, [r7, #8]
800025a: 40cb lsrs r3, r1
800025c: 4053 eors r3, r2
800025e: 617b str r3, [r7, #20]
8000260: 693b ldr r3, [r7, #16]
8000262: 3301 adds r3, #1
8000264: 613b str r3, [r7, #16]
8000266: 693b ldr r3, [r7, #16]
8000268: f5b3 5f80 cmp.w r3, #4096 ; 0x1000
800026c: dbda blt.n 8000224 <test+0x14>
800026e: 697b ldr r3, [r7, #20]
8000270: 4618 mov r0, r3
8000272: 371c adds r7, #28
8000274: 46bd mov sp, r7
8000276: f85d 7b04 ldr.w r7, [sp], #4
800027a: 4770 bx lr
It seems really weird to me as the code can be clearly enhanced in terms of speed. For instance, a single uxtb can be computed instead of two (if executed after the eor), so I believe there is something wrong going here. Why is the optimization flag not taken into account here? Is there something wrong with my makefile?
typedef unsigned int uint32_t;
uint32_t test(uint32_t a, uint32_t b) {
uint32_t tmp0, tmp1;
uint32_t c;
for(int i = 0; i< 4096; i++) {
tmp0 = a & 0xff;
tmp1 = b & 0xff;
c = tmp0 ^ tmp1 ^ (a>>(i/512)) ^ (b >> (i/1024));
}
return c;
}
unsigned int hello ( void )
{
return(test(0x75,0x14));
}
9.3.0 vs 9.2.1 is not going to be much different, I can specifically get a 9.2.1 if you want to see it but you can just do this yourself.
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-O0
arm-none-eabi-gcc -O0 so.c -c -mthumb -mcpu=cortex-m4 -o so.o
Disassembly of section .text:
00000000 <test>:
0: b480 push {r7}
2: b087 sub sp, #28
4: af00 add r7, sp, #0
6: 6078 str r0, [r7, #4]
8: 6039 str r1, [r7, #0]
a: 2300 movs r3, #0
c: 613b str r3, [r7, #16]
e: e020 b.n 52 <test+0x52>
10: 687b ldr r3, [r7, #4]
12: b2db uxtb r3, r3
14: 60fb str r3, [r7, #12]
16: 683b ldr r3, [r7, #0]
18: b2db uxtb r3, r3
1a: 60bb str r3, [r7, #8]
1c: 68fa ldr r2, [r7, #12]
1e: 68bb ldr r3, [r7, #8]
20: 405a eors r2, r3
22: 693b ldr r3, [r7, #16]
24: 2b00 cmp r3, #0
26: da01 bge.n 2c <test+0x2c>
28: f203 13ff addw r3, r3, #511 ; 0x1ff
2c: 125b asrs r3, r3, #9
2e: 4619 mov r1, r3
30: 687b ldr r3, [r7, #4]
32: 40cb lsrs r3, r1
34: 405a eors r2, r3
36: 693b ldr r3, [r7, #16]
38: 2b00 cmp r3, #0
3a: da01 bge.n 40 <test+0x40>
3c: f203 33ff addw r3, r3, #1023 ; 0x3ff
40: 129b asrs r3, r3, #10
42: 4619 mov r1, r3
44: 683b ldr r3, [r7, #0]
46: 40cb lsrs r3, r1
48: 4053 eors r3, r2
4a: 617b str r3, [r7, #20]
4c: 693b ldr r3, [r7, #16]
4e: 3301 adds r3, #1
50: 613b str r3, [r7, #16]
52: 693b ldr r3, [r7, #16]
54: f5b3 5f80 cmp.w r3, #4096 ; 0x1000
58: dbda blt.n 10 <test+0x10>
5a: 697b ldr r3, [r7, #20]
5c: 4618 mov r0, r3
5e: 371c adds r7, #28
60: 46bd mov sp, r7
62: bc80 pop {r7}
64: 4770 bx lr
00000066 <hello>:
66: b580 push {r7, lr}
68: af00 add r7, sp, #0
6a: 2114 movs r1, #20
6c: 2075 movs r0, #117 ; 0x75
6e: f7ff fffe bl 0 <test>
72: 4603 mov r3, r0
74: 4618 mov r0, r3
76: bd80 pop {r7, pc}
-O1
arm-none-eabi-gcc -O1 so.c -c -mthumb -mcpu=cortex-m4 -o so.o
arm-none-eabi-objdump -D so.o
so.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test>:
0: f44f 5380 mov.w r3, #4096 ; 0x1000
4: 3b01 subs r3, #1
6: d1fd bne.n 4 <test+0x4>
8: 08ca lsrs r2, r1, #3
a: ea82 12d0 eor.w r2, r2, r0, lsr #7
e: ea80 0301 eor.w r3, r0, r1
12: b2db uxtb r3, r3
14: ea82 0003 eor.w r0, r2, r3
18: 4770 bx lr
0000001a <hello>:
1a: b508 push {r3, lr}
1c: 2114 movs r1, #20
1e: 2075 movs r0, #117 ; 0x75
20: f7ff fffe bl 0 <test>
24: bd08 pop {r3, pc}
-O2
Disassembly of section .text:
00000000 <test>:
0: ea80 0301 eor.w r3, r0, r1
4: 08ca lsrs r2, r1, #3
6: ea82 10d0 eor.w r0, r2, r0, lsr #7
a: b2db uxtb r3, r3
c: 4058 eors r0, r3
e: 4770 bx lr
00000010 <hello>:
10: 2063 movs r0, #99 ; 0x63
12: 4770 bx lr
-O3
00000000 <test>:
0: ea80 0301 eor.w r3, r0, r1
4: 08ca lsrs r2, r1, #3
6: ea82 10d0 eor.w r0, r2, r0, lsr #7
a: b2db uxtb r3, r3
c: 4058 eors r0, r3
e: 4770 bx lr
00000010 <hello>:
10: 2063 movs r0, #99 ; 0x63
12: 4770 bx lr
-Os
00000000 <test>:
0: 08cb lsrs r3, r1, #3
2: ea83 13d0 eor.w r3, r3, r0, lsr #7
6: 4048 eors r0, r1
8: b2c0 uxtb r0, r0
a: 4058 eors r0, r3
c: 4770 bx lr
0000000e <hello>:
e: 2114 movs r1, #20
10: 2075 movs r0, #117 ; 0x75
12: f7ff bffe b.w 0 <test>
If all of these execute in the same amount of time then clearly yes you either have a build issue or there is an issue with your test. If as you claim -O1 and -O2 and -O3 etc all produce the same output, then you are not actually using those optimization levels.
There is no reason to assume that -Os produces a smaller binary than -O2 or -O3. Its just you are hinting at that desire. You can create exceptions.
Also no reason to assume that compiled for size will execute faster nor will -O3, etc. Especially on a platform like this (well all modern platforms) where some percentage of the performance is not related directly to the number or sequence of instructions, but the whole system.
You are on an stm32, cortex-m4 so you have the st flash cache thing which you cant turn off, now that is going to help all of the tests, but also going to hide things as well. You have a clock init and then later a flash setup, wonder what is going on there if you are upping your clock then you have to slow the flash down first not after or you can crash. For a test like this there generally is no reason to bump the clocks, you want to measure in timer clock cycles ideally system (as in cpu) clock cycles and then do things like mess with the flash wait states, at the slower clock speeds (some parts full range but) you can go with minimal flash wait states and then simply bump up the wait states for different tests without upping the clock to see how the flash affects it unfortunately this is an stm32. To get around that you can run the test in sram.
Depending on the compile time options for the core, some cores have different fetch features and other features and you may have some core features you can mess with a simple alignment change of a tight loop like this can have a dramatic affect, same machine code starting at a different address, how it lines up in the fetch lines and cache lines can affect the benchmark results.
Note you can get the same results using the systick timer you need for the debugger timer. Can wrap the gathering of time in the code under test (not in the function but when you lift the assembly language to make the code under test you can then add the time gathering just before and after, not incurring function call overhead which itself can vary from test to test.
If you are seeing the same machine code come out of the compiler for different settings then you are not actually building with those settings, not actually re-building the application, or some other form of user error (building here and using a binary from there). As a result the same binary will give the same time plus or minus a clock ideally in a situation like this. But that depends also on how you are running or re-running the test. Do you want to see the cache effects or not, prime the cache then run the test, etc.
If when you start to see different machine code or if you really are seeing different machine code but getting the same time then the error is in the measurement of time which is an often overlooked issue with benchmarking. your method seems fine so long as you are really seeing that timer, and have done tests to see that the timer is counting and in the direction you expect it to go. If this is a number of instructions counter not execution of time, then you can still test it to see it is doing what you think. I have no use for those debug tools so don't dabble in them nor know them as intimately as I know other things about these systems.
Being an m4 there might be other features you can turn on/off to see performance differences based on code generated, branch prediction, caching, mmu-like-thing, etc.
It may be the order of the flags you are using (the reason for each of those flags being a first question) relative to the -O3, some may be negating other optimization features.
Curious to know what the real goal is here. Understand that benchmarks are nonsense, because they are so easy to manipulate, the same high level code is not expected to produce the same results in the same target with the same or different tools for various reasons. dumb down the command line and try clang/llvm vs gnu or try gcc 4.x.x, 5.x.x, etc on up. after 4.x.x the output started to get bloated, the compiler wasn't doing as good of a job, for something like this though they should be pretty close but at the same time one fewer or more instruction, a simple alignment difference could make two tests vastly different in terms of results.
Then when you put back clock setups that changes how things work, so you may be able to let's say use no wait states (flash probably runs at have the CPU rate, so there is a wait built in) up to 25mhz as an example then add a wait state up to say 50 and so on. Varies by design some newer parts the flash can run much faster than older parts, but at 25mhz vs 8, the same number of clocks is an overall smaller number of time, wall clock time. At the boundary you can arguably not bump the wait states if you create/modify the clock init code and get that performance boost, but just over that boundary you get a performance hit with the flash wait state increase. So there is a performance balance there.
Summary
If the same code is coming out of the compiler then it is your command line, you can easily simplify the command line to see that the tools will produce different code. If your comparison is wrong and the code is different then the problem is how you are timing the code which is often where benchmarks go wrong, that and other factors not related to the compiler command line. Benchmarks are in general nonsense because they can be manipulated to show different results (even without changing the high level source code of the test).
Try simplifying the command line, examine each option you have there and justify why it is there for your specific application. Validate the timer the best you can or instruction counter whichever this is (and understand that counting instructions executed is not directly related to performance, you can have 100x instructions that execute faster than some other solution).
No reason to expect -Os to produce smaller code, one would hope but there are exceptions. Likewise -Os might execute faster than -O2 or -O3, no reason to expect the larger number optimization level to produce "faster" code.
You are compiling the code with -O0 flag.
It is clearly seen here:
https://godbolt.org/z/qZPYqJ
So the compiler is always right. No missed optimisations found.
Well the real answer is not really easy, but before disasembling something one should know what optimization actually is and how compilers achieves their goals.
Considering gcc there is barely no differences between Os and 03 since they turn on almost the same internal flags except loop unroling for Os.
In addition with nowaday cpu having everything in cache is anyway faster.
(I am new to the ARM world. Excuse me if this is a dumb question.)
I am using below command line to generate assembly code for a C file.
The cpu is arm926ej-s, which is ARMv5 architecture.
arm-none-eabi-gcc -mcpu=arm926ej-s -mthumb -S t.c -o t_thumb.S
arm-none-eabi-gcc -mcpu=arm926ej-s -marm -S t.c -o t_arm.S
I am expecting the -marm and -mthumb options would generate different function prologues. But they give similar results:
for -marm:
# args = 0, pretend = 0, frame = 72
# frame_needed = 1, uses_anonymous_args = 0
push {fp, lr} #<========== push is used instead of stmfd
add fp, sp, #4
sub sp, sp, #72
bl uart_init
for -mthumb:
# args = 0, pretend = 0, frame = 72
# frame_needed = 1, uses_anonymous_args = 0
push {r7, lr} #<========== push is used as expected
sub sp, sp, #72
add r7, sp, #0
bl uart_init
So they both use the push instruction. But as I checked the ARMv5 arch spec, the push instruction only belongs to the Thumb instruction set. I was expecting stmfd for the -marm option.
Why is push chosen instead?
How can I generate pure ARM instructions?
ADD 1 - 5:21 PM 12/18/2019
Below is the disassembly of the .o file:
arm-none-eabi-gcc -mcpu=arm926ej-s -marm -g -c t.c -o build/t_arm.o
arm-none-eabi-objdump.exe -d build/t_arm.o > t_arm.dism
The disassembly:
000002a0 <main>:
2a0: e92d4800 push {fp, lr} <=============== push is used!
2a4: e28db004 add fp, sp, #4
2a8: e24dd048 sub sp, sp, #72 ; 0x48
2ac: ebfffffe bl 0 <uart_init>
2b0: e59f3168 ldr r3, [pc, #360] ; 420 <main+0x180>
2b4: e50b300c str r3, [fp, #-12]
2b8: e59f1164 ldr r1, [pc, #356] ; 424 <main+0x184>
2bc: e51b000c ldr r0, [fp, #-12]
ADD 2 - 5:34 PM 12/18/2019
Thanks to #Erlkoenig.
I just tried to disassemble a -mthumb binary:
arm-none-eabi-gcc -mcpu=arm926ej-s -mthumb -g -c t.c -o build/t_thumb.o
arm-none-eabi-objdump.exe -d build/t_thumb.o > t_thumb.dism
A totally different thumb disassembly is shown:
00000170 <main>:
170: b580 push {r7, lr} <====== though still push is shown, but the encoding is different.
172: b092 sub sp, #72 ; 0x48
174: af00 add r7, sp, #0
176: f7ff fffe bl 0 <uart_init>
17a: 4b3c ldr r3, [pc, #240] ; (26c <main+0xfc>)
17c: 643b str r3, [r7, #64] ; 0x40
17e: 4a3c ldr r2, [pc, #240] ; (270 <main+0x100>)
180: 6c3b ldr r3, [r7, #64] ; 0x40
The hex encoding of the raw instruction as shown by objdump -d indicates that this is a 32bit ARM ("A32") instruction (0xe92d4800). The .S file generated by the -S flag to GCC, and the objdump output just use the ARM UAL (Unified Assembly Syntax), which uses push as an alias for stmfd, while the ARMv5T Architecture Reference Manual uses the old syntax, which has no push on A32. The instruction encoding matches the encoding of stmdb, for which stmfd is an alias. The encoding is shown on p. 339 in the ARMv5T Reference Manual.
A32 ("ARM") code can be easily recognized as all instructions are 4-byte wide and the first 4 bits are often hex E (which means that the condition code is AL, i.e. the instructions are always executed unconditionally):
[e]92d4800
[e]28db004
[e]24dd048
[e]bfffffe
This is helpful when viewing raw binaries in a hex editor. Thumb ("T32") code has many 16bit instructions, some 32bit, and no "stacks" of Es:
b580
b092
af00
f7ff fffe
Of course, for a raw binary, it is not directly clear which 2- and 4-byte groups belong together as instructions.