I want to compile a program with arm-none-eabi-gcc 9.2.1 using the libopencm3 project and run it on ARM Cortex-M4 processors. My program is composed of two files: main.c
#include "../common/stm32wrapper.h"
#include "test.h"
#include <stdio.h>
#include <string.h>
typedef unsigned char u8;
typedef unsigned int u32;
typedef unsigned long long u64;
int main(void)
{
clock_setup();
gpio_setup();
usart_setup(115200);
flash_setup();
SCS_DEMCR |= SCS_DEMCR_TRCENA;
DWT_CYCCNT = 0;
DWT_CTRL |= DWT_CTRL_CYCCNTENA;
u32 oldcount, newcount;
u32 a = 0x75;
u32 b = 0x14;
char buffer[36];
oldcount = DWT_CYCCNT;
u32 c = test(a,b);
newcount = DWT_CYCCNT-oldcount;
sprintf(buffer, "cycles: %d, %08x", newcount, c);
send_USART_str(buffer);
return 0;
}
and test.c.
uint32_t test(uint32_t a, uint32_t b) {
uint32_t tmp0, tmp1;
uint32_t c;
for(int i = 0; i< 4096; i++) {
tmp0 = a & 0xff;
tmp1 = b & 0xff;
c = tmp0 ^ tmp1 ^ (a>>(i/512)) ^ (b >> (i/1024));
}
return c;
}
To compile my program, I use the following makefile:
.PHONY: all clean
PREFIX ?= arm-none-eabi
CC = $(PREFIX)-gcc -v
LD = $(PREFIX)-gcc -v
OBJCOPY = $(PREFIX)-objcopy
OBJDUMP = $(PREFIX)-objdump
GDB = $(PREFIX)-gdb
OPENCM3DIR = ../libopencm3
ARMNONEEABIDIR = /usr/arm-none-eabi
COMMONDIR = ../common
all: test_m4.bin
test_m4.%: ARCH_FLAGS = -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16
test_m4.o: CFLAGS += -DSTM32F4
$(COMMONDIR)/stm32f4_wrapper.o: CFLAGS += -DSTM32F4
test_m4.elf: LDSCRIPT = $(COMMONDIR)/stm32f4-discovery.ld
test_m4.elf: LDFLAGS += -L$(OPENCM3DIR)/lib/ -lopencm3_stm32f4
test_m4.elf: OBJS += $(COMMONDIR)/stm32f4_wrapper.o
test_m4.elf: $(COMMONDIR)/stm32f4_wrapper.o $(OPENCM3DIR)/lib/libopencm3_stm32f4.a
CFLAGS += -O3 \
-Wall -Wextra -Wimplicit-function-declaration \
-Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes \
-Wundef -Wshadow \
-I$(ARMNONEEABIDIR)/include -I$(OPENCM3DIR)/include \
-fno-common $(ARCH_FLAGS) -MD \
-ftime-report
LDFLAGS += --static -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group \
-T$(LDSCRIPT) -nostartfiles -Wl,--gc-sections,--no-print-gc-sections \
$(ARCH_FLAGS)
OBJS += test.c
%.bin: %.elf
$(OBJCOPY) -Obinary $^ $#
%.elf: %.o $(OBJS) $(LDSCRIPT)
$(LD) -o $# $< $(OBJS) $(LDFLAGS)
test%.o: main.c
$(CC) $(CFLAGS) -o $# -c $^
%.o: %.c
$(CC) $(CFLAGS) -o $# -c $^
clean:
rm -f *.o *.d *.elf *.bin
I can compile and run my code using this makefile. By running make I get the following output:
arm-none-eabi-gcc -v -O3 -Wall -Wextra -Wimplicit-function-declaration -Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes -Wundef -Wshadow -I/usr/arm-none-eabi/include -I../libopencm3/include -fno-common -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -MD -ftime-report -DSTM32F4 -o test_m4.o -c main.c
Using built-in specs.
COLLECT_GCC=arm-none-eabi-gcc
Target: arm-none-eabi
Configured with: /mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/src/gcc/configure --target=arm-none-eabi --prefix=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native --libexecdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/lib --infodir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/info --mandir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/man --htmldir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/html --pdfdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/pdf --enable-languages=c,c++ --enable-plugins --disable-decimal-float --disable-libffi --disable-libgomp --disable-libmudflap --disable-libquadmath --disable-libssp --disable-libstdcxx-pch --disable-nls --disable-shared --disable-threads --disable-tls --with-gnu-as --with-gnu-ld --with-newlib --with-headers=yes --with-python-dir=share/gcc-arm-none-eabi --with-sysroot=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/arm-none-eabi --build=x86_64-linux-gnu --host=x86_64-linux-gnu --with-gmp=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpfr=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpc=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-isl=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-libelf=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic -lm' --with-pkgversion='GNU Tools for Arm Embedded Processors 9-2019-q4-major' --with-multilib-list=rmprofile
Thread model: single
gcc version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (GNU Tools for Arm Embedded Processors 9-2019-q4-major)
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/cc1 -quiet -v -I /usr/arm-none-eabi/include -I ../libopencm3/include -imultilib thumb/v7e-m+fp/hard -iprefix /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/ -isysroot /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -MD test_m4.d -MQ test_m4.o -D__USES_INITFINI__ -D STM32F4 main.c -quiet -dumpbase main.c -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -march=armv7e-m+fp -auxbase-strip test_m4.o -O3 -Wall -Wextra -Wimplicit-function-declaration -Wredundant-decls -Wmissing-prototypes -Wstrict-prototypes -Wundef -Wshadow -version -fno-common -ftime-report -o /tmp/ccm5h1i9.s
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/local/include"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include-fixed"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/include"
ignoring nonexistent directory "/usr/arm-none-eabi/include"
#include "..." search starts here:
#include <...> search starts here:
../libopencm3/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include-fixed
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include
End of search list.
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 4381e146d4f016ae8e44a645dba65184
Time variable usr sys wall GGC
phase setup : 0.01 ( 8%) 0.01 ( 20%) 0.03 ( 17%) 3569 kB ( 62%)
phase parsing : 0.10 ( 83%) 0.04 ( 80%) 0.14 ( 78%) 2069 kB ( 36%)
phase opt and generate : 0.01 ( 8%) 0.00 ( 0%) 0.01 ( 6%) 120 kB ( 2%)
preprocessing : 0.03 ( 25%) 0.03 ( 60%) 0.03 ( 17%) 889 kB ( 15%)
lexical analysis : 0.04 ( 33%) 0.00 ( 0%) 0.05 ( 28%) 0 kB ( 0%)
parser (global) : 0.02 ( 17%) 0.00 ( 0%) 0.04 ( 22%) 1063 kB ( 18%)
parser struct body : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 6%) 41 kB ( 1%)
parser enumerator list : 0.01 ( 8%) 0.01 ( 20%) 0.01 ( 6%) 54 kB ( 1%)
tree gimplify : 0.00 ( 0%) 0.00 ( 0%) 0.01 ( 6%) 8 kB ( 0%)
initialize rtl : 0.01 ( 8%) 0.00 ( 0%) 0.00 ( 0%) 7 kB ( 0%)
TOTAL : 0.12 0.05 0.18 5767 kB
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/as -v -I /usr/arm-none-eabi/include -I ../libopencm3/include -march=armv7e-m -mfloat-abi=hard -mfpu=fpv4-sp-d16 -meabi=5 -o test_m4.o /tmp/ccm5h1i9.s
GNU assembler version 2.33.1 (arm-none-eabi) using BFD version (GNU Tools for Arm Embedded Processors 9-2019-q4-major) 2.33.1.20191025
COMPILER_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/
LIBRARY_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/
COLLECT_GCC_OPTIONS='-v' '-O3' '-Wall' '-Wextra' '-Wimplicit-function-declaration' '-Wredundant-decls' '-Wmissing-prototypes' '-Wstrict-prototypes' '-Wundef' '-Wshadow' '-I' '/usr/arm-none-eabi/include' '-I' '../libopencm3/include' '-fno-common' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-MD' '-ftime-report' '-D' 'STM32F4' '-o' 'test_m4.o' '-c' '-march=armv7e-m+fp'
arm-none-eabi-gcc -v -o test_m4.elf test_m4.o test.c ../common/stm32f4_wrapper.o --static -Wl,--start-group -lc -lgcc -lnosys -Wl,--end-group -T../common/stm32f4-discovery.ld -nostartfiles -Wl,--gc-sections,--no-print-gc-sections -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -L../libopencm3/lib/ -lopencm3_stm32f4
Using built-in specs.
COLLECT_GCC=arm-none-eabi-gcc
COLLECT_LTO_WRAPPER=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/lto-wrapper
Target: arm-none-eabi
Configured with: /mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/src/gcc/configure --target=arm-none-eabi --prefix=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native --libexecdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/lib --infodir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/info --mandir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/man --htmldir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/html --pdfdir=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/share/doc/gcc-arm-none-eabi/pdf --enable-languages=c,c++ --enable-plugins --disable-decimal-float --disable-libffi --disable-libgomp --disable-libmudflap --disable-libquadmath --disable-libssp --disable-libstdcxx-pch --disable-nls --disable-shared --disable-threads --disable-tls --with-gnu-as --with-gnu-ld --with-newlib --with-headers=yes --with-python-dir=share/gcc-arm-none-eabi --with-sysroot=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/install-native/arm-none-eabi --build=x86_64-linux-gnu --host=x86_64-linux-gnu --with-gmp=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpfr=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-mpc=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-isl=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-libelf=/mnt/workspace/workspace/GCC-9-pipeline/jenkins-GCC-9-pipeline-100_20191030_1572397542/build-native/host-libs/usr --with-host-libstdcxx='-static-libgcc -Wl,-Bstatic,-lstdc++,-Bdynamic -lm' --with-pkgversion='GNU Tools for Arm Embedded Processors 9-2019-q4-major' --with-multilib-list=rmprofile
Thread model: single
gcc version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (GNU Tools for Arm Embedded Processors 9-2019-q4-major)
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/cc1 -quiet -v -imultilib thumb/v7e-m+fp/hard -iprefix /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/ -isysroot /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -D__USES_INITFINI__ test.c -quiet -dumpbase test.c -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16 -march=armv7e-m+fp -auxbase test -version -o /tmp/cc3yny6o.s
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/local/include"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/include-fixed"
ignoring duplicate directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/../../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include"
ignoring nonexistent directory "/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/usr/include"
#include "..." search starts here:
#include <...> search starts here:
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/include-fixed
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/include
End of search list.
GNU C17 (GNU Tools for Arm Embedded Processors 9-2019-q4-major) version 9.2.1 20191025 (release) [ARM/arm-9-branch revision 277599] (arm-none-eabi)
compiled by GNU C version 4.8.4, GMP version 6.1.0, MPFR version 3.1.4, MPC version 1.0.3, isl version isl-0.18-GMP
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
Compiler executable checksum: 4381e146d4f016ae8e44a645dba65184
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/as -v -march=armv7e-m -mfloat-abi=hard -mfpu=fpv4-sp-d16 -meabi=5 -o /tmp/ccfflDpW.o /tmp/cc3yny6o.s
GNU assembler version 2.33.1 (arm-none-eabi) using BFD version (GNU Tools for Arm Embedded Processors 9-2019-q4-major) 2.33.1.20191025
COMPILER_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/bin/
LIBRARY_PATH=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/:/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/collect2 -plugin /usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/liblto_plugin.so -plugin-opt=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/lto-wrapper -plugin-opt=-fresolution=/tmp/cc4qN1Kt.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc --sysroot=/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi -Bstatic -X -o test_m4.elf -L../libopencm3/lib/ -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib/thumb/v7e-m+fp/hard -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1 -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../lib/gcc/arm-none-eabi/9.2.1/../../../../arm-none-eabi/lib -L/usr/gcc-arm-none-eabi-9-2019-q4-major/bin/../arm-none-eabi/lib test_m4.o /tmp/ccfflDpW.o ../common/stm32f4_wrapper.o --start-group -lc -lgcc -lnosys --end-group --gc-sections --no-print-gc-sections -lopencm3_stm32f4 --start-group -lgcc -lc --end-group -T ../common/stm32f4-discovery.ld
COLLECT_GCC_OPTIONS='-v' '-o' 'test_m4.elf' '-static' '-T' '../common/stm32f4-discovery.ld' '-nostartfiles' '-mthumb' '-mcpu=cortex-m4' '-mfloat-abi=hard' '-mfpu=fpv4-sp-d16' '-L../libopencm3/lib/' '-march=armv7e-m+fp'
arm-none-eabi-objcopy -Obinary test_m4.elf test_m4.bin
The thing that it seems the optimization flag does not seem to be taken into account, as no matter what I put, the generated binary is always the same and the program always print cycles: 196645, 00000063. By disassembling the binary I get the following output for both -Os and -O3 optimizations:
080001ac <main>:
80001ac: b570 push {r4, r5, r6, lr}
80001ae: b08a sub sp, #40 ; 0x28
80001b0: f006 fc06 bl 80069c0 <clock_setup>
80001b4: f006 fc1c bl 80069f0 <gpio_setup>
80001b8: f44f 30e1 mov.w r0, #115200 ; 0x1c200
80001bc: f006 fc32 bl 8006a24 <usart_setup>
80001c0: f006 fc52 bl 8006a68 <flash_setup>
80001c4: 490e ldr r1, [pc, #56] ; (8000200 <main+0x54>)
80001c6: 4c0f ldr r4, [pc, #60] ; (8000204 <main+0x58>)
80001c8: 680b ldr r3, [r1, #0]
80001ca: 4a0f ldr r2, [pc, #60] ; (8000208 <main+0x5c>)
80001cc: 2500 movs r5, #0
80001ce: f043 7380 orr.w r3, r3, #16777216 ; 0x1000000
80001d2: 600b str r3, [r1, #0]
80001d4: 6025 str r5, [r4, #0]
80001d6: 6813 ldr r3, [r2, #0]
80001d8: f043 0301 orr.w r3, r3, #1
80001dc: 6013 str r3, [r2, #0]
80001de: 6826 ldr r6, [r4, #0]
80001e0: f000 f816 bl 8000210 <test>
80001e4: 6822 ldr r2, [r4, #0]
80001e6: 4909 ldr r1, [pc, #36] ; (800020c <main+0x60>)
80001e8: 4603 mov r3, r0
80001ea: 1b92 subs r2, r2, r6
80001ec: a801 add r0, sp, #4
80001ee: f006 fca5 bl 8006b3c <sprintf>
80001f2: a801 add r0, sp, #4
80001f4: f006 fc48 bl 8006a88 <send_USART_str>
80001f8: 4628 mov r0, r5
80001fa: b00a add sp, #40 ; 0x28
80001fc: bd70 pop {r4, r5, r6, pc}
80001fe: bf00 nop
8000200: e000edfc .word 0xe000edfc
8000204: e0001004 .word 0xe0001004
8000208: e0001000 .word 0xe0001000
800020c: 0800c1e8 .word 0x0800c1e8
08000210 <test>:
8000210: b480 push {r7}
8000212: b087 sub sp, #28
8000214: af00 add r7, sp, #0
8000216: 2375 movs r3, #117 ; 0x75
8000218: 60fb str r3, [r7, #12]
800021a: 2314 movs r3, #20
800021c: 60bb str r3, [r7, #8]
800021e: 2300 movs r3, #0
8000220: 613b str r3, [r7, #16]
8000222: e020 b.n 8000266 <test+0x56>
8000224: 68fb ldr r3, [r7, #12]
8000226: b2db uxtb r3, r3
8000228: 607b str r3, [r7, #4]
800022a: 68bb ldr r3, [r7, #8]
800022c: b2db uxtb r3, r3
800022e: 603b str r3, [r7, #0]
8000230: 687a ldr r2, [r7, #4]
8000232: 683b ldr r3, [r7, #0]
8000234: 405a eors r2, r3
8000236: 693b ldr r3, [r7, #16]
8000238: 2b00 cmp r3, #0
800023a: da01 bge.n 8000240 <test+0x30>
800023c: f203 13ff addw r3, r3, #511 ; 0x1ff
8000240: 125b asrs r3, r3, #9
8000242: 4619 mov r1, r3
8000244: 68fb ldr r3, [r7, #12]
8000246: 40cb lsrs r3, r1
8000248: 405a eors r2, r3
800024a: 693b ldr r3, [r7, #16]
800024c: 2b00 cmp r3, #0
800024e: da01 bge.n 8000254 <test+0x44>
8000250: f203 33ff addw r3, r3, #1023 ; 0x3ff
8000254: 129b asrs r3, r3, #10
8000256: 4619 mov r1, r3
8000258: 68bb ldr r3, [r7, #8]
800025a: 40cb lsrs r3, r1
800025c: 4053 eors r3, r2
800025e: 617b str r3, [r7, #20]
8000260: 693b ldr r3, [r7, #16]
8000262: 3301 adds r3, #1
8000264: 613b str r3, [r7, #16]
8000266: 693b ldr r3, [r7, #16]
8000268: f5b3 5f80 cmp.w r3, #4096 ; 0x1000
800026c: dbda blt.n 8000224 <test+0x14>
800026e: 697b ldr r3, [r7, #20]
8000270: 4618 mov r0, r3
8000272: 371c adds r7, #28
8000274: 46bd mov sp, r7
8000276: f85d 7b04 ldr.w r7, [sp], #4
800027a: 4770 bx lr
It seems really weird to me as the code can be clearly enhanced in terms of speed. For instance, a single uxtb can be computed instead of two (if executed after the eor), so I believe there is something wrong going here. Why is the optimization flag not taken into account here? Is there something wrong with my makefile?
typedef unsigned int uint32_t;
uint32_t test(uint32_t a, uint32_t b) {
uint32_t tmp0, tmp1;
uint32_t c;
for(int i = 0; i< 4096; i++) {
tmp0 = a & 0xff;
tmp1 = b & 0xff;
c = tmp0 ^ tmp1 ^ (a>>(i/512)) ^ (b >> (i/1024));
}
return c;
}
unsigned int hello ( void )
{
return(test(0x75,0x14));
}
9.3.0 vs 9.2.1 is not going to be much different, I can specifically get a 9.2.1 if you want to see it but you can just do this yourself.
arm-none-eabi-gcc --version
arm-none-eabi-gcc (GCC) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-O0
arm-none-eabi-gcc -O0 so.c -c -mthumb -mcpu=cortex-m4 -o so.o
Disassembly of section .text:
00000000 <test>:
0: b480 push {r7}
2: b087 sub sp, #28
4: af00 add r7, sp, #0
6: 6078 str r0, [r7, #4]
8: 6039 str r1, [r7, #0]
a: 2300 movs r3, #0
c: 613b str r3, [r7, #16]
e: e020 b.n 52 <test+0x52>
10: 687b ldr r3, [r7, #4]
12: b2db uxtb r3, r3
14: 60fb str r3, [r7, #12]
16: 683b ldr r3, [r7, #0]
18: b2db uxtb r3, r3
1a: 60bb str r3, [r7, #8]
1c: 68fa ldr r2, [r7, #12]
1e: 68bb ldr r3, [r7, #8]
20: 405a eors r2, r3
22: 693b ldr r3, [r7, #16]
24: 2b00 cmp r3, #0
26: da01 bge.n 2c <test+0x2c>
28: f203 13ff addw r3, r3, #511 ; 0x1ff
2c: 125b asrs r3, r3, #9
2e: 4619 mov r1, r3
30: 687b ldr r3, [r7, #4]
32: 40cb lsrs r3, r1
34: 405a eors r2, r3
36: 693b ldr r3, [r7, #16]
38: 2b00 cmp r3, #0
3a: da01 bge.n 40 <test+0x40>
3c: f203 33ff addw r3, r3, #1023 ; 0x3ff
40: 129b asrs r3, r3, #10
42: 4619 mov r1, r3
44: 683b ldr r3, [r7, #0]
46: 40cb lsrs r3, r1
48: 4053 eors r3, r2
4a: 617b str r3, [r7, #20]
4c: 693b ldr r3, [r7, #16]
4e: 3301 adds r3, #1
50: 613b str r3, [r7, #16]
52: 693b ldr r3, [r7, #16]
54: f5b3 5f80 cmp.w r3, #4096 ; 0x1000
58: dbda blt.n 10 <test+0x10>
5a: 697b ldr r3, [r7, #20]
5c: 4618 mov r0, r3
5e: 371c adds r7, #28
60: 46bd mov sp, r7
62: bc80 pop {r7}
64: 4770 bx lr
00000066 <hello>:
66: b580 push {r7, lr}
68: af00 add r7, sp, #0
6a: 2114 movs r1, #20
6c: 2075 movs r0, #117 ; 0x75
6e: f7ff fffe bl 0 <test>
72: 4603 mov r3, r0
74: 4618 mov r0, r3
76: bd80 pop {r7, pc}
-O1
arm-none-eabi-gcc -O1 so.c -c -mthumb -mcpu=cortex-m4 -o so.o
arm-none-eabi-objdump -D so.o
so.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <test>:
0: f44f 5380 mov.w r3, #4096 ; 0x1000
4: 3b01 subs r3, #1
6: d1fd bne.n 4 <test+0x4>
8: 08ca lsrs r2, r1, #3
a: ea82 12d0 eor.w r2, r2, r0, lsr #7
e: ea80 0301 eor.w r3, r0, r1
12: b2db uxtb r3, r3
14: ea82 0003 eor.w r0, r2, r3
18: 4770 bx lr
0000001a <hello>:
1a: b508 push {r3, lr}
1c: 2114 movs r1, #20
1e: 2075 movs r0, #117 ; 0x75
20: f7ff fffe bl 0 <test>
24: bd08 pop {r3, pc}
-O2
Disassembly of section .text:
00000000 <test>:
0: ea80 0301 eor.w r3, r0, r1
4: 08ca lsrs r2, r1, #3
6: ea82 10d0 eor.w r0, r2, r0, lsr #7
a: b2db uxtb r3, r3
c: 4058 eors r0, r3
e: 4770 bx lr
00000010 <hello>:
10: 2063 movs r0, #99 ; 0x63
12: 4770 bx lr
-O3
00000000 <test>:
0: ea80 0301 eor.w r3, r0, r1
4: 08ca lsrs r2, r1, #3
6: ea82 10d0 eor.w r0, r2, r0, lsr #7
a: b2db uxtb r3, r3
c: 4058 eors r0, r3
e: 4770 bx lr
00000010 <hello>:
10: 2063 movs r0, #99 ; 0x63
12: 4770 bx lr
-Os
00000000 <test>:
0: 08cb lsrs r3, r1, #3
2: ea83 13d0 eor.w r3, r3, r0, lsr #7
6: 4048 eors r0, r1
8: b2c0 uxtb r0, r0
a: 4058 eors r0, r3
c: 4770 bx lr
0000000e <hello>:
e: 2114 movs r1, #20
10: 2075 movs r0, #117 ; 0x75
12: f7ff bffe b.w 0 <test>
If all of these execute in the same amount of time then clearly yes you either have a build issue or there is an issue with your test. If as you claim -O1 and -O2 and -O3 etc all produce the same output, then you are not actually using those optimization levels.
There is no reason to assume that -Os produces a smaller binary than -O2 or -O3. Its just you are hinting at that desire. You can create exceptions.
Also no reason to assume that compiled for size will execute faster nor will -O3, etc. Especially on a platform like this (well all modern platforms) where some percentage of the performance is not related directly to the number or sequence of instructions, but the whole system.
You are on an stm32, cortex-m4 so you have the st flash cache thing which you cant turn off, now that is going to help all of the tests, but also going to hide things as well. You have a clock init and then later a flash setup, wonder what is going on there if you are upping your clock then you have to slow the flash down first not after or you can crash. For a test like this there generally is no reason to bump the clocks, you want to measure in timer clock cycles ideally system (as in cpu) clock cycles and then do things like mess with the flash wait states, at the slower clock speeds (some parts full range but) you can go with minimal flash wait states and then simply bump up the wait states for different tests without upping the clock to see how the flash affects it unfortunately this is an stm32. To get around that you can run the test in sram.
Depending on the compile time options for the core, some cores have different fetch features and other features and you may have some core features you can mess with a simple alignment change of a tight loop like this can have a dramatic affect, same machine code starting at a different address, how it lines up in the fetch lines and cache lines can affect the benchmark results.
Note you can get the same results using the systick timer you need for the debugger timer. Can wrap the gathering of time in the code under test (not in the function but when you lift the assembly language to make the code under test you can then add the time gathering just before and after, not incurring function call overhead which itself can vary from test to test.
If you are seeing the same machine code come out of the compiler for different settings then you are not actually building with those settings, not actually re-building the application, or some other form of user error (building here and using a binary from there). As a result the same binary will give the same time plus or minus a clock ideally in a situation like this. But that depends also on how you are running or re-running the test. Do you want to see the cache effects or not, prime the cache then run the test, etc.
If when you start to see different machine code or if you really are seeing different machine code but getting the same time then the error is in the measurement of time which is an often overlooked issue with benchmarking. your method seems fine so long as you are really seeing that timer, and have done tests to see that the timer is counting and in the direction you expect it to go. If this is a number of instructions counter not execution of time, then you can still test it to see it is doing what you think. I have no use for those debug tools so don't dabble in them nor know them as intimately as I know other things about these systems.
Being an m4 there might be other features you can turn on/off to see performance differences based on code generated, branch prediction, caching, mmu-like-thing, etc.
It may be the order of the flags you are using (the reason for each of those flags being a first question) relative to the -O3, some may be negating other optimization features.
Curious to know what the real goal is here. Understand that benchmarks are nonsense, because they are so easy to manipulate, the same high level code is not expected to produce the same results in the same target with the same or different tools for various reasons. dumb down the command line and try clang/llvm vs gnu or try gcc 4.x.x, 5.x.x, etc on up. after 4.x.x the output started to get bloated, the compiler wasn't doing as good of a job, for something like this though they should be pretty close but at the same time one fewer or more instruction, a simple alignment difference could make two tests vastly different in terms of results.
Then when you put back clock setups that changes how things work, so you may be able to let's say use no wait states (flash probably runs at have the CPU rate, so there is a wait built in) up to 25mhz as an example then add a wait state up to say 50 and so on. Varies by design some newer parts the flash can run much faster than older parts, but at 25mhz vs 8, the same number of clocks is an overall smaller number of time, wall clock time. At the boundary you can arguably not bump the wait states if you create/modify the clock init code and get that performance boost, but just over that boundary you get a performance hit with the flash wait state increase. So there is a performance balance there.
Summary
If the same code is coming out of the compiler then it is your command line, you can easily simplify the command line to see that the tools will produce different code. If your comparison is wrong and the code is different then the problem is how you are timing the code which is often where benchmarks go wrong, that and other factors not related to the compiler command line. Benchmarks are in general nonsense because they can be manipulated to show different results (even without changing the high level source code of the test).
Try simplifying the command line, examine each option you have there and justify why it is there for your specific application. Validate the timer the best you can or instruction counter whichever this is (and understand that counting instructions executed is not directly related to performance, you can have 100x instructions that execute faster than some other solution).
No reason to expect -Os to produce smaller code, one would hope but there are exceptions. Likewise -Os might execute faster than -O2 or -O3, no reason to expect the larger number optimization level to produce "faster" code.
You are compiling the code with -O0 flag.
It is clearly seen here:
https://godbolt.org/z/qZPYqJ
So the compiler is always right. No missed optimisations found.
Well the real answer is not really easy, but before disasembling something one should know what optimization actually is and how compilers achieves their goals.
Considering gcc there is barely no differences between Os and 03 since they turn on almost the same internal flags except loop unroling for Os.
In addition with nowaday cpu having everything in cache is anyway faster.
Comparing two Thumb-2 micros from two different manufacturers. One's a Cortex M3, one's an A5. Are they guaranteed to compile a particular piece of code to the same codesize?
so here goes
fun.c
unsigned int fun ( unsigned int x )
{
return(x);
}
addimm.c
extern unsigned int fun ( unsigned int );
unsigned int addimm ( unsigned int x )
{
return(fun(x)+0x123);
}
for demonstration purposes building for bare metal, not really a functional program but it compiles clean and demonstrates what I intend to demonstrate.
arm instructions
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mcpu=cortex-a5 -march=armv7-a -c addimm.c -o addimma.o
disassembly of the object, not linked
00000000 <addimm>:
0: e92d4008 push {r3, lr}
4: ebfffffe bl 0 <fun>
8: e2800e12 add r0, r0, #288 ; 0x120
c: e2800003 add r0, r0, #3
10: e8bd8008 pop {r3, pc}
thumb generic (armv4 or v5 whatever the default was for this compiler build)
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -c addimm.c -o addimmt.o
00000000 <addimm>:
0: b508 push {r3, lr}
2: f7ff fffe bl 0 <fun>
6: 3024 adds r0, #36 ; 0x24
8: 30ff adds r0, #255 ; 0xff
a: bc08 pop {r3}
c: bc02 pop {r1}
e: 4708 bx r1
cortex-a5 specific
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-a5 -march=armv7-a -c addimm.c -o addimma5.o
00000000 <addimm>:
0: b508 push {r3, lr}
2: f7ff fffe bl 0 <fun>
6: f200 1023 addw r0, r0, #291 ; 0x123
a: bd08 pop {r3, pc}
cortex-a5 is armv7-a which supports thumb-2 as far as the add immediate itself goes and related to binary size there is no optimization here, 32 bits for thumb and 32 bits for thumb2. But this is but one example there perhaps will be times that thumb2 produces smaller binaries than thumb.
cortex-m3
arm-none-eabi-gcc -Wall -O2 -nostdlib -nostartfiles -ffreestanding -mthumb -mcpu=cortex-m3 -march=armv7-m -c addimm.c -o addimmm3.o
00000000 <addimm>:
0: b508 push {r3, lr}
2: f7ff fffe bl 0 <fun>
6: f200 1023 addw r0, r0, #291 ; 0x123
a: bd08 pop {r3, pc}
produced the same result as cortex-a5. for this simple example the machine code for this object is the same, same size, when built for cortex-a5 and cortex-m3
Now if I add a bootstrap, a main, and call this function and fill in the function it calls to create a complete, linked, program
00000000 <_start>:
0: f000 f802 bl 8 <notmain>
4: e7fe b.n 4 <_start+0x4>
...
00000008 <notmain>:
8: 2005 movs r0, #5
a: f000 b801 b.w 10 <addimm>
e: bf00 nop
00000010 <addimm>:
10: b508 push {r3, lr}
12: f000 f803 bl 1c <fun>
16: f200 1023 addw r0, r0, #291 ; 0x123
1a: bd08 pop {r3, pc}
0000001c <fun>:
1c: 4770 bx lr
1e: 46c0 nop ; (mov r8, r8)
We get a result. The addimm function itself did not change in size. with a cortex-a5 you have to have some arm code that then switches to thumb, and likely when linking with libraries, etc you may get a mixture of arm and thumb, so
00000000 <_start>:
0: eb000000 bl 8 <notmain>
4: eafffffe b 4 <_start+0x4>
00000008 <notmain>:
8: e92d4008 push {r3, lr}
c: e3a00005 mov r0, #5
10: fa000001 blx 1c <addimm>
14: e8bd4008 pop {r3, lr}
18: e12fff1e bx lr
0000001c <addimm>:
1c: b508 push {r3, lr}
1e: f000 e804 blx 28 <fun>
22: f200 1023 addw r0, r0, #291 ; 0x123
26: bd08 pop {r3, pc}
00000028 <fun>:
28: e12fff1e bx lr
overall larger binary, the addimm part itself did not change in size though.
as far as linking changing the size of the object, look at this example
bootstrap.s
.thumb
.thumb_func
.globl _start
_start:
bl notmain
hang: b hang
.thumb_func
.globl dummy
dummy:
bx lr
.code 32
.globl bounce
bounce:
bx lr
hello.c
void dummy ( void );
void bounce ( void );
void notmain ( void )
{
dummy();
bounce();
}
looking at an arm build of notmain by itself, the object:
00000000 <notmain>:
0: e92d4800 push {fp, lr}
4: e28db004 add fp, sp, #4
8: ebfffffe bl 0 <dummy>
c: ebfffffe bl 0 <bounce>
10: e24bd004 sub sp, fp, #4
14: e8bd4800 pop {fp, lr}
18: e12fff1e bx lr
depending on what is calling it and what it calls, the linker may have to add more code to deal with items that are defined outside the object, from global variables to external functions
00008000 <_start>:
8000: f000 f818 bl 8034 <__notmain_from_thumb>
00008004 <hang>:
8004: e7fe b.n 8004 <hang>
00008006 <dummy>:
8006: 4770 bx lr
00008008 <bounce>:
8008: e12fff1e bx lr
0000800c <notmain>:
800c: e92d4800 push {fp, lr}
8010: e28db004 add fp, sp, #4
8014: eb000003 bl 8028 <__dummy_from_arm>
8018: ebfffffa bl 8008 <bounce>
801c: e24bd004 sub sp, fp, #4
8020: e8bd4800 pop {fp, lr}
8024: e12fff1e bx lr
00008028 <__dummy_from_arm>:
8028: e59fc000 ldr ip, [pc] ; 8030 <__dummy_from_arm+0x8>
802c: e12fff1c bx ip
8030: 00008007 andeq r8, r0, r7
00008034 <__notmain_from_thumb>:
8034: 4778 bx pc
8036: 46c0 nop ; (mov r8, r8)
8038: eafffff3 b 800c <notmain>
803c: 00000000 andeq r0, r0, r0
dummy_from_arm and notmain_from_thumb were both added, an increase in the size of the binary. each object did not change in size but the overall binary did. bounce() was an arm to arm function, no patching, dummy() arm to thumb and notmain() thumb to main.
so you might have a cortex-m3 object, and a cortex-a5 object that as far as the code in that object goes they are both identical. But dopending on what you link them with, which eventually something is dfferent between a cortex-m3 system and a cortex-a5 system, you may see more or less code added by the linker to account for the system differences, libraries, operating system specific, etc even so much as where in the binary you put the object, if it has to have a further reach than it can with a single instruction, then the linker will add even more code.
This is all gcc specific stuff, each toolchain is going to deal with each of these problems in its own way. It is the nature of the beast when you use an object and linker model, a very good model but the compiler, assembler, and linker have to work together to make sure that global resources can be properly accessed when linked. has nothing to do with ARM, this problem exists with many/most processor architectures and the toolchains deal with those problems per toolchain, per version, per target architecture. When I said change the size of the object what I really meant was the linker may add more code to the final binary in order to deal with that object and how it interacts with others.