ARM Deliberately Bloating Compiled Code? - c

While working on the issue in Fastest Cortex M0+ Thumb 32x32=64 multiplication function? I wrote the following C function to see how it would compile:
uint64_t lmul(uint32_t a, uint32_t b){
uint32_t hia = a >> 16,
hib = b >> 16,
loa = (uint32_t)(uint16_t)a,
lob = (uint32_t)(uint16_t)b,
low = loa * lob,
mid1 = hia * lob,
mid2 = loa * hib,
mid = mid1 + mid2,
high = hia * hib;
if (mid < mid1)
high += 0x10000;
return ((uint64_t)high << 32) + ((uint64_t)mid << 16) + low;
}
After compiling it with the ARM GCC compiler 4.7.3 through CodeWarrior (what came with the Freescale dev board I'm using) with size optimization, it turned into this:
00000eac <lmul>:
eac: b570 push {r4, r5, r6, lr}
eae: 0c06 lsrs r6, r0, #16
eb0: b280 uxth r0, r0
eb2: 0c0a lsrs r2, r1, #16
eb4: 1c04 adds r4, r0, #0
eb6: b289 uxth r1, r1
eb8: 434c muls r4, r1
eba: 4350 muls r0, r2
ebc: 4371 muls r1, r6
ebe: 1843 adds r3, r0, r1
ec0: 4356 muls r6, r2
ec2: 428b cmp r3, r1
ec4: d202 bcs.n ecc <lmul+0x20>
ec6: 2580 movs r5, #128 ; 0x80
ec8: 026a lsls r2, r5, #9
eca: 18b6 adds r6, r6, r2
ecc: 0c19 lsrs r1, r3, #16
ece: 0418 lsls r0, r3, #16
ed0: 1c22 adds r2, r4, #0
ed2: 2300 movs r3, #0
ed4: 1c04 adds r4, r0, #0
ed6: 1c0d adds r5, r1, #0
ed8: 18a4 adds r4, r4, r2
eda: 415d adcs r5, r3
edc: 1c31 adds r1, r6, #0
ede: 1c18 adds r0, r3, #0
ee0: 1c22 adds r2, r4, #0
ee2: 1c2b adds r3, r5, #0
ee4: 1812 adds r2, r2, r0
ee6: 414b adcs r3, r1
ee8: 1c10 adds r0, r2, #0
eea: 1c19 adds r1, r3, #0
eec: bd70 pop {r4, r5, r6, pc}
I cannot fathom what the compiler is doing in the last 40% of the function. It's like it's playing musical registers for no other purpose than to increase the size of the function. Is this something ARM is known to do, or is there some strange purpose to this that I lack the ARM assembly expertise to comprehend?
If I didn't make any mistakes in substitution the last half of the function could be represented by:
ecc: 0c19 lsrs r1, r3, #16
ece: 0418 lsls r0, r3, #16
ed2: 2300 movs r3, #0
ed8: 18a4 adds r0, r0, r4
eda: 415d adcs r1, r3
ee6: 414b adds r1, r1, r6
eec: bd70 pop {r4, r5, r6, pc}

I haven't used the CodeWarrior tool chain, but I decided to try this with uVision using the ARMCC compiler v 5.03.0.76. Optimizing for space is the default option (-Ospace) and the generated code was still pretty ugly... not too different from yours. When I compiled with the -O2 it looked more like what you would expect:
0x0000008A B570 PUSH {r4-r6,lr}
0x0000008C 0C02 LSRS r2,r0,#16
0x0000008E 0C0C LSRS r4,r1,#16
0x00000090 B280 UXTH r0,r0
0x00000092 B289 UXTH r1,r1
0x00000094 4606 MOV r6,r0
0x00000096 4615 MOV r5,r2
0x00000098 434D MULS r5,r1,r5
0x0000009A 4360 MULS r0,r4,r0
0x0000009C 434E MULS r6,r1,r6
0x0000009E 182B ADDS r3,r5,r0
0x000000A0 4362 MULS r2,r4,r2
0x000000A2 42AB CMP r3,r5
0x000000A4 D202 BCS 0x000000AC
0x000000A6 2001 MOVS r0,#0x01
0x000000A8 0400 LSLS r0,r0,#16
0x000000AA 1812 ADDS r2,r2,r0
0x000000AC 2400 MOVS r4,#0x00
0x000000AE 0C19 LSRS r1,r3,#16
0x000000B0 0418 LSLS r0,r3,#16
0x000000B2 1900 ADDS r0,r0,r4
0x000000B4 4151 ADCS r1,r1,r2
0x000000B6 1980 ADDS r0,r0,r6
0x000000B8 4161 ADCS r1,r1,r4
0x000000BA BD70 POP {r4-r6,pc}
You can try compiling with different optimization options but I would suggest that you go with a newer compiler as Marc Glisse states in his comment.

Related

ARM64 Backtrace from link register

I am currently trying to get backtrace based on stack pointer and link register on ARM64 device using C program.
Below is example of objdump
bar() calls foo() with 240444: ebfffd68 bl 23f9ec <foo##Base>
I can get link register (lr) and from that getting 23f9ec, save it to backtrace list as last routine.
My question: From below assembly code with current lr 0023f9ec <foo##Base>:, how to calculate to get previous routine with lr is 0023fe14 <bar##Base> using C language?
here is my implementation, but getting wrong previous lr
int bt(void** backtrace, int max_size) {
unsigned long* sp = __get_SP();
unsigned long* ra = __get_LR();
int* funcbase = (int*)(int)&bt;
int spofft = (short)((*funcbase));
sp = (char*)sp-spofft;
unsigned long* wra = (unsigned long*)ra;
int spofft;
int depth = 0;
while(ra) {
wra = ra;
while((*wra >> 16) != 0xe92d) {
wra--;
}
if(wra == 0)
return 0;
spofft = (short)(*wra & 0xffff);
if(depth < max_size)
backtrace[depth] = ra;
else
break;
ra =(unsigned long *)((unsigned long)ra + spofft);
sp =(unsigned long *)((unsigned long)sp + spofft);
depth++;
}
return 1;
}
0023f9ec <foo##Base>:
23f9ec: e92d42f3 push {r0, r1, r4, r5, r6, r7, r9, lr}
23f9f0: e1a09001 mov r9, r1
23f9f4: e1a07000 mov r7, r0
23f9f8: ebfffff9 bl 23f9e4 <__get_SP##Base>
23f9fc: e59f4060 ldr r4, [pc, #96] ; 23fa64 <foo##Base+0x78>
23fa00: e08f4004 add r4, pc, r4
23fa04: e1a05000 mov r5, r0
23fa08: ebfffff3 bl 23f9dc <__get_LR##Base>
23fa0c: e59f3054 ldr r3, [pc, #84] ; 23fa68 <foo##Base+0x7c>
23fa10: e3002256 movw r2, #598 ; 0x256
23fa14: e59f1050 ldr r1, [pc, #80] ; 23fa6c <foo##Base+0x80>
23fa18: e7943003 ldr r3, [r4, r3]
23fa1c: e08f1001 add r1, pc, r1
23fa20: e5934000 ldr r4, [r3]
23fa24: e1a03005 mov r3, r5
23fa28: e6bf4074 sxth r4, r4
23fa2c: e58d4004 str r4, [sp, #4]
23fa30: e1a06000 mov r6, r0
23fa34: e58d0000 str r0, [sp]
23fa38: e59f0030 ldr r0, [pc, #48] ; 23fa70 <foo##Base+0x84>
23fa3c: e08f0000 add r0, pc, r0
23fa40: ebfd456d bl 190ffc <printf#plt>
23fa44: e1a03009 mov r3, r9
23fa48: e1a02007 mov r2, r7
23fa4c: e1a01006 mov r1, r6
23fa50: e0640005 rsb r0, r4, r5
23fa54: ebffff70 bl 23f81c <get_prev_sp_ra2##Base>
23fa58: e3a00000 mov r0, #0
23fa5c: e28dd008 add sp, sp, #8
23fa60: e8bd82f0 pop {r4, r5, r6, r7, r9, pc}
23fa64: 003d5be0 eorseq r5, sp, r0, ror #23
23fa68: 000026c8 andeq r2, r0, r8, asr #13
23fa6c: 002b7ba6 eoreq r7, fp, r6, lsr #23
23fa70: 002b73e5 eoreq r7, fp, r5, ror #7
0023fe14 <bar##Base>:
23fe14: e92d4ef0 push {r4, r5, r6, r7, r9, sl, fp, lr}
23fe18: e24dde16 sub sp, sp, #352 ; 0x160
23fe1c: e59f76a8 ldr r7, [pc, #1704] ; 2404cc <bar##Base+0x6b8>
23fe20: e1a04000 mov r4, r0
23fe24: e59f66a4 ldr r6, [pc, #1700] ; 2404d0 <bar##Base+0x6bc>
23fe28: e1a03000 mov r3, r0
23fe2c: e59f26a0 ldr r2, [pc, #1696] ; 2404d4 <bar##Base+0x6c0>
23fe30: e08f7007 add r7, pc, r7
23fe34: e08f6006 add r6, pc, r6
23fe38: e3a00000 mov r0, #0
23fe3c: e08f2002 add r2, pc, r2
23fe40: e1a05001 mov r5, r1
23fe44: e3a01003 mov r1, #3
23fe48: e59f9688 ldr r9, [pc, #1672] ; 2404d8 <bar##Base+0x6c4>
.....................................................................
24043c: e3a0100f mov r1, #15
240440: e1a0000a mov r0, sl
240444: ebfffd68 bl 23f9ec <foo##Base>
240448: e59f2108 ldr r2, [pc, #264] ; 240558 <bar##Base+0x744>
24044c: e3a01003 mov r1, #3
240450: e08f2002 add r2, pc, r2
240454: e1a05000 mov r5, r0
240458: e1a03000 mov r3, r0
24045c: e3a00000 mov r0, #0
I don't think there's an easy way to do this.
Normally the register ABI of any operating system contains a "frame pointer" register. For example, on Apple's armv7 ABI, this is r7:
0x10006fc0 b0b5 push {r4, r5, r7, lr}
0x10006fc2 02af add r7, sp, 8
0x10006fc4 0448 ldr r0, [0x10006fd8]
0x10006fc6 d0e90c45 ldrd r4, r5, [r0, 0x30]
0x10006fca 0020 movs r0, 0
0x10006fcc fff7a6ff bl 0x10006f1c
0x10006fd0 0019 adds r0, r0, r4
0x10006fd2 6941 adcs r1, r5
0x10006fd4 b0bd pop {r4, r5, r7, pc}
If you dereference r7 there, you get to a pair of pointers, the second of which is lr, and the first of which is the r7 of the calling function, allowing you to repeat this process until you reach the bottom of the stack.
Judging by the assembly you posted, the codebase you're looking at doesn't have that. This means that the only way to obtain the return address is the same way that the code itself does: step forward through each instruction and parse/interpret them until you reach something that loads into pc. This is of course imperfect, since there may be functions in your call stack that do not ever return, but there's not much you can do about that.
It may be tempting to search backwards instead, and while you can do a heuristic approach and probably reach quite reasonable results with it, that is even less reliable than searching forward, since you have absolutely no way of telling whether you arrived at address X by stepping forward from the previous instruction or by explicitly jumping there from somewhere else.

C - occasional CPU stall during memcmp on Cortex-R5

I'm running some tests on a Cortex-R5 (Ultrascale MpSoC). It basically generates 2 random numbers with a hardware module and compares them at the end to ensure they're not 0, nor the same values.
uint32_t status;
const uint8_t zeros[32] = {0};
uint8_t bytes1[32] = {0};
uint8_t bytes2[32] = {0};
// (generate random numbers and put them in bytes1)
// (generate random numbers and put them in bytes2)
printf("memcmp 0\n");
status = !memcmp(bytes1, bytes2, 32);
printf("memcmp 1\n");
status |= !memcmp(bytes1, zeros, 32);
printf("memcmp 2\n");
status |= !memcmp(bytes2, zeros, 32);
Some tests are running fine. Some executions are stalled after printing "memcmp 0" (when it freezes, it's always at the first memcmp)...
I have tried several things:
When I print the values in bytes1 and 2, they are indeed random numbers not equal to 0 and not equal with each other.
Moving the memcmp at different places, or switching the memcmp's. It's always the first one which freezes.
Replacing memcmp with a custom function to do comparison => it never freezes.
The memcmp function is used at other places of the code and it freezes nowhere else. Perhaps the difference is that the random check is the only place where the memcmp expects different values (at other places it's to ensure a function produces expected output).
I couldn't find the definition of memcmp... I don't know where to look. The only thing I could find is the assembly code, but it'd be difficult to attach a debugger to know exactly which instruction can't complete.
000064d0 <memcmp>:
64d0: 2a03 cmp r2, #3
64d2: b470 push {r4, r5, r6}
64d4: d912 bls.n 64fc <memcmp+0x2c>
64d6: ea40 0501 orr.w r5, r0, r1
64da: 4604 mov r4, r0
64dc: 07ad lsls r5, r5, #30
64de: 460b mov r3, r1
64e0: d120 bne.n 6524 <memcmp+0x54>
64e2: 681d ldr r5, [r3, #0]
64e4: 4619 mov r1, r3
64e6: 6826 ldr r6, [r4, #0]
64e8: 4620 mov r0, r4
64ea: 3304 adds r3, #4
64ec: 3404 adds r4, #4
64ee: 42ae cmp r6, r5
64f0: d118 bne.n 6524 <memcmp+0x54>
64f2: 3a04 subs r2, #4
64f4: 4620 mov r0, r4
64f6: 2a03 cmp r2, #3
64f8: 4619 mov r1, r3
64fa: d8f2 bhi.n 64e2 <memcmp+0x12>
64fc: 1e54 subs r4, r2, #1
64fe: b172 cbz r2, 651e <memcmp+0x4e>
6500: 7802 ldrb r2, [r0, #0]
6502: 780b ldrb r3, [r1, #0]
6504: 429a cmp r2, r3
6506: bf08 it eq
6508: 1864 addeq r4, r4, r1
650a: d006 beq.n 651a <memcmp+0x4a>
650c: e00c b.n 6528 <memcmp+0x58>
650e: f810 2f01 ldrb.w r2, [r0, #1]!
6512: f811 3f01 ldrb.w r3, [r1, #1]!
6516: 429a cmp r2, r3
6518: d106 bne.n 6528 <memcmp+0x58>
651a: 42a1 cmp r1, r4
651c: d1f7 bne.n 650e <memcmp+0x3e>
651e: 2000 movs r0, #0
6520: bc70 pop {r4, r5, r6}
6522: 4770 bx lr
6524: 1e54 subs r4, r2, #1
6526: e7eb b.n 6500 <memcmp+0x30>
6528: 1ad0 subs r0, r2, r3
652a: bc70 pop {r4, r5, r6}
652c: 4770 bx lr
652e: bf00 nop
Where can I see the source code of memcmp for cortex R5? FYI, the used compiler is armr5-none-eabi-gcc.
Any idea what could cause a CPU stall with this function?
Thank you

Why the interrupt service routine ,PUSH {r3,r4,r5,lr} but POP {r0,r4,r5,lr},which lead to ERROR?

I am using IAR to compile routines, but run error on ARM A7; then i got the question below when i open the .lst file generated by IAR.
It is a ISR, first push {r3, r4, r5, lr}, but POP {r0, r4, r5, lr} when return, the R0 value is changed to the value of R3 before push. So R0 is wrong when returned from irqHandler which lead to error in follow routines.
why ?
void irqHandler(void)
{
878: e92d4038 push {r3, r4, r5, lr}
volatile u32 *pt = (u32 *)AM_INTC_BASE;
87c: e3a044b0 mov r4, #176, 8 ; 0xb0000000
u32 id_spin;
id_spin = *(pt+0x200c/4) & 0x3ff;
880: e302000c movw r0, #8204 ; 0x200c
884: e7900004 ldr r0, [r0, r4]
888: e1b00b00 lsls r0, r0, #22
88c: e1b00b20 lsrs r0, r0, #22
890: e1b05000 movs r5, r0
if(id_spin<32)
894: e3550020 cmp r5, #32
898: 2a000000 bcs 8a0 <irqHandler+0x28>
{
#ifdef WHOLECHIPSIM
print("id_spid<32 error...\r\n",0);
#endif
while(1);
89c: eafffffe b 89c <irqHandler+0x24>
}
else
{
(pFuncIrq[id_spin-32])();
8a0: e59f0010 ldr r0, [pc, #16] ; 8b8 <.text_8>
8a4: e1b01105 lsls r1, r5, #2
8a8: e0910000 adds r0, r1, r0
8ac: e5100080 ldr r0, [r0, #-128] ; 0x80
8b0: e12fff30 blx r0
}
}
8b4: e8bd8031 pop {r0, r4, r5, pc}
The abi requires a 64 bit aligned stack, so the push of r3 simply facilitates that. Could have chosen any register not already specified. Likewise on the pop they need to clean up the stack the function is prototyped as void so the return (r0) is a dont care and r0-r3 are not expected to be preserved so no reason to match the r3 on each end nor match an r0 on each end.
had they chose a register numbered above r3 (r6 for example) on the push then that would have needed to be matched on the pop. Otherwise the pop would have to be one of r0-r3 to not trash a non-volatile register. (couldnt push r3 then pop r6 that would trash r6)
It does not matter as R0-R3, R12, LR, PC, xPSR are saved on the stack automaticly when the hardware invokes the interrupt vector routine. When bx, ldm, pop, or ldr with PC is invoked hardware executes interrupt routine exit poping those registers.
Do not check your compiler. It knows what it does. Check tour wrong logic - especially printing strings in the interrupt handler.
assemble code with the keyword __irq __arm is below:
__irq __arm void irqHandler(void)
{
878: e24ee004 sub lr, lr, #4
87c: e92d503f push {r0, r1, r2, r3, r4, r5, ip, lr}
volatile u32 *pt = (u32 *)AM_INTC_BASE;
880: e3a044b0 mov r4, #176, 8 ; 0xb0000000
u32 id_spin;
id_spin = *(pt+0x200c/4) & 0x3ff;
884: e302000c movw r0, #8204 ; 0x200c
888: e7900004 ldr r0, [r0, r4]
88c: e1b00b00 lsls r0, r0, #22
890: e1b00b20 lsrs r0, r0, #22
894: e1b05000 movs r5, r0
if(id_spin<32)
898: e3550020 cmp r5, #32
89c: 2a000000 bcs 8a4 <irqHandler+0x2c>
{
#ifdef WHOLECHIPSIM
print("id_spid<32 error...\r\n",0);
#endif
while(1);
8a0: eafffffe b 8a0 <irqHandler+0x28>
}
else
{
(pFuncIrq[id_spin-32])();
8a4: e59f0010 ldr r0, [pc, #16] ; 8bc <.text_8>
8a8: e1b01105 lsls r1, r5, #2
8ac: e0910000 adds r0, r1, r0
8b0: e5100080 ldr r0, [r0, #-128] ; 0x80
8b4: e12fff30 blx r0
}
}
8b8: e8fd903f ldm sp!, {r0, r1, r2, r3, r4, r5, ip, pc}^
Cortex A7 PUSH log ,it just push 7 register, so 32bit aligned is ok
follow link is the log info:
http://img.blog.csdn.net/20170819120758443?watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQvcmFpbmJvd2JpcmRzX2Flcw==/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center

Replacing memcpy and .divsi3_skip_div0_test with smaller code on ARM MCU

My entry for https://hackaday.com/2016/11/21/step-up-to-the-1-kb-challenge/ includes a couple of huge functions which are not generated from any C code which I have written.
000004e4 <.divsi3_skip_div0_test>:
4e4: b410 push {r4}
4e6: 1c04 adds r4, r0, #0
4e8: 404c eors r4, r1
4ea: 46a4 mov ip, r4
4ec: 2301 movs r3, #1
4ee: 2200 movs r2, #0
4f0: 2900 cmp r1, #0
4f2: d500 bpl.n 4f6 <.divsi3_skip_div0_test+0x12>
4f4: 4249 negs r1, r1
4f6: 2800 cmp r0, #0
4f8: d500 bpl.n 4fc <.divsi3_skip_div0_test+0x18>
4fa: 4240 negs r0, r0
4fc: 4288 cmp r0, r1
4fe: d32c bcc.n 55a <.divsi3_skip_div0_test+0x76>
500: 2401 movs r4, #1
502: 0724 lsls r4, r4, #28
504: 42a1 cmp r1, r4
506: d204 bcs.n 512 <.divsi3_skip_div0_test+0x2e>
508: 4281 cmp r1, r0
50a: d202 bcs.n 512 <.divsi3_skip_div0_test+0x2e>
50c: 0109 lsls r1, r1, #4
50e: 011b lsls r3, r3, #4
510: e7f8 b.n 504 <.divsi3_skip_div0_test+0x20>
512: 00e4 lsls r4, r4, #3
514: 42a1 cmp r1, r4
516: d204 bcs.n 522 <.divsi3_skip_div0_test+0x3e>
518: 4281 cmp r1, r0
51a: d202 bcs.n 522 <.divsi3_skip_div0_test+0x3e>
51c: 0049 lsls r1, r1, #1
51e: 005b lsls r3, r3, #1
520: e7f8 b.n 514 <.divsi3_skip_div0_test+0x30>
522: 4288 cmp r0, r1
524: d301 bcc.n 52a <.divsi3_skip_div0_test+0x46>
526: 1a40 subs r0, r0, r1
528: 431a orrs r2, r3
52a: 084c lsrs r4, r1, #1
52c: 42a0 cmp r0, r4
52e: d302 bcc.n 536 <.divsi3_skip_div0_test+0x52>
530: 1b00 subs r0, r0, r4
532: 085c lsrs r4, r3, #1
534: 4322 orrs r2, r4
536: 088c lsrs r4, r1, #2
538: 42a0 cmp r0, r4
53a: d302 bcc.n 542 <.divsi3_skip_div0_test+0x5e>
53c: 1b00 subs r0, r0, r4
53e: 089c lsrs r4, r3, #2
540: 4322 orrs r2, r4
542: 08cc lsrs r4, r1, #3
544: 42a0 cmp r0, r4
546: d302 bcc.n 54e <.divsi3_skip_div0_test+0x6a>
548: 1b00 subs r0, r0, r4
54a: 08dc lsrs r4, r3, #3
54c: 4322 orrs r2, r4
54e: 2800 cmp r0, #0
550: d003 beq.n 55a <.divsi3_skip_div0_test+0x76>
552: 091b lsrs r3, r3, #4
554: d001 beq.n 55a <.divsi3_skip_div0_test+0x76>
556: 0909 lsrs r1, r1, #4
558: e7e3 b.n 522 <.divsi3_skip_div0_test+0x3e>
55a: 1c10 adds r0, r2, #0
55c: 4664 mov r4, ip
55e: 2c00 cmp r4, #0
560: d500 bpl.n 564 <.divsi3_skip_div0_test+0x80>
562: 4240 negs r0, r0
564: bc10 pop {r4}
566: 4770 bx lr
568: 2800 cmp r0, #0
56a: d006 beq.n 57a <.divsi3_skip_div0_test+0x96>
56c: db03 blt.n 576 <.divsi3_skip_div0_test+0x92>
56e: 2000 movs r0, #0
570: 43c0 mvns r0, r0
572: 0840 lsrs r0, r0, #1
574: e001 b.n 57a <.divsi3_skip_div0_test+0x96>
576: 2080 movs r0, #128 ; 0x80
578: 0600 lsls r0, r0, #24
57a: b407 push {r0, r1, r2}
57c: 4802 ldr r0, [pc, #8] ; (588 <.divsi3_skip_div0_test+0xa4>)
57e: a102 add r1, pc, #8 ; (adr r1, 588 <.divsi3_skip_div0_test+0xa4>)
580: 1840 adds r0, r0, r1
582: 9002 str r0, [sp, #8]
584: bd03 pop {r0, r1, pc}
586: 46c0 nop ; (mov r8, r8)
588: 00000019 .word 0x00000019
and:
000005a4 <memcpy>:
5a4: b5f0 push {r4, r5, r6, r7, lr}
5a6: 2a0f cmp r2, #15
5a8: d935 bls.n 616 <memcpy+0x72>
5aa: 1c03 adds r3, r0, #0
5ac: 430b orrs r3, r1
5ae: 079c lsls r4, r3, #30
5b0: d135 bne.n 61e <memcpy+0x7a>
5b2: 1c16 adds r6, r2, #0
5b4: 3e10 subs r6, #16
5b6: 0936 lsrs r6, r6, #4
5b8: 0135 lsls r5, r6, #4
5ba: 1945 adds r5, r0, r5
5bc: 3510 adds r5, #16
5be: 1c0c adds r4, r1, #0
5c0: 1c03 adds r3, r0, #0
5c2: 6827 ldr r7, [r4, #0]
5c4: 601f str r7, [r3, #0]
5c6: 6867 ldr r7, [r4, #4]
5c8: 605f str r7, [r3, #4]
5ca: 68a7 ldr r7, [r4, #8]
5cc: 609f str r7, [r3, #8]
5ce: 68e7 ldr r7, [r4, #12]
5d0: 3410 adds r4, #16
5d2: 60df str r7, [r3, #12]
5d4: 3310 adds r3, #16
5d6: 42ab cmp r3, r5
5d8: d1f3 bne.n 5c2 <memcpy+0x1e>
5da: 1c73 adds r3, r6, #1
5dc: 011b lsls r3, r3, #4
5de: 18c5 adds r5, r0, r3
5e0: 18c9 adds r1, r1, r3
5e2: 230f movs r3, #15
5e4: 4013 ands r3, r2
5e6: 2b03 cmp r3, #3
5e8: d91b bls.n 622 <memcpy+0x7e>
5ea: 1f1c subs r4, r3, #4
5ec: 08a4 lsrs r4, r4, #2
5ee: 3401 adds r4, #1
5f0: 00a4 lsls r4, r4, #2
5f2: 2300 movs r3, #0
5f4: 58ce ldr r6, [r1, r3]
5f6: 50ee str r6, [r5, r3]
5f8: 3304 adds r3, #4
5fa: 42a3 cmp r3, r4
5fc: d1fa bne.n 5f4 <memcpy+0x50>
5fe: 18ed adds r5, r5, r3
600: 18c9 adds r1, r1, r3
602: 2303 movs r3, #3
604: 401a ands r2, r3
606: d005 beq.n 614 <memcpy+0x70>
608: 2300 movs r3, #0
60a: 5ccc ldrb r4, [r1, r3]
60c: 54ec strb r4, [r5, r3]
60e: 3301 adds r3, #1
610: 4293 cmp r3, r2
612: d1fa bne.n 60a <memcpy+0x66>
614: bdf0 pop {r4, r5, r6, r7, pc}
616: 1c05 adds r5, r0, #0
618: 2a00 cmp r2, #0
61a: d1f5 bne.n 608 <memcpy+0x64>
61c: e7fa b.n 614 <memcpy+0x70>
61e: 1c05 adds r5, r0, #0
620: e7f2 b.n 608 <memcpy+0x64>
622: 1c1a adds r2, r3, #0
624: e7f8 b.n 618 <memcpy+0x74>
626: 46c0 nop ; (mov r8, r8)
I am guessing that I could code far smaller, but less time-efficient, replacements myself.
Is this likely?
Where will I find the source which I need to edit - I'm guessing that I should look for the source of a libc under gcc-arm-none-eabi/lib/gcc/arm-none-eabi/4.8.3/. I think that I've found the compiled symbols, but I can't find the source.
~/gcc-arm-none-eabi$ grep -R divsi3_skip_div0_test *
Binary file lib/gcc/arm-none-eabi/4.8.3/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/thumb/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv6-m/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/fpu/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/softfp/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/fpu/libgcc.a matches
Alternatively, is there a way to tell gcc not to use memcpy when copying structures? (They are 10 bytes, so three thumb instructions should do the job.) I've tried adding -mno-memcpy and -Wa,mno-memcpy but neither are recognised.
Update:
I've solved the memcpy part of this question - adding a partial, but sufficient, memcpy function stops the other from being added.
size_t memcpy(uint8_t *restrict dst, uint8_t *restrict const src, size_t size) {
int i;
for (i = 0; i < size; i++) {
dst[i] = src[i];
}
return i;
}
It's much smaller, but less efficient and won't handle dst < src + size overlap.
000003ec <memcpy>:
3ec: b510 push {r4, lr}
3ee: 2300 movs r3, #0
3f0: 4293 cmp r3, r2
3f2: d003 beq.n 3fc <memcpy+0x10>
3f4: 5ccc ldrb r4, [r1, r3]
3f6: 54c4 strb r4, [r0, r3]
3f8: 3301 adds r3, #1
3fa: e7f9 b.n 3f0 <memcpy+0x4>
3fc: 1c18 adds r0, r3, #0
3fe: bd10 pop {r4, pc}
To clarify, I'm now only asking what I might do to replace the .divsi3_skip_div0_test code with a less efficient, but smaller, code.
It is not clear to me where the source of this code is, or how to edit its source. It looks to be more complicated to replace than memcpy as it does not look like a C function, as its name begins with a ..

simple ADD/ADC ARM assemlby fails

I have the following C and ASM version of the (supposedly) same code. What it does is load 2 128bit ints represented by 2 64bit ints each to registers (first 4*lower 32bit, then 4*higher 32bit) and ADD/ADC to them. It is simple enough code and the ARM/ST manuals actually give the same example with 96bit (3 ADD/ADCs).
For simple calls both versions work (repeatedly adding (1 << x++) or 1..x). But for the longer testsuite the ARM assembly fails (board hangs). ATM I have no ability to trap/debug that and cannot use any printf() or the likes to find the test failing, which is irrelevant anyways, because there must be some basic fault in the ASM version, as the C version works as expected.
I don't get it, it's simple enough and very close to the C assembly output (sans branching). I tried the "memory" constraint (shouldn't be needed), I tried saving the carry between lower and upper 64bit in a register and adding that later, using ADD(C).W, alignment, using two LDR/STR instead of LDRD/STRD, etc.. I assume the board faults because some addition goes wrong and results in a divide by 0 or something like that.
The GCC ASM is below and uses similar basic technique, so I don't see the problem.
I'm really just looking for the fastest way to do the add, not to fix that code specifically. It's a shame you have to use constant register names because there is no constraint for specifying rX and rX+1. Also it's impossible to use as many registers as GCC as it will run out of them during compilation.
typedef struct I128 {
int64_t high;
uint64_t low;
} I128;
I128 I128add(I128 a, const I128 b) {
#if defined(USEASM) && defined(ARMx)
__asm(
"LDRD %%r2, %%r3, %[alo]\n"
"LDRD %%r4, %%r5, %[blo]\n"
"ADDS %%r2, %%r2, %%r4\n"
"ADCS %%r3, %%r3, %%r5\n"
"STRD %%r2, %%r3, %[alo]\n"
"LDRD %%r2, %%r3, %[ahi]\n"
"LDRD %%r4, %%r5, %[bhi]\n"
"ADCS %%r2, %%r2, %%r4\n"
"ADC %%r3, %%r3, %%r5\n"
"STRD %%r2, %%r3, %[ahi]\n"
: [alo] "+m" (a.low), [ahi] "+m" (a.high)
: [blo] "m" (b.low), [bhi] "m" (b.high)
: "r2", "r3", "r4", "r5", "cc"
);
return a;
#else
// faster to use temp than saving low and adding to a directly
I128 r = {a.high + b.high, a.low + b.low};
// check for overflow of low 64 bits, add carry to high
// avoid conditionals
//r.high += r.low < a.low || r.low < b.low;
// actually gcc produces faster code with conditionals
if(r.low < a.low || r.low < b.low) ++r.high;
return r;
}
GCC C version using " armv7m-none-eabi-gcc-4.7.2 -O3 -ggdb -fomit-frame-pointer -falign-functions=16 -std=gnu99 -march=armv7e-m":
b082 sub sp, #8
e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
a908 add r1, sp, #32
e881 000c stmia.w r1, {r2, r3}
e9dd 890e ldrd r8, r9, [sp, #56] ; 0x38
e9dd 670a ldrd r6, r7, [sp, #40] ; 0x28
e9dd 2308 ldrd r2, r3, [sp, #32]
e9dd 450c ldrd r4, r5, [sp, #48] ; 0x30
eb16 0a08 adds.w sl, r6, r8
eb47 0b09 adc.w fp, r7, r9
1912 adds r2, r2, r4
eb43 0305 adc.w r3, r3, r5
45bb cmp fp, r7
bf08 it eq
45b2 cmpeq sl, r6
d303 bcc.n 8012c9a <I128add+0x3a>
45cb cmp fp, r9
bf08 it eq
45c2 cmpeq sl, r8
d204 bcs.n 8012ca4 <I128add+0x44>
2401 movs r4, #1
2500 movs r5, #0
1912 adds r2, r2, r4
eb43 0305 adc.w r3, r3, r5
e9c0 2300 strd r2, r3, [r0]
e9c0 ab02 strd sl, fp, [r0, #8]
e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp}
b002 add sp, #8
4770 bx lr
My ASM version that fails:
b082 sub sp, #8
b430 push {r4, r5}
a902 add r1, sp, #8
e881 000c stmia.w r1, {r2, r3}
e9dd 2304 ldrd r2, r3, [sp, #16]
e9dd 4508 ldrd r4, r5, [sp, #32]
1912 adds r2, r2, r4
416b adcs r3, r5
e9cd 2304 strd r2, r3, [sp, #16]
e9dd 2302 ldrd r2, r3, [sp, #8]
e9dd 4506 ldrd r4, r5, [sp, #24]
4162 adcs r2, r4
eb43 0305 adc.w r3, r3, r5
e9cd 2302 strd r2, r3, [sp, #8]
4604 mov r4, r0
c90f ldmia r1, {r0, r1, r2, r3}
e884 000f stmia.w r4, {r0, r1, r2, r3}
4620 mov r0, r4
bc30 pop {r4, r5}
b002 add sp, #8
4770 bx lr
I am not getting a hang from your code, but it isnt working either, not sure why. But it was very easy to patch the compiler generated code to handle the carry:
I128 I128add(I128 a, const I128 b) {
I128 r = {a.high + b.high, a.low + b.low};
return r;
}
becomes
000001e4 <I128add>:
1e4: b082 sub sp, #8
1e6: b4f0 push {r4, r5, r6, r7}
1e8: e9dd 4506 ldrd r4, r5, [sp, #24]
1ec: a904 add r1, sp, #16
1ee: e881 000c stmia.w r1, {r2, r3}
1f2: e9dd 230a ldrd r2, r3, [sp, #40] ; 0x28
1f6: 1912 adds r2, r2, r4
1f8: eb43 0305 adc.w r3, r3, r5
1fc: e9dd 6704 ldrd r6, r7, [sp, #16]
200: e9dd 4508 ldrd r4, r5, [sp, #32]
204: 1936 adds r6, r6, r4
206: eb47 0705 adc.w r7, r7, r5
20a: e9c0 6700 strd r6, r7, [r0]
20e: e9c0 2302 strd r2, r3, [r0, #8]
212: bcf0 pop {r4, r5, r6, r7}
214: b002 add sp, #8
216: 4770 bx lr
fixed up the adds
.thumb_func
.globl test2
test2:
sub sp, #8
push {r4, r5, r6, r7}
ldrd r4, r5, [sp, #24]
add r1, sp, #16
stmia r1, {r2, r3}
ldrd r2, r3, [sp, #40]
add r2, r4
adc r3, r5
ldrd r6, r7, [sp, #16]
ldrd r4, r5, [sp, #32]
adc r6, r4
adc r7, r5
strd r6, r7, [r0]
strd r2, r3, [r0, #8]
pop {r4, r5, r6, r7}
add sp, #8
bx lr
final result
00000024 <test2>:
24: b082 sub sp, #8
26: b4f0 push {r4, r5, r6, r7}
28: e9dd 4506 ldrd r4, r5, [sp, #24]
2c: a904 add r1, sp, #16
2e: c10c stmia r1!, {r2, r3}
30: e9dd 230a ldrd r2, r3, [sp, #40] ; 0x28
34: 1912 adds r2, r2, r4
36: 416b adcs r3, r5
38: e9dd 6704 ldrd r6, r7, [sp, #16]
3c: e9dd 4508 ldrd r4, r5, [sp, #32]
40: 4166 adcs r6, r4
42: 416f adcs r7, r5
44: e9c0 6700 strd r6, r7, [r0]
48: e9c0 2302 strd r2, r3, [r0, #8]
4c: bcf0 pop {r4, r5, r6, r7}
4e: b002 add sp, #8
50: 4770 bx lr
Notice the fewer number of thumb2 instructions, unless you are on a cortex-A that has thumb2 support, those fetches from flash (cortex-m) are (can be) slow. I see you are trying to save the push and pop of two more registers but you cost yourself more fetches. You could take the above and still re-arrange the loads and stores and save those two registers.
minimal testing so far. printfs show the upper words adding, where I didnt see that with your code. I am stilll trying to unwind the calling convention (please document your code more for us), looks like r0 is prepped by the caller to place the result, rest is on the stack. I am using a stellaris launchpad (cortex-m4).

Resources