I have some trouble with some inline assembly code. I know what should be done but I miss the "how" !
I have this checksum function that is "almost" working :
static unsigned long cksum_unroll( unsigned short **w, int *mlen)
{
int len;
unsigned short *w0;
unsigned long sum=0;
len = *mlen;
w0 = *w;
while( len >= 8) {
asm volatile (
"ldmia %[w0]!, {v1, v2}\n\t"
"adds %[sum], %[sum], v1\n\t"
"adcs %[sum], %[sum], v2\n\t"
"adcs %[sum], %[sum], #0"
: [sum] "+r" (sum) : [w0] "r" (w0)
);
len -= 8;
}
*mlen = len;
*w = w0;
return (sum);
}
My problem, I believe, is on the line ": [sum] "+r" (sum) : [w0] "r" (w0)"
On the first assembly line, w0 is processed correctly by ldmia (when the line is executed, data are in r4,r5 and w0 is incremented). But the incremented value of w0 is not saved somewhere and when the code loop, the original value of w0 is loaded again (see assembly code below).
My guess is that I should store the value of w0 on the line ": [sum] "+r" (sum) : [w0] "r" (w0)" but I don't know how...
Here's the disassembly code of the inline assembly part of the function :
Note that :
len is stored at r11, #-16
w0 is stored at r11, #-20
sum is stored at r11, #-24
Compiled, the following code :
asm volatile (
"ldmia %[w0]!, {v1, v2}\n\t"
"adds %[sum], %[sum], v1\n\t"
"adcs %[sum], %[sum], v2\n\t"
"adcs %[sum], %[sum], #0"
: [sum] "+r" (sum) : [w0] "r" (w0)
);
len -= 8;
Generate :
00031910: ldr r3, [r11, #-20]
00031914: ldr r2, [r11, #-24]
00031918: mov r4, r2
0003191c: ldm r3!, {r4, r5}
00031920: adds r4, r4, r4
00031924: adcs r4, r4, r5
00031928: adcs r4, r4, #0
0003192c: str r4, [r11, #-24]
00031930: ldr r3, [r11, #-16]
00031934: sub r3, r3, #8
00031938: str r3, [r11, #-16]
As you can see, I would like to add something like "str r3, [r11, #-20]" between lines 31928 and 3192c because when the program loop to the line 31910, r3 is loaded with the initial value of r3...
I think this is an easy one for the inline assembly expert of the stack overflow community!
By the way, I'm working on a ARM7TDMI processor (but this may not be relevant for this question...)
Thanks in advance!
EDIT:
To verify my idea, I tested the following :
asm volatile (
"ldmia %[w0]!, {v1, v2}\n\t"
"adds %[sum], %[sum], v1\n\t"
"adcs %[sum], %[sum], v2\n\t"
"adcs %[sum], %[sum], #0\n\t"
"str %[w0], [r11, #-20]"
: [sum] "+r" (sum) : [w0] "r" (w0)
);
And this works. Maybe this is the solution, but what do I use to replace "r11, #20" that will probably change if I modify the function?
The problem would seem to be that you specify w0 as an INPUT operand, when it actually should be a read-write OUTPUT operand, like sum. In addition, you need to specify that it clobbers v1 and v2 as you use those registers (otherwise, gcc might put some other var into these regs and expect them to be preserved.)
So you should have:
asm volatile (
"ldmia %[w0]!, {v1, v2}\n\t"
"adds %[sum], %[sum], v1\n\t"
"adcs %[sum], %[sum], v2\n\t"
"adcs %[sum], %[sum], #0"
: [sum] "+r" (sum) , [w0] "+r" (w0) : : "v1", "v2"
);
that is, two read-write input/output operands, no exclusively input operands, and two register clobbers
Related
I'm trying to make a Quick Sort function using ARM assembly (Raspberry pi),
but it shows me segmentation error.
I think recursion process makes that error, while storing or loading with stacks.
Can you tell me how can I fix it?
I used ARM assembly code in https://en.wikibooks.org/wiki/Algorithm_Implementation/Sorting/Quicksort#ARM_Assembly
here,
I just typed it same. Just changing registers like 'r3'->'r2', 'r2'->'r1', 'r1'->'r0' ...
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE 32
int main()
{
int arr[SIZE];
int max, min;
int i;
for (i = 0; i < SIZE; i++) {
arr[i] = rand() % 100;
}
asm(
"mov r0, #0\n\t"
"mov r1, #128\n\t"
"Loop3:\n\t"
"stmfd sp!, {r3, r5, lr}\n\t"
"mov r5, r1\n\t"
"Loop4:\n\t"
"sub r6, r5, r0\n\t"
"cmp r6, #4\n\t"
"ldmlefd sp!, {r3, r5, pc}\n\t"
"ldr r6, [%[arr],r0]\n\t"
"add r1, r0, #4\n\t"
"mov r3, r5\n\t"
"Loop5:\n\t"
"ldr r2, [%[arr],r1]\n\t"
"cmp r2, r6\n\t"
"addle r1, r1, #4\n\t"
"ble Loop6\n\t"
"sub r3, r3, #4\n\t"
"ldr r4, [%[arr],r3]\n\t"
"str r4, [%[arr],r1]\n\t"
"str r2, [%[arr],r3]\n\t"
"Loop6:\n\t"
"cmp r1, r3\n\t"
"blt Loop5\n\t"
"Loop7:\n\t"
"sub r1, r1, #4\n\t"
"ldr r2, [%[arr],r1]\n\t"
"str r2, [%[arr],r0]\n\t"
"str r6, [%[arr],r1]\n\t"
"bl Loop3\n\t"
"mov r0, r3\n\t"
"b Loop4\n\t"
:
:
[arr] "r"(arr)
:
"r0", "r1", "r2", "r3", "r4", "r5", "r6"
);
return 0;
}
You inline asm can never reach the end of the asm template. Presumably you're trying to return out of the C function, not just the internal recursive calls. That's obviously unsafe because there's zero guarantee about stack layout or the contents of LR, and that will change with/without optimization.
Don't write a whole recursive in the middle of a C function.
Use a debugger to single-step the resulting program and see where your code breaks the compiler-generated asm that surrounds it.
Also your inline asm is broken: you dereference arr without specifying it as a memory read/write input or a "memory" clobber. A pointer input does not imply that the pointed-to memory is also an operand.
I am trying to implement a function which multiplies 32-bit operand with 256-bit operand in ARM assembly on ARM Cortex-a8. The problem is I am running out of registers and I have no idea how I can reduce the number of used registers here. Here is my function:
typedef struct UN_256fe{
uint32_t uint32[8];
}UN_256fe;
typedef struct UN_288bite{
uint32_t uint32[9];
}UN_288bite;
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res){
asm (
"umull r3, r4, %9, %10;\n\t"
"mov %0, r3; \n\t"/*res->uint32[0] = r3*/
"umull r3, r5, %9, %11;\n\t"
"adds r6, r3, r4; \n\t"/*res->uint32[1] = r3 + r4*/
"mov %1, r6; \n\t"
"umull r3, r4, %9, %12;\n\t"
"adcs r6, r5, r3; \n\t"
"mov %2, r6; \n\t"/*res->uint32[2] = r6*/
"umull r3, r5, %9, %13;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %3, r6; \n\t"/*res->uint32[3] = r6*/
"umull r3, r4, %9, %14;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %4, r6; \n\t"/*res->uint32[4] = r6*/
"umull r3, r5, %9, %15;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %5, r6; \n\t"/*res->uint32[5] = r6*/
"umull r3, r4, %9, %16;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %6, r6; \n\t"/*res->uint32[6] = r6*/
"umull r3, r5, %9, %17;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %7, r6; \n\t"/*res->uint32[7] = r6*/
"adc r6, r5, #0 ; \n\t"
"mov %8, r6; \n\t"/*res->uint32[8] = r6*/
: "=r"(res->uint32[8]), "=r"(res->uint32[7]), "=r"(res->uint32[6]), "=r"(res->uint32[5]), "=r"(res->uint32[4]),
"=r"(res->uint32[3]), "=r"(res->uint32[2]), "=r"(res->uint32[1]), "=r"(res->uint32[0])
: "r"(A), "r"(B->uint32[7]), "r"(B->uint32[6]), "r"(B->uint32[5]),
"r"(B->uint32[4]), "r"(B->uint32[3]), "r"(B->uint32[2]), "r"(B->uint32[1]), "r"(B->uint32[0]), "r"(temp)
: "r3", "r4", "r5", "r6", "cc", "memory");
}
EDIT-1: I updated my clobber list based on the first comment, but I still get the same error
A simple solution is to break this up and don't use 'clobber'. Declare the variables as 'tmp1', etc. Try not to use any mov statements; let the compiler do this if it has to. The compiler will use an algorithm to figure out the best 'flow' of information. If you use 'clobber', it can not reuse registers. They way it is now, you make it load all the memory first before the assembler executes. This is bad as you want memory/CPU ALU to pipeline.
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res)
{
uint32_t mulhi1, mullo1;
uint32_t mulhi2, mullo2;
uint32_t tmp;
asm("umull %0, %1, %2, %3;\n\t"
: "=r" (mullo1), "=r" (mulhi1)
: "r"(A), "r"(B->uint32[7])
);
res->uint32[8] = mullo1; /* was 'mov %0, r3; */
volatile asm("umull %0, %1, %3, %4;\n\t"
"adds %2, %5, %6; \n\t"/*res->uint32[1] = r3 + r4*/
: "=r" (mullo2), "=r" (mulhi2), "=r" (tmp)
: "r"(A), "r"(B->uint32[6]), "r" (mullo1), "r"(mulhi1)
: "cc"
);
res->uint32[7] = tmp; /* was 'mov %1, r6; */
/* ... etc */
}
The whole purpose of the 'gcc inline assembler' is not to code assembler directly in a 'C' file. It is to use the register allocation logic of the compiler AND do something that can not be easily done in 'C'. The use of carry logic in your case.
By not making it one huge 'asm' clause, the compiler can schedule the loads from memory as it needs new registers. It will also pipeline your 'UMULL' ALU activity with the load/store unit.
You should only use clobber if an instruction implicitly clobbers a specific register. You may also use something like,
register int *p1 asm ("r0");
and use that as an output. However, I don't know of any ARM instructions like this besides those that might alter the stack and your code doesn't use these and the carry of course.
GCC knows that memory changes if it is listed as an input/output, so you don't need a memory clobber. In fact it is detrimental as the memory clobber is a compiler memory barrier and this will cause memory to be written when the compiler might be able to schedule that for latter.
The moral is use gcc inline assembler to work with the compiler. If you code in assembler and you have huge routines, the register use can become complex and confusing. Typical assembler coders will keep only one thing in a register per routine, but that is not always the best use of registers. The compiler will shuffle the data around in a fairly smart way that is difficult to beat (and not very satisfying to hand code IMO) when the code size gets larger.
You might want to look at the GMP library which has lots of ways to efficiently tackle some of the same issues it looks like your code has.
I am trying to implement a function which multiplies 32-bit operand with 256-bit operand in ARM assembly on ARM Cortex-a8. The problem is I am running out of registers and I have no idea how I can reduce the number of used registers here. Here is my function:
typedef struct UN_256fe{
uint32_t uint32[8];
}UN_256fe;
typedef struct UN_288bite{
uint32_t uint32[9];
}UN_288bite;
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res){
asm (
"umull r3, r4, %9, %10;\n\t"
"mov %0, r3; \n\t"/*res->uint32[0] = r3*/
"umull r3, r5, %9, %11;\n\t"
"adds r6, r3, r4; \n\t"/*res->uint32[1] = r3 + r4*/
"mov %1, r6; \n\t"
"umull r3, r4, %9, %12;\n\t"
"adcs r6, r5, r3; \n\t"
"mov %2, r6; \n\t"/*res->uint32[2] = r6*/
"umull r3, r5, %9, %13;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %3, r6; \n\t"/*res->uint32[3] = r6*/
"umull r3, r4, %9, %14;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %4, r6; \n\t"/*res->uint32[4] = r6*/
"umull r3, r5, %9, %15;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %5, r6; \n\t"/*res->uint32[5] = r6*/
"umull r3, r4, %9, %16;\n\t"
"adcs r6, r3, r5; \n\t"
"mov %6, r6; \n\t"/*res->uint32[6] = r6*/
"umull r3, r5, %9, %17;\n\t"
"adcs r6, r3, r4; \n\t"
"mov %7, r6; \n\t"/*res->uint32[7] = r6*/
"adc r6, r5, #0 ; \n\t"
"mov %8, r6; \n\t"/*res->uint32[8] = r6*/
: "=r"(res->uint32[8]), "=r"(res->uint32[7]), "=r"(res->uint32[6]), "=r"(res->uint32[5]), "=r"(res->uint32[4]),
"=r"(res->uint32[3]), "=r"(res->uint32[2]), "=r"(res->uint32[1]), "=r"(res->uint32[0])
: "r"(A), "r"(B->uint32[7]), "r"(B->uint32[6]), "r"(B->uint32[5]),
"r"(B->uint32[4]), "r"(B->uint32[3]), "r"(B->uint32[2]), "r"(B->uint32[1]), "r"(B->uint32[0]), "r"(temp)
: "r3", "r4", "r5", "r6", "cc", "memory");
}
EDIT-1: I updated my clobber list based on the first comment, but I still get the same error
A simple solution is to break this up and don't use 'clobber'. Declare the variables as 'tmp1', etc. Try not to use any mov statements; let the compiler do this if it has to. The compiler will use an algorithm to figure out the best 'flow' of information. If you use 'clobber', it can not reuse registers. They way it is now, you make it load all the memory first before the assembler executes. This is bad as you want memory/CPU ALU to pipeline.
void multiply32x256(uint32_t A, UN_256fe* B, UN_288bite* res)
{
uint32_t mulhi1, mullo1;
uint32_t mulhi2, mullo2;
uint32_t tmp;
asm("umull %0, %1, %2, %3;\n\t"
: "=r" (mullo1), "=r" (mulhi1)
: "r"(A), "r"(B->uint32[7])
);
res->uint32[8] = mullo1; /* was 'mov %0, r3; */
volatile asm("umull %0, %1, %3, %4;\n\t"
"adds %2, %5, %6; \n\t"/*res->uint32[1] = r3 + r4*/
: "=r" (mullo2), "=r" (mulhi2), "=r" (tmp)
: "r"(A), "r"(B->uint32[6]), "r" (mullo1), "r"(mulhi1)
: "cc"
);
res->uint32[7] = tmp; /* was 'mov %1, r6; */
/* ... etc */
}
The whole purpose of the 'gcc inline assembler' is not to code assembler directly in a 'C' file. It is to use the register allocation logic of the compiler AND do something that can not be easily done in 'C'. The use of carry logic in your case.
By not making it one huge 'asm' clause, the compiler can schedule the loads from memory as it needs new registers. It will also pipeline your 'UMULL' ALU activity with the load/store unit.
You should only use clobber if an instruction implicitly clobbers a specific register. You may also use something like,
register int *p1 asm ("r0");
and use that as an output. However, I don't know of any ARM instructions like this besides those that might alter the stack and your code doesn't use these and the carry of course.
GCC knows that memory changes if it is listed as an input/output, so you don't need a memory clobber. In fact it is detrimental as the memory clobber is a compiler memory barrier and this will cause memory to be written when the compiler might be able to schedule that for latter.
The moral is use gcc inline assembler to work with the compiler. If you code in assembler and you have huge routines, the register use can become complex and confusing. Typical assembler coders will keep only one thing in a register per routine, but that is not always the best use of registers. The compiler will shuffle the data around in a fairly smart way that is difficult to beat (and not very satisfying to hand code IMO) when the code size gets larger.
You might want to look at the GMP library which has lots of ways to efficiently tackle some of the same issues it looks like your code has.
I've implemented a simple delay loop macro in a C program for the Cortex-M4:
#define DELAY_CYCLES (F_CPU / 3000000) //F_CPU is 72000000
#define delayUS(n) __asm__ volatile( \
"1: subs %0, #1 \n" \
"bne 1b \n" \
: /* no outputs */ \
: "r" (n * DELAY_CYCLES) /* input */ \
: "0" /* clobbers */ \
)
This delays for n microseconds (assuming interrupts are disabled). Mostly, it works fine. However, I've found that it doesn't work correctly in a function that uses it twice:
static void test(uint8_t num) {
digitalWrite(12, 1);
delayUS(10);
digitalWrite(13, 1);
delayUS(10);
digitalWrite(12, 0);
digitalWrite(13, 0);
}
(This was a function that actually uses num, but got stripped down to this while debugging this issue. It also gets inlined into main, hence the labels in the disassembly.)
What happens here is the second call to delayUS() never completes. Examining the generated assembly shows the problem:
528: 2701 movs r7, #1
52a: 6037 str r7, [r6, #0] ;digitalWrite(12, 1)
52c: 23f0 movs r3, #240 ;delayUS(10); 10 * DELAY_CYCLES = 240
52e: 3b01 subs r3, #1
530: d1fd bne.n 52e <main+0x4a>
532: 4c0d ldr r4, [pc, #52]
534: 6027 str r7, [r4, #0] ;digitalWrite(13, 1)
536: 3b01 subs r3, #1 ;delayUS(10), but r3 is still 0
538: d1fd bne.n 536 <main+0x52>
53a: 2300 movs r3, #0
53c: 6033 str r3, [r6, #0] ;digitalWrite(12, 0)
For some reason, gcc doesn't re-initialize r3 before using it in the second delay loop, so instead of delaying for 240 iterations (10µs), it delays for 2^32 (about 3 minutes).
With this variation, the issue disappears:
__attribute__((used)) static int dummy;
#define delayUS(n) __asm__ volatile( \
"1: subs %0, #1 \n" \
"bne 1b \n" \
: "=r" (dummy) /* no outputs */ \
: "0" (n * DELAY_CYCLES) /* input */ \
: "0" /* clobbers */ \
)
That generates more correct code:
528: 2701 movs r7, #1
52a: 23f0 movs r3, #240 ;r3 = 10 * DELAY_CYCLES
52c: 6037 str r7, [r6, #0] ;digitalWrite(12, 1)
52e: 461a mov r2, r3 ;r2 = r3
530: 3a01 subs r2, #1 ;delayUS(r2)
532: d1fd bne.n 530 <main+0x4c>
534: 4c0d ldr r4, [pc, #52]
536: 6027 str r7, [r4, #0] ;digitalWrite(13, 1)
538: 3b01 subs r3, #1 ;delayUS(r3)
53a: d1fd bne.n 538 <main+0x54>
53c: 4a0c ldr r2, [pc, #48]
53e: 6013 str r3, [r2, #0] ;digitalWrite(12, 0)
Here, it's correctly realized that the delay loop clobbers its input register, and so doesn't re-use r3 without initializing it (it uses r2 for one of the loops instead.)
So, why does gcc not recognize that the former version also clobbers its input, when it's listed in the clobber list?
The problem is that the 'clobbers' list is a list of register names, or the special strings "cc" and "memory". Since there is no register called "0", having this in the clobbers list is meaningless. Unfortuately gcc does not give you a warning about this. Instead, as the gcc docs note:
Warning: Do not modify the contents of input-only operands (except for inputs tied to outputs). The compiler assumes that on exit from the asm statement these operands contain the same values as they had before executing the statement. It is not possible to use clobbers to inform the compiler that the values in these inputs are changing. One common work-around is to tie the changing input variable to an output variable that never gets used.
This workaround is what your second example does, and is why it works. For correctness, you should probably also add "cc" to the clobbers list (as you modify the flags), and you might as well remove the "0", because it is meaningless.
Im trying to remove the stack dependency from the following code.
void myfunction(struct kprobe *p, struct pt_regs *regs)
{
register void *rregs asm("r1") = regs;
register void *rfn asm("lr") = p->ainsn.insn_fn;
__asm__ __volatile__ (
"stmdb sp!, {%[regs], r11} \n\t"
"ldmia %[regs], {r0-r12} \n\t"
"blx %[fn] \n\t"
"ldr lr, [sp], #4 \n\t" /* lr = regs */
"stmia lr, {r0-r12} \n\t"
"ldr r11, [sp], #4 \n\t"
: [regs] "=r" (rregs), [fn] "=r" (rfn)
: "" (rregs), "1" (rfn)
: "r0", "r2", "r3", "r4", "r5", "r6", "r7",
"r8", "r9", "r10", "r12", "memory", "cc"
);
}
In the above function, stmdb sp!, {%[regs], r11} pushes r1 and r11 into stack and later it retrives.
In my case, I should avoid using stack here. so I rewrote
void myfunction(struct kprobe *p, struct pt_regs *regs)
{
int r1_bk = 0, r11_bk = 0;
register void *rregs asm("r1") = regs;
register void *rfn asm("lr") = p->ainsn.insn_fn;
register void *r1b_c asm("r1") = &r1_bk;
register void *r11b_c asm("r11") = &r11_bk;
__asm__ __volatile__ (
"ldr %[r1b], r1 \n\t"
"ldr %[r11b], r11 \n\t"
"ldmia %[regs], {r0-r12} \n\t"
"blx %[fn] \n\t"
"ldr lr, %[r1b] \n\t" /* lr = regs */
"stmia lr, {r0-r12} \n\t"
"ldr r11, %[r11b] \n\t"
: [regs] "=r" (rregs), [fn] "=r" (rfn), [r1b] "=r" (r1b_c), [r11b] "=r" (r11b_c)
: "0" (rregs), "1" (rfn)
: "r0", "r2", "r3", "r4", "r5", "r6", "r7",
"r8", "r9", "r10", "r12", "memory", "cc"
);
}
When I compile, following error Im getting.
/tmp/ccJMefdC.s: Assembler messages:
/tmp/ccJMefdC.s:579: Error: internal_relocation (type: OFFSET_IMM) not fixed up
/tmp/ccJMefdC.s:580: Error: internal_relocation (type: OFFSET_IMM) not fixed up
/tmp/ccJMefdC.s:583: Error: internal_relocation (type: OFFSET_IMM) not fixed up
/tmp/ccJMefdC.s:585: Error: internal_relocation (type: OFFSET_IMM) not fixed up
I refered here internal relocation not fixed up. but it doesn't give clear idea. Please share your knowledge regarding this.
Your inline asm call clobbers almost all registers and it is explicitly told to compiler via volatile directive that it shouldn't skip or try to move the call around to optimize register usage. This means compiler while producing the equivalent instructions for myfunction needs to save registers to somewhere before emitting that inline assembly.
Let me prove it to you:
$ cat asm_vol.c
void f() {
asm volatile("" : : : "r0", "r2", "r3", "r4", "r5", "r6", "r7",
"r8", "r9", "r10", "r12", "memory", "cc");
}
$ arm-linux-gnueabihf-gcc -c -O2 asm_vol.c
$ arm-linux-gnueabihf-objdump -d asm_vol.o
asm_vol.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <f>:
0: e92d 07f0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl}
4: e8bd 07f0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl}
8: 4770 bx lr
a: bf00 nop
The reason for the error message is that ldr take register and a memory reference, you are providing the same register twice. The assembler then interprets the register name as a memory location, and therefore complains that it is not defined in the same file.
Since you have run out of registers you can only avoid stack use by using a global variable.