Is there a gcc pragma or something I can use to force gcc to generate branch-free instructions on a specific section of code?
I have a piece of code that I want gcc to compile to branch-free code using cmov instructions:
int foo(int *a, int n, int x) {
int i = 0, j = n;
while (i < n) {
#ifdef PREFETCH
__builtin_prefetch(a+16*i + 15);
#endif /* PREFETCH */
j = (x <= a[i]) ? i : j;
i = (x <= a[i]) ? 2*i + 1 : 2*i + 2;
}
return j;
}
and, indeed, it does so:
morin#soprano$ gcc -O4 -S -c test.c -o -
.file "test.c"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB0:
.cfi_startproc
testl %esi, %esi
movl %esi, %eax
jle .L2
xorl %r8d, %r8d
jmp .L3
.p2align 4,,10
.p2align 3
.L6:
movl %ecx, %r8d
.L3:
movslq %r8d, %rcx
movl (%rdi,%rcx,4), %r9d
leal (%r8,%r8), %ecx # put 2*i in ecx
leal 1(%rcx), %r10d # put 2*i+1 in r10d
addl $2, %ecx # put 2*i+2 in ecx
cmpl %edx, %r9d
cmovge %r10d, %ecx # put 2*i+1 in ecx if appropriate
cmovge %r8d, %eax # set j = i if appropriate
cmpl %esi, %ecx
jl .L6
.L2:
rep ret
.cfi_endproc
.LFE0:
.size foo, .-foo
.ident "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
.section .note.GNU-stack,"",#progbits
(Yes, I realize the loop is a branch, but I'm talking about the choice operators inside the loop.)
Unfortunately, when I enable the __builtin_prefetch call, gcc generates branchy code:
morin#soprano$ gcc -DPREFETCH -O4 -S -c test.c -o -
.file "test.c"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB0:
.cfi_startproc
testl %esi, %esi
movl %esi, %eax
jle .L7
xorl %ecx, %ecx
jmp .L5
.p2align 4,,10
.p2align 3
.L3:
movl %ecx, %eax # this is the x <= a[i] branch
leal 1(%rcx,%rcx), %ecx
cmpl %esi, %ecx
jge .L11
.L5:
movl %ecx, %r8d # this is the main branch
sall $4, %r8d # setup the prefetch
movslq %r8d, %r8 # setup the prefetch
prefetcht0 60(%rdi,%r8,4) # do the prefetch
movslq %ecx, %r8
cmpl %edx, (%rdi,%r8,4) # compare x with a[i]
jge .L3
leal 2(%rcx,%rcx), %ecx # this is the x > a[i] branch
cmpl %esi, %ecx
jl .L5
.L11:
rep ret
.L7:
.p2align 4,,5
rep ret
.cfi_endproc
.LFE0:
.size foo, .-foo
.ident "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
.section .note.GNU-stack,"",#progbits
I've tried using __attribute__((optimize("if-conversion2"))) on this function, but that has no effect.
The reason I care so much is that I haved hand-edited compiler-generated branch-free code (from the first example) to include the prefetcht0 instructions and it runs considerably faster than both of the versions gcc produces.
If you really rely on that level of optimization, you have to write your own assembler stubs.
Reason is that even a modification elsewhere in the code might change the code the compiler (that is not gcc specific) emits. Also, a different version of gcc, different options (e.g. -fomit-frame-pointer) can change the code dramatically.
You should really only do this if you have to. Other influences might have much more impact, like cache configuration, memory allocation (DRAM-page/bank), execution order compared with concurrently run programs, CPU association, and much more. Play with compiler optimizations first. Command line options you will find in the docs (you did not post the version used, therefore not more specific).
A (serious) alternative would be to use clang/llvm. Or just help the gcc team improve their optimizers. You would not be the first. Note also that gcc has made massive improvements specifically for ARM over the last versions.
It looks like gcc might have troubles to generate branch-free code on variables used in loop conditions and post-conditions, together with the constraints of keeping temporary registers alive across a pseudo-function intrinsic call.
There is something suspicious, the generated code from your function is different when using -funroll-all-loops and -fguess-branch-probability. I generates many return instructions. It smells like a little bug in gcc, around the rtl pass of the compiler, or simplifications of blocks of codes.
The following code is branch-less in both cases. This would be a good reason to submit a bug to GCC. At level -O3, GCC should always generate the same code.
int foo( int *a, int n, int x) {
int c, i = 0, j = n;
while (i < n) {
#ifdef PREFETCH
__builtin_prefetch(a+16*i + 15);
#endif /* PREFETCH */
c = (x > a[i]);
j = c ? j : i;
i = 2*i + 1 + c;
}
return j;
}
which generates this
.cfi_startproc
testl %esi, %esi
movl %esi, %eax
jle .L4
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L3:
movslq %ecx, %r8
cmpl %edx, (%rdi,%r8,4)
setl %r8b
cmovge %ecx, %eax
movzbl %r8b, %r8d
leal 1(%r8,%rcx,2), %ecx
cmpl %ecx, %esi
jg .L3
.L4:
rep ret
.cfi_endproc
and this
.cfi_startproc
testl %esi, %esi
movl %esi, %eax
jle .L5
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L4:
movl %ecx, %r8d
sall $4, %r8d
movslq %r8d, %r8
prefetcht0 60(%rdi,%r8,4)
movslq %ecx, %r8
cmpl %edx, (%rdi,%r8,4)
setl %r8b
testb %r8b, %r8b
movzbl %r8b, %r9d
cmove %ecx, %eax
leal 1(%r9,%rcx,2), %ecx
cmpl %ecx, %esi
jg .L4
.L5:
rep ret
.cfi_endproc
Related
I've been trying to translate this function to assembly:
void foo (int a[], int n) {
int i;
int s = 0;
for (i=0; i<n; i++) {
s += a[i];
if (a[i] == 0) {
a[i] = s;
s = 0;
}
}
}
But something is going wrong.
That's what I've done so far:
.section .text
.globl foo
foo:
.L1:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $0, -16(%rbp) /*s*/
movl $0, -8(%rbp) /*i*/
jmp .L2
.L2:
cmpl -8(%rbp), %esi
jle .L4
leave
ret
.L3:
addl $1, -8(%rbp)
jmp .L2
.L4:
movl -8(%rbp), %eax
imull $4, %eax
movslq %eax, %rax
addq %rdi, %rax
movl (%rax), %eax
addl %eax, -16(%rbp)
cmpl $0, %eax
jne .L3
/* if */
leaq (%rax), %rdx
movl -16(%rbp), %eax
movl %eax, (%rdx)
movl $0, -16(%rbp)
jmp .L3
I am compiling the .s module with a .c module, for example, with an int nums [5] = {65, 23, 11, 0, 34} and I'm getting back the same array instead of {65, 23, 11 , 99, 34}.
Could someone help me?
Presumably you have a compiler that can generate AT&T syntax. It might be more instructive to look at what assembly output the compiler generates. Here's my re-formulation of your demo:
#include <stdio.h>
void foo (int a[], int n)
{
for (int s = 0, i = 0; i < n; i++)
{
if (a[i] != 0)
s += a[i];
else
a[i] = s, s = 0;
}
}
int main (void)
{
int nums[] = {65, 23, 11, 0, 34};
int size = sizeof(nums) / sizeof(int);
foo(nums, size);
for (int i = 0; i < size; i++)
fprintf(stdout, i < (size - 1) ? "%d, " : "%d\n", nums[i]);
return (0);
}
Compiling without optimizations enabled is typically harder to work through than optimized code, since it loads from and spills results to memory. You won't learn much from it if you're investing time in learning how to write efficient assembly.
Compiling with the Godbolt compiler explorer with -O2 optimizations yields much more efficient code; it's also useful for cutting out unnecessary directives, labels, etc., that would be visual noise in this case.
In my experience, using -O2 optimizations are clever enough to make you rethink your use of registers, refactoring, etc. -O3 can sometimes optimize too agressively - unrolling loops, vectorizing, etc., to easily follow.
Finally, for the case you have presented, there's a perfect compromise: -Os, which enables many of the optimizations of -O2, but not at the expense of increased code size. I'll paste the assembly here just for comparative purposes:
foo:
xorl %eax, %eax
xorl %ecx, %ecx
.L2:
cmpl %eax, %esi
jle .L7
movl (%rdi,%rax,4), %edx
testl %edx, %edx
je .L3
addl %ecx, %edx
jmp .L4
.L3:
movl %ecx, (%rdi,%rax,4)
.L4:
incq %rax
movl %edx, %ecx
jmp .L2
.L7:
ret
Remember that the calling convention passes the pointer to (a) in %rdi, and the 'count' (n) in %rsi. These are the calling conventions being used. Notice that your code does not 'dereference' or 'index' any elements through %rdi. It's definitely worth going stepping through the code - even with pen and paper if it helps - to understand the branch conditions and how reading and writing is performed on element a[i].
Curiously, using the inner loop of your code:
s += a[i];
if (a[i] == 0)
a[i] = s, s = 0;
Appears to generate more efficient code with -Os than the inner loop I used:
foo:
xorl %eax, %eax
xorl %edx, %edx
.L2:
cmpl %eax, %esi
jle .L6
movl (%rdi,%rax,4), %ecx
addl %ecx, %edx
testl %ecx, %ecx
jne .L3
movl %edx, (%rdi,%rax,4)
xorl %edx, %edx
.L3:
incq %rax
jmp .L2
.L6:
ret
A reminder for me to keep things simple!
this is the assembly code i am supposed to translate:
f1:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
heres the c code I wrote that I think is equivalent.
int f1(int y){
int x = y-97;
int i = 0;
if(x<=25){
x = i;
}
return x;
}
and heres what I get from compiling the C code.
_f1: ## #f1
.cfi_startproc
%bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
## kill: def %edi killed %edi def %rdi
leal -97(%rdi), %ecx
xorl %eax, %eax
cmpl $123, %edi
cmovgel %ecx, %eax
popq %rbp
retq
.cfi_endproc
I was wondering if this was correct / what should be different and if anyone could help explain how jmps work as I am also trying to translate this assembly code and have gotten stuck
f2:
cmpl $1, %edi
jle .L6
movl $2, %edx
movl $1, %eax
jmp .L5
.L8:
movl %ecx, %edx
.L5:
imull %edx, %eax
leal 1(%rdx), %ecx
cmpl %eax, %edi
jg .L8
.L4:
cmpl %edi, %eax
sete %al
movzbl %al, %eax
ret
.L6:
movl $1, %eax
jmp .L4
gcc8.3 -O3 emits exactly the asm in the question for this way of writing the range check using the unsigned-compare trick.
int is_ascii_lowercase_v2(int y){
unsigned char x = y-'a';
return x <= (unsigned)('z'-'a');
}
Narrowing to 8-bit after the int subtract matches the asm more exactly, but it's not necessary for correctness or even to convince compilers to use a 32-bit sub. For unsigned char y, the upper bytes of RDI are allowed to hold arbitrary garbage (x86-64 System V calling convention), but carry only propagates from low to high with sub and add.
The low 8 bits of the result (which is all the cmp reads) would be the same with sub $'a', %dil or sub $'a', %edi.
Writing it as a normal range-check also gets gcc to emit identical code, because compilers know how optimize range-checks. (And gcc chooses to use 32-bit operand-size for the sub, unlike clang which uses 8-bit.)
int is_ascii_lowercase_v3(char y){
return (y>='a' && y<='z');
}
On the Godbolt compiler explorer, this and _v2 compile as follows:
## gcc8.3 -O3
is_ascii_lowercase_v3: # and _v2 is identical
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
Returning a compare result as an integer, instead of using an if, much more naturally matches the asm.
But even writing it "branchlessly" in C won't match the asm unless you enable optimization. The default code-gen from gcc/clang is -O0: anti-optimize for consistent debugging, storing/reloading everything to memory between statements. (And function args on function entry.) You need optimization, because -O0 code-gen is (intentionally) mostly braindead, and nasty looking. See How to remove "noise" from GCC/clang assembly output?
## gcc8.3 -O0
is_ascii_lowercase_v2:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl -20(%rbp), %eax
subl $97, %eax
movb %al, -1(%rbp)
cmpb $25, -1(%rbp)
setbe %al
movzbl %al, %eax
popq %rbp
ret
gcc and clang with optimization enabled will do if-conversion to branchless code when it's efficient. e.g.
int is_ascii_lowercase_branchy(char y){
unsigned char x = y-'a';
if (x < 25U) {
return 1;
}
return 0;
}
still compiles to the same asm with GCC8.3 -O3
is_ascii_lowercase_branchy:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
We can tell that the optimization level was at least gcc -O2. At -O1, gcc uses the less efficient setbe / movzx instead of xor-zeroing EAX ahead of setbe
is_ascii_lowercase_v2:
subl $97, %edi
cmpb $25, %dil
setbe %al
movzbl %al, %eax
ret
I could never get clang to reproduce exactly the same sequence of instructions. It likes to use add $-97, %edi, and cmp with $26 / setb.
Or it will do really interesting (but sub-optimal) things like this:
# clang7.0 -O3
is_ascii_lowercase_v2:
addl $159, %edi # 256-97 = 8-bit version of -97
andl $254, %edi # 0xFE; I haven't figured out why it's clearing the low bit as well as the high bits
xorl %eax, %eax
cmpl $26, %edi
setb %al
retq
So this is something involving -(x-97), maybe using the 2's complement identity in there somewhere (-x = ~x + 1).
Here is an annotated version of the assembly:
# %edi is the first argument, we denote x
subl $97, %edi
# x -= 97
# %eax is the return value, we denote y
xorl %eax, %eax
# y = 0
# %dil is the least significant byte (lsb) of x
cmpb $25, %dil
# %al is lsb(y) which is already zeroed
setbe %al
# if lsb(x) <= 25 then lsb(y) = 1
# setbe is unsigned version, setle would be signed
ret
# return y
So a verbose C equivalent is:
int f(int x) {
int y = 0;
x -= 97;
x &= 0xFF; // x = lsb(x) using 0xFF as a bitmask
y = (unsigned)x <= 25; // Section 6.5.8 of C standard: comparisons yield 0 or 1
return y;
}
We can shorten it by realizing y is unnecessary:
int f(int x) {
x -= 97;
x &= 0xFF;
return (unsigned)x <= 25;
}
The assembly of this is an exact match on Godbolt Compiler Explorer (x86-64 gcc8.2 -O2): https://godbolt.org/z/fQ0LVR
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have practiced Assembler long time ago and I would like to understand a simple program (I generate assembler code from a C code) which adds 2 vectors (actually 2 arrays) and store the result in another vector (an output array). My goal is after to study vectorization. For this, I use gcc-4.9 under Debian Wheezy on i7-core processor.
Here the C code snippet (not vectorized version) :
#include <stdio.h>
#define SIZE 10000
void test(double *a, double *b, double *c)
{
int i;
for (i = 0; i < SIZE; i++)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int i;
double tab1[SIZE];
double tab2[SIZE];
double tab3[SIZE];
for (i = 0; i < SIZE; i++)
{
tab1[i] = i;
tab2[i] = i;
tab3[i] = 0;
}
test(tab1, tab2, tab3);
for (i = 0; i < SIZE; i++)
printf(" tab3[%d] = %f\n", i, tab3[i]);
return 0;
}
I generate Assembler code with AT&T syntax :
gcc -std=c99 -c main_no_vectorized.c -O3 -S -o main_no_vectorized.s
Here is the assembly code :
.file "main_no_vectorized.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl test
.type test, #function
test:
.LFB3:
.cfi_startproc
leaq 16(%rdx), %rax
leaq 16(%rsi), %rcx
cmpq %rax, %rsi
setae %r8b
cmpq %rcx, %rdx
setae %cl
orb %cl, %r8b
je .L7
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %cl
cmpq %rax, %rdx
setae %al
orb %al, %cl
je .L7
testb $8, %dil
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
je .L8
movsd (%rdi), %xmm0
movl $9998, %ebp
movl $4999, %r9d
movl $9999, %r12d
movl $1, %r8d
movl $1, %ebx
addsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L3:
salq $3, %r8
xorl %eax, %eax
xorl %ecx, %ecx
leaq (%rdi,%r8), %r11
leaq (%rsi,%r8), %r10
addq %rdx, %r8
.p2align 4,,10
.p2align 3
.L4:
movupd (%r10,%rax), %xmm0
addl $1, %ecx
addpd (%r11,%rax), %xmm0
movups %xmm0, (%r8,%rax)
addq $16, %rax
cmpl %r9d, %ecx
jb .L4
cmpl %ebp, %r12d
leal (%rbx,%rbp), %eax
je .L1
cltq
movsd (%rdi,%rax,8), %xmm0
addsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L1:
popq %rbx
.cfi_remember_state
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L8:
.cfi_restore_state
movl $10000, %ebp
movl $5000, %r9d
movl $10000, %r12d
xorl %r8d, %r8d
xorl %ebx, %ebx
jmp .L3
.L7:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movsd (%rdi,%rax), %xmm0
addsd (%rsi,%rax), %xmm0
movsd %xmm0, (%rdx,%rax)
addq $8, %rax
cmpq $80000, %rax
jne .L2
rep ret
.cfi_endproc
.LFE3:
.size test, .-test
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string " tab3[%d] = %f\n"
.section .text.unlikely
.LCOLDB4:
.section .text.startup,"ax",#progbits
.LHOTB4:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl %eax, %eax
subq $240016, %rsp
.cfi_def_cfa_offset 240032
movdqa .LC2(%rip), %xmm3
leaq 32(%rsp), %rcx
leaq 80032(%rsp), %rdx
movdqa .LC1(%rip), %xmm1
.p2align 4,,10
.p2align 3
.L21:
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm2
paddd %xmm3, %xmm1
movaps %xmm2, 16(%rsp,%rax)
cvtdq2pd %xmm0, %xmm0
movaps %xmm2, 80016(%rsp,%rax)
movaps %xmm0, (%rcx,%rax)
movaps %xmm0, (%rdx,%rax)
addq $32, %rax
cmpq $80000, %rax
jne .L21
leaq 160016(%rsp), %rdi
movl $80000, %edx
xorl %esi, %esi
call memset
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L22:
movapd 16(%rsp,%rax), %xmm0
addpd 80016(%rsp,%rax), %xmm0
movaps %xmm0, 160016(%rsp,%rax)
addq $16, %rax
cmpq $80000, %rax
jne .L22
xorl %ebx, %ebx
.p2align 4,,10
.p2align 3
.L23:
movsd 160016(%rsp,%rbx,8), %xmm4
movl %ebx, %esi
movl $.LC3, %edi
movl $1, %eax
addq $1, %rbx
movapd %xmm4, %xmm0
movsd %xmm4, 8(%rsp)
call printf
cmpq $10000, %rbx
jne .L23
addq $240016, %rsp
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.section .text.unlikely
.LCOLDE4:
.section .text.startup
.LHOTE4:
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 4
.long 4
.long 4
.long 4
.ident "GCC: (Debian 4.9.1-16) 4.9.1"
.section .note.GNU-stack,"",#progbits
Could you explain to me the main steps of this above assembly code in relation with the C code, in particulary the "test" function, the loop of initialization in main function and the parameters passing (i.e where's the push and pop instructions for the stack) and the effective addition of "a" and "b" arrays ?
What corresponds to .L2, .L3, ... segments ? is there a relation with L2 cache, L3 cache ?
Sorry for these basics questions but I begin with Intel x86_64 assembler.
Thanks for your precious help
The generated assembly code is quite complicated. It first checks to see if the arrays a, b, and c overlap in a way that will cause an optimized loop to fail. For example, if you did this:
test(tab1, tab2, &tab1[1]);
then the overlap would be detected and cause the code to jump to L7 (the straightforward implementation). By the way, L stands for Label, and the label numbers are just generated by the compiler with no particular meaning. So L1, L2, L3, etc are just labels that are used for the code to branch to various places. The overlap checks start at .LFB3 and end at the last je .L7.
If no overlap is detected, then an optimized loop will be used. This optimized loop will try to add two doubles at a time instead of just one. The first thing the optimized loop does is to find out if array a is aligned to a 16 byte boundary (the testb $8, %dil instruction). If it is, it will jump to L8 to load a set of constants (e.g. r9 = 5000). If the array is not aligned, if will fall through and load a different set of constants (e.g. r9 = 4999), and also handle the first element. This is because the unaligned case will need to do 4999 iterations two at a time and handle the first and last unaligned elements separately outside the loop. The aligned case will just do 5000 iterations.
Either way, the code reaches L3 next. The code at L3 and L4 is the optimized loop that does the adds two at a time using the addpd instruction (the nonoptimized loop at L7 used addsd to do one add at a time). After the L4 loop finishes, it checks to see if it needs to handle the last element (for the unaligned case). Then it returns with the ret instruction.
By the way, it helps to know that when test is called, a is in rdi, b is in rsi, and c is in rdx. That is the calling convention for 64-bit. Therefore, there are no arguments pushed on the stack. If you don't understand x86 assembly too well, concentrate on the code starting at L7. That is the non-optimized version and you should be able to figure that part out given that I said your three arguments were in rdi, rsi, and rdx.
The .L2 and such are labels, they are used to refer to the next instruction. They are pretty much exactly like labels in C, if you've used goto. The primary use of a label is with a jump or branch, to specify where the jump goes to.
For example, the .L2 label is start of the body of your for (i = 0; i < SIZE; i++) loop in test(), it is counting by 8 bytes (the size of a double) up to 8*10000. The last instruction in the loop is jne .L2, which jumps to .L2 if the previous comparison was not equal.
You may find this reference (PDF) on x64 helpful.
With this program, I am intending to find the GCD of two numbers. But the result I get is "Floating point exception(core dumped)". What is the problem?
The code I am trying to generate is
int main() {
int sml, lrg, rem;
read %d sml
read %d lrg
while (sml > 0){
rem = lrg % sml;
lrg = sml;
sml = rem;
}
print %d lrg;
return 0;
}
The assembly file generated by me is:
.file "gcd.c"
.section .rodata
.LC0:
.string "%d"
.LC1:
.string "%d\n"
.text
.globl main
.type main, #function
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
leal -8(%ebp), %eax #scan a value
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
leal -12(%ebp), %eax #scan a value
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
.L2:
movl $0, %eax
cmpl -8(%ebp),%eax
jle .L0
jmp .L1
.L0:
movl -12(%ebp),%eax
movl -8(%ebp),%ecx
movl %eax,%edx
sarl $31, %edx
idivl %ecx
movl %edx,%eax
movl %eax, -16(%ebp)
movl -8(%ebp),%edx
movl %edx, -12(%ebp)
movl -16(%ebp),%edx
movl %edx, -8(%ebp)
jmp .L2
.L1:
movl -12(%ebp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %edx
movl $0, %eax #end of program
leave
ret
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.2.3 (4.2.3-6mnb1)"
.section .note.GNU-stack,"",#progbits
On the other hand, this assembly code works
.file "check.c"
.section .rodata
.LC0:
.string "%d"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
andl $-16, %esp
subl $32, %esp
leal 20(%esp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
leal 24(%esp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call scanf
jmp .L2
.L3:
movl 24(%esp), %eax
movl 20(%esp), %ecx
movl %eax, %edx
sarl $31, %edx
idivl %ecx
movl %edx, 28(%esp)
movl 20(%esp), %eax
movl %eax, 24(%esp)
movl 28(%esp), %eax
movl %eax, 20(%esp)
.L2:
movl 20(%esp), %eax
testl %eax, %eax
jg .L3
movl 24(%esp), %eax
movl %eax, 4(%esp)
movl $.LC0, (%esp)
call printf
movl $0, %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits
You probably have a division by zero in this line:
idivl %ecx
With a 0 value in ecx register.
Your comparison and jump are wrong. The working code has:
testl %eax, %eax
jg .L3
The broken code has:
movl $0, %eax
cmpl -8(%ebp),%eax
jle .L0
jmp .L1
The former compares %eax (which contains the most recently computed residue) to zero, and it continues the loop (jumps to .L3) if the residue is positive (greater than zero).
The latter compares 0 to -8(%ebp) (the most recently computed residue). Note that the order is different; it compares 0 to -8(%ebp), not -8(%ebp) to 0. The testl instruction compares the value (after performing an AND) with zero. If the value is positive, it is “greater than” zero. The cmpl instruction compares its second operand to its first operand; if the second operand exceeds the first, the result is “greater than”. This is because, in Intel’s manuals and assembly language, instructions are written with their operands in the reverse order. E.g, moving 3 into %eax would be “mov %eax, $3”. However, the assembler you are using reverses all the operands from Intel’s order (due to legacy reasons).
So, the broken code continues the loop (jumps to .L0) if 0 is less than or equal to the residue. Thus, if the residue is zero, the loop continues. You can change the jle to jl:
jl .L0
Alternately, you could eliminate the redundant unconditional jump:
movl $0, %eax
cmpl -8(%ebp),%eax
jge .L1
Also, you probably want to change:
movl $.LC0, (%esp)
call _printf
to:
movl $.LC1, (%esp)
call _printf
so that you pass "%d\n" to printf instead of passing "%d".
Incidentally, the idivl instruction generates a divide error, not a floating-point exception. Your system is misreporting the error.
I guess you all heard of the 'swap problem'; SO is full of questions about it.
The version of the swap without use of a third variable is often considered to be faster since, well, you have one variable less. I wanted to know what was going on behind the curtains and wrote the following two programs:
int main () {
int a = 9;
int b = 5;
int swap;
swap = a;
a = b;
b = swap;
return 0;
}
and the version without third variable:
int main () {
int a = 9;
int b = 5;
a ^= b;
b ^= a;
a ^= b;
return 0;
}
I generated the assembly code using clang and got this for the first version (that uses a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -8(%rbp), %ecx
movl %ecx, -16(%rbp)
movl -12(%rbp), %ecx
movl %ecx, -8(%rbp)
movl -16(%rbp), %ecx
movl %ecx, -12(%rbp)
popq %rbp
ret
Leh_func_end0:
...
and this for the second version (that does not use a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
movl -8(%rbp), %ecx
movl -12(%rbp), %edx
xorl %ecx, %edx
movl %edx, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
popq %rbp
ret
Leh_func_end0:
...
The second one is longer but I don't know much about assembly code so I have no idea if that means that it is slower so I'd like to hear the opinion of someone more knowledgable about it.
Which of the above versions of a variable swap is faster and takes less memory?
Look at some optimised assembly. From
void swap_temp(int *restrict a, int *restrict b){
int temp = *a;
*a = *b;
*b = temp;
}
void swap_xor(int *restrict a, int *restrict b){
*a ^= *b;
*b ^= *a;
*a ^= *b;
}
gcc -O3 -std=c99 -S -o swapping.s swapping.c produced
.file "swapping.c"
.text
.p2align 4,,15
.globl swap_temp
.type swap_temp, #function
swap_temp:
.LFB0:
.cfi_startproc
movl (%rdi), %eax
movl (%rsi), %edx
movl %edx, (%rdi)
movl %eax, (%rsi)
ret
.cfi_endproc
.LFE0:
.size swap_temp, .-swap_temp
.p2align 4,,15
.globl swap_xor
.type swap_xor, #function
swap_xor:
.LFB1:
.cfi_startproc
movl (%rsi), %edx
movl (%rdi), %eax
xorl %edx, %eax
xorl %eax, %edx
xorl %edx, %eax
movl %edx, (%rsi)
movl %eax, (%rdi)
ret
.cfi_endproc
.LFE1:
.size swap_xor, .-swap_xor
.ident "GCC: (SUSE Linux) 4.5.1 20101208 [gcc-4_5-branch revision 167585]"
.section .comment.SUSE.OPTs,"MS",#progbits,1
.string "Ospwg"
.section .note.GNU-stack,"",#progbits
To me, swap_temp looks as efficient as can be.
The problem with XOR swap trick is that it's strictly sequential. It may seem deceptively fast, but in reality, it is not. There's an instruction called XCHG that swaps two registers, but this can also be slower than simply using 3 MOVs, due to its atomic nature. The common technique with temp is an excellent choice ;)
To get an idea of the cost imagine that every command has a cost to be performed and also the indirect addressing has its own cost.
movl -12(%rbp), %ecx
This line will need something like a time unit for accessing the value in ecx register,
one time unit for accessing rbp, another one for applying the offset (-12) and more time
units (let's say arbitrarily 3) for moving the value from the address stored in ecx to the
address indicated from -12(%rbp).
If you count all the operations in every line and all line, the second method is for sure costlier than the first one.