Why GCC didn't optimize this tail call? - c

I have the code working with lined lists. I use tail calls. Unfortunately, GCC does not optimise the calls.
Here is C code of the function that recursively calculates length of the linked list:
size_t ll_length(const ll_t* list) {
return ll_length_rec(list, 0);
}
size_t ll_length_rec(const ll_t* list, size_t size_so_far)
{
if (list) {
return ll_length_rec(list->next, size_so_far + 1);
} else {
return size_so_far;
}
}
and here is the assembler code:
.globl _ll_length_rec
_ll_length_rec:
LFB8:
.loc 1 47 0
pushq %rbp
LCFI6:
movq %rsp, %rbp
LCFI7:
subq $32, %rsp
LCFI8:
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 1 48 0
cmpq $0, -8(%rbp)
je L8
.loc 1 49 0
movq -16(%rbp), %rsi
incq %rsi
movq -8(%rbp), %rax
movq 8(%rax), %rdi
call _ll_length_rec # < THIS SHOUD BE OPTIMIZED
movq %rax, -24(%rbp)
jmp L10
If GCC would optimize it, there would be no call in the asm. I compile it with:
gcc -S -fnested-functions -foptimize-sibling-calls \
-03 -g -Wall -o llist llist.c
and GCC version is:
i686-apple-darwin10-gcc-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5666) (dot 3)

If I add -O3 to your compilation line, it does not seem to generate the offending call, while without it, I get the unoptimised call. I don't know all gcc options in my head, but is -03 a typo for -O3 or intentional?
Ltmp2:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
jmp LBB1_1
.align 4, 0x90
LBB1_3:
addq $2, %rsi
Ltmp3:
movq (%rax), %rdi
Ltmp4:
LBB1_1:
Ltmp5:
testq %rdi, %rdi
je LBB1_5
Ltmp6:
movq (%rdi), %rax
testq %rax, %rax
jne LBB1_3
incq %rsi
LBB1_5:
movq %rsi, %rax
Ltmp7:
Ltmp8:
popq %rbp
ret

Most likely because neither of your functions are declared as static, which means that the symbols must be visible to the linker in case any other compilation units need them at link time. Try to compile with the -fwhole-program flag and see what happens.

Probably depends on the version of GCC and specific build. This is what I get from GCC 3.4.4 on Windows starting from -O2 and up
.globl _ll_length_rec
.def _ll_length_rec; .scl 2; .type 32; .endef
_ll_length_rec:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %edx
movl 12(%ebp), %eax
jmp L3
.p2align 4,,7
L6:
movl (%edx), %edx
incl %eax
L3:
testl %edx, %edx
jne L6
popl %ebp
ret

Related

Decrementing stack by 24 when only 8 bytes are needed?

I have the C code:
long fib(long n) {
if (n < 2) return 1;
return fib(n-1) + fib(n-2);
}
int main(int argc, char** argv) {
return 0;
}
which I compiled by running gcc -O0 -fno-optimize-sibling-calls -S file.c yielding assembly code that has not been optimized:
.file "long.c"
.text
.globl fib
.type fib, #function
fib:
.LFB5:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
.cfi_offset 3, -24
movq %rdi, -24(%rbp)
cmpq $1, -24(%rbp)
jg .L2
movl $1, %eax
jmp .L3
.L2:
movq -24(%rbp), %rax
subq $1, %rax
movq %rax, %rdi
call fib
movq %rax, %rbx
movq -24(%rbp), %rax
subq $2, %rax
movq %rax, %rdi
call fib
addq %rbx, %rax
.L3:
addq $24, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE5:
.size fib, .-fib
.globl main
.type main, #function
main:
.LFB6:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size main, .-main
.ident "GCC: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0"
.section .note.GNU-stack,"",#progbits
My question is:
Why do we decrement the stack pointer by 24, subq $24, %rsp? As I see it, we store one element only, first argument n in %rdi, on the stack after the initial two pushes. So why don't we just decrement the stack pointer by 8 and then move n to -8(%rbp)? So
subq $8, %rsp
movq %rdi, -8(%rbp)
GCC does not fully optimize with -O0, not even its stack use. (This may aid in debugging by making some of its use of the stack more transparent to humans. For example, objects a, b, and c may share a single stack location if their active lifetimes (defined by uses in the program, not by the model of lifetime in the C standard) with -O3, but may have separately reserved places in the stack with -O0, and that makes it easier for a human to see where a, b, and c are used in the assembly code. The wasted 16 bytes may be a side effect of this, as those spaces may be reserved for some purpose that this small function did not happen to use, such as space to save certain registers if needed.)
Changing optimization to -O3 results in GCC subtracting only eight from the stack pointer.

using printf before and inside a loop x86-64 assembly

I'm having trouble figuring out how to use printf correctly in this function. So the function is called multInts and is supposed to multiply the first element of the first array with the first element of the second array and continue through the whole array. However, the lab instructions specify that I can't call printf in the main function. So, I need to print out the word "Products:\n" and then in each new line after that, print out the product. I don't know how to use printf within the loop, however. The instructor said that we should "call printf in the loop after calculating product" and also to "save and restore caller-save registers before calling printf," but I'm not sure what that means.
Here's what my code looks like so far:
.file "lab4.s"
.section .rodata
.LC0:
.string "Products: \n"
.LC1:
.string "%i \n"
.data
sizeIntArrays:
.long 5
sizeShortArrays:
.word 4
intArray1:
.long 10
.long 25
.long 33
.long 48
.long 52
intArray2:
.long 20
.long -37
.long 42
.long -61
.long -10
##### MAIN FUNCTION
.text
.globl main
.type main,#function
main:
pushq %rbp
movq %rsp, %rbp
#pass parameters and call other functions
movl sizeIntArrays, %edi #move size to registers for 1st parameter
leaq intArray1, %rsi #load effective address of intArray1 to register rsi
leaq intArray2, %rdx #load effective address of intArray2 to register rdx
call multInts #call multInts function
movq $0, %rax #return 0 to caller
movq %rbp, %rsp
popq %rbp
ret
.size main,.-main
##### MULTINTS
.globl multInts
.type multInts,#function
multInts:
pushq %rbp
movq %rsp, %rbp
#add code here for what the functions should do
movq $0, %r8 #initialize index for array access in caller save reg
movq $0, %rcx #initialize 8 byte caller save result reg
loop0:
cmpl %r8d, %edi #compare index to size
je exit0 #exit if equal
movslq (%rsi,%r8,4),%rax # Load a long into RAX
movslq (%rdx,%r8,4),%r11 # Load a long into R11
imulq %r11, %rax # RAX *= R11
addq %rax, %rcx # RCX += RAX
incq %r8 #increment index
jmp loop0
exit0:
movq $.LC0, %rdi
movq %rcx, %rsi
movq $0, %rax
call printf
movq %rbp, %rsp
popq %rbp
ret
.size multInts,.-multInts
What I've tried to do is just move the printf instruction to before the loop, but it gives me a segmentation fault when trying to run the loop because %rdi and %rsi contain the addresses of the arrays that need to be used in the loop. How do I get around that and which registers should I use? Also, how do I call printf within the loop?
The output should look something like this:
Products:
200
-925
1386
-2928
-520
Assume that printf clobbers all the call-clobbered registers (What registers are preserved through a linux x86-64 function call), and use different ones for anything that needs to survive from one iteration of the loop to the next.
Look at compiler output for an example: write a version of your loop in C and compile it with -Og.
Obviously you need to move the instructions that set up the args in registers
(like the format string) along with the call printf.
The easiest way to protect a register from being accessed by a subroutine is to push it. According to the ABI V calling convention printf may change any register except RBX, RBP, R12–R15. The registers you need to preserve are RAX, RDX, RSI, RDI, R8 and R11 (RCX is no longer needed), so push before the call to printf and pop them afterwards:
pushq %rax
pushq %rdx
pushq %rsi
pushq %rdi
pushq %r8
pushq %r11
movq $.LC1, %rdi
movq %rax, %rsi
movq $0, %rax
call printf
popq %r11
popq %r8
popq %rdi
popq %rsi
popq %rdx
popq %rax
Now, you can copy the block into the loop. For each printf, you have to think about what needs to be secured:
...
multInts:
pushq %rbp
movq %rsp, %rbp
#add code here for what the functions should do
pushq %rdx # Preserve registers
pushq %rdi
pushq %rsi
movq $.LC0, %rdi # Format string (no further values)
movq $0, %rax # No vector registers used
call printf # Call C function
popq %rsi # Restore registers
popq %rdi
popq %rdx
movq $0, %r8 #initialize index for array access in caller save reg
loop0:
cmpl %r8d, %edi #compare index to size
je exit0 #exit if equal
movslq (%rsi,%r8,4),%rax # Load a long into RAX
movslq (%rdx,%r8,4),%r11 # Load a long into R11
imulq %r11, %rax # RAX *= R11
pushq %rax # Preserve registers
pushq %rdx
pushq %rsi
pushq %rdi
pushq %r8
pushq %r11
movq $.LC1, %rdi # Format string
movq %rax, %rsi # Value
movq $0, %rax # No vector registers used
call printf # Call C function
popq %r11 # Restore registers
popq %r8
popq %rdi
popq %rsi
popq %rdx
popq %rax
incq %r8 #increment index
jmp loop0
exit0:
movq %rbp, %rsp
popq %rbp
ret
...
BTW: .string "%i \n" will force printf only to process the lower 32-bit of RDI. Use .string %lli \n instead.

Why is GCC exchanging rax and xmm0 registers? [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 7 years ago.
Improve this question
I was verifying some assembly generated by gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2) and realized that the following instructions were being generated:
movq %xmm0, %rax
movq %rax, %xmm0
I'd like to know what is the purpose of these instructions considering that it seems irrelevant, is it some kind of optimization? Like when we do:
xor ax, ax
I'd like to let clear that this code appeared just when I used the option -mtune=native and my CPU is a Intel Core I5 4200U.
Following is my source code:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"
void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
unsigned int i = 0;
for (i = 0; i < array_size; i++)
{
array3[i] = array1[i] * array2[i];
}
}
int main()
{
const unsigned int array_size = 1024*1024;
unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
int i = 0;
srand(time(NULL));
for (i = 0; i < array_size; i++)
{
array1[i] = rand();
array2[i] = rand();
}
clock_t t0 = clock();
multiply(array1,array2,array3, array_size);
multiply(array1,array2,array3, array_size);
clock_t t1 = clock();
printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}
This is the assembly generated by GCC using:gcc -S -mtune=native Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
And this with gcc -S Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
The differences can be found at the end of .L5 label.

Manual Assembly vs GCC

Disclaimer: I'm just starting out with x86 assembly. I did learn a bit of SPIM at university, but that's hardly worth mentioning.
I thought I start with what's probably the most simple function in libc, abs(). Pretty straightforward in C:
long myAbs(long j) {
return j < 0 ? -j : j;
}
My version in assembly:
.global myAbs
.type myAbs, #function
.text
myAbs:
test %rdi, %rdi
jns end
negq %rdi
end:
movq %rdi, %rax
ret
(This doesn't work for 32bit integers, probably because RAX is a 64bit register and the sign is probably at the wrong position - I have to investigate that).
Now here's what gcc does (gcc -O2 -S myAbs.c):
.file "myAbs.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl myAbs
.type myAbs, #function
myAbs:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $4144, %rsp
orq $0, (%rsp)
addq $4128, %rsp
movq %rdi, %rdx
sarq $63, %rdx
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movq %rdi, %rax
xorq %rdx, %rax
subq %rdx, %rax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
jne .L5
leave
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L5:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
.LFE0:
.size myAbs, .-myAbs
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.ident "GCC: (Gentoo Hardened 5.1.0 p1.2, pie-0.6.3) 5.1.0"
.section .note.GNU-stack,"",#progbits
Why this big difference? GCC produces substantially more instructions. I can't imagine that this won't be slower than my code.
Am I missing something? Or am I doing something seriousely wrong here?
For those who wonder what the generated code comes from, first note that when GCC compile myAbs with stack protection it transform it into this form
long myAbs(long j) {
uintptr_t canary = __stack_chk_guard;
register long result = j < 0 ? -j : j;
if ( (canary = canary ^ __stack_chk_guard) != 0 )
__stack_chk_fail();
}
The code to simply perform j < 0 ? -j : j; is
movq %rdi, %rdx ;RDX = j
movq %rdi, %rax ;RAX = j
sarq $63, %rdx ;RDX = 0 if j >=0, 0fff...ffh if j < 0
xorq %rdx, %rax ;Note: x xor 0ff...ffh = Not X, x xor 0 = x
;RAX = j if j >=0, ~j if j < 0
subq %rdx, %rax ;Note: 0fff...ffh = -1
;RAX = j+0 = j if j >= 0, ~j+1 = -j if j < 0
;~j+1 = -j in two complement
Analyzing the generated code we get
pushq %rbp
movq %rsp, %rbp ;Standard prologue
subq $4144, %rsp ;Allocate slight more than 4 KiB
orq $0, (%rsp) ;Perform a useless RW operation to test if there is enough stack space for __stack_chk_fail
addq $4128, %rsp ;This leave 16 byte allocated for local vars
movq %rdi, %rdx ;See above
sarq $63, %rdx ;See above
movq %fs:40, %rax ;Get the canary
movq %rax, -8(%rbp) ;Save it as a local var
xorl %eax, %eax ;Clear it
movq %rdi, %rax ;See above
xorq %rdx, %rax ;See above
subq %rdx, %rax ;See above
movq -8(%rbp), %rcx ;RCX = Canary
xorq %fs:40, %rcx ;Check if equal to the original value
jne .L5 ;If not fail
leave
ret
.L5:
call __stack_chk_fail#PLT ;__stack_chk_fail is noreturn
So all the extra instructions are for implementing the Stack Smashing Protector.
Thanks to FUZxxl for pointing out the use of the first instructions after the prologue.
Many of the beginning calls are to setup the stack and save the return address (something which you are not doing). Seems like theres are some stack protection going on. Perhaps you could tune your compiler settings to get rid of some overhead.
Perhaps adding flags to you compiler such as: -fno-stack-protector could minimise this difference.
Yes this probably is slower than your handwritten assembly, but offers much more protection and is probably worth the slight overhead.
As for why the stack protection still exists even though it is a leaf function see here.

In x86, why do I have the same instruction two times, with reversed operands?

I am doing several experiments with x86 asm trying to see how common language constructs map into assembly. In my current experiment, I am trying to see specifically how C language pointers map to register-indirect addressing. I have written a fairly hello-world like pointer program:
#include <stdio.h>
int
main (void)
{
int value = 5;
int *int_val = &value;
printf ("The value we have is %d\n", *int_val);
return 0;
}
and compiled it to the following asm using: gcc -o pointer.s -fno-asynchronous-unwind-tables pointer.c:[1][2]
.file "pointer.c"
.section .rodata
.LC0:
.string "The value we have is %d\n"
.text
.globl main
.type main, #function
main:
;------- function prologue
pushq %rbp
movq %rsp, %rbp
;---------------------------------
subq $32, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
;----------------------------------
movl $5, -20(%rbp) ; This is where the value 5 is stored in `value` (automatic allocation)
;----------------------------------
leaq -20(%rbp), %rax ;; (GUESS) If I have understood correctly, this is where the address of `value` is
;; extracted, and stored into %rax
;----------------------------------
movq %rax, -16(%rbp) ;;
movq -16(%rbp), %rax ;; Why do I have two times the same instructions, with reversed operands???
;----------------------------------
movl (%rax), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
;----------------------------------
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L3
call __stack_chk_fail
.L3:
leave
ret
.size main, .-main
.ident "GCC: (Ubuntu 4.9.1-16ubuntu6) 4.9.1"
.section .note.GNU-stack,"",#progbits
My issue is that I don't understand why it contains the instruction movq two times, with reversed operands. Could someone explain it to me?
[1]: I want to avoid having my asm code interspersed with cfi directives when I don't need them at all.
[2]: My environment is Ubuntu 14.10, gcc 4.9.1 (modified by ubuntu), and Gnu assembler (GNU Binutils for Ubuntu) 2.24.90.20141014, configured to target x86_64-linux-gnu
Maybe it will be clearer if you reorganize your blocks:
;----------------------------------
leaq -20(%rbp), %rax ; &value
movq %rax, -16(%rbp) ; int_val
;----------------------------------
movq -16(%rbp), %rax ; int_val
movl (%rax), %eax ; *int_val
movl %eax, %esi ; printf-argument
movl $.LC0, %edi ; printf-argument (format-string)
movl $0, %eax ; no floating-point numbers
call printf
;----------------------------------
The first block performs int *int_val = &value;, the second block performs printf .... Without optimization, the blocks are independent.
Since you're not doing any optimization, gcc creates very simple-minded code that does each statement in the program one at a time without looking at any other statement. So in your example, it stores a value into the variable int_val, and then the very next instruction reads that variable again as part of the next statement. In both cases, it is using %rax as the temporary to hold value, as that's the first register generally used for things.

Resources