Putting a local array changes assembly output for X86-64

Putting a local array changes assembly output for X86-64 - c

When we have just an int variable in main:
int main() {
int d;
return 0;
}
Following code is generated for x86-64 on Linux by gcc -S test.c.
.file "test.c"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 6.3.1 20170109"
.section .note.GNU-stack,"",#progbits
Putting an array as local variable
int main() {
int d[2];
return 0;
}
generates a lot of extraneous code at the beginning which I am not able to comprehend.
.file "test.c"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
leaq -4144(%rsp), %rsp
orq $0, (%rsp)
leaq 4128(%rsp), %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L3
call __stack_chk_fail#PLT
.L3:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (GNU) 6.3.1 20170109"
.section .note.GNU-stack,"",#progbits
Specifically, what are these instructions doing?
leaq -4144(%rsp), %rsp
orq $0, (%rsp)
leaq 4128(%rsp), %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, %eax

Related

Decrementing stack by 24 when only 8 bytes are needed?

I have the C code:
long fib(long n) {
if (n < 2) return 1;
return fib(n-1) + fib(n-2);
}
int main(int argc, char** argv) {
return 0;
}
which I compiled by running gcc -O0 -fno-optimize-sibling-calls -S file.c yielding assembly code that has not been optimized:
.file "long.c"
.text
.globl fib
.type fib, #function
fib:
.LFB5:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
.cfi_offset 3, -24
movq %rdi, -24(%rbp)
cmpq $1, -24(%rbp)
jg .L2
movl $1, %eax
jmp .L3
.L2:
movq -24(%rbp), %rax
subq $1, %rax
movq %rax, %rdi
call fib
movq %rax, %rbx
movq -24(%rbp), %rax
subq $2, %rax
movq %rax, %rdi
call fib
addq %rbx, %rax
.L3:
addq $24, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE5:
.size fib, .-fib
.globl main
.type main, #function
main:
.LFB6:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size main, .-main
.ident "GCC: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0"
.section .note.GNU-stack,"",#progbits
My question is:
Why do we decrement the stack pointer by 24, subq $24, %rsp? As I see it, we store one element only, first argument n in %rdi, on the stack after the initial two pushes. So why don't we just decrement the stack pointer by 8 and then move n to -8(%rbp)? So
subq $8, %rsp
movq %rdi, -8(%rbp)

GCC does not fully optimize with -O0, not even its stack use. (This may aid in debugging by making some of its use of the stack more transparent to humans. For example, objects a, b, and c may share a single stack location if their active lifetimes (defined by uses in the program, not by the model of lifetime in the C standard) with -O3, but may have separately reserved places in the stack with -O0, and that makes it easier for a human to see where a, b, and c are used in the assembly code. The wasted 16 bytes may be a side effect of this, as those spaces may be reserved for some purpose that this small function did not happen to use, such as space to save certain registers if needed.)
Changing optimization to -O3 results in GCC subtracting only eight from the stack pointer.

Variable to return or return directly?

I'm learning to program and sometimes I find that using a variable to return makes my code more readable.
I was wondering if these functions perform the same operations and are equally efficient.
CASE 1:
int Foo1()
{
int x = 5 + 6 + 7; // Return variable
return x;
}
int Foo2(int y)
{
return 5 + 6 + 7;
}
In this case I think that the initialization and sum occur at compile time so there's no difference between them.
CASE 2:
int Foo1(int y)
{
int x = y + 6 + 7; // Return variable
return x;
}
int Foo2(int y)
{
return y + 6 + 7;
}
But, what happen in this case? It seems that the initialization occur at execution time and it has to perform it.
Is returning the value directly faster than initialize a variable and then returning it? Should I always try to return values directly instead using a variable to return?

You can easily try this yourself.
You can get the assembly from your compiler
Without optimization:
(gcc -S -O0 -o src.S src.c)
.file "so_temp.c"
.text
.globl case1Foo1
.type case1Foo1, #function
case1Foo1:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $18, -4(%rbp)
movl -4(%rbp), %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size case1Foo1, .-case1Foo1
.globl case1Foo2
.type case1Foo2, #function
case1Foo2:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $18, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size case1Foo2, .-case1Foo2
.globl case2Foo1
.type case2Foo1, #function
case2Foo1:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -20(%rbp)
movl -20(%rbp), %eax
addl $13, %eax
movl %eax, -4(%rbp)
movl -4(%rbp), %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size case2Foo1, .-case2Foo1
.globl case2Foo2
.type case2Foo2, #function
case2Foo2:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
addl $13, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size case2Foo2, .-case2Foo2
.ident "GCC: (Ubuntu 8.3.0-6ubuntu1) 8.3.0"
.section .note.GNU-stack,"",#progbits
Ther you can see, that the foo2 versions have a few instructions less than the foo1 versions of the functions.
With optimization turned to O3:
(gcc -S -O3 -o src.S src.c)
.file "so_temp.c"
.text
.p2align 4,,15
.globl case1Foo1
.type case1Foo1, #function
case1Foo1:
.LFB0:
.cfi_startproc
movl $18, %eax
ret
.cfi_endproc
.LFE0:
.size case1Foo1, .-case1Foo1
.p2align 4,,15
.globl case1Foo2
.type case1Foo2, #function
case1Foo2:
.LFB5:
.cfi_startproc
movl $18, %eax
ret
.cfi_endproc
.LFE5:
.size case1Foo2, .-case1Foo2
.p2align 4,,15
.globl case2Foo1
.type case2Foo1, #function
case2Foo1:
.LFB2:
.cfi_startproc
leal 13(%rdi), %eax
ret
.cfi_endproc
.LFE2:
.size case2Foo1, .-case2Foo1
.p2align 4,,15
.globl case2Foo2
.type case2Foo2, #function
case2Foo2:
.LFB7:
.cfi_startproc
leal 13(%rdi), %eax
ret
.cfi_endproc
.LFE7:
.size case2Foo2, .-case2Foo2
.ident "GCC: (Ubuntu 8.3.0-6ubuntu1) 8.3.0"
.section .note.GNU-stack,"",#progbits
both versions are exactly the same.
Still I don't think that this is something you should optimize yourself.
In this case readable code should be preferred, especially as code normally isn't compiled with optimizations turned off.

Case 2 is more efficient, but is often not needed as the compiler is extremely likely to optimize case 1 into case 2.
Go for readability if it doesn't hurt performance (as in this case).

Any compiler of at least modest quality will, at even low levels of optimization (such as GCC’s -O1), compile these to the same code. For the most part, any correct optimization you can easily see will be performed by a good compiler.
The C standard does not require compilers to mindlessly compile code into instructions that perform the exact steps in the C source code. It only requires compilers to produce code that has the same effects. Those effects are defined in terms of observable behavior, which includes the output of the program, interactions with the user, and access to volatile objects (special objects you will learn about later). Compilers will eliminate things like intermediate variables as long as they can do so without changing the observable behavior.

Understanding pointer assignment in x86-64 Assembly Code

I am trying to understand assembly code. I am stuck in the portion where the pointer is assigned and the code after leaq command
This is my C code:
#include <stdio.h>
#include<stdlib.h>
int main(){
int x=50;
int *y=&x;
return 0;
}
This is my corresponding ASSEMBLY code:
.file "AssemlyCode.c"
.def __main; .scl 2; .type 32; .endef
.text
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $48, %rsp
.seh_stackalloc 48
.seh_endprologue
call __main
movl $50, -12(%rbp)
leaq -12(%rbp), %rax
movq %rax, -8(%rbp)
movl $0, %eax
addq $48, %rsp
popq %rbp
ret
.seh_endproc
.ident "GCC: (GNU) 5.4.0"

leaq -8(%rbp), %rax
movl %eax, -4(%rbp)
movl $0, %eax
addq $48, %rsp
popq %rbp
ret
leaq saves address of variable x on the stack to register rax. Variable x is automatic variable on the stack, hence it address is calculated as offset from register that holds stack frame pointer(rbp).
movl eax to stack saves argc argument to the stack.
next step is to put return value in eax register from main function(return 0)
two next opcodes are function epilogue - you are cleaning up used stack and restore previous frame pointer register.
and the last one instruction is simple return.

Why does GCC not add .section into the assembly

If you look at the second line of this program it just says ".text". When I write assembly programs I though that you had to put ".section .text" Why does GCC omit the ".section". I also noticed that it includes it before declaring rodata bellow ".section .rodata".
Also just wondering what ".type sum, #function" does? I wrote an assembly function this morning without it and it executed fine.
.file "test.c"
.text
.globl sum
.type sum, #function
sum:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movss %xmm0, -4(%rbp)
movss %xmm1, -8(%rbp)
movss -4(%rbp), %xmm0
mulss -8(%rbp), %xmm0
cvttss2si %xmm0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size sum, .-sum
.section .rodata
.LC2:
.string "%d\n"
.text
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movss .LC0(%rip), %xmm1
movss .LC1(%rip), %xmm0
call sum
movl %eax, -4(%rbp)
movl -4(%rbp), %eax
movl %eax, %esi
movl $.LC2, %edi
movl $0, %eax
call printf
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 4
.LC0:
.long 1092930765
.align 4
.LC1:
.long 1092825907
.ident "GCC: (Ubuntu 4.9.2-10ubuntu13) 4.9.2"
.section .note.GNU-stack,"",#progbits

Collecting up some comments into an answer:
Before arbitrary section names were possible, .text, .data, and .bss were assembler directives. Now, you can write .section .text instead. This should all be documented in the GNU as manual. (linked to latest version).
.type sum, #function
sets some ELF symbol-type stuff. IDK if this matters for dynamic linking, but it doesn't for static linkage. There's a lot of stuff the compiler emits but that you don't actually need for your code to run. This is not a bad thing.
For the other things in gcc asm output, have a look at my answer to GCC Assembly Optimizations - Why are these equivalent?

Incrementing a variable through embedded assembly language

I am trying to understand how to embed assembly language in C (using gcc on x86_64 architecture). I wrote this program to increment the value of a single variable. But I am getting garbage value as output. And ideas why?
#include <stdio.h>
int main(void) {
int x;
x = 4;
asm("incl %0": "=r"(x): "r0"(x));
printf("%d", x);
return 0;
}
Thanks
Update The program is giving expected result on gcc 4.8.3 but not on gcc 4.6.3. I am pasting the assembly output of the non-working code:
.file "abc.c"
.section .rodata
.LC0:
.string "%d"
.text
.globl main
.type main, #function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
movl $4, -20(%rbp)
movl -20(%rbp), %eax
incl %edx
movl %edx, %ebx
.cfi_offset 3, -24
movl %ebx, -20(%rbp)
movl $.LC0, %eax
movl -20(%rbp), %edx
movl %edx, %esi
movq %rax, %rdi
movl $0, %eax
call printf
movl $0, %eax
addq $24, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits

You don't need to say x twice; once is sufficient:
asm("incl %0": "+r"(x));
The +r says that the value will be input and output.
Your way, with separate inputs and output registers, requires that you take the input from %1, add one, and write the output to %0, but you can't do that with incl.
The reason it works on some compilers is because GCC is free to allocate both %0 and %1 to the same register, and appears to have done so in those cases, but it does not have to. Incidentally, if you want to prevent GCC allocating an input and output to the same register (say, if you want to initialize the output before using the input to calculate a final output), you need to use the & modifier.
The documentation for the modifiers is here.