Is memcpy a really function with symbol? - c

This simple c:
#include <stdio.h>
#include <string.h>
int *add(int a, int b){
int ar[1];
int result = a+b;
memcpy(ar, &result, sizeof(int));
return ar;
int main(){
int a = add(1,2)[0];
is compiled into this:
.globl add
.type add, #function
pushq %rbp #
movq %rsp, %rbp #,
movl %edi, -20(%rbp) # a, a
movl %esi, -24(%rbp) # b, b
# a.c:5: int result = a+b;
movl -20(%rbp), %edx # a, tmp91
movl -24(%rbp), %eax # b, tmp92
addl %edx, %eax # tmp91, _1
# a.c:5: int result = a+b;
movl %eax, -8(%rbp) # _1, result
# a.c:6: memcpy(ar, &result, sizeof(int)); ---I SEE NO CALL INSTRUCTION---
movl -8(%rbp), %eax # MEM[(char * {ref-all})&result], _6
movl %eax, -4(%rbp) # _6, MEM[(char * {ref-all})&ar]
# a.c:7: return ar;
# lea -4(%rbp), %rax #--ONLY THIS IS CORRECT, NOT `0`
# a.c:8: }
popq %rbp #
.size add, .-add
.section .rodata
.string "%i\n"
.globl main
.type main, #function
pushq %rbp #
movq %rsp, %rbp #,
subq $16, %rsp #,
# a.c:11: int a = add(1,2)[0];
movl $2, %esi #,
movl $1, %edi #,
call add #
# a.c:11: int a = add(1,2)[0];
movl (%rax), %eax # *_1, tmp90
movl %eax, -4(%rbp) # tmp90, a
# a.c:12: printf("%i\n",a);
movl -4(%rbp), %eax # a, tmp91
movl %eax, %esi # tmp91,
leaq .LC0(%rip), %rdi #,
movl $0, %eax #,
call printf#PLT #
movl $0, %eax #, _6
# a.c:13: }
.size main, .-main
.ident "GCC: (Debian 8.3.0-6) 8.3.0"
.section .note.GNU-stack,"",#progbits
Every function from stdlib like printf or puts are called from GOT (i.e. %rip register holds the address of GOT). But not memcpy, it is like "assembly inline instructions" instead of regular call address. So is memcpy even a symbol? If so, why is it not as argument to call? Is memcpy in GOT table? If so, what is a offset from GOT to that symbol?

So first off, you have a bug:
$ cc -O2 -S test.c
test.c: In function ‘add’:
test.c:7:12: warning: function returns address of local variable
Returning the address of a local variable has undefined behavior, if and only if the caller uses that value; this is why your compiler generated code that returned a null pointer, which will crash the program if used but be harmless otherwise. In fact, my copy of GCC generates only this for add:
xorl %eax, %eax
because that treatment of the return value makes the other operations in add be dead code.
(The "only if used" restriction is also why my compiler generates a warning, not a hard error.)
Now, if I modify your program to have well-defined behavior, e.g.
#include <stdio.h>
#include <string.h>
void add(int *sum, int a, int b)
int result = a+b;
memcpy(sum, &result, sizeof(int));
int main(void)
int a;
add(&a, 1, 2);
return 0;
then I do indeed see assembly code in which the memcpy call has been replaced by inline code:
addl %edx, %esi
movl %esi, (%rdi)
This is a feature of many modern C compilers: they know what some of the C library's functions do, and can inline them when that makes sense. (You can see that in this case the generated code is both smaller and faster than it would have been with an actual call to memcpy.)
GCC lets me turn this feature off with a command-line option:
$ gcc -O2 -ffreestanding test.c
$ sed -ne '/^add:/,/cfi_endproc/{; /^\.LF[BE]/d; /\.cfi_/d; p; }' test.s
subq $24, %rsp
addl %edx, %esi
movl $4, %edx
movl %esi, 12(%rsp)
leaq 12(%rsp), %rsi
call memcpy#PLT
addq $24, %rsp
In this mode, the call to memcpy in add is treated the same as the call to printf in main. Your compiler may have similar options.


How is struct organized in assembly?

I am trying to figure out, how does compiler pad space between each struct members. In this example:
struct s{
int a,b,c;
struct s get(int a){
struct s foo = {.a=a,.b=a+1,.c=a+2};
return foo;
is compiled with cc -S a.c:
.file "a.c"
.globl get
.type get, #function
pushq %rbp
movq %rsp, %rbp
movl %edi, -36(%rbp)
movl -36(%rbp), %eax
movl %eax, -24(%rbp)
movl -36(%rbp), %eax
addl $1, %eax
movl %eax, -20(%rbp)
movl -36(%rbp), %eax
addl $2, %eax
movl %eax, -16(%rbp)
movq -24(%rbp), %rax
movq %rax, -12(%rbp)
movl -16(%rbp), %eax
movl %eax, -4(%rbp)
movq -12(%rbp), %rax
movl -4(%rbp), %ecx
movq %rcx, %rdx
popq %rbp
.size get, .-get
.ident "GCC: (Debian 8.3.0-6) 8.3.0"
.section .note.GNU-stack,"",#progbits
No optimization is used. The question is why is there -36(%rbp) used as first member "reference", when they are arranged sequentially in
.a == -24(%rbp)
.b == -20(%rbp)
.c == -16(%rbp)
There is no need to make room with -36(%rbp) which compiler uses here. Is it intentionally (as a room or compiler uses the -36(%rbp) as a "reference" to the first member)?
Also, at the end,
movq -24(%rbp), %rax #take first member
movq %rax, -12(%rbp) #place it randomly
movl -16(%rbp), %eax #take third member
movl %eax, -4(%rbp) #place it randomly
Does not make sense, it is not sequential with the initial struct, and the first and third member of the struct are copied randomly in the space the function get had allocated.
What is the convention for structs?
The code you observe is a jumble of three different things: the actual layout of a struct s, the ABI specification of how to return structs from functions, and the anti-optimizations inserted by many compilers in their default mode (equivalent to -O0) to ensure that unsophisticated debuggers can find and change the values of variables while stopped at any breakpoint (see Why does clang produce inefficient asm with -O0 (for this simple floating point sum)? for more about this).
You can cut out the second of these factors by having get write into a struct s * argument, instead of returning a struct by value, and the third by compiling with gcc -O2 -S instead of just gcc -S. (Also try -Og and -O1; the complex optimizations applied at -O2 can be confusing, too.) For instance:
$ cat test.c
struct s {
int a,b,c;
void get(int a, struct s *s)
s->a = a;
s->b = a+1;
s->c = a+2;
$ gcc -O2 -S test.c
$ cat test.s
.file "test.c"
.p2align 4
.globl get
.type get, #function
leal 1(%rdi), %eax
movl %edi, (%rsi)
addl $2, %edi
movl %eax, 4(%rsi)
movl %edi, 8(%rsi)
.size get, .-get
.ident "GCC: (Debian 9.3.0-13) 9.3.0"
.section .note.GNU-stack,"",#progbits
From this assembly language it should be clearer that a is at offset 0 within struct s, b is at offset 4, and c at offset 8.
Struct layout is specified by the "psABI" (processor-specific application binary interface) for each CPU architecture. You can read the psABI specs for x86 at These also explain how structs are returned from functions. It's also important to know that the layout of a stack frame is only partially specified by the psABI. Some of the "random" offsets in your assembly dump are, in fact, arbitrarily chosen by the compiler.

How to read from a file in assembly x86

I have a code in c language that needs to be translated to assembly x86.
Here is the c code:
int rb (FILE *f){
int s;
char c;
s = fr(&c, 1, 1, f);
if (s <= 0) return -1;
return (int)c;
So far I got to this assembly code that gives me Segmentation fault:
pushl %ebp
movl %esp,%ebp
pushl 8(%ebp)
pushl $1
pushl $1
leal 12(%ebp), %eax
pushl %eax
call fr
jz ng
jns ex
pushl $1
negl %eax
popl %ebp
Can anyone help me to solve this? :)
Both Gcc and Clang can generate the assember for you. It might not always be easy to read but this is how to do it:
Make the snippet you want to inspect compilable with no errors. Note, I've changed your example to take a pointer to an integer as an argument because in your example you were declaring a char on the stack and then returning it ie Undefined Bahaviour.
Create a file called foo.c with this in it:
#include <stdio.h>
extern size_t fr(void *restrict ptr, size_t size, size_t nitems, FILE *restrict stream);
int rb (FILE *f, int *c){
int s;
s = fr(c, 1, 1, f);
if (s <= 0) return -1;
return *c;
Compile it using the S flag to gcc ie
gcc-5 -S -O0 -Wall -pedantic -std=c11 foo.c
The open the following file foo.s
.globl _rb
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rdx
movq -32(%rbp), %rax
movq %rdx, %rcx
movl $1, %edx
movl $1, %esi
movq %rax, %rdi
call _fr
movl %eax, -4(%rbp)
cmpl $0, -4(%rbp)
jg L2
movl $-1, %eax
jmp L3
movq -32(%rbp), %rax
movl (%rax), %eax
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
.... snipped
Now you have the assembler on x86 for the code you wanted. Note, you can play around with various options to change the output in particular the optimization levels will drastically change the output.

In x86, why do I have the same instruction two times, with reversed operands?

I am doing several experiments with x86 asm trying to see how common language constructs map into assembly. In my current experiment, I am trying to see specifically how C language pointers map to register-indirect addressing. I have written a fairly hello-world like pointer program:
#include <stdio.h>
main (void)
int value = 5;
int *int_val = &value;
printf ("The value we have is %d\n", *int_val);
return 0;
and compiled it to the following asm using: gcc -o pointer.s -fno-asynchronous-unwind-tables pointer.c:[1][2]
.file "pointer.c"
.section .rodata
.string "The value we have is %d\n"
.globl main
.type main, #function
;------- function prologue
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $5, -20(%rbp) ; This is where the value 5 is stored in `value` (automatic allocation)
leaq -20(%rbp), %rax ;; (GUESS) If I have understood correctly, this is where the address of `value` is
;; extracted, and stored into %rax
movq %rax, -16(%rbp) ;;
movq -16(%rbp), %rax ;; Why do I have two times the same instructions, with reversed operands???
movl (%rax), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L3
call __stack_chk_fail
.size main, .-main
.ident "GCC: (Ubuntu 4.9.1-16ubuntu6) 4.9.1"
.section .note.GNU-stack,"",#progbits
My issue is that I don't understand why it contains the instruction movq two times, with reversed operands. Could someone explain it to me?
[1]: I want to avoid having my asm code interspersed with cfi directives when I don't need them at all.
[2]: My environment is Ubuntu 14.10, gcc 4.9.1 (modified by ubuntu), and Gnu assembler (GNU Binutils for Ubuntu), configured to target x86_64-linux-gnu
Maybe it will be clearer if you reorganize your blocks:
leaq -20(%rbp), %rax ; &value
movq %rax, -16(%rbp) ; int_val
movq -16(%rbp), %rax ; int_val
movl (%rax), %eax ; *int_val
movl %eax, %esi ; printf-argument
movl $.LC0, %edi ; printf-argument (format-string)
movl $0, %eax ; no floating-point numbers
call printf
The first block performs int *int_val = &value;, the second block performs printf .... Without optimization, the blocks are independent.
Since you're not doing any optimization, gcc creates very simple-minded code that does each statement in the program one at a time without looking at any other statement. So in your example, it stores a value into the variable int_val, and then the very next instruction reads that variable again as part of the next statement. In both cases, it is using %rax as the temporary to hold value, as that's the first register generally used for things.

assembly code of the c function

I'm trying to understand the assembly code of the C function. I could not understand why andl -16 is done at the main. Is it for allocating space for the local variables. If so why subl 32 is done for main.
I could not understand the disassembly of the func1. As read the stack grows from higher order address to low order address for 8086 processors. So here why is the access on positive side of the ebp(for parameters offset) and why not in the negative side of ebp. The local variables inside the func1 is 3 + return address + saved registers - So it has to be 20, but why is it 24? (subl $24,esp)
int add(int a, int b){
int res = 0;
res = a + b;
return res;
int func1(int a){
int s1,s2,s3;
s1 = add(a,a);
s2 = add(s1,a);
s3 = add(s1,s2);
return s3;
int main(){
int a,b;
a = 1;b = 2;
b = func1(a);
printf("\n a : %d b : %d \n",a,b);
return 0;
assembly code :
.file "sample.c"
.globl add
.type add, #function
pushl %ebp
movl %esp, %ebp
subl $16, %esp
movl $0, -4(%ebp)
movl 12(%ebp), %eax
movl 8(%ebp), %edx
leal (%edx,%eax), %eax
movl %eax, -4(%ebp)
movl -4(%ebp), %eax
.size add, .-add
.globl func1
.type func1, #function
pushl %ebp
movl %esp, %ebp
subl $24, %esp
movl 8(%ebp), %eax
movl %eax, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call add
movl %eax, -4(%ebp)
movl 8(%ebp), %eax
movl %eax, 4(%esp)
movl -4(%ebp), %eax
movl %eax, (%esp)
call add
movl %eax, -8(%ebp)
movl -8(%ebp), %eax
movl %eax, 4(%esp)
movl -4(%ebp), %eax
movl %eax, (%esp)
call add
movl %eax, -12(%ebp)
movl -12(%ebp), %eax
.size func1, .-func1
.section .rodata
.string "\n a : %d b : %d \n"
.globl main
.type main, #function
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
movl $1, 28(%esp)
movl $2, 24(%esp)
movl 28(%esp), %eax
movl %eax, (%esp)
call func1
movl %eax, 24(%esp)
movl $.LC0, %eax
movl 24(%esp), %edx
movl %edx, 8(%esp)
movl 28(%esp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call printf
movl $0, %eax
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5"
.section .note.GNU-stack,"",#progbits
The andl $-16, %esp aligns the stack pointer to a multiple of 16 bytes, by clearing the low four bits.
The only places where positive offsets are used with (%ebp) are parameter accesses.
You did not state what your target platform is or what switches you used to compile with. The assembly code shows some Ubuntu identifier has been inserted, but I am not familiar with the ABI it uses, beyond that it is probably similar to ABIs generally used with the Intel x86 architecture. So I am going to guess that the ABI requires 8-byte alignment at routine calls, and so the compiler makes the stack frame of func1 24 bytes instead of 20 so that 8-byte alignment is maintained.
I will further guess that the compiler aligned the stack to 16 bytes at the start of main as a sort of “preference” in the compiler, in case it uses SSE instructions that prefer 16-byte alignment, or other operations that prefer 16-byte alignment.
So, we have:
In main, the andl $-16, %esp aligns the stack to a multiple of 16 bytes as a compiler preference. Inside main, 28(%esp) and 24(%esp) refer to temporary values the compiler saves on the stack, while 8(%esp), 4(%esp), and (%esp) are used to pass parameters to func1 and printf. We see from the fact that the assembly code calls printf but it is commented out in your code that you have pasted C source code that is different from the C source code used to generate the assembly code: This is not the correct assembly code generated from the C source code.
In func1, 24 bytes are allocated on the stack instead of 20 to maintain 8-byte alignment. Inside func1, parameters are accessed through 8(%ebp) and 4(%ebp). Locations from -12(%ebp) to -4(%ebp) are used to hold values of your variables. 4(%esp) and (%esp) are used to pass parameters to add.
Here is the stack frame of func1:
- 4(%ebp) = 20(%esp): s1.
- 8(%ebp) = 16(%esp): s2.
-12(%ebp) = 12(%esp): s3.
-16(%ebp) = 8(%esp): Unused padding.
-20(%ebp) = 4(%esp): Passes second parameter of add.
-24(%ebp) = 0(%esp): Passes first parameter of add.
I would suggest working through this with the output of objdump -S which will give you interlisting with the C source.

Variable swap with and without auxiliary variable - which is faster?

I guess you all heard of the 'swap problem'; SO is full of questions about it.
The version of the swap without use of a third variable is often considered to be faster since, well, you have one variable less. I wanted to know what was going on behind the curtains and wrote the following two programs:
int main () {
int a = 9;
int b = 5;
int swap;
swap = a;
a = b;
b = swap;
return 0;
and the version without third variable:
int main () {
int a = 9;
int b = 5;
a ^= b;
b ^= a;
a ^= b;
return 0;
I generated the assembly code using clang and got this for the first version (that uses a third variable):
movq %rsp, %rbp
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -8(%rbp), %ecx
movl %ecx, -16(%rbp)
movl -12(%rbp), %ecx
movl %ecx, -8(%rbp)
movl -16(%rbp), %ecx
movl %ecx, -12(%rbp)
popq %rbp
and this for the second version (that does not use a third variable):
movq %rsp, %rbp
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
movl -8(%rbp), %ecx
movl -12(%rbp), %edx
xorl %ecx, %edx
movl %edx, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
popq %rbp
The second one is longer but I don't know much about assembly code so I have no idea if that means that it is slower so I'd like to hear the opinion of someone more knowledgable about it.
Which of the above versions of a variable swap is faster and takes less memory?
Look at some optimised assembly. From
void swap_temp(int *restrict a, int *restrict b){
int temp = *a;
*a = *b;
*b = temp;
void swap_xor(int *restrict a, int *restrict b){
*a ^= *b;
*b ^= *a;
*a ^= *b;
gcc -O3 -std=c99 -S -o swapping.s swapping.c produced
.file "swapping.c"
.p2align 4,,15
.globl swap_temp
.type swap_temp, #function
movl (%rdi), %eax
movl (%rsi), %edx
movl %edx, (%rdi)
movl %eax, (%rsi)
.size swap_temp, .-swap_temp
.p2align 4,,15
.globl swap_xor
.type swap_xor, #function
movl (%rsi), %edx
movl (%rdi), %eax
xorl %edx, %eax
xorl %eax, %edx
xorl %edx, %eax
movl %edx, (%rsi)
movl %eax, (%rdi)
.size swap_xor, .-swap_xor
.ident "GCC: (SUSE Linux) 4.5.1 20101208 [gcc-4_5-branch revision 167585]"
.section .comment.SUSE.OPTs,"MS",#progbits,1
.string "Ospwg"
.section .note.GNU-stack,"",#progbits
To me, swap_temp looks as efficient as can be.
The problem with XOR swap trick is that it's strictly sequential. It may seem deceptively fast, but in reality, it is not. There's an instruction called XCHG that swaps two registers, but this can also be slower than simply using 3 MOVs, due to its atomic nature. The common technique with temp is an excellent choice ;)
To get an idea of the cost imagine that every command has a cost to be performed and also the indirect addressing has its own cost.
movl -12(%rbp), %ecx
This line will need something like a time unit for accessing the value in ecx register,
one time unit for accessing rbp, another one for applying the offset (-12) and more time
units (let's say arbitrarily 3) for moving the value from the address stored in ecx to the
address indicated from -12(%rbp).
If you count all the operations in every line and all line, the second method is for sure costlier than the first one.
