__builtin_return_address returns the wrong address - c

I'm trying to get the address of the end of a function, so I use __builtin_return_address but it returns a value higher than the next declared funciton, here main:
uintptr_t foo( void )
return (uintptr_t)__builtin_extract_return_addr(__builtin_return_address(0));
int main( int argc, char **argv )
printf("foo:\t%zu\nmain:\t%zu\n", foo(), (uintptr_t)main);
return (0);
foo: 94718524985719
main: 94718524985687
But how is it possible ? How could main start address be lower than foo return address ?

__builtin_extract_return_addr(__builtin_return_address(0)) gives the address at which execution continues when the current function returns. In your snippet, this will be somewhere in main since the call to foo is in main.
32 bytes after the start of main makes sense.
When I compiled your code[1], it produced the following assembly and a difference of 26:
push rbp
mov rbp, rsp
mov rax, QWORD PTR [rbp+8]
pop rbp
.string "foo:\t%lu\nmain:\t%lu\n"
push rbp --- <--- Address of this
mov rbp, rsp |
push rbx |
sub rsp, 24 | 26 bytes
mov DWORD PTR [rbp-20], edi |
mov QWORD PTR [rbp-32], rsi |
mov ebx, OFFSET FLAT:main |
call foo ---
mov rdx, rbx <--- Address of this
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
mov rbx, QWORD PTR [rbp-8]
I corrected the format pattern.
#include <inttypes.h>
printf("foo:\t%" PRIuPTR "\nmain:\t%" PRIuPTR "\n", foo(), (uintptr_t)main);


How to dereference a pointer multiple times in ASMx64?

All is in the title, I have an int** as my function argument and I want to display the int with assembly, so what i've done is:
global bar
section .text:
mov rdx, [rdi]
mov rdi, [rdx]
mov rdx, [rdi]
mov rdi, [rdx]
call rsi
My C code:
#include <stdio.h>
void bar(int **i, void (*f)()); //assembly function prototype
void foo(int i)
printf("%d\n", i);
int main(void)
int i = 3;
int *ptr = &i;
bar(&ptr, &foo);
return (0);
It segfaults and says "invalid address" at foo with lldb, so I think it's because i'm not dereferencing the right way, so I'm stuck because I need to do this for a larger function. Thanks for the help.
Let me trace the assembly code:
global bar
section .text:
mov rdx, [rdi] // rdi = &ptr, rdx = *&ptr = ptr
mov rdi, [rdx] // rdx = ptr, rdi = *ptr = i
mov rdx, [rdi] // rdi = i, rdx = *i = (invalid)
mov rdi, [rdx]
call rsi
This suggests actually you have to pass int****, not int**, as the first argument because you are doing dereference 4 times.
It will be like this:
#include <stdio.h>
void bar(int ****i, void (*f)()); //assembly function prototype
void foo(int i)
printf("%d\n", i);
int main(void)
int i = 3;
int *ptr = &i;
int **pptr = &ptr;
int ***ppptr = &pptr;
bar(&ppptr, &foo);
return (0);
Also stack pointer should be 16-byte aligned on function call in x86-64, so the assembly function bar should be (for example):
global bar
section .text:
mov rdx, [rdi]
mov rdi, [rdx]
mov rdx, [rdi]
mov rdi, [rdx]
sub rsp, 8 // adjust stack pointer
call rsi
add rsp, 8 // restore stack pointer
The alignment is done before call and 8 byte (return address) is pushed by call, so another 8 byte should be subtracted from the function pointer to retain 16-byte alignment.
If you want to use int** as the first argument, do dereferences (memory accesses) only 2 times.
global bar
section .text:
mov rdx, [rdi]
mov rdi, [rdx]
sub rsp, 8
call rsi
add rsp, 8
Another thing you may want to do is
Create stack frame
Store the argument on the stack memory for later use
global bar
section .text:
push rbp // create stack frame
mov rbp, rsp
sub rsp, 16 // create region for local variables (note 16-byte alignment)
mov [rbp-8], rdi // save the argument to the memory
mov rdx, [rdi]
mov rdi, [rdx]
mov rdi, [rbp-8] // restore the argument from the memory
mov rdx, [rdi]
mov rdi, [rdx]
call rsi
leave // destruct stack frame
I think you might want this:
mov rax, [rdi]
mov edi, [rax]
call rsi
which would match the void bar(int **i, void (*f)()) prototype

Understanding pointers in assembler from machine's view

Here is a basic program I written on the godbolt compiler, and it's as simple as:
void main()
int a = 10;
int *p = &a;
printf("%d", *p);
The results after compilation I get:
.string "%d"
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-12], 10
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
Question: Pushing the rbp, making the stack frame by making a 16 byte block, how from a register, a value is moved to a stack location and vice versa, how the job of LEA is to figure out the address, I got this part.
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
Lea -> getting address of rbp-12 into rax,
then moving the value which is the address of rbp-12 into rax,
but next line again says, move to rax, the value of rbp-8. This seems ambiguous. Then again moving the value of rax to eax. I don't understand the amount of work here. Why couldn't I have done
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov eax, QWORD PTR [rbp-8]
and be done with it? coz on the original line, rbp-12's address is stored onto rax, then rax stored to rbp-8. then rbp-8 stored again into rax, and then again rax is stored into eax? couldn't we have just copied the rbp-8 directly to eax? i guess not. But my question is why?
I know there is de-referencing in pointers, so How LEA helps grabbing the address of rbp-12, I understand, but on the next parts, when did it went from grabbing values from addresses I completely lost. And also, after that, I didn't understand any of the asm lines.
You're seeing very un-optimized code. Here's my line-by-line interpretation:
.string "%d" ; Format string for printf
push rbp ; Save original base pointer
mov rbp, rsp ; Set base pointer to beginning of stack frame
sub rsp, 16 ; Allocate space for stack frame
mov DWORD PTR [rbp-12], 10 ; Initialize variable 'a'
lea rax, [rbp-12] ; Load effective address of 'a'
mov QWORD PTR [rbp-8], rax ; Store address of 'a' in 'p'
mov rax, QWORD PTR [rbp-8] ; Load 'p' into rax (even though it's already there - heh!)
mov eax, DWORD PTR [rax] ; Load 32-bit value of '*p' into eax
mov esi, eax ; Load value to print into esi
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
mov eax, 0 ; Zero out eax (not sure why -- likely printf call protocol)
call printf ; Make the printf call
nop ; No-op (not sure why)
leave ; Remove the stack frame
ret ; Return
Compilers, when not optimizing, generate code like this as they parse the code you gave them. It's doing a lot of unnecessary stuff, but it is quicker to generate and makes using a debugger easier.
Compare this with the optimized code (-O2):
.string "%d" ; Format string for printf
mov esi, 10 ; Don't need those variables -- just a 10 to pass to printf!
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
xor eax, eax ; It's a few cycles faster to xor a register with itself than to load an immediate 0
jmp printf ; Just jmp to printf -- it will handle the return
The optimizer found that the variables weren't necessary, so no stack frame is created. Nothing is left but the printf call! And that's done as a jmp since nothing else need be done here when the printf is complete.

How are allocated arrays declared in a loop?

I'm puzzled over this function.
int i;
for(i = 1; i<10; i++){
int arr[i];
return 0;
How can the space grow in a bounded (by ESP) stack memory?
Is there a sort of compilation trick?
EDIT for explanation:
Shouldn't the stack be something like that?
0 ---> val of i uninitialized
-4 ---> arr[0] uninitialized
and after the first loop
0 ---> val of i uninitialized
-4 ---> arr[1] uninitialized
-8 ---> arr[0] uninitialized
I'm tempted to say: is ESP moving below each iteration of the loop?
How can the space grow in a bounded size stack memory?
You refer to the space of char arr - its space does not grow. It's a local variable inside the scope of the for loop. So everytime the loop has a new i it's a brand new char arr.
On every loop there is allocated stack for the array and then dealocated.
A bit different example
#include "stdio.h"
#include "string.h"
int h(int x)
for(int i = 1; i<x; i++){
char arr[i];
memset(arr, i, sizeof(arr));
return 0;
int main()
in the compiled code
.string "%d\n"
push rbp
mov rbp, rsp
push r13
push r12
mov r12d, edi
push rbx
mov ebx, 1
push rax
cmp r12d, ebx
jle .L6
lea rax, [rbx+15]
mov r13, rsp
mov ecx, ebx
mov rsi, rbx
and rax, -16
sub rsp, rax
mov eax, ebx
inc rbx
mov rdi, rsp
rep stosb
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
mov rsp, r13
jmp .L2
lea rsp, [rbp-24]
xor eax, eax
pop rbx
pop r12
pop r13
pop rbp
push rax
mov edi, 50
call h
xor eax, eax
pop rdx
lines 15,19 & 20 allocate the space
and thew line 28 deallocates the space for the array
Is there a sort of compilation trick?
Yes, sort of. It uses VLAs (https://en.wikipedia.org/wiki/Variable-length_array)
Godbolt is very useful for inspecting things like this:
As you can see the -Wvla warning is indeed triggered for the line in question.

Why does this generated assembly code seem to contain nonsense? [duplicate]

This question already has an answer here:
Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?
(1 answer)
Closed 3 years ago.
I used https://godbolt.org/ with "x86-64 gcc 9.1" to assemble the following C code to understand why passing a pointer to a local variable as a function argument works. Now I have difficulties to understand some steps.
I commented on the lines I have difficulties with.
void printStr(char* cpStr) {
printf("str: %s", cpStr);
int main(void) {
char str[] = "abc";
return 0;
.string "str: %s"
push rbp
mov rbp, rsp
sub rsp, 16 ; why allocate 16 bytes when using it just for the pointer to str[0] which is 4 bytes long?
mov QWORD PTR [rbp-8], rdi ; why copy rdi to the stack...
mov rax, QWORD PTR [rbp-8] ; ... just to copy it into rax again? Also rax seems to already contain the pointer to str[0] (see *)
mov rsi, rax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
push rbp
mov rbp, rsp
sub rsp, 16 ; why allocate 16 bytes when "abc" is just 4 bytes long?
mov DWORD PTR [rbp-4], 6513249
lea rax, [rbp-4] ; pointer to str[0] copied into rax (*)
mov rdi, rax ; why copy the pointer to str[0] to rdi?
call printStr
mov eax, 0
Thanks to the help of Jester I could solve my confusion. The following code is compiled with the "-O1" flag of GCC (for me the best optimization level to understand what's going on):
.string "str: %s"
sub rsp, 8
; now the call to printf gets prepared, rdi = first argument, rsi = second argument
mov rsi, rdi ; move str[0] to rsi
mov edi, OFFSET FLAT:.LC0 ; move address of static string literal "str: %s" to edi
mov eax, 0 ; set eax to the number of vector registers used, because printf is a varargs function
call printf
add rsp, 8
sub rsp, 24
mov DWORD PTR [rsp+12], 6513249 ; create string "abc" on the stack
lea rdi, [rsp+12] ; move address of str[0] (pointer to 'a') to rdi (first argument for printStr)
call printStr
mov eax, 0
add rsp, 24
As Jester said, the 16 bytes were allocated for alignment. There is a good post on Stack Overflow which explains this here.
There is a post on Stack Overflow which explains why al is zeroed before a call to a varargs function here.

incrementing struct members

Say I have a struct defined as follows
struct my_struct
int num;
Here I have a pointer to my_struct and I want to do an increment on num
void foo(struct my_struct* my_ptr)
// increment num
// method #1
// method #2
// method #3
Do these 3 ways of incrementing num do the same thing?
While we're at it, is it true that pre-increment is more efficient than post-increment?
First two will have the same effect (when on a line on their own like that), but the third method isn't valid C code (you can't put the ++ there).
As for efficiency, there is no difference. The difference you may have heard people talking about is when, in C++, you increment a non-pointer data type, such as an iterator. In some cases, pre-increment can be faster there.
You can see the generated code using GCC Explorer.
void foo(struct my_struct* my_ptr)
void bar(struct my_struct* my_ptr)
foo(my_struct*): # #foo(my_struct*)
incl (%rdi)
bar(my_struct*): # #bar(my_struct*)
incl (%rdi)
As you can see, there's no difference whatsoever.
The only possible difference between the first two is when you use them in expressions:
my_ptr->num = 0;
int x = my_ptr->num++; // x = 0
my_ptr->num = 0;
int y = ++my_ptr->num; // y = 1
If your only intention is to increment the value of num then the 1st and 2nd method will yield same intented result to the callee method.
However, if you change your code to the following, you can see the difference between the code generated by gcc (assembly level code):
struct my_struct
int num;
void foo(struct my_struct* my_ptr)
printf("\nPost Increment: %d", my_ptr->num++);
int main()
struct my_struct a;
a.num = 10;
Now compile it using: gcc -masm=intel -S structTest.c -o structTest.s
This asks gcc to generate the assembly code:
Open structTest.s in a text editor.
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov edx, eax
**lea ecx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], ecx
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
push rbp
mov rbp, rsp
sub rsp, 16
**mov DWORD PTR [rbp-16], 10
lea rax, [rbp-16]
mov rdi, rax
call foo**
And when you change the operation to pre-increment, the follwoing code is generated:
push rbp
mov rbp, rsp
sub rsp, 16
**mov QWORD PTR [rbp-8], rdi**
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
**lea edx, [rax+1]**
mov rax, QWORD PTR [rbp-8]
**mov DWORD PTR [rax], edx**
mov rax, QWORD PTR [rbp-8]
**mov edx, DWORD PTR [rax]**
mov eax, OFFSET FLAT:.LC0
mov esi, edx
mov rdi, rax
mov eax, 0
call printf
So, you would see that in the second case, the compiler increments the num value and passes on this num value to printf().
In terms of performance, I would expect the post-increment to be more efficient since the memory locations are touched a fewer number of times.
The important lines have been marked between ** in the above code.
