This question already has answers here:
Why does the x86-64 / AMD64 System V ABI mandate a 16 byte stack alignment?
(1 answer)
What does it mean to align the stack?
(6 answers)
Closed 4 years ago.
In the book CSAPP, 3.7.5 Local Storage in Registers, there is a calling function:
long P(long x, long y)
{
long u = Q(y);
long v = Q(x);
return u + v;
}
and the Generated assembly code for the calling function is:
P:
pushq %rbp
pushq %rbx
subq $8, %rsp Align stack frame
movq %rdi, %rbp
movq %rsi, %rdi
call Q
movq %rax, %rbx
movq %rbp, %rdi
call Q
addq %rbx, %rax
addq $8, %rsp
popq %rbx
popq %rbp
ret
I can't understand Line 3 subq $8, %rsp. The book says it is used to align stack frame. Why the machine align stack frame here?
Related
I'm trying to implement a thread library using an assembly code to save the registers on a struct tcb, get the new thread from queue e load its tcb to registers.
I'm using all 16 general-purpose registers and eflags. However, I'm facing some difficults, because the assembly code is not working properly. It keep showing Segmentation Fault.
That's what I've done in assembly to switch context:
change_context:
// Saving current context
pushfq
pushq %rax
movq tcb, %rax
popq 0(%rax)
movq %rcx, 8(%rax)
movq %rdx, 16(%rax)
movq %rbx, 24(%rax)
movq %rsi, 32(%rax)
movq %rdi, 40(%rax)
movq %rbp, 48(%rax)
movq %r8, 56(%rax)
movq %r9, 64(%rax)
movq %r10, 72(%rax)
movq %r11, 80(%rax)
movq %r12, 88(%rax)
movq %r13, 96(%rax)
movq %r14, 104(%rax)
movq %r15, 112(%rax)
popq 120(%rax)
movq %rsp, 128(%rax)
// Find new context
call get_next_thread
//Restauring context
movq current_running, %rax
movq 8(%rax), %rcx
movq 16(%rax), %rdx
movq 24(%rax), %rbx
movq 32(%rax), %rsi
movq 40(%rax), %rdi
movq 48(%rax), %rbp
movq 56(%rax), %r8
movq 64(%rax), %r9
movq 72(%rax), %r10
movq 80(%rax), %r11
movq 88(%rax), %r12
movq 96(%rax), %r13
movq 104(%rax), %r14
movq 112(%rax), %r15
pushq 120(%rax)
popfq
movq 128(%rax), %rsp
pushq 0(%rax)
popq %rax
ret
'tcb' is a pointer to current thread control block and it has the following face:
struct _tcb{
uint64_t registers[15];
uint64_t flags;
uint64_t stack_ptr;
uint64_t *stack;
}
When creating the threads, I initialize it with zeros, except %rdi that points to void *arg and stack_ptr that points to last position of stack, where I put the value of the start_routine.
The thread creation function is similar to the default from "pthread.h"
int create_thr(pthread_t *thread,
void *(*start_routine)(void *),
void *arg);
The only difference is that I'm not using attributes.
I've debugged it with valgrind and gdb. I found out that its first complain is from line "popq 0(%rax)"; it says "invalid write of size 8". Later it gets Segmentation Fault on functions like printf and usleep. I'm using a Linux 64-bit.
Any ideas?
I have the C code:
long fib(long n) {
if (n < 2) return 1;
return fib(n-1) + fib(n-2);
}
int main(int argc, char** argv) {
return 0;
}
which I compiled by running gcc -O0 -fno-optimize-sibling-calls -S file.c yielding assembly code that has not been optimized:
.file "long.c"
.text
.globl fib
.type fib, #function
fib:
.LFB5:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
.cfi_offset 3, -24
movq %rdi, -24(%rbp)
cmpq $1, -24(%rbp)
jg .L2
movl $1, %eax
jmp .L3
.L2:
movq -24(%rbp), %rax
subq $1, %rax
movq %rax, %rdi
call fib
movq %rax, %rbx
movq -24(%rbp), %rax
subq $2, %rax
movq %rax, %rdi
call fib
addq %rbx, %rax
.L3:
addq $24, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE5:
.size fib, .-fib
.globl main
.type main, #function
main:
.LFB6:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movq %rsi, -16(%rbp)
movl $0, %eax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size main, .-main
.ident "GCC: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0"
.section .note.GNU-stack,"",#progbits
My question is:
Why do we decrement the stack pointer by 24, subq $24, %rsp? As I see it, we store one element only, first argument n in %rdi, on the stack after the initial two pushes. So why don't we just decrement the stack pointer by 8 and then move n to -8(%rbp)? So
subq $8, %rsp
movq %rdi, -8(%rbp)
GCC does not fully optimize with -O0, not even its stack use. (This may aid in debugging by making some of its use of the stack more transparent to humans. For example, objects a, b, and c may share a single stack location if their active lifetimes (defined by uses in the program, not by the model of lifetime in the C standard) with -O3, but may have separately reserved places in the stack with -O0, and that makes it easier for a human to see where a, b, and c are used in the assembly code. The wasted 16 bytes may be a side effect of this, as those spaces may be reserved for some purpose that this small function did not happen to use, such as space to save certain registers if needed.)
Changing optimization to -O3 results in GCC subtracting only eight from the stack pointer.
C code:
long vframe(long n, long idx, long *q) {
long i;
long *p[n];
p[0] = &i;
for (i = 1; i < n; i++)
p[i] = q;
return *p[idx];
}
Portions of generated assembly code:
long vframe(long n, long idx, long *q)
n in %rdi, idx in %rsi, q in %rdx
Only portions of code shown
vframe:
pushq %rbp Save old %rbp
movq %rsp, %rbp Set frame pointer
subq $16, %rsp Allocate space for i (%rsp = s1)
leaq 22(,%rdi,8), %rax
andq $-16, %rax
subq %rax, %rsp Allocate space for array p (%rsp = s2)
leaq 7(%rsp), %rax
shrq $3, %rax
leaq 0(,%rax,8), %r8 Set %r8 to &p[0]
movq %r8, %rcx Set %rcx to &p[0] (%rcx = p)
...
Code for initialization loop
i in %rax and on stack, n in %rdi, p in %rcx, q in %rdx
.L3: loop:
movq %rdx, (%rcx,%rax,8) Set p[i] to q
addq $1, %rax Increment i
movq %rax, -8(%rbp) Store on stack
.L2:
movq -8(%rbp), %rax Retrieve i from stack
cmpq %rdi, %rax Compare i:n
jl .L3 If <, goto loop
...
Code for function exit
leave Restore %rbp and %rsp
ret Return
In the book the author says:
The leaq instruction in
leaq 22(,%rdi,8), %rax
computes the value 8n + 22, which is then rounded down to the nearest multiple of 16 by the andq instruction in
andq $-16, %rax
The resulting value will be 8n + 8 when n is odd and 8n + 16 when n is even, and this value is subtracted from s1 to give s2.
What puzzles me is 8n + 22. why must it be 22, not 16, 17, 18, 19, 20, 21, 23?
I also have this problem when I read here now.
and I try to complie the source code by clang with command:
clang -Og -S source.c -o source.s
in both platform ubuntu18.04 and windows 10,
the assembly file shows:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movq %rsp, %r8
movq %rsp, %r9
leaq 15(,%rdi,8), %r10
andq $-16, %r10
movq %r9, %rax
...
clang choose 15 even not from 16 to 23,
I read The x86-64 psABI version 1.0,
in section 3.2.2 The Stack Frame:
The end of the input argument area shall be aligned on a 16 (32 or 64, if
__m256 or __m512 is passed on stack) byte boundary. In other words, the value
(%rsp + 8) is always a multiple of 16 (32 or 64) when control is transferred to
the function entry point. The stack pointer, %rsp, always points to the end of the
latest allocated stack frame.
so I think the number is 22 or 15 is not important thing,
the number just leads to the size of e1 and e2
clang choose 15, then when n is even, e1 and e2 will be less than gcc version,
the key point is to guard 16 bytes align for the end of a stack frame,
or the value of s2 - s1 in a variable-size stack frame.
I know that OS X is 16 byte stack align, but I don't really understand why it is causing an error here.
All I am doing here is to pass an object size (which is 24) to %rdi, and call malloc. Does this error mean I have to ask for 32 bytes ?
And the error message is:
libdyld.dylib`stack_not_16_byte_aligned_error:
-> 0x7fffc12da2fa <+0>: movdqa %xmm0, (%rsp)
0x7fffc12da2ff <+5>: int3
libdyld.dylib`_dyld_func_lookup:
0x7fffc12da300 <+0>: pushq %rbp
0x7fffc12da301 <+1>: movq %rsp, %rbp
Here is the code:
Object_copy:
pushq %rbp
movq %rbp, %rsp
subq $8, %rsp
movq %rdi, 8(%rsp) # save self address
movq obj_size(%rdi), %rax # get object size
imul $8, %rax
movq %rax, %rdi
callq _malloc <------------------- error in this call
# rsi old object address
# rax new object address
# rdi object size, mutiple of 8
# rcx temp reg
# copy object tag
movq 0(%rsi), %rcx
movq %rcx, 0(%rax)
# set rdx to counter, starting from 8
movq $8, %rdx
# add 8 to object size, since we are starting from 8
addq $8, %rdi
start_loop:
cmpq %rdx, %rdi
jle end_loop
movq (%rdx, %rsi, 1), %rcx
movq %rcx, (%rdx, %rax, 1)
addq $8, %rdx
jmp start_loop
end_loop:
leave
ret
Main_protoObj:
.quad 5 ; object tag
.quad 3 ; object size
.quad Main_dispatch_table ; dispatch table
_main:
leaq Main_protoObj(%rip), %rdi
callq Object_copy # copy main proto object
subq $8, %rsp # save the main object on the stack
movq %rax, 8(%rsp)
movq %rax, %rdi # set rdi point to SELF
callq Main_init
callq Main_main
addq $8, %rsp # restore stack
leaq _term_msg(%rip), %rax
callq _print_string
Like you said, MacOS X has a 16 byte stack alignment, which means that the machine expects each variable on the stack to start on a byte that is a multiple of 16 from the current stack pointer.
When the stack is misaligned, it means we start trying to read variables from the middle of that 16 byte window and usually end up with a segmentation fault.
Before you call a routine in your code, you need to make sure that your stack is aligned correctly; in this case, meaning that the base pointer register is divisible by 16.
subq $8, %rsp # stack is misaligned by 8 bytes
movq %rdi, 8(%rsp) #
movq obj_size(%rdi), %rax #
imul $8, %rax #
movq %rax, %rdi #
callq _malloc # stack is still misaligned when this is called
To fix this, you can subq the %rsp by something like 16 instead of 8.
subq $16, %rsp # stack is still aligned
movq %rdi, 16(%rsp) #
... #
callq _malloc # stack is still aligned when this is called, good
I have main function in C that runs code in assembly. I just want to make simple sum:
main.c
#include <stdio.h>
extern int addByAssembly(int first_number, int second_number);
int main (int argc, char **argv)
{
int sum=0;
sum = addByAssembly(5,4);
printf ("%d\n",sum);
return 0;
}
addByAssembly.s
.data
SYSREAD = 0
SYSWRITE = 1
SYSEXIT = 60
STDOUT = 1
STDIN = 0
EXIT_SUCCESS = 0
.text
#.global main
#main:
#call write
#movq $SYSEXIT, %rax
#movq $EXIT_SUCCESS, %rdi
#syscall
#********
.globl addByAssembly
addByAssembly:
pushq %rbp
movq %rsp, %rbp
movq 16(%rsp), %rax
addq 24(%rsp), %rax
movq %rbp, %rsp
popq %rbp
But i got mess in my sum. It looks like i badly pass arguments, beause if I do this:
movq $123, %rax
return value is 123. I 've tried many ways, but cannot find how to make this properly to sum.
Thanks 'Jester' for so much effort and time to get me this explained!
To sum up. Passing parameters from C to As ( and as well from As to C) has its own ABI convention.
As you can see there, params are send on order:
1) rdi
2) rsi
3) rdx
... and so on...
In case you have more parameters than in convention, it will be pushed to stack.
So in my case:
.globl addByAssembly
addByAssembly:
pushq %rbp
movq %rsp, %rbp
--movq 16(%rsp), %rax #this was wrong as my params are
--addq 24(%rsp), %rax # first in %rdi, second in %rsi
++lea (%rdi, %rsi), %rax # in my case this line will do
# %rdi+%rsi -> %rax (learn lea, usefull command)
# REMEMBER return value is always in %rax!
movq %rbp, %rsp
popq %rbp