inline assembly - useless intermediate copy instructions - c

I'm trying to write a scheduler to run what we call "fibers".
Unfortunately, I'm not really used to writing inline assembly.
typedef struct {
//fiber's stack
long rsp;
long rbp;
//next fiber in ready list
struct fiber *next;
} fiber;
//currently executing fiber
fiber *fib;
So the very first task is - obviously - creating a fiber for the main function so it can be suspended.
int main(int argc, char* argv[]){
//create fiber for main function
fib = malloc(sizeof(*fib));
__asm__(
"movq %%rsp, %0;"
"movq %%rbp, %1;"
: "=r"(fib->rsp),"=r"(fib->rbp)
);
//jump to actual main and execute
__asm__(...);
}
This gets compiled to
movl $24, %edi #,
call malloc #
#APP
# 27 "scheduler.c" 1
movq %rsp, %rcx;movq %rbp, %rdx; # tmp92, tmp93
# 0 "" 2
#NO_APP
movq %rax, fib(%rip) # tmp91, fib
movq %rcx, (%rax) # tmp92, MEM[(struct fiber *)_3].rsp
movq %rdx, 8(%rax) # tmp93, MEM[(struct fiber *)_3].rbp
Why does this compile movs into temporary registers? Can I somehow get rid of them?
The first version of this question had asm output from gcc -O0, with even more instructions and temporaries.
Turning on optimisations does not get rid of them.

turning them on does not get rid of the temporaries
It did get rid of some extra loads and stores. The fib is of course still there in memory since you declared that as a global variable. The rax is the return value from the malloc that must be assigned to the fib in memory. The other two lines write into your fib members which are also required.
Since you specified register outputs the asm block can't write directly into memory. That's easy to fix with a memory constraint though:
__asm__(
"movq %%rsp, %0;"
"movq %%rbp, %1;"
: "=m"(fib->rsp),"=m"(fib->rbp)
);
This will generate:
call malloc
movq %rax, fib(%rip)
movq %rsp, (%rax)
movq %rbp, 8(%rax)

Related

Mixed assembly and C: re-assigning RSP register causes segmentation fault

I was trying to re-assign stack for a sub-function. I tried to implement that by mix assembly and C on Linux, x86_64 architecture.
Here is how I planed to to:
1. Get a section of memory by mmap syscall.
2. pass the mapped memory to asm func, then assign it to RSP register and jump to that function.
C function:
typedef struct Context
{
uint64_t rbx; // 0
uint64_t rsp; // 8
uint64_t rbp; // 16
uint64_t r12; // 24
uint64_t r13; // 32
uint64_t r14; // 40
uint64_t r15; // 48
uint64_t rip; // 56
uint64_t break_point; // 64
} Context_st;
main_func()
{
...
Context_st contexts[1];
Context_st mainContexts;
// inittialize main stack and the function's stack. This should make the func return to main function, which is this one.
_init_contexts(contexts);
// This should backup main function's context and jump to another function with customized stack
_run_contexts(&mainContexts, contexts);
printf("Successfully returned\n"); // This could not be executed.
...
}
static void _init_contexts(Context_st pContexts[1])
{
int mmapFlags = MAP_PRIVATE | MAP_ANONYMOUS;
const size_t STACK_SIZE = 512;
asm_init_context(&(pContexts[0]));
pContexts[0].rip = (uint64_t)_some_func;
pContexts[0].break_point = (uint64_t)_func_to_go;
pContexts[0].rsp = (uint64_t)mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE, mmapFlags, -1, 0);
pContexts[0].rsp += STACK_SIZE;
return;
}
static void _run_contexts(Context_st *pMainCtx, Context_st *pSubCtx)
{
asm_save_main_and_go(pMainCtx, pSubCtx);
return;
}
And assembly functions:
#################################################
##
.text
.align 4
.globl asm_init_context
.type asm_init_context, #function
asm_init_context:
movq %rbx, (%rdi)
movq %rsp, 8(%rdi)
movq %rbp, 16(%rdi)
movq %r12, 24(%rdi)
movq %r13, 32(%rdi)
movq %r14, 40(%rdi)
movq %r15, 48(%rdi)
retq
#################################################
##
.text
.align 4
.globl asm_save_main_and_go
.type asm_save_main_and_go, #function
asm_save_main_and_go:
movq %rbx, (%rdi)
movq %rsp, 8(%rdi)
movq %rbp, 16(%rdi)
movq %r12, 24(%rdi)
movq %r13, 32(%rdi)
movq %r14, 40(%rdi)
movq %r15, 48(%rdi)
movl %esi, %eax
movq (%rsi), %rbx
movq 8(%rsi), %rsp # This is the code which causes SIGSEGV
movq 16(%rsi), %rbp
movq 24(%rsi), %r12
movq 32(%rsi), %r13
movq 40(%rsi), %r14
movq 48(%rsi), %r15
movq 56(%rsi), %rdx
push %rdx
jmp 64(%rsi)
I have marked the asm code which causes SIGSEGV. If I comment it out, the program may correctly run and return to shell. But I do not know why.
Is there anything wrong I did? Or If there is something I need to do additionally to make mmapped memory section capable for RSP register as a customized stack space?
Thank you at advanced !!
Updated on Nov 22:
It was very strange: when I switched to another virtual machine, the _func_to_go() was successfully invoked. Although there is another segmentation fault after it returns, it may be known issue because of error stack...
I will keep updated if I replicate the problem. But I hope this remains "WorksForMe".
Special thanks to #AjayBrahmakshatriya and #Ped7g. Your discussion pointed out some of my error, and I have corrected or will correct them in my project here. :-)

x86-64 segmentation fault saving stack pointer

I am currently following along with this tutorial,
but I'm not a student of that school.
GDB gives me a segmentation fault in thread_start on the line:
movq %rsp, (%rdi) # save sp in old thread's tcb
Here's additional info when I backtrace:
#0 thread_start () at thread_start.s:16
#1 0x0000000180219e83 in _cygtls::remove(unsigned int)::__PRETTY_FUNCTION__
() from /usr/bin/cygwin1.dll
#2 0x00000000ffffcc6b in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)
Being a newbie, I can't for my life figure out why. Here is my main file:
#define STACK_SIZE 1024*1024
//Thread TCB
struct thread {
unsigned char * stack_pointer;
void(*initial_function)(void *);
void * initial_argument;
};
struct thread * current_thread;
struct thread * inactive_thread;
void thread_switch(struct thread * old_t, struct thread * new_t);
void thread_start(struct thread * old_t, struct thread * new_t);
void yield() {
//swap threads
struct thread * temp = current_thread;
current_thread = inactive_thread;
inactive_thread = temp;
thread_switch(inactive_thread, current_thread);
}
void thread_wrap() {
// call the thread's function
current_thread->initial_function(current_thread->initial_argument);
yield();
}
int factorial(int n) {
return n == 0 ? 1 : n * factorial(n - 1);
}
// calls and print the factorial
void fun_with_threads(void * arg) {
int n = *(int*)arg;
printf("%d! = %d\n", n, factorial(n));
}
int main() {
//allocate memory for threads
inactive_thread = (struct thread*) malloc(sizeof(struct thread));
current_thread = (struct thread*) malloc(sizeof(struct thread));
// argument for factorial
int *p= (int *) malloc(sizeof(int));
*p = 5;
// intialise thread
current_thread->initial_argument = p;
current_thread->initial_function = fun_with_threads;
current_thread->stack_pointer = ((unsigned char*) malloc(STACK_SIZE)) + STACK_SIZE;
thread_start(inactive_thread, current_thread);
return 0;
}
Here's my asm code for thread_start
# Inline comment
/* Block comment */
# void thread_switch(struct thread * old_t, struct thread * new_t);
.globl thread_start
thread_start:
pushq %rbx # callee-save
pushq %rbp # callee-save
pushq %r12 # callee-save
pushq %r13 # callee-save
pushq %r14 # callee-save
pushq %r15 # callee-save
movq %rsp, (%rdi) # save sp in old thread's tcb
movq (%rsi), %rsp # load sp from new thread
jmp thread_wrap
and thread_switch:
# Inline comment
/* Block comment */
# void thread_switch(struct thread * old_t, struct thread * new_t);
.globl thread_switch
thread_switch:
pushq %rbx # callee-save
pushq %rbp # callee-save
pushq %r12 # callee-save
pushq %r13 # callee-save
pushq %r14 # callee-save
pushq %r15 # callee-save
movq %rsp, (%rdi) # save sp in old thread's tcb
movq (%rsi), %rsp # load sp from new thread
popq %r15 # callee-restore
popq %r14 # callee-restore
popq %r13 # callee-restore
popq %r12 # callee-restore
popq %rbp # callee-restore
popq %rbx # callee-restore
ret # return
You're on cygwin, right? It uses the Windows x64 calling convention by default, not the System V x86-64 psABI. So your args aren't in %rdi and %rsi.
The calling convention is Windows x64, but the ABI is slightly different: long is 64 bit, so it's LP64 not LLP64. See the cygwin docs.
You could override the default with __attribute__((sysv_abi)) on the prototype, but that only works for compilers that understand GNU C.
Agner Fog's calling convention guide has some suggestions on how to write source code that assembles to working functions on Windows vs. non-Windows. The most straightforward thing is to use an #ifdef to choose different function prologues.
This Intel intro to x64 assembly is somewhat Windows-centric, and details the Windows x64 __fastcall calling convention.
(It's followed by examples and stuff. It's a pretty big and good tutorial that starts from very basic stuff, including how to use tools like an assembler. I'd recommend it for learning x86-64 asm in a Windows dev environment, and maybe in general.)
Windows x64 __fastcall (like x64 __vectorcall but doesn't pass vectors in vector regs)
RCX, RDX, R8, R9 are used for integer and pointer arguments in that order left to right
XMM0, 1, 2, and 3 are used for floating point arguments.
Additional arguments are pushed on the stack left to right.
Parameters less than 64 bits long are not zero extended; the high bits contain garbage.
It is the caller's responsibility to allocate 32 bytes of "shadow space" (for storing RCX, RDX, R8, and R9 if needed) before calling the
function.
It is the caller's responsibility to clean the stack after the call.
Integer return values (similar to x86) are returned in RAX if 64 bits or less.
Floating point return values are returned in XMM0.
Larger return values (structs) have space allocated on the stack by the caller, and RCX then contains a pointer to the return space when
the callee is called. Register usage for integer parameters is then
pushed one to the right. RAX returns this address to the caller.
The stack is 16-byte aligned. The "call" instruction pushes an 8-byte return value, so the all non-leaf functions must adjust the
stack by a value of the form 16n+8 when allocating stack space.
Registers RAX, RCX, RDX, R8, R9, R10, and R11 are considered volatile and must be considered destroyed on function calls. RBX, RBP,
RDI, RSI, R12, R14, R14, and R15 must be saved in any function using
them.
Note there is no calling convention for the floating point (and thus MMX) registers.
Further details (varargs, exception handling, stack unwinding) are at Microsoft's site.
Links to MS's calling-convention docs in the x86 tag wiki (along with System V ABI docs, and tons of other good stuff).
See also Why does Windows64 use a different calling convention from all other OSes on x86-64?

Setting up local stack according to x86-64 calling convention on linux

I am doing some extended assembly optimization on gnu C code running on 64 bit linux. I wanted to print debugging messages from within the assembly code and that's how I came accross the following. I am hoping someone can explain what I am supposed to do in this situation.
Take a look at this sample function:
void test(int a, int b, int c, int d){
__asm__ volatile (
"movq $0, %%rax\n\t"
"pushq %%rax\n\t"
"popq %%rax\n\t"
:
:"m" (a)
:"cc", "%rax"
);
}
Since the four agruments to the function are of class INTEGER, they will be passed through registers and then pushed onto the stack. The strange thing to me is how gcc actually does it:
test:
pushq %rbp
movq %rsp, %rbp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl %edx, -12(%rbp)
movl %ecx, -16(%rbp)
movq $0, %rax
pushq %rax
popq %rax
popq %rbp
ret
The passed arguments are pushed onto the stack, but the stack pointer is not decremented. Thus, when I do pushq %rax, the values of a and b are overwritten.
What I am wondering: is there a way to ask gcc to properly set up the local stack? Am I simply not supposed to use push and pop in function calls?
x86-64 abi provides a 128 byte red zone under the stack pointer, and the compiler decided to use that. You can turn that off using -mno-red-zone option.

inline asm code organization

I have just written a few small inline asm routines to query the timestamp counter in x86 so that I can profile small portions of code. I would really like to put those routines in a header so that I can reuse them in many different source files so basically my question is whether I should just organize those in macros or make them inline functions, my doubt with inline is that it is not necessarily the case that the compiler will actually inline it and since it is a performance sensitive call I would rather skip the function call overhead, on the other hand with macros the whole type safety goes away and I would strictly need a 32 bit int for this, I assume I could just add the specification in comments but still I try to avoid macros because of the many caveats. Here is the code:
inline void rdtsc(uint64_t* cycles)
{
uint32_t cycles_high, cycles_low;
asm volatile (
".att_syntax\n"
"CPUID\n\t" //Serialize
"RDTSC\n\t" //Read clock and cpuid
"mov %%edx, %0 \n\t"
"mov %%eax, %1 \n\t"
: "=r" (cycles_high), "=r" (cycles_low)
:: "%edx", "%eax");
*cycles = ((uint64_t) cycles_high << 32) | cycles_low;
}
Any suggestions on this are welcome. I am just trying to figure out what the preferred style would be for this kind of situation.
Since you will be measuring performance of portions of code, not necessarily always entire functions, you should not try to inline your performance counter.
It doesn't matter if there's a call overhead or not. What matter is that the mesurement is consistent, which means you either want ALWAYS the call overhead to be present, or NEVER.
The first is much easier to achieve than the former.
Let every portion of your code have the same call overhead.
If you really need to serialize before reading the TSC, you could use the LFENCE instruction instead which doesn't alter registers.
If you decide to continue to use CPUID for serialization, you ought to set EAX first (probably to 0, since you're not really concerned about the output) and note that this instruction trashes the EAX, EBX, ECX and EDX registers, so your routine MUST account for this fact.
In all, I'd be inclined to write it like this:
#include <stdint.h>
#include <stdio.h>
inline uint64_t rdtsc() {
uint32_t high, low;
asm volatile (
".att_syntax\n\t"
"LFENCE\n\t"
"RDTSC\n\t"
"movl %%eax, %0\n\t"
"movl %%edx, %1\n\t"
: "=rm" (low), "=rm" (high)
:: "%edx", "%eax");
return ((uint64_t) high << 32) | low;
}
int main() {
uint64_t x, y;
x = rdtsc();
printf("%lu\n", x);
y = rdtsc();
printf("%lu\n", y);
printf("%lu\n", y-x);
}
update:
It's been proposed by #Jester, and by #DavidWohlferd that one can eliminate the register allocations by assigning high and low directly to the edx and eax registers.
That version would look like this:
inline uint64_t rdtsc() {
uint32_t high, low;
asm volatile (
".att_syntax\n\t"
"LFENCE\n\t"
"RDTSC\n\t"
: "=a" (low), "=d" (high)
:: );
return ((uint64_t) high << 32) | low;
}
The resulting code (using gcc 4.8.3 on a 64-bit machine running Linux) using optimization -O2 and including up to the call to printf, is this:
#APP
# 20 "rdtsc.c" 1
.att_syntax
LFENCE
RDTSC
# 0 "" 2
#NO_APP
movq %rdx, %rbx
movl %eax, %eax
movl $.LC0, %edi
salq $32, %rbx
orq %rax, %rbx
xorl %eax, %eax
movq %rbx, %rsi
call printf
The version I originally posted results in this:
#APP
# 7 "rdtsc.c" 1
.att_syntax
LFENCE
RDTSC
movl %eax, %ecx
movl %edx, %ebx
# 0 "" 2
#NO_APP
movl %ecx, %ecx
salq $32, %rbx
movl $.LC0, %edi
orq %rcx, %rbx
xorl %eax, %eax
movq %rbx, %rsi
call printf
That version of the code is one instruction longer.

OS X asm C call with return value

I've been playing around with the asm macro in C to directly call some assembly instructions on OS X Mavericks to get a stack pointer address (from %rsp) and I've found really strange behaviour (at least to me) while trying to assign a return value from the assembler code into the %rax register (the one that should by convention hold the function return value). The C code is very simple:
#include <stdio.h>
unsigned long long get_sp(void) {
asm ("mov %rsp, %rax");
return 0;
}
int main(void) {
printf("0x%llx\n", get_sp());
}
If I compile and run the code, the value from %rax register gets printed(the actual stack pointer), which is strange as I would expect the %rax register to be overwritten by "return 0;"
However if I remove the return 0; a string "0x0" gets printed which is also strange as I would expect the return value from %rax register to be read and printed.
I've tried to run this code(with the only difference using %esp and %eax registers) also on the Ubuntu Linux also and it actually works as I would expect(using the gcc compiler).
Could this be a bug in the llvm-gcc compiler(Apple LLVM version 5.1)?
//EDIT
This is the version without the "return 0;"
otool -tV sp.out
sp.out:
(__TEXT,__text) section
_get_sp:
0000000100000f30 pushq %rbp
0000000100000f31 movq %rsp, %rbp
0000000100000f34 movq %rsp, %rax
0000000100000f37 movq -0x8(%rbp), %rax
0000000100000f3b popq %rbp
0000000100000f3c ret
0000000100000f3d nopl (%rax)
_main:
0000000100000f40 pushq %rbp
0000000100000f41 movq %rsp, %rbp
0000000100000f44 subq $0x10, %rsp
0000000100000f48 callq _get_sp
0000000100000f4d leaq 0x3a(%rip), %rdi ## literal pool for: "0x%llx
"
0000000100000f54 movq %rax, %rsi
0000000100000f57 movb $0x0, %al
0000000100000f59 callq 0x100000f6e ## symbol stub for: _printf
0000000100000f5e movl $0x0, %ecx
0000000100000f63 movl %eax, -0x4(%rbp)
0000000100000f66 movl %ecx, %eax
0000000100000f68 addq $0x10, %rsp
0000000100000f6c popq %rbp
0000000100000f6d ret
This is not a bug. It's the result of incorrect use of inline assembly. In the case where the return statement is included, the compiler does not inspect the asm statement. If %rax has already been set to zero before the asm block, the instruction overwrites that value. The compiler is free to do this before the asm block, since you haven't informed it of any register outputs, clobbers, etc.
In the case where no return statement is included, you can't rely on the return value. Which is why clang (that's what llvm-gcc is with Xcode 5.1 - it's not the gcc front end) issues a warning. gcc-4.8.2 appears to work on OS X - but because the code is incorrect in both cases, it's just 'luck'. With optimization: -O2, it no longer works. gcc doesn't issue a warning by default, which is a good reason to at least use -Wall.
{
unsigned long long ret;
__asm__ ("movq %rsp, %0" : "=r" (ret));
return ret;
}
always works. volatile is not necessary, as the compiler is using an output, so it cannot discard the asm statement. Even changing the first line to unsigned long long ret = 0; - the compiler is obviously not free to reorder.
this works for me on Mavericks [edit: and without a single change on Ubuntu Saucy x86_64]:
#include <stdio.h>
unsigned long long get_sp(void) {
long _sp = 0x0L;
__asm__ __volatile__(
"mov %%rsp, %[value] \n\t"
: [value] "=r" (_sp)
:
:);
return _sp;
}
int main(void) {
printf("0x%llx\n", get_sp());
}

Resources