x64 argument and return value calling convention - c

I invoke Clang 12.0.0 with -Os -march=haswell to compile the following C program:
int bar(int);
int foo(int x) {
const int b = bar(x);
if (x || b) {
return 123;
}
return 456;
}
The following assembly is generated:
foo: # #foo
push rbx
mov ebx, edi
call bar
or eax, ebx
mov ecx, 456
mov eax, 123
cmove eax, ecx
pop rbx
ret
https://gcc.godbolt.org/z/WsGoM56Ez
As I understand it, the caller of foo sets up x in RAX/EAX. foo then calls bar, which doesn't require modifying RAX/EAX, since x is passed through as unmodified input.
The or eax, ebx instruction appears to be comparing the input x with the result of bar. How does that result end up in EBX? What purpose does mov ebx,edi serve?

I'm afraid you are mistaken:
the function argument is passed in rdi, as per the x86-64 System V calling convention.
register rbx must not be modified by a function; GCC saves/restores it as required, so it can keep a copy of x there across the call to bar.
the function return value is in rax. (Actually eax; a 32-bit int only uses the low half)
You can verify the basics by compiling a function like int foo(int x){return x;} - you'll see just a mov eax, edi.
Here is a commented version:
foo: # #foo
push rbx # save register rbx
mov ebx, edi # save argument `x` in ebx
call bar # a = bar() (in eax)
or eax, ebx # compute `x | a`, setting FLAGS
mov ecx, 456 # prepare 456 for conditional move
mov eax, 123 # eax = 123
cmove eax, ecx # if `(x | a) == 0` set eax to 456
pop rbx # restore register rbx
ret # return value is in eax
The compiler optimizes x || b as (x | b) != 0 which allows for branchless code generation.
Note that mov doesn't modify the FLAGS, unlike most integer ALU instructions.

Related

How can I store abc(x, y) which is a pointer function into an array as per the following code sample?

Function 1.
It is a pointer function.
char *abc(unsigned int a, unsigned int b)
{
//do something here ...
}
Function 2
Leveraged the function 1 into function 2.
I am trying to store the abc function into an array, however I am getting the error as : error: assignment to expression with array type.
fun2()
{
unsigned int x, y;
x= 5, y=6;
char *array1;
char array2;
for(i=0; i<3; i++)
{
array2[i] = abc(x, y);
}
}
You can't store the invocation of a function in C since it would defeat many existing popular optimizations involving register parameters passing - see because normally parameters are assigned their argument values immediately before the execution flow is transferred to the calling site - compilers may choose to use the registers to store those values but as it stands those registers are volatile and so if we were to delay the actual call they would be overwritten at said later time - possibly even by another call to some function which also have its arguments passed as registers. A solution - which I've personally implemented - is to have a function simulate the call for you by re-assigning to the proper registers and any further arguments - to the stack. In this case you store the argument values in a flat memory. But this must be done in assembly exclusively for this purpose and specific to your target architecture. On the other hand if your architecture is not using any such optimizations - it could be quite easier but still hand written assembly would be required.
In any case this is not a feature the standard (or even pre standard as far as I know) C has implemented anytime.
For example this is an implementation for x86-64 I've wrote some time ago (for MSVC masm assembler):
PUBLIC makeuniquecall
.data
makeuniquecall_jmp_table dq zero_zero, one_zero, two_zero, three_zero ; ordinary
makeuniquecall_jmp_table_one dq zero_one, one_one, two_one, three_one ; single precision
makeuniquecall_jmp_table_two dq zero_two, one_two, two_two, three_two ; double precision
.code
makeuniquecall PROC
;rcx - function pointer
;rdx - raw argument data
;r8 - a byte array specifying each register parameter if it's float and the last qword is the size of the rest
push r12
push r13
push r14
mov r12, rcx
mov r13, rdx
mov r14, r8
; first store the stack vars
mov rax, [r14 + 4] ; retrieve size of stack
sub rsp, rax
mov rdi, rsp
xor rdx, rdx
mov r8, 8
div r8
mov rcx, rax
mov rsi, r13
;add rsi, 32
rep movs qword ptr [rdi], qword ptr [rsi]
xor r10,r10
cycle:
mov rax, r14
add rax, r10
movzx rax, byte ptr [rax]
test rax, rax
jnz jmp_one
lea rax, makeuniquecall_jmp_table
jmp qword ptr[rax + r10 * 8]
jmp_one:
cmp rax, 1
jnz jmp_two
lea rax, makeuniquecall_jmp_table_one
jmp qword ptr[rax + r10 * 8]
jmp_two:
lea rax, makeuniquecall_jmp_table_two
jmp qword ptr[rax + r10 * 8]
zero_zero::
mov rcx, qword ptr[r13+r10*8]
jmp continue
one_zero::
mov rdx, qword ptr[r13+r10*8]
jmp continue
two_zero::
mov r8, qword ptr[r13+r10*8]
jmp continue
three_zero::
mov r9, qword ptr[r13+r10*8]
jmp continue
zero_one::
movss xmm0, dword ptr[r13+r10*8]
jmp continue
one_one::
movss xmm1, dword ptr[r13+r10*8]
jmp continue
two_one::
movss xmm2, dword ptr[r13+r10*8]
jmp continue
three_one::
movss xmm3, dword ptr[r13+r10*8]
jmp continue
zero_two::
movsd xmm0, qword ptr[r13+r10*8]
jmp continue
one_two::
movsd xmm1, qword ptr[r13+r10*8]
jmp continue
two_two::
movsd xmm2, qword ptr[r13+r10*8]
jmp continue
three_two::
movsd xmm3, qword ptr[r13+r10*8]
continue:
inc r10
cmp r10, 4
jb cycle
mov r14, [r14 + 4] ; retrieve size of stack
call r12
add rsp, r14
pop r14
pop r13
pop r12
ret
makeuniquecall ENDP
END
And your code will look something like this:
#include <stdio.h>
char* abc(unsigned int a, unsigned int b)
{
printf("a - %d, b - %d\n", a, b);
return "return abc str\n";
}
extern makeuniquecall();
main()
{
unsigned int x, y;
x = 5, y = 6;
#pragma pack(4)
struct {
struct { char maskargs[4]; unsigned long long szargs; } invok;
char *(*pfunc)();
unsigned long long args[2], shadow[2];
} array2[3];
#pragma pack(pop)
for (int i = 0; i < 3; i++)
{
memset(array2[i].invok.maskargs, 0, sizeof array2[i].invok.maskargs); // standard - no floats passed
array2[i].invok.szargs = 8 * 4; //consider shadow space
array2[i].pfunc = abc;
array2[i].args[0] = x;
array2[i].args[1] = y;
}
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", ((char *(*)())makeuniquecall)(array2[i].pfunc, array2[i].args, &array2[i].invok));
}
You'll probably not need that for your specific case you will get away with simply storing each argument and calling the function directly - i.e. (plus this method won't be x86-64 specific):
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", array2[i].pfunc(array2[i].args[0], array2[i].args[1]));
But mine implementation gives you the flexibility to store different amount of arguments for each call.
Note consider this guide for running above examples on msvc (since it requires to add asm file for the assembly code).
I love such noob questions since they make you think about why x-y feature doesn't actually exist in the language.

ASM x64 function pointer not returning the good value

I have a problem with function pointers in assembly, even when my function returns a negative number it always sets rax to a positive number, I did a minimal reproductible example with a function that compares two integers and it does the same thing:
ASM Function code [EDIT]:
global foo
section .data
msg: db `superior\n`
msg_len: equ $-msg
section .text
foo:
push rbx
mov rbx, rdi
mov rdi, 2
mov rsi, 1
sub rsp, 8 ; align the stack frame
call rbx
add rsp, 8
test rax, rax ;[EDIT] correct: test eax, eax
js bar
mov rax, 1
mov rdi, 1
mov rsi, msg
mov rdx, msg_len
syscall
bar:
mov rdi, 1
mov rsi, 2
sub rsp, 8 ; same here
call rbx
add rsp, 8
test rax, rax ;[EDIT] correct: test eax, eax
js exit
mov rax, 1
mov rdi, 1
mov rsi, msg
mov rdx, msg_len
syscall
exit:
pop rbx ;restoring initial data of rbx
ret
main.c code:
#include <stdio.h>
int foo(int (*f)()); //my asm function prototype
int cmp(int i, int j)
{
printf("%d - %d\n", i, j);
return(i - j);
}
int main(void)
{
foo(&cmp);
return (0);
}
The output is:
2 - 1
superior
1 - 2
superior
But it should be just:
2 - 1
superior
Compilation:
nasm -f elf64 foo.s
gcc -c main.c -o main.o
gcc main.o foo.o
Thanks for the help
[EDIT] It didn't work because I checked rax instead of eax, now it works. Thanks for your help
An int is 32 bits, but rax is a 64-bit register. A function that returns int will place its return value in eax, which will typically zero out the high half of rax. So if cmp returns -1, which is the 32-bit number 0xffffffff, then rax will contain 0x00000000ffffffff. This is not a negative 64-bit number, so test rax, rax will not set the sign flag.
Try using test eax, eax as your test instead.
Your code seems too complicated.
Firstly, let's write the thing to do in language like C:
int foo(int (*f)()) {
if (cmp(2, 1) > 0) {
PRINT;
}
if (cmp(1, 2) > 0) {
PRINT;
}
}
Then, let's write assembly code according to this:
global foo
section .data
msg: db `superior\n`
msg_len: equ $-msg
section .text
; int foo(int (*f)()) {
foo:
mov rbx, rdi ; function pointer stored in rbx
; if (cmp(2, 1) > 0) {
mov rdi, 2 ; first integer
mov rsi, 1 ; second integer
call rbx ; call function pointer
cmp rax, 0
jle bar ; jump if rdi <= rsi (signed)
; PRINT;
mov rax, 1
mov rdi, 1
mov rsi, msg
mov rdx, msg_len
syscall ; write "superior\n"
; }
bar:
; if (cmp(1, 2) > 0) {
mov rdi, 1
mov rsi, 2
call rbx
cmp rax, 0
jle bar2
; PRINT;
mov rax, 1
mov rdi, 1
mov rsi, msg
mov rdx, msg_len
syscall ; write "superior\n"
; }
bar2:
; }
ret
To retain your code, points to fix are:
ja is for unsigned comparision. jg should be used to signed comparision instead.
There are code to print after bar2, but there also are code to print after the jump ja bar2 is not taken. You should add ret before bar: to prevent this from being executed.

Pass values from C program to Assembly language

I would like to pass values from C program to Assembly using the linked assembly method instead of inline assembly method in C.
Below is the Assembly program(GCD) which is am working on.
;gcdasm.nasm
bits 64
section .text
global gcdasm
gcdasm:
push rbp
mov rbp, rsp
mov rax, [rbp+4] ;load rax with x
mov rbx, [rbp+8] ;load rbx with y
top:
cmp rax, rbx ;x(rax) has to be larger than y(rbx)
je exit ;if x=y then exit and return value y
jb xchange ;if x<y then swap x and y
modulo:
cqo ;RDX:RAX sign extend
div rbx ;div rdx:rax with rbx
cmp rdx, 0 ;check remider if its 0
je exit ;if reminder is 0 then exit return return y
mov rax, rdx ;reminder rdx as next dividend
jmp modulo ;loop
xchange:
xchg rax, rbx ;swap x and y
jmp modulo
exit:
mov rax, rbx ;Return c program with the divisor y
mov rsp, rbp
pop rbp
ret
And this is the C program from with I am trying to pass the values to assembly program
//gcd.c
#include<stdio.h>
extern int gcdasm(int x, int y);
int main(void){
int x=0;
int y=0;
int result=0;
x = 46;
y = 90;
printf("%d and %d have a gcd of %d\n", x,y,gcdasm(x,y));
x = 55;
y = 66;
printf("%d and %d have a gcd of %d\n", x,y,gcdasm(x,y));
return 0;
}
When I compile using the below method and run it. I get either error Floating point exception or an empty prompt waiting for input
$ nasm -felf64 gcdasm.nasm -o gcdasm.o
$ gcc gcdasm.o gcd.c -o gcd
$ ./gcd
Floating point exception
$ ./gcd
I am unable to figure out the error. Kindly help me out.
Thank you.
Passing arguments to gcdasm()
The two int arguments are passed through registers, not the stack. The first and second arguments are passed in the lower-half of rdi and rsi (i.e.: edi and esi), respectively. So, by sign extending edi and esi into rax and rbx respectively, you load the passed arguments into those registers:
movsx rax, edi ;load rax with x
movsx rbx, esi ;load rbx with y
However, note that rbx is not a scratch register, therefore the callee needs to save it before modifying it and then restore it back before leaving the gcdasm function.
You can simply replace rbx by rcx (which isn't a callee-saved register) everywhere in your code. You don't need rbp at all, so you can remove all the instructions where rbp appears.
Other problems
There is also a problem with the logic of the program with:
mov rax, rdx ;reminder rdx as next dividend
Instead of this, the divisor (rcx) should become the dividend (rax) and the remainder (rdx) should become the divisor (rcx), that is:
mov rax, rcx
mov rcx, rdx
When dividing signed values, you have to use the idiv instruction, not div.
Improvement
There are also some reasons regarding performance and code size to use test rdx, rdx instead of cmp rdx, 0 for comparing rdx against zero.
With all that above in mind:
;gcdasm.nasm
bits 64
section .text
global gcdasm
gcdasm:
movsx rax, edi ;load rax with x
movsx rcx, esi ;load rcx with y
top:
cmp rax, rcx ;x(rax) has to be larger than y(rcx)
je exit ;if x=y then exit and return value y
jb xchange ;if x<y then swap x and y
modulo:
cqo ;sign extend RDX:RAX
idiv rcx ;rdx:rax/rcx (signed values)
test rdx, rdx ;check whether remainder is zero
je exit ;if reminder is 0 then exit return y
mov rax, rcx ;divisor becomes dividend
mov rcx, rdx ;remainder becomes divisor
jmp modulo ;loop
xchange:
xchg rax, rcx ;swap x and y
jmp modulo
exit:
mov rax, rcx ;Return c program with the divisor y
ret

C pointers and references

I would like to know what's really happening calling & and * in C.
Is that it costs a lot of resources? Should I call & each time I wanna get an adress of a same given variable or keep it in memory i.e in a cache variable. Same for * i.e when I wanna get a pointer value ?
Example
void bar(char *str)
{
check_one(*str)
check_two(*str)
//... Could be replaced by
char c = *str;
check_one(c);
check_two(c);
}
I would like to know what's really happening calling & and * in C.
There's no such thing as "calling" & or *. They are the address operator, or the dereference operator, and instruct the compiler to work with the address of an object, or with the object that a pointer points to, respectively.
And C is not C++, so there's no references; I think you just misused that word in your question's title.
In most cases, that's basically two ways to look at the same thing.
Usually, you'll use & when you actually want the address of an object. Since the compiler needs to handle objects in memory with their address anyway, there's no overhead.
For the specific implications of using the operators, you'll have to look at the assembler your compiler generates.
Example: consider this trivial code, disassembled via godbolt.org:
#include <stdio.h>
#include <stdlib.h>
void check_one(char c)
{
if(c == 'x')
exit(0);
}
void check_two(char c)
{
if(c == 'X')
exit(1);
}
void foo(char *str)
{
check_one(*str);
check_two(*str);
}
void bar(char *str)
{
char c = *str;
check_one(c);
check_two(c);
}
int main()
{
char msg[] = "something";
foo(msg);
bar(msg);
}
The compiler output can far wildly depending on the vendor and optimization settings.
clang 3.8 using -O2
check_one(char): # #check_one(char)
movzx eax, dil
cmp eax, 120
je .LBB0_2
ret
.LBB0_2:
push rax
xor edi, edi
call exit
check_two(char): # #check_two(char)
movzx eax, dil
cmp eax, 88
je .LBB1_2
ret
.LBB1_2:
push rax
mov edi, 1
call exit
foo(char*): # #foo(char*)
push rax
movzx eax, byte ptr [rdi]
cmp eax, 88
je .LBB2_3
movzx eax, al
cmp eax, 120
je .LBB2_2
pop rax
ret
.LBB2_3:
mov edi, 1
call exit
.LBB2_2:
xor edi, edi
call exit
bar(char*): # #bar(char*)
push rax
movzx eax, byte ptr [rdi]
cmp eax, 88
je .LBB3_3
movzx eax, al
cmp eax, 120
je .LBB3_2
pop rax
ret
.LBB3_3:
mov edi, 1
call exit
.LBB3_2:
xor edi, edi
call exit
main: # #main
xor eax, eax
ret
Notice that foo and bar are identical. Do other compilers do something similar? Well...
gcc x64 5.4 using -O2
check_one(char):
cmp dil, 120
je .L6
rep ret
.L6:
push rax
xor edi, edi
call exit
check_two(char):
cmp dil, 88
je .L11
rep ret
.L11:
push rax
mov edi, 1
call exit
bar(char*):
sub rsp, 8
movzx eax, BYTE PTR [rdi]
cmp al, 120
je .L16
cmp al, 88
je .L17
add rsp, 8
ret
.L16:
xor edi, edi
call exit
.L17:
mov edi, 1
call exit
foo(char*):
jmp bar(char*)
main:
sub rsp, 24
movabs rax, 7956005065853857651
mov QWORD PTR [rsp], rax
mov rdi, rsp
mov eax, 103
mov WORD PTR [rsp+8], ax
call bar(char*)
mov rdi, rsp
call bar(char*)
xor eax, eax
add rsp, 24
ret
Well, if there were any doubt foo and bar are equivalent, a least by the compiler, I think this:
foo(char*):
jmp bar(char*)
is a strong argument they indeed are.
In C, there's no runtime cost associated with either the unary & or * operators; both are evaluated at compile time. So there's no difference in runtime between
check_one(*str)
check_two(*str)
and
char c = *str;
check_one( c );
check_two( c );
ignoring the overhead of the assignment.
That's not necessarily true in C++, since you can overload those operators.
tldr;
If you are programming in C, then the & operator is used to obtain the address of a variable and * is used to get the value of that variable, given it's address.
This is also the reason why in C, when you pass a string to a function, you must state the length of the string otherwise, if someone unfamiliar with your logic sees the function signature, they could not tell if the function is called as bar(&some_char) or bar(some_cstr).
To conclude, if you have a variable x of type someType, then &x will result in someType* addressOfX and *addressOfX will result in giving the value of x. Functions in C only take pointers as parameters, i.e. you cannot create a function where the parameter type is &x or &&x
Also your examples can be rewritten as:
check_one(str[0])
check_two(str[0])
AFAIK, in x86 and x64 your variables are stored in memory (if not stated with register keyword) and accessed by pointers.
const int foo = 5 equal to foo dd 5 and check_one(*foo) equal to push dword [foo]; call check_one.
If you create additional variable c, then it looks like:
c resd 1
...
mov eax, [foo]
mov dword [c], eax ; Variable foo just copied to c
push dword [c]
call check_one
And nothing changed, except additional copying and memory allocation.
I think that compiler's optimizer deals with it and makes both cases as fast as it is possible. So you can use more readable variant.

How do I best use the const keyword in C?

I am trying to get a sense of how I should use const in C code. First I didn't really bother using it, but then I saw a quite a few examples of const being used throughout. Should I make an effort and go back and religiously make suitable variables const? Or will I just be waisting my time?
I suppose it makes it easier to read which variables that are expected to change, especially in function calls, both for humans and the compiler. Am I missing any other important points?
const is typed, #define macros are not.
const is scoped by C block, #define applies to a file (or more strictly, a compilation unit).
const is most useful with parameter passing. If you see const used on a prototype with pointers, you know it is safe to pass your array or struct because the function will not alter it. No const and it can.
Look at the definition for such as strcpy() and you will see what I mean. Apply "const-ness" to function prototypes at the outset. Retro-fitting const is not so much difficult as "a lot of work" (but OK if you get paid by the hour).
Also consider:
const char *s = "Hello World";
char *s = "Hello World";
which is correct, and why?
How do I best use the const keyword in C?
Use const when you want to make it "read-only". It's that simple :)
Using const is not only a good practice but improves the readability and comprehensibility of the code as well as helps prevent some common errors. Definitely do use const where appropriate.
Apart from producing a compiler error when attempting to modify the constant and passing the constant as a non-const parameter, therefore acting as a compiler guard, it also enables the compiler to perform certain optimisations knowing that the value will not change and therefore it can cache the value and not have to read it fresh from memory, because it won't have changed, and it allows it to be immediately substituted in the code.
C const
const and register are basically the opposite of volatile and using volatile will override the const optimisations at file and block scope and the register optimisations at block-scope. const register and register will produce identical outputs because const does nothing on C at block-scope on gcc C -O0, and is redundant on -O1 and onwards, so only the register optimisations apply at -O0, and are redundant from -O1 onwards.
#include<stdio.h>
int main() {
const int i = 1;
printf("%d", i);
}
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1
mov eax, DWORD PTR [rbp-4] //load from stack isn't eliminated for block-scope consts on gcc C unlike on gcc C++ and clang C, even though value will be the same
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
In this instance, with -O0, const, volatile and auto all produce the same code, with only register differing c.f.
#include<stdio.h>
const int i = 1;
int main() {
printf("%d", i);
}
i:
.long 1
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
mov eax, DWORD PTR i[rip] //load from memory
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
pop rbp
ret
with const int i = 1; instead:
i:
.long 1
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
mov eax, 1 //saves load from memory, now immediate
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
pop rbp
ret
C++ const
#include <iostream>
int main() {
int i = 1;
std::cout << i;
}
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1 //stores on stack
mov eax, DWORD PTR [rbp-4] //loads the value stored on the stack
mov esi, eax
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
leave
ret
#include <iostream>
int main() {
const int i = 1;
std::cout << i;
}
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 1 //stores it on the stack
mov esi, 1 //but saves a load from memory here, unlike on C
//'register' would skip this store on the stack altogether
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
leave
ret
#include <iostream>
int i = 1;
int main() {
std::cout << i;
}
i:
.long 1
main:
push rbp
mov rbp, rsp
mov eax, DWORD PTR i[rip] //load from memory
mov esi, eax
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
pop rbp
ret
#include <iostream>
const int i = 1;
int main() {
std::cout << i;
}
main:
push rbp
mov rbp, rsp
mov esi, 1 //eliminated load from memory, now immediate
mov edi, OFFSET FLAT:_ZSt4cout
call std::basic_ostream<char, std::char_traits<char> >::operator<<(int)
mov eax, 0
pop rbp
ret
C++ has the extra restriction of producing a compiler error if a const is not initialised (both at file-scope and block-scope). const also has internal linkage as a default on C++. volatile still overrides const and register but const register combines both optimisations on C++.
Even though all the above code is compiled using the default implicit -O0, when compiled with -Ofast, const surprisingly still isn't redundant on C or C++ on clang or gcc for file-scoped consts. The load from memory isn't optimised out unless const is used, even if the file-scope variable isn't modified in the code. https://godbolt.org/z/PhDdxk.

Resources