How Calling Functions with lots of args in Assembly Works - c

Here is an example set of functions, the first with 20 args the second with 2:
int a(int n1, int n2, int n3, int n4, int n5, int n6, int n7, int n8, int n9, int n10, int n11, int n12, int n13, int n14, int n15, int n16, int n17, int n18, int n19, int n20) {
return n1 * n2 * n3 * n4 * n5 * n6 * n7 * n8 * n9 * n10 * n11 * n12 * n13 * n14 * n15 * n16 * n17 * n18 * n19 * n20;
}
int b(int n1, int n2) {
return a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1)
+ a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1)
+ a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1);
}
It gets compiled to this assembly:
a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov DWORD PTR [rbp-12], edx
mov DWORD PTR [rbp-16], ecx
mov DWORD PTR [rbp-20], r8d
mov DWORD PTR [rbp-24], r9d
mov eax, DWORD PTR [rbp-4]
imul eax, DWORD PTR [rbp-8]
imul eax, DWORD PTR [rbp-12]
imul eax, DWORD PTR [rbp-16]
imul eax, DWORD PTR [rbp-20]
imul eax, DWORD PTR [rbp-24]
imul eax, DWORD PTR [rbp+16]
imul eax, DWORD PTR [rbp+24]
imul eax, DWORD PTR [rbp+32]
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp
ret
b(int, int):
push rbp
mov rbp, rsp
push rbx
sub rsp, 8
mov DWORD PTR [rbp-12], edi
mov DWORD PTR [rbp-16], esi
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
mov ebx, eax
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
add ebx, eax
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
add eax, ebx
mov rbx, QWORD PTR [rbp-8]
leave
ret
I have a few questions about this. First, I noticed that it sort of seems to switch how it handles the args as the number increases:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
...
imul eax, DWORD PTR [rbp-8]
...
Wondering what is going on there, why it is doing that. It seems to treat the first arg with push, then the next 8 or so with mov, then the remaining with imul only relative to the eax register. Wonder if there is a limit to how many args you can have.
The second thing I'm wondering is the following. Say instead of a(), the function b() called out to some "system function" or some other external library call. Wondering how does it know how to unpack the args. Well, I guess, nevermind, I assume the C compiler will compile into assembly/machine code all linked external libraries. So yeah nevermind there I guess.
The last thing is if the syscalls all have a set number of arguments, similar to how in x86 the max operands is 3. Or perhaps the syscalls can have any number of arguments. It seems they would want to limit it for performance, so they only use those earlier stage instructions like push and mov, instead of imul.
Thanks for the help, just looking for clarification on how arguments are handled when calling functions in assembly, especially when there are a large number of arguments.

You need to learn more about stack frame and application binary interface (ABI, or calling convention). ABI defines the way parameters are passed by the caller to the callee, which registers are volatile, as well as how the stack is cleaned up.
Many ABIs exist, because anyone can design their own ABI as long as the caller and the callee agree. However, only a few ABIs are widely used. On Windows most 32-bit programs use stdcall, cdecl, Microsoft's fastcall or Borland's fastcall, whereas 64-bit programs mostly use Microsoft x64 calling convention. On Unix 64-bit programs always use System V AMD64 ABI, which is also the one used by your compiler.
Let's look at your code, with comments:
push rbp ; save the old stack frame
mov rbp, rsp ; establish new stack frame
mov DWORD PTR [rbp-4], edi ; save the first six arguments
mov DWORD PTR [rbp-8], esi
mov DWORD PTR [rbp-12], edx
mov DWORD PTR [rbp-16], ecx
mov DWORD PTR [rbp-20], r8d
mov DWORD PTR [rbp-24], r9d
mov eax, DWORD PTR [rbp-4] ; load n1
imul eax, DWORD PTR [rbp-8] ; eax = eax * n2
imul eax, DWORD PTR [rbp-12] ; eax = eax * n3
imul eax, DWORD PTR [rbp-16]
imul eax, DWORD PTR [rbp-20]
imul eax, DWORD PTR [rbp-24]
imul eax, DWORD PTR [rbp+16] ; eax = eax * n7
imul eax, DWORD PTR [rbp+24] ; eax = eax * n8
imul eax, DWORD PTR [rbp+32]
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp ; restore old stack frame
ret ; exit
Note: the first two line has nothing to do with the arguments; they are to create a stack frame, so that you can easily access local variables and arguments. Without the stack frame you can still access them with [rsp+*], but the offset need be adjusted according to any PUSH and POP you used.
Following that are instructions to store the arguments to local variables. Registers are frequently changed and arguments passed in registers need to be stored, in case you need to use them later. In this case, however, is not necessary. So the optimized code can be
push rbp ; save the old stack frame
mov rbp, rsp ; establish new stack frame
mov eax, edi ; eax = n1
imul eax, esi ; eax = eax * n2
imul eax, edx ; eax = eax * n3
imul eax, ecx ; eax = eax * n4
imul eax, e8d ; eax = eax * n5
imul eax, e9d ; eax = eax * n6
imul eax, DWORD PTR [rbp+16] ; eax = eax * n7
imul eax, DWORD PTR [rbp+24] ; eax = eax * n8
imul eax, DWORD PTR [rbp+32] ; eax = eax * n9
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp ; restore old stack frame
ret ; exit
From the above example, you should be able to guess that the first argument is passed in edi (or rdi, di, dil depends on the size), the second one is in esi, and then edx, ecx, r8d and r9d (Integer only, floats are passed in vector registers). When you have more than 6 arguments, the other are pushed on to the stack and can be accessed using [rbp+16], [rbp+24], .... ([rbp+8] is old rbp; [rbp] is returning address).
For the caller
mov r9d, DWORD PTR [rbp-12] ; r9d = n6
mov r8d, DWORD PTR [rbp-12] ; r8d = n5
mov ecx, DWORD PTR [rbp-16] ; ecx = n4
mov edx, DWORD PTR [rbp-12] ; edx = n3
mov esi, DWORD PTR [rbp-16] ; esi = n2
mov eax, DWORD PTR [rbp-12] ; eax = n1 ; will assign to edi
mov edi, DWORD PTR [rbp-12] ; push n20
push rdi
mov edi, DWORD PTR [rbp-16] ; push n19
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16] ; push n7
push rdi
mov edi, eax ; edi = n1
call a() ; call the function
add rsp, 112 ; clean up the stack, 14 * 8 = 112 bytes
mov ebx, eax ; result is in eax
a more straightforward version is
mov r9d, DWORD PTR [rbp-12] ; r9d = n6
mov r8d, DWORD PTR [rbp-12] ; r8d = n5
mov ecx, DWORD PTR [rbp-16] ; ecx = n4
mov edx, DWORD PTR [rbp-12] ; edx = n3
mov esi, DWORD PTR [rbp-16] ; esi = n2
mov edi, DWORD PTR [rbp-12] ; edi = n1
push [rbp-12] ; push n20
push [rbp-16] ; push n19
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12] ; push n8
push [rbp-16] ; push n7
call a() ; call the function
add rsp, 112 ; clean up the stack, 14 * 8 = 112 bytes
mov ebx, eax ; result is in eax
Note that the arguments are pushed in reversed order.
Since you can push any number to the stack (before it overflows) before you call the function, there is no limitation on the number of arguments.

Related

Is there an O(1) method to choose between n branches in C, like with an array of function pointers or array of goto labels?

I'm working on a calculator, and at the part where operations are actually executed, I have a big long switch block that looks something like this (the cases go up linearly starting at 1):
switch(operator) {
case 1:
a = a + b;
break;
case 2:
a = a - b;
break;
...
case 20:
a = sin(a);
break;
I have quite a few operators and functions at this point and testing each case one at a time doesn't seem like it would be the fastest option.
Is there a way to use a table (such as an array of goto labels, or an array of function pointers) so that the "operator" variable would cause the program to jump to the appropriate operation without the program having to test for each of the cases? If so, how would I go about doing this, given the above code?
It can not be faster then switch and I will show you why. Here is my version of your function with switch
void switch_fun(int operator) {
switch(operator) {
case 0:
a = a + b;
break;
case 1:
a = a - b;
break;
case 2:
a = a * b;
break;
}
}
Which translate to this assembler (with comments for non-assembly people)
switch_fun:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi ## copy 'operator'
cmp DWORD PTR [rbp-4], 2 ## compare 'operator' with 2
je .L2 ## if 'operator' is equal to 2 jump to .L2 (few lines below)
cmp DWORD PTR [rbp-4], 0 ## same comparition for 0
je .L4
cmp DWORD PTR [rbp-4], 1 ## and for 1
je .L5
jmp .L6 ## if nothing match jump at the end of the function
.L4:
mov edx, DWORD PTR a[rip]
mov eax, DWORD PTR b[rip]
add eax, edx ## addition, like in case 0:
mov DWORD PTR a[rip], eax
jmp .L3 ## jump at the end
.L5:
mov eax, DWORD PTR a[rip]
mov edx, DWORD PTR b[rip]
sub eax, edx ## subtraction
mov DWORD PTR a[rip], eax
jmp .L3
.L2:
mov edx, DWORD PTR a[rip]
mov eax, DWORD PTR b[rip]
imul eax, edx ## multiplication
mov DWORD PTR a[rip], eax
nop
.L3:
.L6:
nop
pop rbp
ret
It's pretty simple construction and probably impossible to optimize even more, but let's try. Here I have version of this function but with array of addresses to goto labels.
void array_fun(int operator) {
void* labels[] = {&&addition, &&subtraction, &&mul};
goto *labels[operator];
addition:
a = a + b;
goto outer;
subtraction:
a = a - b;
goto outer;
mul:
a = a * b;
outer:
return;
}
Ignoring fact that this code is really unsafe because if operator will be greater then 2 this will crash program but whatever, let's see how this look in assembly.
array_fun:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-36], edi
mov QWORD PTR [rbp-32], OFFSET FLAT:.L8
mov QWORD PTR [rbp-24], OFFSET FLAT:.L9
mov QWORD PTR [rbp-16], OFFSET FLAT:.L10 ## here you eliminated some comparitions
## but add some new 'moves' and calculations, everything below is practically the same like in first function, it's definitely not faster, probably slower then switch
mov eax, DWORD PTR [rbp-36]
cdqe
mov rax, QWORD PTR [rbp-32+rax*8]
nop
jmp rax
.L8:
mov edx, DWORD PTR a[rip]
mov eax, DWORD PTR b[rip]
add eax, edx
mov DWORD PTR a[rip], eax
jmp .L12
.L9:
mov eax, DWORD PTR a[rip]
mov edx, DWORD PTR b[rip]
sub eax, edx
mov DWORD PTR a[rip], eax
jmp .L12
.L10:
mov edx, DWORD PTR a[rip]
mov eax, DWORD PTR b[rip]
imul eax, edx
mov DWORD PTR a[rip], eax
.L12:
nop
pop rbp
ret
TL;DR don't try to optimize switch on integers because it's impossible to do
EDIT: With array of pointers to functions it would be even worse because you will get overhead because of calling a function

float to double (IEEE754) conversion

I'm trying to convert 32bit float to 64bit double in asm on x86 architecture. The conversion is done by function written in asm and then I want to call it from C. I have no idea what I'm doing wrong, but memory pointed by dst seem to stay untouched and after printf program crashes. I want to do it without any floating-point intructions. Here's the code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
mov esi, dword ptr [ebp+8] ; src
mov edi, dword ptr [ebp+12]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END
And the C code:
void conv(float * src, double * dst);
int main()
{
float src = 4.5f;
double dst = 0.;
conv(&src, &dst);
printf("%f\n", dst);
return 0;
}
Your primary problem is accessing the arguments. Since you did pusha the arguments are not at [ebp+8] and [ebp+12], rather at [ebp+36] and [ebp+40]. A debugger would have shown you this right away. Even with those changes your code is still broken though.
Ok, finally it works. Very helpful was Jester's advice about args access. Stupid thing, but hard to notice. Here's final code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
;+36 and +40 since pusha
mov esi, dword ptr [ebp+36]; src
mov edi, dword ptr [ebp+40]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
;mentissa:
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
;exponent:
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi+4], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END

assembly output + questions about stacks

I was studying one of my courses when I ran into a specific exercise that I cannot seem to resolve... It is pretty basic because I am VERY new to assembly. So lets begin.
I have a C function
unsigned int func(int *ptr, unsigned int j) {
unsigned int res = j;
int i = ptr[j+1];
for(; i<8; ++i) {
res >>= 1;
}
return res;
}
I translated it with gcc to assembly
.file "func.c"
.intel_syntax noprefix
.text
.globl func
.type func, #function
func:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov DWORD PTR [rbp-8], eax
mov eax, DWORD PTR [rbp-28]
add eax, 1
mov eax, eax
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-4], eax
jmp .L2
.L3:
shr DWORD PTR [rbp-8]
add DWORD PTR [rbp-4], 1
.L2:
cmp DWORD PTR [rbp-4], 7
jle .L3
mov eax, DWORD PTR [rbp-8]
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size func, .-func
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4"
.section .note.GNU-stack,"",#progbits
The question is as follow. what is the command that place j (variable in the c function) on top of the stack?
I sincerely cannot find out please enlighten me XD.
The variable j is the second parameter for func; it is stored in the register esi in the x86-64 System V ABI calling convention. This instruction mov DWORD PTR [rbp-28], esi put j into the stack.
You can see it very clearly by writing a simple function that calls "func" and compiling it with -O0 (or with -O2 and marking it as noinline, or only providing a prototype so there's nothing for the compiler to inline).
unsigned int func(int *ptr, unsigned int j) {
unsigned int res = j;
int i = ptr[j+1];
for(; i<8; ++i) {
res >>= 1;
}
return res;
}
int main()
{
int a = 1;
int array[10];
func (array, a);
return 0;
}
Using the Godbolt compiler explorer, we can easily get gcc -O0 -fverbose-asm assembly output.
Please focus on the following instructions:
# in main:
...
mov DWORD PTR [rbp-4], 1
mov edx, DWORD PTR [rbp-4]
...
mov esi, edx
...
func(int*, unsigned int):
...
mov DWORD PTR [rbp-28], esi # j, j
...
j, j is a comment added by gcc -fverbose-asm tell you that the source and destination operands are both the C variable j in that instruction.
The full assembly instructions:
func(int*, unsigned int):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov eax, DWORD PTR [rbp-28]
mov DWORD PTR [rbp-4], eax
mov eax, DWORD PTR [rbp-28]
add eax, 1
mov eax, eax
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-8], eax
jmp .L2
.L3:
shr DWORD PTR [rbp-4]
add DWORD PTR [rbp-8], 1
.L2:
cmp DWORD PTR [rbp-8], 7
jle .L3
mov eax, DWORD PTR [rbp-4]
pop rbp
ret
main:
push rbp
mov rbp, rsp
sub rsp, 48
mov DWORD PTR [rbp-4], 1
mov edx, DWORD PTR [rbp-4]
lea rax, [rbp-48]
mov esi, edx
mov rdi, rax
call func(int*, unsigned int)
mov eax, 0
leave
ret
Taking into account these instructions
mov eax, DWORD PTR [rbp-28]
add eax, 1
it seems that j is stored at address rbp-28 While ptr is stored at address rbp-24.
These are instructions where the values are stored in the stack
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
It seems the arguments are passed to the function using registers rdi and esi.
Compilers can optimize their calls of functions and use registers instead of the stack to pass arguments of small sizes to functions. Within the functions they can use the stack to temporary store the arguments passed through registers.
Just a suggestion for further explorations on your own. Use gcc -O0 -g2 f.c -Wa,-adhln. It will turn off optimizations and generate assembly code intermixed with the source. It might give you better ideas about what it does.
As an alternative you can use the objdump -Sd f.o on the output '.o' or executable. Just make sure that you add debugging info and turn off optimizations at compilation.

Is it faster in C to use a written jump table or switch statement?

So, I am trying to see if there is any difference between using a jump table of function pointers versus a switch statements for performing many, one command operations like these.
This is the code to assembly link i made
Here is my actual code as well
enum code {
ADD,
SUB,
MUL,
DIV,
REM
};
typedef struct {
int val;
} Value;
typedef struct {
enum code ins;
int operand;
} Op;
void run(Value* arg, Op* func)
{
switch(func->ins)
{
case ADD: arg->val += func->operand; break;
case SUB: arg->val -= func->operand; break;
case MUL: arg->val *= func->operand; break;
case DIV: arg->val /= func->operand; break;
case REM: arg->val %= func->operand; break;
}
}
My question is, based on the generated assembly in that link or the code, would there be any difference from making a bunch of small functions to complete the operations in the cases of the switch statement, and making an array of pointers to those functions and calling them with the same enum?
Using gcc x86_64 7.1
void add(Value* arg, Op* func)
{
arg->val += func->operand;
}
static void (*jmptable)(Value*, Op*)[] = {
&add
}
Assembly code paste:
run(Value*, Op*):
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], rdi
mov QWORD PTR [rbp-16], rsi
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax]
cmp eax, 4
ja .L9
mov eax, eax
mov rax, QWORD PTR .L4[0+rax*8]
jmp rax
.L4:
.quad .L3
.quad .L5
.quad .L6
.quad .L7
.quad .L8
.L3:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
add edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L5:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
sub edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L6:
mov rax, QWORD PTR [rbp-8]
mov edx, DWORD PTR [rax]
mov rax, QWORD PTR [rbp-16]
mov eax, DWORD PTR [rax+4]
imul edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L7:
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov rdx, QWORD PTR [rbp-16]
mov esi, DWORD PTR [rdx+4]
cdq
idiv esi
mov edx, eax
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
jmp .L2
.L8:
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov rdx, QWORD PTR [rbp-16]
mov ecx, DWORD PTR [rdx+4]
cdq
idiv ecx
mov rax, QWORD PTR [rbp-8]
mov DWORD PTR [rax], edx
nop
.L2:
.L9:
nop
pop rbp
ret
A catchall answer to all these questions: you should measure.
Practically though, I'm betting on the switch version. Function calls have overhead (and they can be hardly inlined in this context), which you could eliminate with labels as values, which is a common compiler extension*, but you should really try all your options and measure if the performance of this piece of code matters to you greatly.
Otherwise, use whatever's most convenient to you.
*a switch is likely to generate a jump table equivalent to what you could compose from labels as values but it could switch between different implementations depending on the particular case values and their number
Can you spot the difference? Trust in compiler (it will do such a micro optimisations much better than you) - and do not forget break statements. Care about algorithm, not about such a small details.
https://godbolt.org/g/sPxse2
Looks like due to branch prediction and bounds checking, using the switch labels as jump points may be up to 20% faster on older systems - newer systems having better branch prediction. Basically, this relies on a compiler extension. You still have the switch, but the switch doesn't fall through to the dispatcher. Instead, each case has its own dispatcher that jumps directly into the case. A number of popular VMs do this.
See here for more info and examples:https://www.cipht.net/2017/10/03/are-jump-tables-always-fastest.html

Wrong Visual Studio assembly output?

I wrote this classic function : (in 32-bit mode)
void ex(size_t a, size_t b)
{
size_t c;
c = a;
a = b;
b = c;
}
I call it inside the main as follows :
size_t a = 4;
size_t b = 5;
ex(a,b);
What I was expecting from the assembly code generated when entering the function is something like this :
1-Push the values of b and a in the stack : (which was done)
mov eax,dword ptr [b]
push eax
mov ecx,dword ptr [a]
push ecx
2-Use the values of a and b in the stack :
push ebp
mov ebp, esp
sub esp, 4
c = a;
mov eax, dword ptr [ebp+8]
mov dword ptr [ebp-4], eax
and so on for the other variables.
However, this is what I find when debugging :
push ebp
mov ebp,esp
sub esp,0CCh // normal since it's in debug with ZI option
push ebx
push esi
push edi
lea edi,[ebp-0CCh]
mov ecx,33h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
size_t c;
c = a;
mov eax,dword ptr [a]
mov dword ptr [c],eax
Why is it using the variable a directly instead of calling the value stored in the stack? I don't understand...
The debugger doesn't show the instruction using ebp to access a. The same syntax is permitted when you write inline assembly. Otherwise the reason that dword ptr still appears.
It is easy to get it your preferred way, right click > untick "Show Symbol Names".
Using the assembly output option (right click on file name, properties, ...), I get what you expect from debug assembly output. This could depend on which version of VS you use. For this example, I used VS2005. I have VS2015 on a different system, but didn't try it yet.
_c$ = -8 ; size = 4
_a$ = 8 ; size = 4
_b$ = 12 ; size = 4
_ex PROC ; COMDAT
push ebp
mov ebp, esp
sub esp, 204 ; 000000ccH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-204]
mov ecx, 51 ; 00000033H
mov eax, -858993460 ; ccccccccH
rep stosd ;fill with 0cccccccch
mov eax, DWORD PTR _a$[ebp]
mov DWORD PTR _c$[ebp], eax
mov eax, DWORD PTR _b$[ebp]
mov DWORD PTR _a$[ebp], eax
mov eax, DWORD PTR _c$[ebp]
mov DWORD PTR _b$[ebp], eax
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
_ex ENDP
Note this doesn't work, you need to use pointers for the swap to work.
void ex(size_t *pa, size_t *pb)
{
size_t c;
c = *pa;
*pa = *pb;
*pb = c;
}
which gets translated into:
_c$ = -8 ; size = 4
_pa$ = 8 ; size = 4
_pb$ = 12 ; size = 4
_ex PROC ; COMDAT
push ebp
mov ebp, esp
sub esp, 204 ; 000000ccH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-204]
mov ecx, 51 ; 00000033H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR _pa$[ebp]
mov ecx, DWORD PTR [eax]
mov DWORD PTR _c$[ebp], ecx
mov eax, DWORD PTR _pa$[ebp]
mov ecx, DWORD PTR _pb$[ebp]
mov edx, DWORD PTR [ecx]
mov DWORD PTR [eax], edx
mov eax, DWORD PTR _pb$[ebp]
mov ecx, DWORD PTR _c$[ebp]
mov DWORD PTR [eax], ecx
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
_ex ENDP

Resources