I want to have a register containing 4 bytes of address and 4 bytes of data. For that, I thought about building it in an array of structures (containing address and data as members) or in a matrix. Here a sample code to test what I want to achieve:
#include <stdio.h>
#include <stdint.h>
void reg_init();
#define REG_SIZE 1
typedef struct Reg{
uint8_t addr[4];
uint8_t data[4];
} reg;
static reg reg_struct[REG_SIZE];
static uint8_t reg_matrix[REG_SIZE][8];
int main()
{
int index=-1;
reg_init();
for(int i=0; i<REG_SIZE; i++)
{
uint8_t addr_to_check[4]={0x12,0x34,0x56,0x78};
// FOR STRUCT
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_struct[i].addr[j]) break;
if(j==3) index = i;
}
//FOR MATRIX
for(int j=0; j<4; j++)
{
if(addr_to_check[j]!=reg_matrix[i][j]) break;
if(j==3) index = i;
}
}
if (index<0) printf("Address not found\n");
else printf("Address at index: %i",index);
return 0;
}
void reg_init()
{
// Register init for testing
reg_struct[0].addr[0] = 0x12;
reg_struct[0].addr[1] = 0x34;
reg_struct[0].addr[2] = 0x56;
reg_struct[0].addr[3] = 0x78;
reg_struct[0].data[0] = 0x01;
reg_struct[0].data[1] = 0x02;
reg_struct[0].data[2] = 0x03;
reg_struct[0].data[3] = 0x04;
reg_matrix[0][0] = 0x12;
reg_matrix[0][1] = 0x34;
reg_matrix[0][2] = 0x56;
reg_matrix[0][3] = 0x78;
reg_matrix[0][4] = 0x01;
reg_matrix[0][5] = 0x02;
reg_matrix[0][6] = 0x03;
reg_matrix[0][7] = 0x04;
}
The example shows just a unit size register, but the size will be much higher (up to 8 bytes). Overall, I'm interested in optimization in terms of performance. Does it really care to use one or another, or will the compiler build the same machine code?
Below is the assembly of the above code created using visual studio 2019.
Look at record line number ; Line 50 and ; Line 51 it looks like compiler has created same assembly code for both matrix and structure.
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.20.27508.1
TITLE D:\main.c
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ___local_stdio_printf_options
PUBLIC __vfprintf_l
PUBLIC _printf
PUBLIC _reg_init
PUBLIC _main
EXTRN ___acrt_iob_func:PROC
EXTRN ___stdio_common_vfprintf:PROC
_DATA SEGMENT
COMM ?_OptionsStorage#?1??__local_stdio_printf_options##9#9:QWORD ; `__local_stdio_printf_options'::`2'::_OptionsStorage
_DATA ENDS
_BSS SEGMENT
_reg_struct DQ 01H DUP (?)
_reg_matrix DB 08H DUP (?)
_BSS ENDS
_DATA SEGMENT
$SG8132 DB 'Address not found', 0aH, 00H
ORG $+1
$SG8133 DB 'Address at index: %i', 00H
_DATA ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_index$ = -20 ; size = 4
_addr_to_check$1 = -16 ; size = 4
_j$2 = -12 ; size = 4
_j$3 = -8 ; size = 4
_i$4 = -4 ; size = 4
_main PROC
; File D:\main.c
; Line 16
push ebp
mov ebp, esp
sub esp, 20 ; 00000014H
; Line 17
mov DWORD PTR _index$[ebp], -1
; Line 18
call _reg_init
; Line 19
mov DWORD PTR _i$4[ebp], 0
jmp SHORT $LN4#main
$LN2#main:
mov eax, DWORD PTR _i$4[ebp]
add eax, 1
mov DWORD PTR _i$4[ebp], eax
$LN4#main:
cmp DWORD PTR _i$4[ebp], 1
jge $LN3#main
; Line 21
mov BYTE PTR _addr_to_check$1[ebp], 18 ; 00000012H
mov BYTE PTR _addr_to_check$1[ebp+1], 52 ; 00000034H
mov BYTE PTR _addr_to_check$1[ebp+2], 86 ; 00000056H
mov BYTE PTR _addr_to_check$1[ebp+3], 120 ; 00000078H
; Line 23
mov DWORD PTR _j$3[ebp], 0
jmp SHORT $LN7#main
$LN5#main:
mov ecx, DWORD PTR _j$3[ebp]
add ecx, 1
mov DWORD PTR _j$3[ebp], ecx
$LN7#main:
cmp DWORD PTR _j$3[ebp], 4
jge SHORT $LN6#main
; Line 25
mov edx, DWORD PTR _j$3[ebp]
movzx eax, BYTE PTR _addr_to_check$1[ebp+edx]
mov ecx, DWORD PTR _j$3[ebp]
mov edx, DWORD PTR _i$4[ebp]
movzx ecx, BYTE PTR _reg_struct[ecx+edx*8]
cmp eax, ecx
je SHORT $LN11#main
jmp SHORT $LN6#main
$LN11#main:
; Line 26
cmp DWORD PTR _j$3[ebp], 3
jne SHORT $LN12#main
mov edx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], edx
$LN12#main:
; Line 27
jmp SHORT $LN5#main
$LN6#main:
; Line 30
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN10#main
$LN8#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN10#main:
cmp DWORD PTR _j$2[ebp], 4
jge SHORT $LN9#main
; Line 32
mov ecx, DWORD PTR _j$2[ebp]
movzx edx, BYTE PTR _addr_to_check$1[ebp+ecx]
mov eax, DWORD PTR _j$2[ebp]
mov ecx, DWORD PTR _i$4[ebp]
movzx eax, BYTE PTR _reg_matrix[eax+ecx*8]
cmp edx, eax
je SHORT $LN13#main
jmp SHORT $LN9#main
$LN13#main:
; Line 33
cmp DWORD PTR _j$2[ebp], 3
jne SHORT $LN14#main
mov ecx, DWORD PTR _i$4[ebp]
mov DWORD PTR _index$[ebp], ecx
$LN14#main:
; Line 34
jmp SHORT $LN8#main
$LN9#main:
; Line 35
jmp $LN2#main
$LN3#main:
; Line 36
cmp DWORD PTR _index$[ebp], 0
jge SHORT $LN15#main
push OFFSET $SG8132
call _printf
add esp, 4
jmp SHORT $LN16#main
$LN15#main:
; Line 37
mov edx, DWORD PTR _index$[ebp]
push edx
push OFFSET $SG8133
call _printf
add esp, 8
$LN16#main:
; Line 38
xor eax, eax
; Line 39
mov esp, ebp
pop ebp
ret 0
_main ENDP
_TEXT ENDS
; Function compile flags: /Odtp
_TEXT SEGMENT
_reg_init PROC
; File D:\main.c
; Line 41
push ebp
mov ebp, esp
; Line 43
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 0
mov BYTE PTR _reg_struct[ecx+eax], 18 ; 00000012H
; Line 44
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 0
mov BYTE PTR _reg_struct[edx+eax], 52 ; 00000034H
; Line 45
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
shl eax, 1
mov BYTE PTR _reg_struct[edx+eax], 86 ; 00000056H
; Line 46
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 3
mov BYTE PTR _reg_struct[edx+ecx], 120 ; 00000078H
; Line 47
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 0
mov BYTE PTR _reg_struct[eax+edx+4], 1
; Line 48
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 0
mov BYTE PTR _reg_struct[ecx+edx+4], 2
; Line 49
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 1
mov BYTE PTR _reg_struct[ecx+edx+4], 3
; Line 50
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 3
mov BYTE PTR _reg_struct[ecx+eax+4], 4
; Line 51
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 0
mov BYTE PTR _reg_matrix[edx+ecx], 18 ; 00000012H
; Line 52
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 0
mov BYTE PTR _reg_matrix[eax+ecx], 52 ; 00000034H
; Line 53
mov edx, 8
imul eax, edx, 0
mov ecx, 1
shl ecx, 1
mov BYTE PTR _reg_matrix[eax+ecx], 86 ; 00000056H
; Line 54
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 3
mov BYTE PTR _reg_matrix[eax+edx], 120 ; 00000078H
; Line 55
mov eax, 8
imul ecx, eax, 0
mov edx, 1
shl edx, 2
mov BYTE PTR _reg_matrix[ecx+edx], 1
; Line 56
mov eax, 8
imul ecx, eax, 0
mov edx, 1
imul eax, edx, 5
mov BYTE PTR _reg_matrix[ecx+eax], 2
; Line 57
mov ecx, 8
imul edx, ecx, 0
mov eax, 1
imul ecx, eax, 6
mov BYTE PTR _reg_matrix[edx+ecx], 3
; Line 58
mov edx, 8
imul eax, edx, 0
mov ecx, 1
imul edx, ecx, 7
mov BYTE PTR _reg_matrix[eax+edx], 4
; Line 59
pop ebp
ret 0
_reg_init ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT _printf
_TEXT SEGMENT
__Result$ = -8 ; size = 4
__ArgList$ = -4 ; size = 4
__Format$ = 8 ; size = 4
_printf PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 954
push ebp
mov ebp, esp
sub esp, 8
; Line 957
lea eax, DWORD PTR __Format$[ebp+4]
mov DWORD PTR __ArgList$[ebp], eax
; Line 958
mov ecx, DWORD PTR __ArgList$[ebp]
push ecx
push 0
mov edx, DWORD PTR __Format$[ebp]
push edx
push 1
call ___acrt_iob_func
add esp, 4
push eax
call __vfprintf_l
add esp, 16 ; 00000010H
mov DWORD PTR __Result$[ebp], eax
; Line 959
mov DWORD PTR __ArgList$[ebp], 0
; Line 960
mov eax, DWORD PTR __Result$[ebp]
; Line 961
mov esp, ebp
pop ebp
ret 0
_printf ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT __vfprintf_l
_TEXT SEGMENT
__Stream$ = 8 ; size = 4
__Format$ = 12 ; size = 4
__Locale$ = 16 ; size = 4
__ArgList$ = 20 ; size = 4
__vfprintf_l PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\stdio.h
; Line 642
push ebp
mov ebp, esp
; Line 643
mov eax, DWORD PTR __ArgList$[ebp]
push eax
mov ecx, DWORD PTR __Locale$[ebp]
push ecx
mov edx, DWORD PTR __Format$[ebp]
push edx
mov eax, DWORD PTR __Stream$[ebp]
push eax
call ___local_stdio_printf_options
mov ecx, DWORD PTR [eax+4]
push ecx
mov edx, DWORD PTR [eax]
push edx
call ___stdio_common_vfprintf
add esp, 24 ; 00000018H
; Line 644
pop ebp
ret 0
__vfprintf_l ENDP
_TEXT ENDS
; Function compile flags: /Odtp
; COMDAT ___local_stdio_printf_options
_TEXT SEGMENT
___local_stdio_printf_options PROC ; COMDAT
; File C:\Program Files (x86)\Windows Kits\10\include\10.0.17763.0\ucrt\corecrt_stdio_config.h
; Line 86
push ebp
mov ebp, esp
; Line 88
mov eax, OFFSET ?_OptionsStorage#?1??__local_stdio_printf_options##9#9 ; `__local_stdio_printf_options'::`2'::_OptionsStorage
; Line 89
pop ebp
ret 0
___local_stdio_printf_options ENDP
_TEXT ENDS
END
I never learn C language so it makes me confuse. I just like to know if I did it correctly or where I need to improve. For this code I used assembly x86 32 bit. Thanks
This is what I supposed to do:
Write a procedure with the signature
char *strchar(char *s1, char c1)
that returns a pointer to the first occurrence of the character c1 within the string s1 or, if not found, returns a null.
This is what I came out with:
strchar (char*, char):
push ebp
mov ebp, esp
mov dword ptr [ebp-24], edi
mov EAX , esi
mov BYTE PTR [ebp-28], al
.L5:
mov EAX , dword ptr [ebp-24]
movzx EAX , byte ptr [ EAX ]
test AL, AL
je .L2
mov EAX , dword PTR [ebp-24]
movzx EAX , BYTE PTR [ EAX ]
cmp BYTE PTR [ebp-28], al
jne .L3
mov eax, dword PTR [ebp-24]
jmp .L6
.L3:
add dword PTR [ebp-24], 1
jmp .L5
.L2:
LEA eax, [ebp-9]
MOV DWORD PTR [EBP-8], eax
MOV EAX, DWORD PTR [ebp-8]
.L6:
POP EBP
RET
The lines:
mov dword ptr [ebp-24], edi
mov EAX , esi
mov BYTE PTR [ebp-28], al
assume that a stack frame has been allocated for this function which doesn’t appear true; I think you should have something like:
sub esp, 32
after the
mov ebp,esp
Also, the three lines after L2 seem confused. The only way to get to L2 is if the nil (0) byte is discovered in the string, at which point, the code should return a NULL pointer.
The exit path in the code (L6) leaves eax alone, so all that should be needed is:
L2:
mov eax, 0
It might make debugging easier if you kept the alias up to date; so:
L2:
mov eax, 0
mov [ebp-24], eax
Also, the calling convention used here is a bit odd: the string is passed in edi and the character in esi. Normally, in x86-32, these would both be passed on the stack. This looks like it might have been x86-64 code, converted to x86-32....
A final note; this assembly code looks like the output of a compiler with optimisations disabled. Often, generating the assembly with the optimisations enabled generates easier to understand code. This code, for example, could be much more concisely written as below, without even devolving into weird intel ops:
strchar:
mov edx, esi
mov eax, edi
L:
mov dh, [eax]
test dh, dh
jz null
cmp dh, dl
je done
inc eax
jmp L
null:
mov eax, 0
done:
ret
Here is one with stack overhead
[global strchar]
strchar:
push ebp
mov ebp, esp
mov dl, byte [ebp + 12]
mov ecx, dword [ebp + 8]
xor eax, eax
.loop: mov al, [ecx]
or al, al
jz .exit
cmp al, dl
jz .found
add ecx, 1
jmp .loop
.found: mov eax, ecx
.exit:
leave
ret
Here is one without stack overhead
[global strchar]
strchar:
mov dl, byte [esp + 8]
mov ecx, dword [esp + 4]
xor eax, eax
.loop: mov al, [ecx]
or al, al
jz .exit
cmp al, dl
jz .found
add ecx, 1
jmp .loop
.found: mov eax, ecx
.exit:
ret
These are using the 'cdecl' calling convention. For 'stdcall' change the last 'ret' to 'ret 8'.
Here is an example set of functions, the first with 20 args the second with 2:
int a(int n1, int n2, int n3, int n4, int n5, int n6, int n7, int n8, int n9, int n10, int n11, int n12, int n13, int n14, int n15, int n16, int n17, int n18, int n19, int n20) {
return n1 * n2 * n3 * n4 * n5 * n6 * n7 * n8 * n9 * n10 * n11 * n12 * n13 * n14 * n15 * n16 * n17 * n18 * n19 * n20;
}
int b(int n1, int n2) {
return a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1)
+ a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1)
+ a(n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1, n1, n2, n1, n2, n1);
}
It gets compiled to this assembly:
a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
mov DWORD PTR [rbp-8], esi
mov DWORD PTR [rbp-12], edx
mov DWORD PTR [rbp-16], ecx
mov DWORD PTR [rbp-20], r8d
mov DWORD PTR [rbp-24], r9d
mov eax, DWORD PTR [rbp-4]
imul eax, DWORD PTR [rbp-8]
imul eax, DWORD PTR [rbp-12]
imul eax, DWORD PTR [rbp-16]
imul eax, DWORD PTR [rbp-20]
imul eax, DWORD PTR [rbp-24]
imul eax, DWORD PTR [rbp+16]
imul eax, DWORD PTR [rbp+24]
imul eax, DWORD PTR [rbp+32]
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp
ret
b(int, int):
push rbp
mov rbp, rsp
push rbx
sub rsp, 8
mov DWORD PTR [rbp-12], edi
mov DWORD PTR [rbp-16], esi
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
mov ebx, eax
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
add ebx, eax
mov r9d, DWORD PTR [rbp-12]
mov r8d, DWORD PTR [rbp-12]
mov ecx, DWORD PTR [rbp-16]
mov edx, DWORD PTR [rbp-12]
mov esi, DWORD PTR [rbp-16]
mov eax, DWORD PTR [rbp-12]
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, eax
call a(int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int)
add rsp, 112
add eax, ebx
mov rbx, QWORD PTR [rbp-8]
leave
ret
I have a few questions about this. First, I noticed that it sort of seems to switch how it handles the args as the number increases:
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], edi
...
imul eax, DWORD PTR [rbp-8]
...
Wondering what is going on there, why it is doing that. It seems to treat the first arg with push, then the next 8 or so with mov, then the remaining with imul only relative to the eax register. Wonder if there is a limit to how many args you can have.
The second thing I'm wondering is the following. Say instead of a(), the function b() called out to some "system function" or some other external library call. Wondering how does it know how to unpack the args. Well, I guess, nevermind, I assume the C compiler will compile into assembly/machine code all linked external libraries. So yeah nevermind there I guess.
The last thing is if the syscalls all have a set number of arguments, similar to how in x86 the max operands is 3. Or perhaps the syscalls can have any number of arguments. It seems they would want to limit it for performance, so they only use those earlier stage instructions like push and mov, instead of imul.
Thanks for the help, just looking for clarification on how arguments are handled when calling functions in assembly, especially when there are a large number of arguments.
You need to learn more about stack frame and application binary interface (ABI, or calling convention). ABI defines the way parameters are passed by the caller to the callee, which registers are volatile, as well as how the stack is cleaned up.
Many ABIs exist, because anyone can design their own ABI as long as the caller and the callee agree. However, only a few ABIs are widely used. On Windows most 32-bit programs use stdcall, cdecl, Microsoft's fastcall or Borland's fastcall, whereas 64-bit programs mostly use Microsoft x64 calling convention. On Unix 64-bit programs always use System V AMD64 ABI, which is also the one used by your compiler.
Let's look at your code, with comments:
push rbp ; save the old stack frame
mov rbp, rsp ; establish new stack frame
mov DWORD PTR [rbp-4], edi ; save the first six arguments
mov DWORD PTR [rbp-8], esi
mov DWORD PTR [rbp-12], edx
mov DWORD PTR [rbp-16], ecx
mov DWORD PTR [rbp-20], r8d
mov DWORD PTR [rbp-24], r9d
mov eax, DWORD PTR [rbp-4] ; load n1
imul eax, DWORD PTR [rbp-8] ; eax = eax * n2
imul eax, DWORD PTR [rbp-12] ; eax = eax * n3
imul eax, DWORD PTR [rbp-16]
imul eax, DWORD PTR [rbp-20]
imul eax, DWORD PTR [rbp-24]
imul eax, DWORD PTR [rbp+16] ; eax = eax * n7
imul eax, DWORD PTR [rbp+24] ; eax = eax * n8
imul eax, DWORD PTR [rbp+32]
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp ; restore old stack frame
ret ; exit
Note: the first two line has nothing to do with the arguments; they are to create a stack frame, so that you can easily access local variables and arguments. Without the stack frame you can still access them with [rsp+*], but the offset need be adjusted according to any PUSH and POP you used.
Following that are instructions to store the arguments to local variables. Registers are frequently changed and arguments passed in registers need to be stored, in case you need to use them later. In this case, however, is not necessary. So the optimized code can be
push rbp ; save the old stack frame
mov rbp, rsp ; establish new stack frame
mov eax, edi ; eax = n1
imul eax, esi ; eax = eax * n2
imul eax, edx ; eax = eax * n3
imul eax, ecx ; eax = eax * n4
imul eax, e8d ; eax = eax * n5
imul eax, e9d ; eax = eax * n6
imul eax, DWORD PTR [rbp+16] ; eax = eax * n7
imul eax, DWORD PTR [rbp+24] ; eax = eax * n8
imul eax, DWORD PTR [rbp+32] ; eax = eax * n9
imul eax, DWORD PTR [rbp+40]
imul eax, DWORD PTR [rbp+48]
imul eax, DWORD PTR [rbp+56]
imul eax, DWORD PTR [rbp+64]
imul eax, DWORD PTR [rbp+72]
imul eax, DWORD PTR [rbp+80]
imul eax, DWORD PTR [rbp+88]
imul eax, DWORD PTR [rbp+96]
imul eax, DWORD PTR [rbp+104]
imul eax, DWORD PTR [rbp+112]
imul eax, DWORD PTR [rbp+120]
pop rbp ; restore old stack frame
ret ; exit
From the above example, you should be able to guess that the first argument is passed in edi (or rdi, di, dil depends on the size), the second one is in esi, and then edx, ecx, r8d and r9d (Integer only, floats are passed in vector registers). When you have more than 6 arguments, the other are pushed on to the stack and can be accessed using [rbp+16], [rbp+24], .... ([rbp+8] is old rbp; [rbp] is returning address).
For the caller
mov r9d, DWORD PTR [rbp-12] ; r9d = n6
mov r8d, DWORD PTR [rbp-12] ; r8d = n5
mov ecx, DWORD PTR [rbp-16] ; ecx = n4
mov edx, DWORD PTR [rbp-12] ; edx = n3
mov esi, DWORD PTR [rbp-16] ; esi = n2
mov eax, DWORD PTR [rbp-12] ; eax = n1 ; will assign to edi
mov edi, DWORD PTR [rbp-12] ; push n20
push rdi
mov edi, DWORD PTR [rbp-16] ; push n19
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16]
push rdi
mov edi, DWORD PTR [rbp-12]
push rdi
mov edi, DWORD PTR [rbp-16] ; push n7
push rdi
mov edi, eax ; edi = n1
call a() ; call the function
add rsp, 112 ; clean up the stack, 14 * 8 = 112 bytes
mov ebx, eax ; result is in eax
a more straightforward version is
mov r9d, DWORD PTR [rbp-12] ; r9d = n6
mov r8d, DWORD PTR [rbp-12] ; r8d = n5
mov ecx, DWORD PTR [rbp-16] ; ecx = n4
mov edx, DWORD PTR [rbp-12] ; edx = n3
mov esi, DWORD PTR [rbp-16] ; esi = n2
mov edi, DWORD PTR [rbp-12] ; edi = n1
push [rbp-12] ; push n20
push [rbp-16] ; push n19
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12]
push [rbp-16]
push [rbp-12] ; push n8
push [rbp-16] ; push n7
call a() ; call the function
add rsp, 112 ; clean up the stack, 14 * 8 = 112 bytes
mov ebx, eax ; result is in eax
Note that the arguments are pushed in reversed order.
Since you can push any number to the stack (before it overflows) before you call the function, there is no limitation on the number of arguments.
I'm trying to convert 32bit float to 64bit double in asm on x86 architecture. The conversion is done by function written in asm and then I want to call it from C. I have no idea what I'm doing wrong, but memory pointed by dst seem to stay untouched and after printf program crashes. I want to do it without any floating-point intructions. Here's the code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
mov esi, dword ptr [ebp+8] ; src
mov edi, dword ptr [ebp+12]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END
And the C code:
void conv(float * src, double * dst);
int main()
{
float src = 4.5f;
double dst = 0.;
conv(&src, &dst);
printf("%f\n", dst);
return 0;
}
Your primary problem is accessing the arguments. Since you did pusha the arguments are not at [ebp+8] and [ebp+12], rather at [ebp+36] and [ebp+40]. A debugger would have shown you this right away. Even with those changes your code is still broken though.
Ok, finally it works. Very helpful was Jester's advice about args access. Stupid thing, but hard to notice. Here's final code:
.686
.model flat
public _conv
.data
mantissa_mask dd 00000000011111111111111111111111b
exponent_mask dd 01111111100000000000000000000000b
.code
_conv PROC
pusha
mov ebp, esp
;+36 and +40 since pusha
mov esi, dword ptr [ebp+36]; src
mov edi, dword ptr [ebp+40]; dst
mov dword ptr [edi], 0
mov dword ptr [edi+4], 0
;mentissa:
mov eax, dword ptr [esi]
and eax, dword ptr mantissa_mask
mov dword ptr [edi], eax
xor edx, edx ; zero edx
mov ecx, 1
shl ecx, 29 ;ecx == 2^29
mul ecx ;so it's like `shl edx:eax, 29`
mov dword ptr [edi], eax
mov dword ptr [edi+4], edx
;exponent:
mov eax, dword ptr [esi]
and eax, dword ptr exponent_mask
shr eax, 23 ;put exponent on lowest bits
sub eax, 127 ;exponent in float is coded enlarged by 127
add eax, 1023 ;in double it's enlarged by 1023
shl eax, 20 ;exponent in double starts on 20bit of 2nd byte
or dword ptr [edi+4], eax
;sign bit:
bt dword ptr [esi], 31
jc set_sign_bit
btr dword ptr [edi+4], 31
jmp endthis
set_sign_bit:
bts dword ptr [edi+4], 31
endthis:
popa
ret
_conv ENDP
END