Calling NASM 32bit SSE function in C - c

I'm trying to interface C and NASM to pass an array of float, make a few steps (subtract 2.0 to each element of the array) and then return the sum of the array result to C. The incorrect result printed is x = -34975412102996426752.000000
I don't understand why..
Thanks for help
This is the code of file (test.c and myfunc.nasm):
test.c:
extern float myfunc(float* a);
int main(int argc, char** argv) {
float a[] = { 7.0, 24.0, 4.0, 6.1 };
float x = myfunc(a);
printf("x: %f\n",x);
return 0;
}
myfunc.nasm:
%include "sseutils.nasm"
section .data
align 16
y: dd 2.0 , 2.0, 2.0, 2.0
section .bss
alignb 16
A: resd 4
section .text
global myfunc
a equ 8
myfunc:
push ebp
mov ebp, esp
push ebx
movaps xmm0 , [ebp+a]
movaps xmm1,[y]
subps xmm0, xmm1
haddpd xmm0,xmm0
haddpd xmm0,xmm0
movaps [A], xmm0
fld dword[A]
pop ebx
mov esp, ebp
pop ebp
ret

Related

How can I store abc(x, y) which is a pointer function into an array as per the following code sample?

Function 1.
It is a pointer function.
char *abc(unsigned int a, unsigned int b)
{
//do something here ...
}
Function 2
Leveraged the function 1 into function 2.
I am trying to store the abc function into an array, however I am getting the error as : error: assignment to expression with array type.
fun2()
{
unsigned int x, y;
x= 5, y=6;
char *array1;
char array2;
for(i=0; i<3; i++)
{
array2[i] = abc(x, y);
}
}
You can't store the invocation of a function in C since it would defeat many existing popular optimizations involving register parameters passing - see because normally parameters are assigned their argument values immediately before the execution flow is transferred to the calling site - compilers may choose to use the registers to store those values but as it stands those registers are volatile and so if we were to delay the actual call they would be overwritten at said later time - possibly even by another call to some function which also have its arguments passed as registers. A solution - which I've personally implemented - is to have a function simulate the call for you by re-assigning to the proper registers and any further arguments - to the stack. In this case you store the argument values in a flat memory. But this must be done in assembly exclusively for this purpose and specific to your target architecture. On the other hand if your architecture is not using any such optimizations - it could be quite easier but still hand written assembly would be required.
In any case this is not a feature the standard (or even pre standard as far as I know) C has implemented anytime.
For example this is an implementation for x86-64 I've wrote some time ago (for MSVC masm assembler):
PUBLIC makeuniquecall
.data
makeuniquecall_jmp_table dq zero_zero, one_zero, two_zero, three_zero ; ordinary
makeuniquecall_jmp_table_one dq zero_one, one_one, two_one, three_one ; single precision
makeuniquecall_jmp_table_two dq zero_two, one_two, two_two, three_two ; double precision
.code
makeuniquecall PROC
;rcx - function pointer
;rdx - raw argument data
;r8 - a byte array specifying each register parameter if it's float and the last qword is the size of the rest
push r12
push r13
push r14
mov r12, rcx
mov r13, rdx
mov r14, r8
; first store the stack vars
mov rax, [r14 + 4] ; retrieve size of stack
sub rsp, rax
mov rdi, rsp
xor rdx, rdx
mov r8, 8
div r8
mov rcx, rax
mov rsi, r13
;add rsi, 32
rep movs qword ptr [rdi], qword ptr [rsi]
xor r10,r10
cycle:
mov rax, r14
add rax, r10
movzx rax, byte ptr [rax]
test rax, rax
jnz jmp_one
lea rax, makeuniquecall_jmp_table
jmp qword ptr[rax + r10 * 8]
jmp_one:
cmp rax, 1
jnz jmp_two
lea rax, makeuniquecall_jmp_table_one
jmp qword ptr[rax + r10 * 8]
jmp_two:
lea rax, makeuniquecall_jmp_table_two
jmp qword ptr[rax + r10 * 8]
zero_zero::
mov rcx, qword ptr[r13+r10*8]
jmp continue
one_zero::
mov rdx, qword ptr[r13+r10*8]
jmp continue
two_zero::
mov r8, qword ptr[r13+r10*8]
jmp continue
three_zero::
mov r9, qword ptr[r13+r10*8]
jmp continue
zero_one::
movss xmm0, dword ptr[r13+r10*8]
jmp continue
one_one::
movss xmm1, dword ptr[r13+r10*8]
jmp continue
two_one::
movss xmm2, dword ptr[r13+r10*8]
jmp continue
three_one::
movss xmm3, dword ptr[r13+r10*8]
jmp continue
zero_two::
movsd xmm0, qword ptr[r13+r10*8]
jmp continue
one_two::
movsd xmm1, qword ptr[r13+r10*8]
jmp continue
two_two::
movsd xmm2, qword ptr[r13+r10*8]
jmp continue
three_two::
movsd xmm3, qword ptr[r13+r10*8]
continue:
inc r10
cmp r10, 4
jb cycle
mov r14, [r14 + 4] ; retrieve size of stack
call r12
add rsp, r14
pop r14
pop r13
pop r12
ret
makeuniquecall ENDP
END
And your code will look something like this:
#include <stdio.h>
char* abc(unsigned int a, unsigned int b)
{
printf("a - %d, b - %d\n", a, b);
return "return abc str\n";
}
extern makeuniquecall();
main()
{
unsigned int x, y;
x = 5, y = 6;
#pragma pack(4)
struct {
struct { char maskargs[4]; unsigned long long szargs; } invok;
char *(*pfunc)();
unsigned long long args[2], shadow[2];
} array2[3];
#pragma pack(pop)
for (int i = 0; i < 3; i++)
{
memset(array2[i].invok.maskargs, 0, sizeof array2[i].invok.maskargs); // standard - no floats passed
array2[i].invok.szargs = 8 * 4; //consider shadow space
array2[i].pfunc = abc;
array2[i].args[0] = x;
array2[i].args[1] = y;
}
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", ((char *(*)())makeuniquecall)(array2[i].pfunc, array2[i].args, &array2[i].invok));
}
You'll probably not need that for your specific case you will get away with simply storing each argument and calling the function directly - i.e. (plus this method won't be x86-64 specific):
//now do the calls
for (int i = 0; i < 3; i++)
printf("%s\n", array2[i].pfunc(array2[i].args[0], array2[i].args[1]));
But mine implementation gives you the flexibility to store different amount of arguments for each call.
Note consider this guide for running above examples on msvc (since it requires to add asm file for the assembly code).
I love such noob questions since they make you think about why x-y feature doesn't actually exist in the language.

ASM, declaring float variables

I'm trying to store four 32-bit float variables into a 128-bit float variable, here is the asm code:
; function signature is:
; int findMin()
section .data
fa: dd 10.0
fb: dd 20.0
fc: dd 12.0
fd: dd 9.0
section .bss
fl_var: reso 1
section .text
global findMin
findMin:
; -------------------------------------------
; Entrace sequence
; -------------------------------------------
push ebp ; save base pointer
mov ebp, esp ; point to current stack frame
push ebx ; save general registers
push ecx
push edx
push esi
push edi
fld dword [fa]
fstp dword [fl_var]
fld dword [fb]
fstp dword [fl_var + 4]
fld dword [fc]
fstp dword [fl_var + 8]
fld dword [fd]
fstp dword [fl_var + 12]
mov eax, 0
; ------------------------------------------
; Exit sequence
; ------------------------------------------
pop edi
pop esi
pop edx
pop ecx
pop ebx
mov esp, ebp
pop ebp
ret
For each fn float variable, I use the st0 register to move it into the fl_var, usign a proper offset.
The code doesn't work, and the problem is in the declaration of the fn float variables. If I try to print them using the gdb I get this:
f/fh &fa //print a 32-bit float
0x804a020 <fa>: 0
The same happen for the others fn variables.
If write:
f/fw &fa //print a 64-bit float
0x804a020 <fa>: 10
It print the right value! The same happen for the others fn variables.
So, even if I declare fa to be a dd (4 bytes), it creates a dq (8 bytes). There is a way to declare a 32-bit float in asm?
I've found the type real4, but the compiler complains.
EDIT
Another question regarding the gdb and the way it prints variables. Look at this code:
;int findMin(float a, float b, float c, float d)
section .bss
int_var: reso 2
section .text
global findMin
findMin:
fld qword [ebp + 8]
fstp qword [int_var]
fld qword [ebp + 16]
fstp qword [int_var + 8]
fld qword [ebp + 24]
fstp qword [int_var + 16]
fld qword [ebp + 32]
fstp qword [int_var + 24]
I call it from a C piece of code, because of the C calling conventions, float are passed as 64-bit values. So I take each argument moving 8 bytes each time from the stack.
When I enter the gdb and print the value of int_var, I get the correct result using:
x/4fw &int_var
If I write
x/4fg &int_var
I don't get the numbers passed to the function, but inconsistent values.
Why it is printing correctly using the w letter (32-bit)?

C float in NASM x86 assembly

In my university project i have to use binary representation of float number in x86 assembly for arithmetic operations. Using FPU is forbidden so i try to read float number and return it as DWORD but whatever i try to do i get "-nan". Any advices?
Edit:
I use gcc and it's 32 bit code
Declaration in C (i can't change that)
extern "C" float func(float num);
*.asm file
section .text
global func
func:
;prolog
push ebp
mov ebp, esp
; zapamiętanie rejestrów zachowywanych
push ebx
push esi
push edi
mov eax, DWORD [ebp+8]
;mov eax, 0xffffffff i checked that but i still get the same result
; odtworzenie rejestrów, które były zapamiętane
pop edi
pop esi
pop ebx
;epilog
pop ebp
ret
Example result (for 256)
01000011100000000000000000000000
11111111110000000000000000000000
num1: 256.000000
num2: -nan
Edit:
C code without checking bits part
#include <stdio.h>
extern "C" float func(float num);
int main()
{
float num1;
float num2;
scanf("%f", &num1);
num2=func(num1);
printf("num1: %f\nnum2: %f\n", num1, num2);
return 0;
}
If you declare the return type func as float the result will be returned in the FPU (ST0). For returning a value in EAX you have to declare it as an integer type. For printf you have to fake a float. Example:
caller.c:
#include <stdio.h>
#include <stdint.h>
extern float asmfunc1(float);
extern uint32_t asmfunc2(float);
int main (void)
{
printf ("asmfunc1: %f\n", asmfunc1(456.78));
uint32_t ifl = asmfunc2(123.45);
float* pfl = (float*) &ifl; // Faking a float
printf ("asmfunc2: %f\n", *pfl);
return 0;
}
callee.asm:
section .text
global asmfunc1, asmfunc2
asmfunc1:
fld dword [esp+4]
ret
asmfunc2:
push ebp
mov ebp, esp
mov eax, [ebp+8]
leave
ret
Build & run:
nasm -felf callee.asm
gcc -m32 callee.o caller.c
./a.out
In the 32 bit Linux ABI, float values are actually returned as long double at the top of the 8087 FP stack. You cannot return a float without using the FPU.
What you are probably restricted from doing is FP operations for addition, subtraction... But you still need to load the result in the FP stack to return it. In 64 bits mode, you would return float values as double in the xmm0 register.
Try changing the code to this:
section .text
global func
func:
push ebp
mov ebp, esp
flds 8(%ebp)
pop ebp
ret

How can I move a int from xmm register to eax register?

I have to call an assembly method to calculate an addition of two integers, using sse, from C.
The simple C code is:
#include <stdio.h>
#include <stdlib.h>
extern int add(int a, int b);
int main(){
int a=5;
int b=2;
int n = add(a,b);
printf("%d\n",n);
return 0;
}
While the nasm code is:
section .bss
RISULTATO resd 1
section .text
global add
add:
;-----------------------------------------------
; start point of the function
;-----------------------------------------------
push ebp ; salva il Base Pointer
mov ebp, esp ; il Base Pointer punta al Record di Attivazione corrente
push ebx ; salva i registri da preservare
push esi
push edi
;-----------------------------------------------
; add implementation
;-----------------------------------------------
movss xmm0, [ebp+8]
movss xmm1, [ebp+12]
addss xmm1, xmm0
movss [RISULTATO], xmm1
mov eax, [RISULTATO]
;-----------------------------------------------
; exit point of the function
;-----------------------------------------------
pop edi ; ripristina i registri da preservare
pop esi
pop ebx
mov esp,ebp ; ripristina lo Stack Pointer
pop ebp ; ripristina il Base Pointer
ret ; ritorna alla funzione chiamante
But in this way I need to pass from RISULTATO stored in memory.
There is any way to avoid this and move directly the result stored in xmm0 to eax?
Naturally I need to move the result of the addition in eax to pass it to my C method, because the return value must to be in eax register.
Thanks for the help. :)

Nasm, not printing the correct value

I've made a nasm procedure that calculates the eucledian distance between two vectors of a certain size.
This nasm function is called from a C file which get the result of the function. I've tested, and it works, the value returned is correct, I can print it withoud any problem.
My issue is when I try to print the result inside the nasm file.
This is the nasm code:
extern printf
%macro print_dist2 1
section .data
.str db %1,0 ; %1 is macro call first actual parameter
section .text
push dword [dist]
push dword .str
push dword fmt2
call printf
add esp, 12 ; pop stack 3 * 4 bytes
%endmacro
section .data
dist: dd 258
fmt2: db "%s, dist: %g",10,0 ; format string for printf
section .bss
section .text
global distanza
x equ 8
y equ 12
dim equ 16
ris equ 20
distanza:
; ------------------------------------------------------------
; Function entrance sequence
; ------------------------------------------------------------
push ebp
mov ebp, esp
push ebx
push esi
push edi
; ------------------------------------------------------------
; read the parameters from the current stack frame
; ------------------------------------------------------------
mov eax, [ebp+x] ; address of x
mov ebx, [ebp+y] ; address of y
mov ecx, [ebp+dim] ; size of the vectors (it's the same)
.
.
. omitted calculations.
.
sqrtss xmm1, xmm1
movss [dist], xmm1 ; move the result to dist
fld dword [dist] ; mov in st0, that will be used by the C file
print_dist2 "distanza"
; ------------------------------------------------------------
; Function exit sequence
; ------------------------------------------------------------
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret
This is the C file:
float a[] = {6.1f, 9.5f, 12.6f};
float b[] = {25.1f, 34.1f, 9.6f};
dim = 3;
float dist = distanza(a, b, dim);
printf("d(x,y) = %f\n", dist);
This is the output:
distanza, dist: 5.46877e-315
d(x,y) = 31.227551
The C, code print the correct value, the nasm doesn't.
If I change the format string of the nasm code:
fmt2: db "%s, dist: %f",10,0
I've used %f instead of %g, the nasm output is this one:
distanza, dist: 0.000000
I have no idea what is wrong with my code, why it is not printing the correct value?
Your dist is a single 32-bit float, but printf needs a double 64-bit float. You have to transform it:
%macro print_dist2 1
section .data
.str db %1,0 ; %1 is macro call first actual parameter
section .text
sub esp, 8 ; Space for a 64-bit double floating point number
fld dword [dist] ; Load a 32-bit single floating point number
fstp qword [esp] ; Store it as 64-bit floating point number
push dword .str
push dword fmt2
call printf
add esp, 16 ; pop stack 4 * 4 bytes
%endmacro

Resources