I am trying to make assembly function that uses SSE and FPU for parallel calculations. Unfortunately I am receiving segmentation fault(core dumped) error(while debugging it doesn't show in assembly function). I also cannot step out from assembly function. Gdb shows:
Warning:
Cannot insert breakpoint 0.
Cannot access memory at address 0x2bffff
after ret statement.
I'm out of any ideas what may cause that type of behaviour. Maybe some of you see something I don't? Cheers.
Integrals.s
#float intgr_vert(float x)
#{
# return pow(x, 2) - 4*x + 6;
#}
s_precision = 0x007f
.bss
.lcomm holder, 4
.lcomm rect_size_vec, 16
.lcomm x_vec, 16
.lcomm result, 16
.data
four:
.float 4.0, 4.0, 4.0, 4.0
six:
.float 6.0, 6.0, 6.0, 6.0
.globl four_intgr_strips
.type four_intgr_strips, #function
four_intgr_strips:
pushl %eax
pushl %ecx
pushl %edx
pushl %ebp
movl %esp, %ebp
subl $2, %esp
movl $0, %edi
movl 20(%ebp), %eax #x position
movl 24(%ebp), %ebx #rectangle size
movw $s_precision, -2(%ebp)
finit
fldcw -2(%ebp)
pool:
movl %eax, x_vec(, %edi, 4)
movl %ebx, rect_size_vec(, %edi, 4)
movl %eax, holder
flds holder
movl %ebx, holder
flds holder #adding size of rectangle to calculate different x
fadd %st(1), %st(0)
fstps holder
movl holder, %eax
inc %edi
cmp $4, %edi
je pool_dne
jmp pool
pool_dne:
ret ###########################can't go further
.type sumAreas, #function
sumAreas:
movl $0, %edi
flds result(, %edi, 4)
inc %edi
loop:
flds result(, %edi, 4)
fadd %st(1), %st(0)
inc %edi
cmp $4, %edi
je end_loop
jmp loop
end_loop:
ret
.type calcAreas, #function
calcAreas:
movaps rect_size_vec, %xmm1
mulps %xmm1, %xmm0
movaps %xmm0, result
ret
.type calcVertical, #function
calcVertical:
movaps x_vec, %xmm0
mulps %xmm0, %xmm0
movaps x_vec, %xmm1
movups four, %xmm2
mulps %xmm1, %xmm2
subps %xmm2, %xmm0
movups six, %xmm1
addps %xmm1, %xmm0
ret
main.c
#include <stdio.h>
#include <math.h>
// x^2 - 4x + 6 integral
float four_intgr_strips(float, float);
float calc_intgr_in_as(int a, int n, float rect_size)
{
float sum = 0;
float four_rect_area;
float last_rect_l_corner = a;
for(int i = 0; i != n/4; i++)
{
four_rect_area = four_intgr_strips(last_rect_l_corner, rect_size);
sum = sum + four_rect_area;
last_rect_l_corner = last_rect_l_corner + 4*rect_size;
}
return sum;
}
int main()
{
int a, b, n;
float rect_size;
float sum;
printf("\nType integral lower bound:");
scanf("%d", &a);
printf("\nType integral upper bound:");
scanf("%d", &b);
do
{
printf("\nType rectangles number(must be multiple of 4):");
scanf("%d", &n);
}
while(n % 4 != 0);
rect_size = (float)(b - a)/n;
sum = calc_intgr_in_as(a, n, rect_size);
printf("\nArea under function is: %f with SSE", sum);
}
You have forgotten to cleanup the stack.
In the prologue you have:
pushl %eax
pushl %ecx
pushl %edx
pushl %ebp
movl %esp, %ebp
You obviously need to undo that before you ret, such as:
movl %ebp, %esp
popl %ebp
popl %edx
popl %ecx
popl %eax
ret
PS: I have already told you that unaligning the stack is a bad idea, sooner or later that will bite you. Also, next time you ask a question, mention what input you used and what output you expect.
Related
I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}
Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.
You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax
The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.
This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function
I am writing a problem to calculate smallest common multiple in assembly. GDB shows sigsegv exception at the very first line pushl %ebp. Can you tell me how to fix this?
test.c
#include <stdio.h>
extern int nww(int a, int b); // Funkcja z asm
int main()
{
int a = 10, b = 3;
int nw= nww(a,b);
printf("NWW %d i %d = %d ",a,b,nw);
return 0;
}
nww.s
.text
.global nww
.type nww, #function
nww:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movl 12(%ebp), %ebx
nwd:
movl $0, %edx # miejsce na reszte
divl %ebx # podziel a przez b, reszta w d
movl %ebx, %eax
movl %edx, %ebx # przygotowanie nastepnego dzielenia
cmp $0, %edx
jnz nww
movl %eax, %edx # NWD do dx
movl 8(%ebp), %eax # nowe A i B
movl 12(%ebp), %ebx
imull %ebx, %eax
divl %edx
exit:
movl %ebp, %esp
popl %ebp
ret
I am stuck with converting C code into assembly. Here is the code that I need to convert:
#include <stdio.h>
define N 50 int x[N], y[N], z[2 * N];
void convolve(int[], int[], int[], int);
int main(void)
{
int i, n;
printf("Enter vector size (<=%d): ", N);
scanf("%d", &n);
printf("Enter first vector (%d elements):\n", n);
for (i = 0; i < n; i++)
**scanf("%d", &x[i]);
**printf("Enter second vector (%d elements):\n", n);
for (i = 0; i < n; i++)
scanf("%d", &y[i]);
convolve(x, y, z, n);
printf("Convolution:\n");
for (i = 0; i < ((n + n) - 1); i++)
printf("%d ", z[i]);
printf("\n");
return 0;
}
void convolve(int x[], int y[], int z[], int n)
{
int i, j;
for (i = 0; i < ((n + n) - 1); i++)
z[i] = 0;
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
z[i + j] += x[i] * y[j];
return;
}
I am stuck at this line:
scanf("%d", &x[i]);
How do I insert into array?
Here is what I have so far:
.data
.align 4
state: .long 0
.bss
N = 50
int x[N], y[N], z[2*N]
.data
.equ N, 50
.comm i,4,4 #int b
.comm n,4,4 #int n
.comm j,4,4 #int j
.comm x,N*4,4 #int x[N] where N is 50
.comm y,N*4,4 #int x[N] where N is 50
.comm z,N*8,4 #int x[N] where N is 100
.section .rodata #to format strings
fmt0: .string "Enter vector size (<=%d): "
fmt1: .string "%d"
fmt2: .string "Enter first element (%d elements):\n"
fmt3: .string "Enter second element (%d elements):\n"
fmt4: .string "Convolution:\n"
fmt5: .string "\n"
fmt6: .string .%d .
.text
.globl main
main:
pushl %ebp #prolog
movl %esp, %ebp
pushl %esi #save callee-save registers %esi, %edi, and %ebx onto stack
pushl %edi # where %esi at -4(%ebp),%edi at -8(%ebp), and %ebx at -12(%ebp)
pushl %ebx
pushl %eax #for array where %eax at -16(%ebp)------------------------------
/* Allocate space for i and n on the stack */
subl $8, %esp
/* i is at address -20(%ebp) */
/* n is at address -24(%ebp) */
pushl $fmt0 #push fmt0
call printf #printf("Enter vector size (<=%d): ")
addl $4, %esp #deallocate parm to printf
leal -24(%ebp), %ebx #%ebx = address of n
pushl %ebx #push address of n
pushl $fmt1 #push fmt1 "%d"
call scanf #scanf ("%d", &n)
addl $8, %esp #dealoccate parms for scanf
pushl $fmt2 #push fmt2
call printf #printf("Enter first element (%d elements):\n")
addl $4, %esp #deallocate parm to printf
movl $0, -20(%ebp) #i=0
movl -20(%ebp), %edi #%edi=i
movl -24(%ebp), %esi #esi=n
cmpl %esi, %edi #compare i:n
jg for_done #jump to for_done if i>n
for_loop:
pushl %edi #push i
pushl %esi #push n
pushl %eax #push array
pushl $fmt1 #push fmt1 ("%d")
call scanf #scanf("%d", n)
addl $8, %esp #dealocate parms to scanf
movl (address of x,%edi,4), %eax------------------------------------------------------
incl %edi #%edi++ (i++)
movl %edi,-20(%ebp) #i=%edi
compl %esi, %edi #compare i:n
jle for_loop #jump to for_loop if i<n
for_done:
addl $8, %esp #deallocate local vars from stack
popl %ebx #restore %ebx
popl %edi #restore %edi
popl %esi #restore %esi
/*next loop for second vector*/
pushl %esi #save callee-save registers %esi, %edi, and %ebx onto stack
pushl %edi # where %esi at -4(%ebp),%edi at -8(%ebp), and %ebx at -12(%ebp)
pushl %ebx
pushl $fmt3 #push fmt3
call printf #printf("Enter second element (%d elements):\n")
addl $4, %esp #deallocate parm to printf
movl $0, -20(%ebp) #i=0
movl -20(%ebp), %edi #%edi=i
movl -24(%ebp), %esi #esi=n
cmpl %esi, %edi #compare i:n
jg for_done #jump to for_done if i>n
for_loop:
pushl %edi #push i
pushl %esi #push n
pushl %eax #push array
pushl $fmt1 #push fmt1 ("%d")
call scanf #scanf("%d", n)
addl $8, %esp #dealocate parms to scanf
movl (address of y,%edi,4), %eax------------------------------------------------------
incl %edi #%edi++ (i++)
movl %edi,-20(%ebp) #i=%edi
compl %esi, %edi #compare i:n
jle for_loop #jump to for_loop if i<n
for_done:
addl $8, %esp #deallocate local vars from stack
popl %ebx #restore %ebx
popl %edi #restore %edi
popl %esi #restore %esi
leave #epilog
ret
convolve:
pushl %ebp #prolog
movl %esp, %ebp
pushl %esi #save callee-save registers %esi, %edi, and %ebx onto stack
pushl %edi # where %esi at -4(%ebp),%edi at -8(%ebp), and %ebx at -12(%ebp)
pushl %ebx
/* Allocate space for x, y, z, n, i, and j on the stack */
subl $24, %esp
/* x is at address 4(%ebp) */
/* y is at address 8(%ebp) */
/* z is at address 12(%ebp) */
/* n is at address 16(%ebp) */
/* i is at address -16(%ebp) */
/* n is at address -20(%ebp) */
movl $0, -16(%ebp) #i=0
movl -16(%ebp), %edi #%edi=i
movl -20(%ebp), %esi #esi=n
addl %esi, %esi #2 times n
subl $1, %esi #2n - 1
cmpl %esi, %edi #compare i:n
jg for_done #jump to for_done if i>n
I can't help but notice you've got two different for_loop and for_done defined; you might want to differentiate them somehow.
as per some of the other suggestions, maybe something like this, for example, could work:
for_loop1:
pushl %edi #push i
pushl %esi #push n
leal x(,%edi,4), %eax
pushl %eax #push array
...
for_loop2:
pushl %edi #push i
pushl %esi #push n
leal y(,%edi,4), %eax
pushl %eax #push array
So I am writing some C with Assembly in AT&T. I've got small problem right now, because when I call first function that is f_float with float parameter, parameter is loaded from stack and returned value is proper. But with the second call f_double with double parameter, the value isn't loading from stack. I'm on linux Mint 17.1, gcc version 4.9.2(Ubuntu4.9.2-0ubuntu1~14.04). Any advices?
main.cpp
#include <stdio.h>
float f_float(float);
double f_double(double);
int main()
{
float a, f_result;
double b, d_result;
printf("\nInsert float number: ");
scanf("%f", &a);
printf("\nInsert double number: ");
scanf("%lf", &b);
f_result = f_float(a);
d_result = f_double(b);
printf("\nResult of float with f function: %f", f_result);
printf("\nResult of double with f function: %lf", d_result);
return 0;
}
functions.s
s_precision = 0x007f
d_precision = 0x027f
#(x^2)/(sqrt(x^2 +1) +1)
.globl f_float
.type f_float, #function
f_float:
pushl %ebp
movl %esp, %ebp
subl $2, %esp
finit
movl $s_precision, -2(%ebp)
fldcw -2(%ebp)
flds 8(%ebp)
fmul %st(0)
fld1
fadd %st(1), %st(0)
fsqrt
fld1
fsubr %st(1), %st(0)
movl %ebp, %esp
pop %ebp
ret
.globl f_double
.type f_double, #function
f_double:
pushl %ebp
movl %esp, %ebp
subl $2, %esp
finit
movl $d_precision, -2(%ebp)
fldcw -2(%ebp)
fldl 8(%ebp)
fmul %st(0)
fld1
fadd %st(1), %st(0)
fsqrt
fld1
fsubr %st(1), %st(0)
movl %ebp, %esp
pop %ebp
ret
Changing movl $*_precision to movw $*_precision resolved problem. That was mistake of course, but thought that it will overwrite the data left on stack later on. Anyway, problem soved. Thanks to everyone for help. – Robs
The C version functions:
float foo1 (float a, float b)
{
return sin(a) + b;
}
double sin(double x);
double cos(double x);
cos is for a later function.
The task is to translate foo1 to assembly, but as you can see a is a float and sin expects a double parameter. Have been translating C to assembly codes for weeks now, but honestly, have no idea of how to do in this point here.
gcc is your friend - compiling with gcc -S suggests that you might want to use cvtss2sd for float to double conversion and cvtsd2ss for converting the double result back to float:
_foo1:
pushl %ebp
movl %esp, %ebp
subl $24, %esp
movss 8(%ebp), %xmm0
cvtss2sd %xmm0, %xmm0
movsd %xmm0, (%esp)
calll _sin
fstpl -16(%ebp)
movss 12(%ebp), %xmm0
cvtss2sd %xmm0, %xmm0
addsd -16(%ebp), %xmm0
cvtsd2ss %xmm0, %xmm0
movss %xmm0, -4(%ebp)
flds -4(%ebp)
addl $24, %esp
popl %ebp
ret
EDIT: this appears to be some kind of homework assignment with arbitrary constraints. If you need to avoid SSE instructions then just add -mno-sse to the command line (gcc -mno-sse ...) and you get:
_foo1:
pushl %ebp
movl %esp, %ebp
subl $24, %esp
flds 8(%ebp)
fstpl (%esp)
calll _sin
fadds 12(%ebp)
fstps -4(%ebp)
flds -4(%ebp)
addl $24, %esp
popl %ebp
ret
EDIT 2: for more compact code you can omit the stack frame (gcc -fomit-frame-pointer ...) - note however that you will lose some functionality when it comes to debugging or profiling:
_foo1:
subl $12, %esp
flds 16(%esp)
fstpl (%esp)
calll _sin
fadds 20(%esp)
fstps 8(%esp)
flds 8(%esp)
addl $12, %esp
ret