I'm trying to convert the following optimised assembly code to C code but keeping as close to the assembly code as possible. I have no idea how to change the movsbl call to C code. My understanding is that it moves a byte with a zero extension into a 32bit register. I have included comments as to what I believe to be happening in the assembly code.
file "my_sieve.c"
.text
.p2align 4,,15
.globl my_sieve
.type my_sieve, #function
my_sieve:
pushl %ebp
movl $4, %eax #eax = 4
pushl %edi
movl $1, %ebp #ebp = 1
pushl %esi
movl $2, %esi #esi = 2
pushl %ebx
movl 24(%esp), %edi #edi = max
cmpl $3, %edi #if edi > 3
jg .L9 #go to L9
jmp .L1 #otherwise go to L1
.p2align 4,,7
.p2align 3
.L6:
addl $1, %esi #esi + 1
movl %esi, %eax #eax = esi
imull %esi, %eax #eax*esi (i*i)
cmpl %edi, %eax #if i>max
jg .L1 #go to L1
.L9:
movl 20(%esp), %ebx #ebx = composite
movl %esi, %edx #edx = esi
sarl $3, %edx #shift edx right 3 bits
movsbl (%ebx,%edx), %ecx #??
movl %esi, %edx #edx = esi
andl $7, %edx #bit-wise AND between 7 & edx, stores result in edx
btl %edx, %ecx #copies edx to ecx
jc .L6 #jump to L6 if carry
.p2align 4,,7
.p2align 3
.L7:
movl %eax, %ecx #ecx = eax
movl %ebp, %ebx #ebx = ebp
andl $7, %ecx #bit-wise AND between 7 & ecx, stores result in ecx
movl %eax, %edx #edx = eax
sall %cl, %ebx #left shift ebx the number of bits held in cl
addl %esi, %eax #eax + esi
movl %ebx, %ecx #ecx = ebx
movl 20(%esp), %ebx #ebx = composite
sarl $3, %edx #sign preserving right shift edx 3 bits
orb %cl, (%ebx,%edx) #8-bit logical OR
cmpl %eax, %edi #compare edi & eax
jge .L7 #jump if greater or equal
jmp .L6 #otherwise jump to L6
.p2align 4,,7
.p2align 3
.L1:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret #return
.size my_sieve, .-my_sieve
.ident "GCC: (Ubuntu 4.8.2-19ubuntu1) 4.8.2"
.section .note.GNU-stack,"",#progbits
And my incomplete attempt at creating C code.
I need help to fill in the blanks or someone to tell me what I'm attempting is completely wrong.
void my_sieve(char *composite, int max){
long ebp, eax, edi, esi, edi, ecx, edx;
eax = 4;
ebp = 1;
esi = 2;
edi = max;
if(edi > 3)
goto L9;
else
goto L1;
L6:
esi += 1;
eax = esi;
eax = eax*esi;
if(eax > edi)
goto L1;
L9:
ebx = composite;
edx = esi;
//right shift
edx = edx >> 3;
//movsbl
edx = esi;
//bit-wise
edx = edx & 7;
edx = ecx;
if(carry)
goto L6;
L7:
ecx = eax;
ebx = ebp;
//bit-wise
ecs = ecx & 7;
edx = eax;
//left shift
ebx = ebx & 0xFF;
eax += esi;
ecx = ebx;
ebx = composite;
//right shift
edx = edx >> 3;
//8 bit logical OR
if(edi >= eax)
goto L7;
else
goto L6;
L1:
return;
}
I think (int)((int8_t)value) would do it. Casting between unsigned and signed types of the same size has no effect on the bit pattern. Casting a smaller signed type to a larger one causes a sign extension.
Related
I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}
Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.
You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax
The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.
This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function
I'm trying to translate the following program to x86 assembly ( AT&T ).
#include <stdio.h>
int main()
{
int n = 123;
int reverse = 0;
while (n != 0)
{
reverse = reverse * 10;
reverse = reverse + n%10;
n = n/10;
}
printf("%d\n", reverse);
return 0;
}
It's supposed to print 321.
However, with the code below, I'm getting a 0 instead.
Can anybody give me a clue about what I'm doing wrong here?
( I pasted just the relevant section below. I'm sure that initialization and printing are working fine. You can see the whole thing here)
movl $123, %esi # int n
movl $0, %edi # int reverse
movl $10, %ebx # divisor
L1: # while n != 0
cmpl $0, %esi
je L2
# reverse = reverse * 10
imul $10, %edi
# reverse = reverse + n % 10
movl $0, %edx
movl %edi, %eax
idivl %ebx
addl %edx, %edi
# n = n / 10
movl %esi, %eax
movl $0, %edx
idivl %ebx
movl %eax, %esi
jmp L1
L2: # end while
movl %edi, %eax
Maybe I'm not yet perfectly understanding what the idivl command is supposed to do. I understand that it divides %edx:%eax by %ebx and stores the quotient in %eax and the remainder in %edx.
# reverse = reverse + n % 10
movl $0, %edx
movl %edi, %eax ; <--- here
%edi is not n, according to the comments above:
movl $123, %esi # int n
So, it should be using %esi, i.e. movl %esi, %eax.
sometimes it is good to see what the compiler generates
int reverse(int x)
{
int r = 0;
while (x != 0)
{
r = r * 10;
r = r + x%10;
x = x/10;
}
return r;
}
and shortest version:
reverse:
xor eax, eax
mov esi, 10
.L2:
test edi, edi
je .L5
imul ecx, eax, 10
mov eax, edi
cdq
idiv esi
mov edi, eax
lea eax, [rdx+rcx]
jmp .L2
.L5:
ret
or the fastest:
reverse:
xor eax, eax
test edi, edi
je .L4
mov esi, 1717986919
.L3:
lea ecx, [rax+rax*4]
mov eax, edi
imul esi
mov eax, edi
sar eax, 31
sar edx, 2
sub edx, eax
lea eax, [rdx+rdx*4]
add eax, eax
sub edi, eax
test edx, edx
lea eax, [rdi+rcx*2]
mov edi, edx
jne .L3
rep ret
.L4:
rep ret
as you see the compilers same good/better than the 99.99% of the coders
Hi I have a function in C that returns the max of a set of numbers in an array. I need to convert it into assembly and make it callable from C. nums is the array in which all the numbers are stored. len is the length of the array that was passed. The other variables that I made are local variables
.global max
.text
.equ word_size, 4
max:
#prologue
push %ebp
movl %esp, %ebp
.equ nums, 2*word_size
.equ len, 3*word_size
.equ cur_max, 4*word_size
.equ index, 5*word_size
#eax is index
#short cur_max = nums[0]
movl cur_max(%ebp), %eax
movl $0, %ebx
mov nums(%ebp), %edx
leal (%edx, %ebx, word_size), %edx
mov %edx, cur_max(%ebp)
#for(index = 1; index < len;index++)
mov index(%ebp), %eax
mov len(%ebp), %ebx
mov nums(%ebp), %ecx
mov cur_max(%ebp), %edx
mov $1, %eax
for_loop_begin:
cmp %ebx, %eax
jl loop_begin
jmp for_loop_end
loop_begin:
if_loop:
leal (%ecx,%eax,word_size), %esi #nums[i] = esi
cmp %edx,%esi
jg in_if_loop
in_if_loop_end:
inc %eax
jmp for_loop_begin
for_loop_end:
mov %edx,%eax
ret
in_if_loop:
mov %esi,%edx
movl %ebp, %esp
pop %ebp
jmp in_if_loop_end
This is the C code
short max(short* nums, int len){
int index;
short cur_max = nums[0];
for( index = 1; index < len; index++)
if( nums[index] > cur_max)
cur_max = nums[index];
return cur_max;
}
I'm having so much difficulty with a question I was assigned for homework. I have the following C code and the subsequent assembly:
int foo(int n, int A[X(n)][Y(n)], int j){
int i;
int result = 0;
for (i = 0; i < X(n); i++)
result += A[i][j];
return result;
}
movl 8(%ebp), %eax
leal (%eax,%eax), %edx
leal (%edx,%eax), %ecx
movl %edx, %ebx
leal 1(%edx), %eax
movl $0, %edx
testl %eax, %eax
jle .L3
leal 0(,%ecx,4), %esi
movl 16(%ebp), %edx
movl 12(%ebp), %ecx
leal (%ecx,%edx,4), %eax
movl $0, %edx
movl $1, %ecx
addl $2, %ebx
.L4:
addl (%eax), %edx
addl $1, %ecx
addl %esi, %eax
cmpl %ebx, %ecx
jne .L4
.L3:
movl %edx, %eax
I need to find out the definitions of X and Y. I believe that n is initially stored in eax, and then 2n is stored in edx and 3n in ecx. So I think esi would equal 3n * 4. Also, because result is initially stored as movl $0, %edx and the following lines are incremented by one I'm thinking that X would be equal to #define X(n + 1). Also, I believe addl %esi, %eax would be Y. So since esi = %ecx * 4 does Y = 4n? However, this is where I begin to get severely confused. Thank's all.
Cute exercise.
The declaration seems to define A as a C99 variable-length-array. Incidentally these have exceedingly poor compiler support and are optional in C11.
The inner Y(n) dimension may then be inferred from the array stride across loop iterations, where EAX is the pointer and ESI the pitch, and appears to be defined as n*3. As for X(n) we may infer it from the loop entry condition when i = 0, and it appears to expand as N*2+1.
#define X(n) ((n)*2+1)
#define Y(n) ((n)*3)
Annotated assembly:
_foo:
;Prologue (assumed)
push ebp
mov ebp,esp
;Pre-scale N
mov eax,[ebp+8]
lea edx,[eax+eax]
lea ecx,[edx+eax] ;ECX = N*3
mov ebx,edx ;EBX = N*2
;Bail out earily if X(n) <= 0
lea eax,[edx+1] ;EAX = N*2+1
mov edx,0
test eax,eax ;(OF=0)
jle ##end ;Proceed if N*2+1 > 0
;Prepare loop counters
lea esi,[ecx*4] ;ESI = N*3*sizeof int, array stride
mov edx,[ebp+16] ;EDX = j
mov ecx,[ebp+12]
lea eax,[ecx+edx*4] ;EAX = &A[0][j]
mov edx,0 ;EDX = 0, accumulator
mov ecx,1 ;ECX = 1, loop counter
add ebx,2 ;EBX = N*2+2
;Step through the loop
##loop:
add edx,[eax] ;EDX += A[i][j]
add ecx,1 ;Increment loop counter
add eax,esi ;++A
cmp ecx,ebx
jne ##loop ;[1..N*2+2) <=> [0..N*2+1)
##end:
;Epilogue
mov eax,edx ;Return the sum
pop ebp
ret
I am trying to translate the following:
Action:
pushl %ebp
movl %esp, %eax
subl $0x32, %esp
movl $0x0, -0x8(%eax)
movl $0x0, -0x4(%eax)
movl -0x4(%eax), %eax
cmpl $0x32(%eax), %ebp
movl -0x4(%ebp), %eax
sall $0x2, %ebp
addl 0x8(%ebp), %ebp
movl (%ebp), %ebp
addl %ebp, -0x8(%eax)
addl $0x1, -0x4(%eax)
What is the best way to translate this code?
For the original question:
mov -0x4(%ebp), %eax # eax = z;
mov 0xc(%ebp), %edx # edx = y;
mov (%edx, %eax, 4), %eax # eax = *(edx + eax + 4)
add $0x3, %eax # eax += 3
movb $0x41, (%eax) # *eax = 'A'
Dry-running the statements give:
y[z + 4][3] = 'A';