understanding testl in assembly language - c

Trying to understand some assembly language, but I am not sure if I am understanding it correctly
movl 8(%ebp),%eax // assign %eax to a variable, say var
testl %eax,%eax // test if var is > 0 or not. if var is > 0, jump to .L3
jge .L3
addl $15,%eax // add 15 to var
.L3:
sarl $4,%eax // shift var 4 to the right , which is the same as multiplying var by 16
given by above understanding, I wrote the following code
int function(int x){
int var = x;
if(var>0) {
ret = ret * 16;
}
ret = ret + 15;
return ret;
}
however, my assembly code looks like the following
movl 8(%ebp), %ebp
movl %eax. %edx
sall $4, %edx
test1 %eax, %eax
cmovg %edx, %eax
addl $15, %eax
am I misunderstanding the original assembly code somewhere?
Edit: is there perhaps a loop involved?

Notice that the code continues with the shift even after the addition, and that jge also includes the equal case. Thus the code could look more like this:
int function(int x) {
int ret = x;
if (ret >= 0) goto skip_add;
ret = ret + 15;
skip_add:
ret = ret / 16;
return ret;
}
Or, to avoid the goto, reverse the condition:
int function(int x) {
int ret = x;
if(ret < 0) {
ret = ret + 15;
}
ret = ret / 16;
return ret;
}
PS: shifting right is division, shifting left would be multiplication.

Related

Why does C program run Slower when a loop contains a condition

I am rephrasing this question based on the comments received.
I have a loop that runs 30 Billion times and assigns values to a chunk of memory assigned using malloc();
When the loop contains a condition it runs much slower than when the condition is not present. Review the scenarios below:
Scenario A: Condition is present and program is slow (43 sec)
Scenario B: Condition is not present and program is much faster (4 sec)
// gcc -O3 -c block.c && gcc -o block block.o
#include <stdio.h>
#include <stdlib.h>
#define LEN 3000000000
int main (int argc, char** argv){
long i,j;
unsigned char *n = NULL;
unsigned char *m = NULL;
m = (unsigned char *) malloc (sizeof(char) * LEN);
n = m;
srand ((unsigned) time(NULL));
int t = (unsigned) time(NULL);
for (j = 0; j < 10; j++){
n = m;
for (i = 0; i < LEN; i++){
//////////// A: THIS IS SLOW
/*
if (i % 2){
*n = 1;
} else {
*n = 0;
}
*/
/////////// END OF A
/////////// B: THIS IS FAST
*n = 0;
i % 2;
*n = 1;
/////////// END OF B
n += 1;
}
}
printf("Done. %d sec \n", ((unsigned) time(NULL)) - t );
free(m);
return 0;
}
Regards,
KD
You can use gcc -S -O3 to have a look at the resulting assembler.
Here is an example on an Intel box:
Fast version:
movl %eax, %r12d
.p2align 4,,10
.p2align 3
.L2:
movl $3000000000, %edx
movl $1, %esi
movq %rbp, %rdi
call memset
subq $1, %rbx
jne .L2
Slow version:
movl $10, %edi
movl %eax, %ebp
movl $3000000000, %esi
.p2align 4,,10
.p2align 3
.L2:
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L5:
movq %rdx, %rcx
andl $1, %ecx
movb %cl, (%rbx,%rdx)
addq $1, %rdx
cmpq %rsi, %rdx
jne .L5
subq $1, %rdi
jne .L2
Conclusion: the compiler is smarter than you think. It is able to optimize the inner loop as a memset (which is faster because it uses SSE/AVX or REP instructions on Intel). However, this optimization cannot kick in if the condition is kept - because the result is different.

How to convert recursion to tail recursion in this example?

I have this recursive function to add the cubes of n even numbers and I want't to turn it to a tail recursion.
int sum_even_cubes_rec(int n) {
if (n < 2)
return 0;
if ((n % 2) == 0) {
return (n*n*n + sum_even_cubes_rec(n - 1));
} else {
return (0 + sum_even_cubes_rec(n - 1));
}
}
This is what I wrote but it is wrong and I don't know how to fix it.
Can you please help me.
int sum_even_cubes_rec2(int n, int acc) {
if ((n % 2) == 0) {
return sum_even_cubes_rec2 (n-1, acc + n*n*n);
} return acc;
}
int sum_even_cubes_helperFunktion(int n) {
return sum_even_cubes_rec2(n, 0);
}
Your approach is correct. You have already added acc argument, so that's what you need to return for the base case.
The rest of your code is almost right - you need to adjust what you add to acc for the next invocation:
int sum_even_cubes_rec2(int n, int acc) {
if (n < 2) {
return acc;
}
int nextAcc = (n % 2) == 0 ? acc + n*n*n : acc;
return sum_even_cubes_rec2 (n-1, nextAcc);
}
Simply it can be written as this
int sum_even_cubes_rec2(int n) {
static int ans = 0;
if(n<2){
int tmp =ans;
ans =0;
return tmp;
}
ans += ( (n%2==0)? n*n*n : 0 );
return sum_even_cubes_rec2(n-1);
}
int sum_even_cubes(int n) {
int ret =0;
if (n < 2) return 0;
ret = (n % 2) ? 0: n*n*n;
return ret + sum_even_cubes(n-1);
}
Gcc -O2 -S will compile this into (function argument is %edi; return value is in %eax; target for recursion-loop is .L4) :
sum_even_cubes:
.LFB0:
.cfi_startproc
xorl %eax, %eax
cmpl $1, %edi
jle .L5
.p2align 4,,10
.p2align 3
.L4:
xorl %edx, %edx
testb $1, %dil
jne .L3
movl %edi, %edx
imull %edi, %edx
imull %edi, %edx
.L3:
subl $1, %edi
addl %edx, %eax
cmpl $1, %edi
jne .L4
rep ret
.L5:
rep ret
.cfi_endproc
.LFE0:

Assembly Code to C

I was practicing some assembly code to C and need some help with two questions. Based on the GCC objdump it seems okay but I want to make sure I can do this WITHOUT a computer (still kind of new to assembly code)
Question 1 :
q1:
pushl %ebp
movl %esp, %ebp
subl $4, %esp
cmpl $0, 8(%ebp)\\ compare variable1 to zero
jle .L2 \\jump if less than or equal to zero
movl $1, -4(%ebp)\\ ?? variable2 = 1??
jmp .L4\\else
.L2:
movl $0, -4(%ebp)\\ variable2 = 0
.L4:
movl -4(%ebp), %eax\\ variable2 = variable1
leave
ret
what I got was
int main(int x, int z)
{
if (x < 0)
z = 0;
else
z = x;
}
But I was not sure what the purpose of movl $1, -4(%ebp) was.
Question 2 :
fn:
pushl %ebp
movl $1, %eax
movl %esp, %ebp
movl 8(%ebp), %edx
cmpl $1, %edx\\ compare variable1 to 1
jle .L4\\ less than or equal jump.
.L5:
imull %edx, %eax\\ multiply variable1 by variable 2
subl $1, %edx\\ variable1 -1
cmpl $1, %edx\\ compare variable1 with 1
jne .L5 Loop if not equal
.L4:
popl %ebp\\ return value
ret
How I interpreted the information
int main(int x)
{
int result;
if (x <= 1){
for (result=1; x != 1; x = x-1)
result *= x;}
else{return result;}
}
Not sure if my logic is correct on either of those.
Q1 you have one argument 8(%ebp) and one local variable at -4(%ebp). Return value will be in %eax. Knowing this, the function looks more like:
int foo(int arg)
{
int local;
if (arg <= 0) {
local = 0;
} else {
local = 1;
}
return local;
}
Q2 popl %ebp // return value that's not the return value, that's restoring the saved %ebp of the caller (that was pushed in the beginning). Also, the condition in the loop should use > not !=. You are missing an if (x > 1) conditional around the for loop. (Thanks to Mooing Duck for pointing this out.) Also, technically it's a do-while loop. Otherwise you got this function right.
int factorial(int x)
{
int result = 1;
if (x > 1) {
do {
result *= x;
x -= 1;
} while(x != 1);
}
return result;
}

Leal instruction in for loop

I'm reading a book Computer Systems: A Programmer's Perspective (2nd Edition)
and Practice Problem 3.23 are little confused me:
A function fun_b has the following overall structure:
int fun_b(unsigned x) {
int val = 0;
int i;
for ( ____;_____;_____) {
}
return val;
}
The gcc C compiler generates the following assembly code:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 leal (%eax,%eax), %edx
6 movl %ebx, %eax
7 andl $1, %eax
8 orl %edx, %eax
9 shrl %ebx Shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Reverse engineer the operation of this code and then do the following:
A. Use the assembly-code version to fill in the missing parts of the C code.
My solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for ( i = 0 ;i < 32;i++) {
val += val; //because leal (%eax,%eax), edx --> %edx = %eax + %eax
val = val | x & 0x1;
x >>= 1;
}
return val;
}
Book's solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for (i = 0; i < 32; i++) {
val = (val << 1) | (x & 0x1);
x >>= 1;
}
return val;
}
Please, explain to me why leal function has non typical behavior in this function.
And I dont understand how this assembly code is yielding this statement val = (val << 1) | (x & 0x1)
In your code:
val += val;
val = val | x & 0x1;
Here, val += val which is equivalent to (val*2) which is effectively equal to val left shifted by 1.
But I think your solution is correct only if the assembly code was something like:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 addl %eax, %eax
6 movl %ebx, %edx
7 andl $1, %edx
8 orl %edx, %eax
9 shrl %ebx # shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Because if val + val was a separate statement, compiler usually places it in eax register rather than in edx (i'm not sure this is the case always). So, for the code you have given, the possible solutions are:
val = (val << 1) | (x & 0x1);
or
val = (val + val) | (x & 0x1);
or
val = (val * 2) | (x & 0x1);
x >>= 1; means multiplying x by 2 which in binary is shifting to the left or adding 0 at the right side
x >>= 1; == x * 2; == x +=x;

Efficiency using bitwise operators

The requirement is like following:
/* length must be >= 18 */
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
using bit-wise operator, I could refactor the 1st line
int remainder = (length - 2) & 7;
Can it be further optimized?
((length+5)&~7)+2
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
==>
int HELPER_calcActualLength(int length) {
int remainder = length % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
int calcActualLength(int length) {
return 18 + HELPER_calcActualLength(length - 18);
}
And HELPER_calcActualLength() equals to ROUNDUP_8() in the semantics when the argument >= 0
And more simpler ROUNDUP_8() can be:
#define ROUNDUP_8(x) (((x)+7)&~7)
int calcActualLength(int length) {
return 18 + ROUNDUP_8(length - 18);
}
==> 2 + ROUNDUP_8(length - 18 + 16);
==> 2 + ROUNDUP_8(length - 2);
==> 2 + (((length - 2)+7)&~7)
==> ((length+5)&~7)+2
Original code produces the following 64-bit assembly when compiling with gcc -O3:
movl %edi, %eax
leal -18(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $29, %edx
addl %edx, %ecx
andl $7, %ecx
subl %edx, %ecx
je .L2
addl $8, %eax
subl %ecx, %eax
.L2:
rep
As suggested in the comments to your question, changing the argument to unsigned int allows for greater optimisations and results in the following assembly:
leal -18(%rdi), %edx
movl %edi, %eax
andl $7, %edx
je .L3
leal 8(%rdi), %eax
subl %edx, %eax
.L3:
rep
Rounding up to a multiple of 8 can be performed by adding 7 and masking with ~7. It works like this: if the last three bits are not all zero, then adding 7 carries into the 4-th bit, otherwise no carry occurs. So your function could be simplified to:
return (((length - 18) + 7) & ~7) + 18;
or simpler:
return ((length - 11) & ~7) + 18;
GCC compiles the last line to simply:
leal -11(%rdi), %eax
andl $-8, %eax
addl $18, %eax
Note that the lea (Load Effective Address) instruciton is often "abused" for its ability to compute simple linear combinations like reg1 + size*reg2 + offset

Resources