Assembly to C matrix assitance - c

I'm really confused by my homework assignment. We are given C code and then assembly which are listed below. This is x86 assembly. Any assistance would be greatly appreciated. I've made an attempt to solve it based on what I'm understanding.
C code:
void transpose(Marray_t A) {
int i, j;
for (i = 0; i < M; i++)
for (j = 0; j < i; j++) {
int t = A[i][j];
A[i][j] = A[j][i];
A[j][i] = t;
}
}
Assembly code for ONLY inner loop:
1 .L3:
2 movl (%ebx), %eax //is this getting the mem location of %ebx and setting to %eax?
3 movl (%esi,%ecx,4), %edx //ecx * 4 + esi into edx
4 movl %eax, (%esi,%ecx,4) //
5 addl $1, %ecx //add 1 to ecx
6 movl %edx, (%ebx) //move edx to mem location of ebx???
7 addl $52, %ebx //I think this is M but I could be wrong
8 cmpl %edi, %ecx //compare edi & ecx
9 jl .L3
here is what I have to answer:
A. What is the value of M? ...I think this is 52...?
B. What registers hold program values i and j? ... I think edx and eax?
C. Write a C code version of transpose that makes use of the optimizations
that occur in this loop. Use the parameter M in your code rather than numeric
constants.
Attempt at (C):
void tranpose(Marray_t A) {
int i, j;
for(i = 0; i < M; i++) {
for(j = 0; j < i; j++) {
int *row = &A[i][0];
int *col = &A[0][j];
int value = (*row * 4) + *col;
}
}
}

1 .L3:
2 movl (%ebx), %eax // eax := read memory word at ebx
3 movl (%esi,%ecx,4), %edx // edx := read memory word at esi + 4*ecx
4 movl %eax, (%esi,%ecx,4) // store eax into that location
5 addl $1, %ecx // add 1 to ecx
6 movl %edx, (%ebx) // store edx into memory at ebx
7 addl $52, %ebx // add 52 to ebx
8 cmpl %edi, %ecx // compare edi & ecx
9 jl .L3
So in this code. %ebx is the the address of A[j][i], %esi is the address of A[i] and %ecx is j. 52 is sizeof(A[j]), so M is probably 13 (as the array element size is 4)

Related

Why does C program run Slower when a loop contains a condition

I am rephrasing this question based on the comments received.
I have a loop that runs 30 Billion times and assigns values to a chunk of memory assigned using malloc();
When the loop contains a condition it runs much slower than when the condition is not present. Review the scenarios below:
Scenario A: Condition is present and program is slow (43 sec)
Scenario B: Condition is not present and program is much faster (4 sec)
// gcc -O3 -c block.c && gcc -o block block.o
#include <stdio.h>
#include <stdlib.h>
#define LEN 3000000000
int main (int argc, char** argv){
long i,j;
unsigned char *n = NULL;
unsigned char *m = NULL;
m = (unsigned char *) malloc (sizeof(char) * LEN);
n = m;
srand ((unsigned) time(NULL));
int t = (unsigned) time(NULL);
for (j = 0; j < 10; j++){
n = m;
for (i = 0; i < LEN; i++){
//////////// A: THIS IS SLOW
/*
if (i % 2){
*n = 1;
} else {
*n = 0;
}
*/
/////////// END OF A
/////////// B: THIS IS FAST
*n = 0;
i % 2;
*n = 1;
/////////// END OF B
n += 1;
}
}
printf("Done. %d sec \n", ((unsigned) time(NULL)) - t );
free(m);
return 0;
}
Regards,
KD
You can use gcc -S -O3 to have a look at the resulting assembler.
Here is an example on an Intel box:
Fast version:
movl %eax, %r12d
.p2align 4,,10
.p2align 3
.L2:
movl $3000000000, %edx
movl $1, %esi
movq %rbp, %rdi
call memset
subq $1, %rbx
jne .L2
Slow version:
movl $10, %edi
movl %eax, %ebp
movl $3000000000, %esi
.p2align 4,,10
.p2align 3
.L2:
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L5:
movq %rdx, %rcx
andl $1, %ecx
movb %cl, (%rbx,%rdx)
addq $1, %rdx
cmpq %rsi, %rdx
jne .L5
subq $1, %rdi
jne .L2
Conclusion: the compiler is smarter than you think. It is able to optimize the inner loop as a memset (which is faster because it uses SSE/AVX or REP instructions on Intel). However, this optimization cannot kick in if the condition is kept - because the result is different.

x86 assembly code confusion

We've just begun the topic on assembly and I've been stuck on this problem for the longest time. I have to convert assembly to C code given the following:
C Code:
int foo(int *a, int n, int val) {
int i;
for (i = _________; ____________________________ ; i =___________) {
;
}
return i;
}
Assembly:
// what I've gathered so far
foo()
:
foo:
pushl %ebp
movl %esp,%ebp
movl 8(%ebp),%ecx // ecx: a
movl 16(%ebp),%edx // edx: val
movl 12(%ebp),%eax // eax: n
decl %eax // n = n--
js .L3 // if n < 0 goto done
.L7: // loop
cmpl %edx,(%ecx,%eax,4) // I don't understand how you would compute the
// address for (%ecx,%eax,4) I know it would be %ecx + %eax*4 = %ecx + eax << 2
jne .L3 // if (%ecx, %eax, 4) != val goto done (?)
decl %eax // n = n--
jns .L7 // if (n >= 0) jump to loop
.L3: // done
movl %ebp,%esp
popl %ebp
ret
I don't know how to figure out what i is being initialized to and what the body of the loop is. I'm assuming i = n since n serves as the update. It seems as if there are two conditions one being n > 0 and the other being the cmpl line. Please correct me if my understanding of the code is incorrect, and any clues to this problem is much appreciated.
I could have done some off-by 1 errors, but basically it is this:
int foo(int *a, int n, int val) {
int i;
for (i = n - 1; i >= 0 && a[i] == val; i = i - 1) {
;
}
return i;
}
The i is the %eax register; it loops from n - 1 to 0. The cmpl indexed access (%ecx,%eax,4) is addressed in bytes - this is equivalent to a[i], as size of int on ia32 is 4 bytes. The 4 bytes addressed thus is compared against val.
The %eax is implicitly returned.
Notice also, that js means < 0, and jns >= 0.
Another way to write it:
i = n;
i --; // decl %eax
if (i < 0) {
goto L3; // js .L3
}
L7:
if (a[i] != val) // cmpl %edx,(%ecx,%eax,4)
goto L3; // jne .L3
i --; // decl %eax
if (i >= 0)
goto L7; // jns .L7
L3:
return i;
An alternative using the preprocessor:
#define _________ n - 1
#define ____________________________ i >= 0 && a[i] == val
#define ___________ i + 1
int foo(int *a, int n, int val) {
int i;
for (i = _________; ____________________________ ; i =___________) {
;
}
return i;
}
Of course you can only use this for fun or to tease new programmers ;-)

understanding testl in assembly language

Trying to understand some assembly language, but I am not sure if I am understanding it correctly
movl 8(%ebp),%eax // assign %eax to a variable, say var
testl %eax,%eax // test if var is > 0 or not. if var is > 0, jump to .L3
jge .L3
addl $15,%eax // add 15 to var
.L3:
sarl $4,%eax // shift var 4 to the right , which is the same as multiplying var by 16
given by above understanding, I wrote the following code
int function(int x){
int var = x;
if(var>0) {
ret = ret * 16;
}
ret = ret + 15;
return ret;
}
however, my assembly code looks like the following
movl 8(%ebp), %ebp
movl %eax. %edx
sall $4, %edx
test1 %eax, %eax
cmovg %edx, %eax
addl $15, %eax
am I misunderstanding the original assembly code somewhere?
Edit: is there perhaps a loop involved?
Notice that the code continues with the shift even after the addition, and that jge also includes the equal case. Thus the code could look more like this:
int function(int x) {
int ret = x;
if (ret >= 0) goto skip_add;
ret = ret + 15;
skip_add:
ret = ret / 16;
return ret;
}
Or, to avoid the goto, reverse the condition:
int function(int x) {
int ret = x;
if(ret < 0) {
ret = ret + 15;
}
ret = ret / 16;
return ret;
}
PS: shifting right is division, shifting left would be multiplication.

Leal instruction in for loop

I'm reading a book Computer Systems: A Programmer's Perspective (2nd Edition)
and Practice Problem 3.23 are little confused me:
A function fun_b has the following overall structure:
int fun_b(unsigned x) {
int val = 0;
int i;
for ( ____;_____;_____) {
}
return val;
}
The gcc C compiler generates the following assembly code:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 leal (%eax,%eax), %edx
6 movl %ebx, %eax
7 andl $1, %eax
8 orl %edx, %eax
9 shrl %ebx Shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Reverse engineer the operation of this code and then do the following:
A. Use the assembly-code version to fill in the missing parts of the C code.
My solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for ( i = 0 ;i < 32;i++) {
val += val; //because leal (%eax,%eax), edx --> %edx = %eax + %eax
val = val | x & 0x1;
x >>= 1;
}
return val;
}
Book's solution.
int fun_b(unsigned x) {
int val = 0;
int i;
for (i = 0; i < 32; i++) {
val = (val << 1) | (x & 0x1);
x >>= 1;
}
return val;
}
Please, explain to me why leal function has non typical behavior in this function.
And I dont understand how this assembly code is yielding this statement val = (val << 1) | (x & 0x1)
In your code:
val += val;
val = val | x & 0x1;
Here, val += val which is equivalent to (val*2) which is effectively equal to val left shifted by 1.
But I think your solution is correct only if the assembly code was something like:
x at %ebp+8
1 movl 8(%ebp), %ebx
2 movl $0, %eax
3 movl $0, %ecx
.L13:
5 addl %eax, %eax
6 movl %ebx, %edx
7 andl $1, %edx
8 orl %edx, %eax
9 shrl %ebx # shift right by 1
10 addl $1, %ecx
11 cmpl $32, %ecx
12 jne .L13
Because if val + val was a separate statement, compiler usually places it in eax register rather than in edx (i'm not sure this is the case always). So, for the code you have given, the possible solutions are:
val = (val << 1) | (x & 0x1);
or
val = (val + val) | (x & 0x1);
or
val = (val * 2) | (x & 0x1);
x >>= 1; means multiplying x by 2 which in binary is shifting to the left or adding 0 at the right side
x >>= 1; == x * 2; == x +=x;

Efficiency using bitwise operators

The requirement is like following:
/* length must be >= 18 */
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
using bit-wise operator, I could refactor the 1st line
int remainder = (length - 2) & 7;
Can it be further optimized?
((length+5)&~7)+2
int calcActualLength(int length) {
int remainder = (length - 18) % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
==>
int HELPER_calcActualLength(int length) {
int remainder = length % 8;
if (remainder == 0)
return length;
return length + 8 - remainder;
}
int calcActualLength(int length) {
return 18 + HELPER_calcActualLength(length - 18);
}
And HELPER_calcActualLength() equals to ROUNDUP_8() in the semantics when the argument >= 0
And more simpler ROUNDUP_8() can be:
#define ROUNDUP_8(x) (((x)+7)&~7)
int calcActualLength(int length) {
return 18 + ROUNDUP_8(length - 18);
}
==> 2 + ROUNDUP_8(length - 18 + 16);
==> 2 + ROUNDUP_8(length - 2);
==> 2 + (((length - 2)+7)&~7)
==> ((length+5)&~7)+2
Original code produces the following 64-bit assembly when compiling with gcc -O3:
movl %edi, %eax
leal -18(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $29, %edx
addl %edx, %ecx
andl $7, %ecx
subl %edx, %ecx
je .L2
addl $8, %eax
subl %ecx, %eax
.L2:
rep
As suggested in the comments to your question, changing the argument to unsigned int allows for greater optimisations and results in the following assembly:
leal -18(%rdi), %edx
movl %edi, %eax
andl $7, %edx
je .L3
leal 8(%rdi), %eax
subl %edx, %eax
.L3:
rep
Rounding up to a multiple of 8 can be performed by adding 7 and masking with ~7. It works like this: if the last three bits are not all zero, then adding 7 carries into the 4-th bit, otherwise no carry occurs. So your function could be simplified to:
return (((length - 18) + 7) & ~7) + 18;
or simpler:
return ((length - 11) & ~7) + 18;
GCC compiles the last line to simply:
leal -11(%rdi), %eax
andl $-8, %eax
addl $18, %eax
Note that the lea (Load Effective Address) instruciton is often "abused" for its ability to compute simple linear combinations like reg1 + size*reg2 + offset

Resources