Why does C program run Slower when a loop contains a condition

Why does C program run Slower when a loop contains a condition - c

I am rephrasing this question based on the comments received.
I have a loop that runs 30 Billion times and assigns values to a chunk of memory assigned using malloc();
When the loop contains a condition it runs much slower than when the condition is not present. Review the scenarios below:
Scenario A: Condition is present and program is slow (43 sec)
Scenario B: Condition is not present and program is much faster (4 sec)
// gcc -O3 -c block.c && gcc -o block block.o
#include <stdio.h>
#include <stdlib.h>
#define LEN 3000000000
int main (int argc, char** argv){
long i,j;
unsigned char *n = NULL;
unsigned char *m = NULL;
m = (unsigned char *) malloc (sizeof(char) * LEN);
n = m;
srand ((unsigned) time(NULL));
int t = (unsigned) time(NULL);
for (j = 0; j < 10; j++){
n = m;
for (i = 0; i < LEN; i++){
//////////// A: THIS IS SLOW
/*
if (i % 2){
*n = 1;
} else {
*n = 0;
}
*/
/////////// END OF A
/////////// B: THIS IS FAST
*n = 0;
i % 2;
*n = 1;
/////////// END OF B
n += 1;
}
}
printf("Done. %d sec \n", ((unsigned) time(NULL)) - t );
free(m);
return 0;
}
Regards,
KD

You can use gcc -S -O3 to have a look at the resulting assembler.
Here is an example on an Intel box:
Fast version:
movl %eax, %r12d
.p2align 4,,10
.p2align 3
.L2:
movl $3000000000, %edx
movl $1, %esi
movq %rbp, %rdi
call memset
subq $1, %rbx
jne .L2
Slow version:
movl $10, %edi
movl %eax, %ebp
movl $3000000000, %esi
.p2align 4,,10
.p2align 3
.L2:
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L5:
movq %rdx, %rcx
andl $1, %ecx
movb %cl, (%rbx,%rdx)
addq $1, %rdx
cmpq %rsi, %rdx
jne .L5
subq $1, %rdi
jne .L2
Conclusion: the compiler is smarter than you think. It is able to optimize the inner loop as a memset (which is faster because it uses SSE/AVX or REP instructions on Intel). However, this optimization cannot kick in if the condition is kept - because the result is different.

Related

x86-64 Assembly Loop

This question was proposed to me by a friend and I have no idea how to solve it.
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
And is supposed to map to this loop in C,
int loop(int a, int b){
int x, y;
y = ____;
for (____; ____; ____){
____;
}
return ____;
}
My attempt at converting the assembly to C,
y = 5a;
y = b + 2y;
x = 4y;
if (x < b){
x = 3a;
do{
y += x;
} while (b <= -2);
}
return y;
I assumed %eax = y, since 'y' in the code to fill is the first variable being assigned.
'x' follows as %edx since it's another assignment, and so should be at least part of the "Initialisation" of the for loop.
However this doesn't seem to fix into the blanks provided, so I am really stuck.

I think I've got a really close, if not perfect solution:
/* rdi = a, rsi = b */
/* rax = y, rdx = x */
/*
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
*/
int loop(int a, int b){
int x, y;
y = b + (a * 5) * 2;
for (x = y * 4; x > b;){
do y += (x = a * 3); while(b < -2);
break;
}
return y;
}
Not sure if break; is an issue but I can't find a better way.

how to auto vectorization array comparison function

This is a follow on to this post. Disclaimer: I have done zero profiling and don't even have an application, this is purely for me to learn more about vectorization.
My code is below. I am compiling with gcc 4.9.4 on a machine with an i3 m370. The first loop vectorizes as I expect. However the second loop checking each element of temp is not vectorized AFAICT, with all the "andb" instructions. I expected it to be vectorized with something like _mm_test_all_ones. How can that loop also be vectorized? Second question, I really want this as part of a larger loop. If I uncomment whats below, nothing gets vectorized. How can I also get that vectorized?
#define ARR_LENGTH 4096
#define block_size 4
typedef float afloat __attribute__ ((__aligned__(16)));
char all_equal_2(afloat *a, afloat *b){
unsigned int i, j;
char r = 1;
unsigned int temp[block_size] __attribute__((aligned(16)));
//for (i=0; i<ARR_LENGTH; i+=block_size){
for (j = 0; j < block_size; ++j) {
temp[j] = (*a) == (*b);
a++;
b++;
}
for (j=0; j<block_size; j++){
r &= temp[j];
}
/*if (r == 0){
break;
}
}*/
return r;
}
And the key section of resulting assembly:
.cfi_startproc
movaps (%rdi), %xmm0
cmpeqps (%rsi), %xmm0
movdqa .LC0(%rip), %xmm1
pand %xmm0, %xmm1
movaps %xmm1, -24(%rsp)
movl -24(%rsp), %eax
andl $1, %eax
andb -20(%rsp), %al
andb -16(%rsp), %al
andb -12(%rsp), %al
ret
.cfi_endproc
Update:
This post is similar to my first question. In that question, the vector was a raw pointer so segfaults are possible, but here that isn't a concern. Therefore AFAIK reordering the comparison operations is safe here, but not there. The conclusion is probably the same though.

Autovectorization really likes reductions operations, so the trick was to turn this into a reduction.
#define ARR_LENGTH 4096
typedef float afloat __attribute__ ((__aligned__(16)));
int foo(afloat *a, afloat *b){
unsigned int i, j;
unsigned int result;
unsigned int blocksize = 4;
for (i=0; i<ARR_LENGTH; i+=blocksize){
result = 0;
for (j=0; j<blocksize; j++){
result += (*a) == (*b);
a++;
b++;
}
if (result == blocksize){
blocksize *= 2;
} else {
break;
}
}
blocksize = ARR_LENGTH - i;
for (i=0; i<blocksize; i++){
result += (*a) == (*b);
a++;
b++;
}
return result == i;
}
Compiles into a nice loop:
.L3:
movaps (%rdi,%rax), %xmm1
addl $1, %ecx
cmpeqps (%rsi,%rax), %xmm1
addq $16, %rax
cmpl %r8d, %ecx
psubd %xmm1, %xmm0
jb .L3

So your loop is quiet small and it is recursive: the result of iteration N is used as an input in iteration N+1.
If you change your second loop to allow 2 operations per ieration:
char r2 = r;
for (j=0; j<block_size/2; j+=2){
r &= temp[j];
r2 &=temp[j+1];
}
r &= r2;
you will see output is optimized
.cfi_def_cfa_register %rbp
vmovss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
vmovss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
vucomiss (%rsi), %xmm0
sete %al
vucomiss 4(%rsi), %xmm1
sete %cl
andb %al, %cl
movzbl %cl, %eax
popq %rbp
retq
.cfi_endproc
for the last point, with the code optimized and the outer loop enabled I see some optimizations. Did you change compilation options?

Assembly, negative values treatment on sum

The assembly function with commented c version:
/*
struct X
{
int c; // 4 bytes
struct X *next; // 4 bytes
};
int add2 (struct X *x)
{
if (x == NULL) return 0;
else return x->c + add2(x->next);
}
*/
.text
.globl add2
add2:
/********************************** prologue *************************************/
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %esi
/********************************************************************************/
movl 8(%ebp), %ebx
cmpl $0, %ebx
jne out
movl $0, %eax
jmp end
out:
/***************************** calculates in x->next *******************************/
pushl %ecx
pushl %edx
pushl %eax
movl 4(%ebx), %esi
pushl %esi
call add2
addl $4, %esp
popl %eax
popl %edx
popl %ecx
/********************************************************************************/
cmpl $0, (%ebx) /* > negative values */
js neg /* treatment < */
addl (%ebx), %eax /* return x->c + add2(x->next); */
neg:negl (%ebx) /* c = |c| */
subl (%ebx), %eax /* return x->(-)c + add2(x->next); */
end:
/****************************************end *************************************/
popl %esi
popl %ebx
movl %ebp, %esp
popl %ebp
ret
/*********************************************************************************/
The main c code:
#include <stdio.h>
#include <stdlib.h>
struct X
{
int c;
struct X * next;
};
typedef struct X Xlist;
Xlist * lst_create (void)
{
return NULL;
}
Xlist * lst_insert (Xlist * l, int c)
{
Xlist * new = (Xlist*) malloc(sizeof(Xlist));
new->c = c;
new->next = l;
return new;
}
int add2 (struct X * x);
int main (void)
{
// int i;
Xlist * l;
l = lst_create();
//for (i=-9;i<10;i++)
l = lst_insert(l, -1);
printf("%d\n", add2(l));
return 0;
}
The intention is to print the sum of the elements of a linked list.
I'm getting memory garbage when using negative values. I believe the error is somehow here:
neg:negl (%ebx) /* c = |c| */
subl (%ebx), %eax /* return x->(-)c + add2(x->next); */
But why?
Already used the same algorithm in other add function and it was ok.

It seems to me that a big problem is that your recursive call to add2() ignores the return value:
pushl %eax
movl 4(%ebx), %esi
pushl %esi
call add2
addl $4, %esp
popl %eax ; <-- overwrites what the add2 call returned
Also, your C equivalent code doesn't seem to be really be equivalent. The assembly version modifies the negative values in the list to be positive; that isn't reflected in your C code version.

Assembly Code to C

I was practicing some assembly code to C and need some help with two questions. Based on the GCC objdump it seems okay but I want to make sure I can do this WITHOUT a computer (still kind of new to assembly code)
Question 1 :
q1:
pushl %ebp
movl %esp, %ebp
subl $4, %esp
cmpl $0, 8(%ebp)\\ compare variable1 to zero
jle .L2 \\jump if less than or equal to zero
movl $1, -4(%ebp)\\ ?? variable2 = 1??
jmp .L4\\else
.L2:
movl $0, -4(%ebp)\\ variable2 = 0
.L4:
movl -4(%ebp), %eax\\ variable2 = variable1
leave
ret
what I got was
int main(int x, int z)
{
if (x < 0)
z = 0;
else
z = x;
}
But I was not sure what the purpose of movl $1, -4(%ebp) was.
Question 2 :
fn:
pushl %ebp
movl $1, %eax
movl %esp, %ebp
movl 8(%ebp), %edx
cmpl $1, %edx\\ compare variable1 to 1
jle .L4\\ less than or equal jump.
.L5:
imull %edx, %eax\\ multiply variable1 by variable 2
subl $1, %edx\\ variable1 -1
cmpl $1, %edx\\ compare variable1 with 1
jne .L5 Loop if not equal
.L4:
popl %ebp\\ return value
ret
How I interpreted the information
int main(int x)
{
int result;
if (x <= 1){
for (result=1; x != 1; x = x-1)
result *= x;}
else{return result;}
}
Not sure if my logic is correct on either of those.

Q1 you have one argument 8(%ebp) and one local variable at -4(%ebp). Return value will be in %eax. Knowing this, the function looks more like:
int foo(int arg)
{
int local;
if (arg <= 0) {
local = 0;
} else {
local = 1;
}
return local;
}
Q2 popl %ebp // return value that's not the return value, that's restoring the saved %ebp of the caller (that was pushed in the beginning). Also, the condition in the loop should use > not !=. You are missing an if (x > 1) conditional around the for loop. (Thanks to Mooing Duck for pointing this out.) Also, technically it's a do-while loop. Otherwise you got this function right.
int factorial(int x)
{
int result = 1;
if (x > 1) {
do {
result *= x;
x -= 1;
} while(x != 1);
}
return result;
}

understanding testl in assembly language

Trying to understand some assembly language, but I am not sure if I am understanding it correctly
movl 8(%ebp),%eax // assign %eax to a variable, say var
testl %eax,%eax // test if var is > 0 or not. if var is > 0, jump to .L3
jge .L3
addl $15,%eax // add 15 to var
.L3:
sarl $4,%eax // shift var 4 to the right , which is the same as multiplying var by 16
given by above understanding, I wrote the following code
int function(int x){
int var = x;
if(var>0) {
ret = ret * 16;
}
ret = ret + 15;
return ret;
}
however, my assembly code looks like the following
movl 8(%ebp), %ebp
movl %eax. %edx
sall $4, %edx
test1 %eax, %eax
cmovg %edx, %eax
addl $15, %eax
am I misunderstanding the original assembly code somewhere?
Edit: is there perhaps a loop involved?

Notice that the code continues with the shift even after the addition, and that jge also includes the equal case. Thus the code could look more like this:
int function(int x) {
int ret = x;
if (ret >= 0) goto skip_add;
ret = ret + 15;
skip_add:
ret = ret / 16;
return ret;
}
Or, to avoid the goto, reverse the condition:
int function(int x) {
int ret = x;
if(ret < 0) {
ret = ret + 15;
}
ret = ret / 16;
return ret;
}
PS: shifting right is division, shifting left would be multiplication.