This question was proposed to me by a friend and I have no idea how to solve it.
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
And is supposed to map to this loop in C,
int loop(int a, int b){
int x, y;
y = ____;
for (____; ____; ____){
____;
}
return ____;
}
My attempt at converting the assembly to C,
y = 5a;
y = b + 2y;
x = 4y;
if (x < b){
x = 3a;
do{
y += x;
} while (b <= -2);
}
return y;
I assumed %eax = y, since 'y' in the code to fill is the first variable being assigned.
'x' follows as %edx since it's another assignment, and so should be at least part of the "Initialisation" of the for loop.
However this doesn't seem to fix into the blanks provided, so I am really stuck.
I think I've got a really close, if not perfect solution:
/* rdi = a, rsi = b */
/* rax = y, rdx = x */
/*
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
*/
int loop(int a, int b){
int x, y;
y = b + (a * 5) * 2;
for (x = y * 4; x > b;){
do y += (x = a * 3); while(b < -2);
break;
}
return y;
}
Not sure if break; is an issue but I can't find a better way.
Thanks for all the comments so far. I am sorry that I have used a bad example in my original question, that almost everyone would say: "Oh, you should use memcopy!" But that is not what my question is about.
My question is more generic about how manual loop unrolling should be done. Consider this example this time, by summing all elements in an array:
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + (n - nr);
double sum_x = 0.0;
for (; x < end; x++) sum_x += *x;
if (nr) sum_x += *x;
return sum_x;
}
The compiler generated assembly admits a similar behaviour (to what is shown by the array-copying example in my original question)
sum:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
leaq (%rsi,%rdi,8), %rdx
cmpq %rdx, %rsi
jnb .L5
movq %rsi, %rax
pxor %xmm0, %xmm0
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L3
movq %rsi, %rax
notq %rax
addq %rax, %rdx
shrq $3, %rdx
leaq 8(%rsi,%rdx,8), %rsi
.L2:
testq %rcx, %rcx
je .L1
addsd (%rsi), %xmm0
.L1:
ret
.L5:
pxor %xmm0, %xmm0
jmp .L2
However, if I now schedule the "fractional" part ahead of the main loop (as I later dig out in an answer I posted), the compiler does much better job.
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + n;
double sum_x = 0.0;
if (nr) sum_x += *x;
for (x += nr; x < end; x++) sum_x += *x;
return sum_x;
}
sum:
leaq (%rsi,%rdi,8), %rdx
pxor %xmm0, %xmm0
andl $1, %edi
je .L2
addsd (%rsi), %xmm0
.L2:
leaq (%rsi,%rdi,8), %rax
cmpq %rax, %rdx
jbe .L1
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L4
.L1:
ret
I have only used a compiler flag -O2. So as Peter said, the compiler generated assembly should be close to C source code. Then the question is, why does a compiler do better in the latter case?
This is not really a performance-related question. It is just something I unconsciously found (and can't explain) when checking compiler's assembly output for C code from a C project I have been writing. Thanks again. Thank Peter for proposing a better title for the question.
Original question:
The following small C function copies a, a vector of n entries to b. A manual loop unrolling of depth 2 is applied.
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = 0;
ptrdiff_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
It gives the x64 assembly under gcc -O2 (any gcc version 5.4+). However, I find the part of the output as commented weird. Why does the compiler ever generate them?
foo:
movq %rdi, %rcx
xorl %eax, %eax
andl $1, %ecx
subq %rcx, %rdi
testq %rdi, %rdi
jle .L11
.L12:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
jg .L12 // the loop ends, with `i = n`, BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
shrq %rdi // n = n / 2;
leaq 2(%rdi,%rdi), %rax // i = 2 * n + 2; (this is just `i = n`, isn't it?)
.L11:
testq %rcx, %rcx
je .L10
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L10:
ret
A similar version using size_t instead of ptrdiff_t gives something similar:
#include <stdlib.h>
void bar (size_t n, double *a, double *b) {
size_t i = 0;
size_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
bar:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
je .L20
xorl %eax, %eax
.L21:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
ja .L21 // the loop ends, with `i = n`, BUT BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
andq $-2, %rdi // n = n & (-2);
addq $2, %rdi // n = n + 2; (this is just `i = n`, isn't it?)
.L20:
testq %rcx, %rcx
je .L19
movsd (%rsi,%rdi,8), %xmm0
movsd %xmm0, (%rdx,%rdi,8)
.L19:
ret
And here is another equivalence,
#include <stdlib.h>
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
n -= nr;
double *b_end = b + n;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
} // `b = b_end` when the loop ends
if (nr) b[0] = a[0];
}
but the following assembly looks more odd (though produced under -O2). Now n, a and b are all copied, and when the loop ends, we take 5 lines of code just to end up with b_copy = 0?!
baz: // initially, `n` in %rdi, `a` in %rsi, `b` in %rdx
movq %rdi, %r8 // n_copy = n;
andl $1, %r8d // nr = n_copy & 1;
subq %r8, %rdi // n_copy -= nr;
leaq (%rdx,%rdi,8), %rdi // b_end = b + n;
cmpq %rdi, %rdx // if (b >= b_end) jump to .L31
jnb .L31
movq %rdx, %rax // b_copy = b;
movq %rsi, %rcx // a_copy = a;
.L32:
movsd (%rcx), %xmm0
addq $16, %rax
addq $16, %rcx
movsd %xmm0, -16(%rax)
movsd -8(%rcx), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rdi // `b_copy` in %rax, `b_end` in %rdi
ja .L32 // the loop ends, with `b_copy = b_end`
movq %rdx, %rax // b_copy = b;
notq %rax // b_copy = ~b_copy;
addq %rax, %rdi // b_end = b_end + b_copy;
andq $-16, %rdi // b_end = b_end & (-16);
leaq 16(%rdi), %rax // b_copy = b_end + 16;
addq %rax, %rsi // a += b_copy; (isn't `b_copy` just 0?)
addq %rax, %rdx // b += b_copy;
.L31:
testq %r8, %r8 // if (nr == 0) jump to .L30
je .L30
movsd (%rsi), %xmm0 // xmm0 = a[0];
movsd %xmm0, (%rdx) // b[0] = xmm0;
.L30:
ret
Can anyone explain what the compiler has in mind in all three cases?
Looks like if I unroll the loop in the following manner, a compiler can generate neater code.
#include <stdlib.h>
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void bar (size_t n, double *a, double *b) {
size_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
double *b_end = b + n;
if (nr) b[0] = a[0];
b += nr;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
}
}
foo:
movq %rdi, %rax
andl $1, %eax
je .L9
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jle .L11
.L4:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L9:
cmpq %rax, %rdi
jg .L4
.L11:
ret
bar:
movq %rdi, %rax
andl $1, %eax
je .L20
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jbe .L21
.L15:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L20:
cmpq %rax, %rdi
ja .L15
.L21:
ret
baz:
leaq (%rdx,%rdi,8), %rcx
andl $1, %edi
je .L23
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L23:
leaq (%rdx,%rdi,8), %rax
cmpq %rax, %rcx
jbe .L22
.L25:
movsd (%rsi), %xmm0
addq $16, %rax
addq $16, %rsi
movsd %xmm0, -16(%rax)
movsd -8(%rsi), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rcx
ja .L25
.L22:
ret
If you're asking why the assembly is relatively large, its because the compiler can't assume what you may know.
For example, if you know the source array will not be modified during the copy, tell the compiler so by adding a const qualifier to the pointed at source data.
void foo (ptrdiff_t n, double *a, double const *b)
Further, if you know the two memory ranges will never overlap, add a restrict qualifier to each of the two pointers.
void foo (ptrdiff_t n, double *restrict a, double const *restrict b)
Ultimately, if you want the most optimized copy (compiler vendors spend a LOT of time on this), use memcpy for non-overlapping ranges, and memmove for overlapping ranges.
This is a follow on to this post. Disclaimer: I have done zero profiling and don't even have an application, this is purely for me to learn more about vectorization.
My code is below. I am compiling with gcc 4.9.4 on a machine with an i3 m370. The first loop vectorizes as I expect. However the second loop checking each element of temp is not vectorized AFAICT, with all the "andb" instructions. I expected it to be vectorized with something like _mm_test_all_ones. How can that loop also be vectorized? Second question, I really want this as part of a larger loop. If I uncomment whats below, nothing gets vectorized. How can I also get that vectorized?
#define ARR_LENGTH 4096
#define block_size 4
typedef float afloat __attribute__ ((__aligned__(16)));
char all_equal_2(afloat *a, afloat *b){
unsigned int i, j;
char r = 1;
unsigned int temp[block_size] __attribute__((aligned(16)));
//for (i=0; i<ARR_LENGTH; i+=block_size){
for (j = 0; j < block_size; ++j) {
temp[j] = (*a) == (*b);
a++;
b++;
}
for (j=0; j<block_size; j++){
r &= temp[j];
}
/*if (r == 0){
break;
}
}*/
return r;
}
And the key section of resulting assembly:
.cfi_startproc
movaps (%rdi), %xmm0
cmpeqps (%rsi), %xmm0
movdqa .LC0(%rip), %xmm1
pand %xmm0, %xmm1
movaps %xmm1, -24(%rsp)
movl -24(%rsp), %eax
andl $1, %eax
andb -20(%rsp), %al
andb -16(%rsp), %al
andb -12(%rsp), %al
ret
.cfi_endproc
Update:
This post is similar to my first question. In that question, the vector was a raw pointer so segfaults are possible, but here that isn't a concern. Therefore AFAIK reordering the comparison operations is safe here, but not there. The conclusion is probably the same though.
Autovectorization really likes reductions operations, so the trick was to turn this into a reduction.
#define ARR_LENGTH 4096
typedef float afloat __attribute__ ((__aligned__(16)));
int foo(afloat *a, afloat *b){
unsigned int i, j;
unsigned int result;
unsigned int blocksize = 4;
for (i=0; i<ARR_LENGTH; i+=blocksize){
result = 0;
for (j=0; j<blocksize; j++){
result += (*a) == (*b);
a++;
b++;
}
if (result == blocksize){
blocksize *= 2;
} else {
break;
}
}
blocksize = ARR_LENGTH - i;
for (i=0; i<blocksize; i++){
result += (*a) == (*b);
a++;
b++;
}
return result == i;
}
Compiles into a nice loop:
.L3:
movaps (%rdi,%rax), %xmm1
addl $1, %ecx
cmpeqps (%rsi,%rax), %xmm1
addq $16, %rax
cmpl %r8d, %ecx
psubd %xmm1, %xmm0
jb .L3
So your loop is quiet small and it is recursive: the result of iteration N is used as an input in iteration N+1.
If you change your second loop to allow 2 operations per ieration:
char r2 = r;
for (j=0; j<block_size/2; j+=2){
r &= temp[j];
r2 &=temp[j+1];
}
r &= r2;
you will see output is optimized
.cfi_def_cfa_register %rbp
vmovss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
vmovss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
vucomiss (%rsi), %xmm0
sete %al
vucomiss 4(%rsi), %xmm1
sete %cl
andb %al, %cl
movzbl %cl, %eax
popq %rbp
retq
.cfi_endproc
for the last point, with the code optimized and the outer loop enabled I see some optimizations. Did you change compilation options?
I am rephrasing this question based on the comments received.
I have a loop that runs 30 Billion times and assigns values to a chunk of memory assigned using malloc();
When the loop contains a condition it runs much slower than when the condition is not present. Review the scenarios below:
Scenario A: Condition is present and program is slow (43 sec)
Scenario B: Condition is not present and program is much faster (4 sec)
// gcc -O3 -c block.c && gcc -o block block.o
#include <stdio.h>
#include <stdlib.h>
#define LEN 3000000000
int main (int argc, char** argv){
long i,j;
unsigned char *n = NULL;
unsigned char *m = NULL;
m = (unsigned char *) malloc (sizeof(char) * LEN);
n = m;
srand ((unsigned) time(NULL));
int t = (unsigned) time(NULL);
for (j = 0; j < 10; j++){
n = m;
for (i = 0; i < LEN; i++){
//////////// A: THIS IS SLOW
/*
if (i % 2){
*n = 1;
} else {
*n = 0;
}
*/
/////////// END OF A
/////////// B: THIS IS FAST
*n = 0;
i % 2;
*n = 1;
/////////// END OF B
n += 1;
}
}
printf("Done. %d sec \n", ((unsigned) time(NULL)) - t );
free(m);
return 0;
}
Regards,
KD
You can use gcc -S -O3 to have a look at the resulting assembler.
Here is an example on an Intel box:
Fast version:
movl %eax, %r12d
.p2align 4,,10
.p2align 3
.L2:
movl $3000000000, %edx
movl $1, %esi
movq %rbp, %rdi
call memset
subq $1, %rbx
jne .L2
Slow version:
movl $10, %edi
movl %eax, %ebp
movl $3000000000, %esi
.p2align 4,,10
.p2align 3
.L2:
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L5:
movq %rdx, %rcx
andl $1, %ecx
movb %cl, (%rbx,%rdx)
addq $1, %rdx
cmpq %rsi, %rdx
jne .L5
subq $1, %rdi
jne .L2
Conclusion: the compiler is smarter than you think. It is able to optimize the inner loop as a memset (which is faster because it uses SSE/AVX or REP instructions on Intel). However, this optimization cannot kick in if the condition is kept - because the result is different.
The assembly functions with commented c version:
/*
int f (int x)
{
return x+2;
}
void map2 (int* um, int * outro, int n)
{
int i;
for (i=0; i<n; i++)
*(outro+i) = f(*(um+i));
}
*/
.text
.globl f
f:
/********************************** prologue *************************************/
pushl %ebp
movl %esp, %ebp
pushl %ebx
/********************************************************************************/
movl 8(%ebp), %eax /* eax receives x value */
addl $2, %eax /* return x+2; */
/************************************** end *************************************/
popl %edi
popl %ebx
movl %ebp, %esp
popl %ebp
ret
/*********************************************************************************/
.globl map2
map2:
/********************************** prologue *************************************/
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %esi
pushl %edi
/********************************************************************************/
movl $0, %ebx /* i = 0; INIT */
L1: cmpl 16(%ebp), %ebx /* if (!(i<n)) */
jge out
movl 12(%ebp), %esi /* esi receives 'outro' (another) address */
**movl %ebx, %ecx /* moves ebx value for bytes multiplication */
imul $4, %ecx /* 4(int) * i bytes to course */**
addl %ecx, %esi /* esi points to outro+i */
movl 8(%ebp), %edi /* edi receives 'um' (one) address */
**movl %ebx, %edx /* moves ebx value for bytes multiplication */
imul $4, %edx /* 4(int) * i bytes to course */**
addl %edx, %edi /* edi points to um+i */
/************************ calls f and return it's value *************************/
pushl %ecx
pushl %edx
pushl %eax
pushl (%edi) /* push *(um+i) for 'f' usage */
call f
movl %eax, (%esi) /* *(outro+i) = f(*(um+i)); */
addl $4, %esp /* clears *(um+i) from stack */
popl %eax
popl %edx
popl %ecx
/********************************************************************************/
incl %ebx /* i++; */
jmp L1 /* end loop */
out:
/************************************** end *************************************/
popl %edi
popl %esi
popl %ebx
movl %ebp, %esp
popl %ebp
ret
/********************************************************************************/
The main C Code:
#include <stdio.h>
#define N 10
int f (int x);
void map2 (int* um, int * outro, int n);
int main (void) {
int i;
int a[N], b[N];
for (i=0;i<N;i++)
{
a[i] = i;
printf("b[%d] = %d\n", i, f(i)); // added for debug purposes
}
map2(a,b,N);
printf("\n"); // added for clear sight
for (i=0;i<N;i++)
printf("b[%d] = %d\n", i, b[i]);
return 1;
}
output:
b[0] = 2
b[1] = 3
b[2] = 4
b[3] = 5
b[4] = 6
b[5] = 7
b[6] = 8
b[7] = 9
b[8] = 10
b[9] = 11
b[0] = 33686018
b[1] = 33686019
b[2] = 516
b[3] = -253
b[4] = -145333866
b[5] = -143814668
b[6] = -145333723
b[7] = -143596928
b[8] = 0
b[9] = 134513961
<seg fault>
Pretty clear that the 'f' function is ok, however 'map2' has some problem. From b[4] to b[7] getting memory junk. What's wrong here?
My guess is something about popl %edi popl %esi at the end of the code, since they're holding the two arrays addresses. Even so, changes there didn't fix the problem.