how to auto vectorization array comparison function - c

This is a follow on to this post. Disclaimer: I have done zero profiling and don't even have an application, this is purely for me to learn more about vectorization.
My code is below. I am compiling with gcc 4.9.4 on a machine with an i3 m370. The first loop vectorizes as I expect. However the second loop checking each element of temp is not vectorized AFAICT, with all the "andb" instructions. I expected it to be vectorized with something like _mm_test_all_ones. How can that loop also be vectorized? Second question, I really want this as part of a larger loop. If I uncomment whats below, nothing gets vectorized. How can I also get that vectorized?
#define ARR_LENGTH 4096
#define block_size 4
typedef float afloat __attribute__ ((__aligned__(16)));
char all_equal_2(afloat *a, afloat *b){
unsigned int i, j;
char r = 1;
unsigned int temp[block_size] __attribute__((aligned(16)));
//for (i=0; i<ARR_LENGTH; i+=block_size){
for (j = 0; j < block_size; ++j) {
temp[j] = (*a) == (*b);
a++;
b++;
}
for (j=0; j<block_size; j++){
r &= temp[j];
}
/*if (r == 0){
break;
}
}*/
return r;
}
And the key section of resulting assembly:
.cfi_startproc
movaps (%rdi), %xmm0
cmpeqps (%rsi), %xmm0
movdqa .LC0(%rip), %xmm1
pand %xmm0, %xmm1
movaps %xmm1, -24(%rsp)
movl -24(%rsp), %eax
andl $1, %eax
andb -20(%rsp), %al
andb -16(%rsp), %al
andb -12(%rsp), %al
ret
.cfi_endproc
Update:
This post is similar to my first question. In that question, the vector was a raw pointer so segfaults are possible, but here that isn't a concern. Therefore AFAIK reordering the comparison operations is safe here, but not there. The conclusion is probably the same though.

Autovectorization really likes reductions operations, so the trick was to turn this into a reduction.
#define ARR_LENGTH 4096
typedef float afloat __attribute__ ((__aligned__(16)));
int foo(afloat *a, afloat *b){
unsigned int i, j;
unsigned int result;
unsigned int blocksize = 4;
for (i=0; i<ARR_LENGTH; i+=blocksize){
result = 0;
for (j=0; j<blocksize; j++){
result += (*a) == (*b);
a++;
b++;
}
if (result == blocksize){
blocksize *= 2;
} else {
break;
}
}
blocksize = ARR_LENGTH - i;
for (i=0; i<blocksize; i++){
result += (*a) == (*b);
a++;
b++;
}
return result == i;
}
Compiles into a nice loop:
.L3:
movaps (%rdi,%rax), %xmm1
addl $1, %ecx
cmpeqps (%rsi,%rax), %xmm1
addq $16, %rax
cmpl %r8d, %ecx
psubd %xmm1, %xmm0
jb .L3

So your loop is quiet small and it is recursive: the result of iteration N is used as an input in iteration N+1.
If you change your second loop to allow 2 operations per ieration:
char r2 = r;
for (j=0; j<block_size/2; j+=2){
r &= temp[j];
r2 &=temp[j+1];
}
r &= r2;
you will see output is optimized
.cfi_def_cfa_register %rbp
vmovss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero
vmovss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
vucomiss (%rsi), %xmm0
sete %al
vucomiss 4(%rsi), %xmm1
sete %cl
andb %al, %cl
movzbl %cl, %eax
popq %rbp
retq
.cfi_endproc
for the last point, with the code optimized and the outer loop enabled I see some optimizations. Did you change compilation options?

Related

x86-64 Assembly Loop

This question was proposed to me by a friend and I have no idea how to solve it.
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
And is supposed to map to this loop in C,
int loop(int a, int b){
int x, y;
y = ____;
for (____; ____; ____){
____;
}
return ____;
}
My attempt at converting the assembly to C,
y = 5a;
y = b + 2y;
x = 4y;
if (x < b){
x = 3a;
do{
y += x;
} while (b <= -2);
}
return y;
I assumed %eax = y, since 'y' in the code to fill is the first variable being assigned.
'x' follows as %edx since it's another assignment, and so should be at least part of the "Initialisation" of the for loop.
However this doesn't seem to fix into the blanks provided, so I am really stuck.
I think I've got a really close, if not perfect solution:
/* rdi = a, rsi = b */
/* rax = y, rdx = x */
/*
loop:
leal (%rdi, %rdi, 4), %eax
leal (%rsi, %rax, 2), %eax
leal 0(, %rax, 4), %edx
cmpl %edx, %esi
jge .L1
leal (%rdi, %rdi, 2), %edx
.L3:
addl %edx, %eax
cmpl $-2, %esi
jl .L3
.L1:
rep ret
*/
int loop(int a, int b){
int x, y;
y = b + (a * 5) * 2;
for (x = y * 4; x > b;){
do y += (x = a * 3); while(b < -2);
break;
}
return y;
}
Not sure if break; is an issue but I can't find a better way.

Memmove in Assembly

I'm trying to put this Memmove C code to assembly and don't get the supposed result.
I'm using x86-64 assembly on xubuntu and after debugging for 2 hours, I don't see where I'm wrong.
C memmove code:
#include <stdio.h>
extern void * memmove(void *dest, void *src, size_t n);
int main () {
char str1[] = "Geeks"; // Array of size 6
char str2[] = "Quiz"; // Array of size 5
puts("str1 before memmove ");
puts(str1);
/* Copies contents of str2 to sr1 */
memmove(str1, str2, sizeof(str2));
puts("\nstr1 after memmove ");
puts(str1);
return 0;
}
/*
void * memmove(void *dest, void *src, size_t n) {
char *d = (char *) dest;
char *s = (char *) src;
if(s == d)
return dest;
if(s < d) {
//copy from back
s=s+n-1;
d=d+n-1;
while(n--) {
*d-- = *s--;
}
}
else {
//copy from front
while(n--)
*d++ = *s++;
}
return dest;
} */
Assembly code:
.globl memmove
# RDI = dest
# RSI = src
# RDX = n
# R8 = d
# R9 = s
memmove:
mov %rdi, %r8 # d = dest
mov %rsi, %r9 # s = src
jmp if_equal
if_equal:
cmp %r8, %r9 # s == d
jz retDest
ja else # s > d
jb if_s_minor # s < d
if_s_minor:
add %rdx, %r9 # s = s + n
sub $1, %r9 # s = s - 1
add %rdx, %r8 # d = d + n
sub $1, %r8 # d = d - 1
jmp while1
while1:
cmp $0, %rdx # n > 0 ?
jna retDest # if n <= 0 go to retDest
sub $1, %rdx # n--
movb (%rsi), %cl # *dst-- = *src--
movb %cl, (%rdi)
leaq -1(%r8), %r8 # *d--
leaq -1(%r9), %r9 # *s--
jmp while1
else:
jmp while2
while2:
cmp $0, %rdx # n > 0 ?
jna retDest
sub $1, %rdx # n--
movb (%rsi), %cl # *dst = *src
movb %cl, (%rdi)
leaq 1(%r8), %r8 # *d++
leaq 1(%r9), %r9 # *s++
jmp while2
retDest:
mov %rdi, %rax
ret
.end
It was supposed to show "Quiz" on the second print but it shows this:
str1 before memmove
Geeks
str1 after memmove
Qeeks
movb (%rsi), %cl # *dst = *src
movb %cl, (%rdi)
leaq 1(%r8), %r8 # *d++
leaq 1(%r9), %r9 # *s++
jmp while2
The problem here is that your code changes the %r8 and %r9 registers, but the %rdi and %rsi registers that are used in the actual move stay the same, thus repeating copying the same byte again and again!
Same problem of course in the while1 code.

Why does gcc's code-gen for my unrolled loop epilogue look over-complicated?

Thanks for all the comments so far. I am sorry that I have used a bad example in my original question, that almost everyone would say: "Oh, you should use memcopy!" But that is not what my question is about.
My question is more generic about how manual loop unrolling should be done. Consider this example this time, by summing all elements in an array:
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + (n - nr);
double sum_x = 0.0;
for (; x < end; x++) sum_x += *x;
if (nr) sum_x += *x;
return sum_x;
}
The compiler generated assembly admits a similar behaviour (to what is shown by the array-copying example in my original question)
sum:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
leaq (%rsi,%rdi,8), %rdx
cmpq %rdx, %rsi
jnb .L5
movq %rsi, %rax
pxor %xmm0, %xmm0
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L3
movq %rsi, %rax
notq %rax
addq %rax, %rdx
shrq $3, %rdx
leaq 8(%rsi,%rdx,8), %rsi
.L2:
testq %rcx, %rcx
je .L1
addsd (%rsi), %xmm0
.L1:
ret
.L5:
pxor %xmm0, %xmm0
jmp .L2
However, if I now schedule the "fractional" part ahead of the main loop (as I later dig out in an answer I posted), the compiler does much better job.
#include <stdlib.h>
double sum (size_t n, double *x) {
size_t nr = n & 1;
double *end = x + n;
double sum_x = 0.0;
if (nr) sum_x += *x;
for (x += nr; x < end; x++) sum_x += *x;
return sum_x;
}
sum:
leaq (%rsi,%rdi,8), %rdx
pxor %xmm0, %xmm0
andl $1, %edi
je .L2
addsd (%rsi), %xmm0
.L2:
leaq (%rsi,%rdi,8), %rax
cmpq %rax, %rdx
jbe .L1
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
ja .L4
.L1:
ret
I have only used a compiler flag -O2. So as Peter said, the compiler generated assembly should be close to C source code. Then the question is, why does a compiler do better in the latter case?
This is not really a performance-related question. It is just something I unconsciously found (and can't explain) when checking compiler's assembly output for C code from a C project I have been writing. Thanks again. Thank Peter for proposing a better title for the question.
Original question:
The following small C function copies a, a vector of n entries to b. A manual loop unrolling of depth 2 is applied.
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = 0;
ptrdiff_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
It gives the x64 assembly under gcc -O2 (any gcc version 5.4+). However, I find the part of the output as commented weird. Why does the compiler ever generate them?
foo:
movq %rdi, %rcx
xorl %eax, %eax
andl $1, %ecx
subq %rcx, %rdi
testq %rdi, %rdi
jle .L11
.L12:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
jg .L12 // the loop ends, with `i = n`, BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
shrq %rdi // n = n / 2;
leaq 2(%rdi,%rdi), %rax // i = 2 * n + 2; (this is just `i = n`, isn't it?)
.L11:
testq %rcx, %rcx
je .L10
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L10:
ret
A similar version using size_t instead of ptrdiff_t gives something similar:
#include <stdlib.h>
void bar (size_t n, double *a, double *b) {
size_t i = 0;
size_t nr = n & 1;
n -= nr; // `n` is an even integer
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
} // `i = n` when the loop ends
if (nr) b[i] = a[i];
}
bar:
movq %rdi, %rcx
andl $1, %ecx
subq %rcx, %rdi
je .L20
xorl %eax, %eax
.L21:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rdi // `i` in %rax, `n` in %rdi
ja .L21 // the loop ends, with `i = n`, BUT BELOW IS WEIRD
subq $1, %rdi // n = n - 1;
andq $-2, %rdi // n = n & (-2);
addq $2, %rdi // n = n + 2; (this is just `i = n`, isn't it?)
.L20:
testq %rcx, %rcx
je .L19
movsd (%rsi,%rdi,8), %xmm0
movsd %xmm0, (%rdx,%rdi,8)
.L19:
ret
And here is another equivalence,
#include <stdlib.h>
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
n -= nr;
double *b_end = b + n;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
} // `b = b_end` when the loop ends
if (nr) b[0] = a[0];
}
but the following assembly looks more odd (though produced under -O2). Now n, a and b are all copied, and when the loop ends, we take 5 lines of code just to end up with b_copy = 0?!
baz: // initially, `n` in %rdi, `a` in %rsi, `b` in %rdx
movq %rdi, %r8 // n_copy = n;
andl $1, %r8d // nr = n_copy & 1;
subq %r8, %rdi // n_copy -= nr;
leaq (%rdx,%rdi,8), %rdi // b_end = b + n;
cmpq %rdi, %rdx // if (b >= b_end) jump to .L31
jnb .L31
movq %rdx, %rax // b_copy = b;
movq %rsi, %rcx // a_copy = a;
.L32:
movsd (%rcx), %xmm0
addq $16, %rax
addq $16, %rcx
movsd %xmm0, -16(%rax)
movsd -8(%rcx), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rdi // `b_copy` in %rax, `b_end` in %rdi
ja .L32 // the loop ends, with `b_copy = b_end`
movq %rdx, %rax // b_copy = b;
notq %rax // b_copy = ~b_copy;
addq %rax, %rdi // b_end = b_end + b_copy;
andq $-16, %rdi // b_end = b_end & (-16);
leaq 16(%rdi), %rax // b_copy = b_end + 16;
addq %rax, %rsi // a += b_copy; (isn't `b_copy` just 0?)
addq %rax, %rdx // b += b_copy;
.L31:
testq %r8, %r8 // if (nr == 0) jump to .L30
je .L30
movsd (%rsi), %xmm0 // xmm0 = a[0];
movsd %xmm0, (%rdx) // b[0] = xmm0;
.L30:
ret
Can anyone explain what the compiler has in mind in all three cases?
Looks like if I unroll the loop in the following manner, a compiler can generate neater code.
#include <stdlib.h>
#include <stddef.h>
void foo (ptrdiff_t n, double *a, double *b) {
ptrdiff_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void bar (size_t n, double *a, double *b) {
size_t i = n & 1;
if (i) b[0] = a[0];
while (i < n) {
b[i] = a[i];
b[i + 1] = a[i + 1];
i += 2;
}
}
void baz (size_t n, double *a, double *b) {
size_t nr = n & 1;
double *b_end = b + n;
if (nr) b[0] = a[0];
b += nr;
while (b < b_end) {
b[0] = a[0];
b[1] = a[1];
a += 2;
b += 2;
}
}
foo:
movq %rdi, %rax
andl $1, %eax
je .L9
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jle .L11
.L4:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L9:
cmpq %rax, %rdi
jg .L4
.L11:
ret
bar:
movq %rdi, %rax
andl $1, %eax
je .L20
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
cmpq %rax, %rdi
jbe .L21
.L15:
movsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
movsd 8(%rsi,%rax,8), %xmm0
movsd %xmm0, 8(%rdx,%rax,8)
addq $2, %rax
.L20:
cmpq %rax, %rdi
ja .L15
.L21:
ret
baz:
leaq (%rdx,%rdi,8), %rcx
andl $1, %edi
je .L23
movsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L23:
leaq (%rdx,%rdi,8), %rax
cmpq %rax, %rcx
jbe .L22
.L25:
movsd (%rsi), %xmm0
addq $16, %rax
addq $16, %rsi
movsd %xmm0, -16(%rax)
movsd -8(%rsi), %xmm0
movsd %xmm0, -8(%rax)
cmpq %rax, %rcx
ja .L25
.L22:
ret
If you're asking why the assembly is relatively large, its because the compiler can't assume what you may know.
For example, if you know the source array will not be modified during the copy, tell the compiler so by adding a const qualifier to the pointed at source data.
void foo (ptrdiff_t n, double *a, double const *b)
Further, if you know the two memory ranges will never overlap, add a restrict qualifier to each of the two pointers.
void foo (ptrdiff_t n, double *restrict a, double const *restrict b)
Ultimately, if you want the most optimized copy (compiler vendors spend a LOT of time on this), use memcpy for non-overlapping ranges, and memmove for overlapping ranges.

Why does C program run Slower when a loop contains a condition

I am rephrasing this question based on the comments received.
I have a loop that runs 30 Billion times and assigns values to a chunk of memory assigned using malloc();
When the loop contains a condition it runs much slower than when the condition is not present. Review the scenarios below:
Scenario A: Condition is present and program is slow (43 sec)
Scenario B: Condition is not present and program is much faster (4 sec)
// gcc -O3 -c block.c && gcc -o block block.o
#include <stdio.h>
#include <stdlib.h>
#define LEN 3000000000
int main (int argc, char** argv){
long i,j;
unsigned char *n = NULL;
unsigned char *m = NULL;
m = (unsigned char *) malloc (sizeof(char) * LEN);
n = m;
srand ((unsigned) time(NULL));
int t = (unsigned) time(NULL);
for (j = 0; j < 10; j++){
n = m;
for (i = 0; i < LEN; i++){
//////////// A: THIS IS SLOW
/*
if (i % 2){
*n = 1;
} else {
*n = 0;
}
*/
/////////// END OF A
/////////// B: THIS IS FAST
*n = 0;
i % 2;
*n = 1;
/////////// END OF B
n += 1;
}
}
printf("Done. %d sec \n", ((unsigned) time(NULL)) - t );
free(m);
return 0;
}
Regards,
KD
You can use gcc -S -O3 to have a look at the resulting assembler.
Here is an example on an Intel box:
Fast version:
movl %eax, %r12d
.p2align 4,,10
.p2align 3
.L2:
movl $3000000000, %edx
movl $1, %esi
movq %rbp, %rdi
call memset
subq $1, %rbx
jne .L2
Slow version:
movl $10, %edi
movl %eax, %ebp
movl $3000000000, %esi
.p2align 4,,10
.p2align 3
.L2:
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L5:
movq %rdx, %rcx
andl $1, %ecx
movb %cl, (%rbx,%rdx)
addq $1, %rdx
cmpq %rsi, %rdx
jne .L5
subq $1, %rdi
jne .L2
Conclusion: the compiler is smarter than you think. It is able to optimize the inner loop as a memset (which is faster because it uses SSE/AVX or REP instructions on Intel). However, this optimization cannot kick in if the condition is kept - because the result is different.

understanding testl in assembly language

Trying to understand some assembly language, but I am not sure if I am understanding it correctly
movl 8(%ebp),%eax // assign %eax to a variable, say var
testl %eax,%eax // test if var is > 0 or not. if var is > 0, jump to .L3
jge .L3
addl $15,%eax // add 15 to var
.L3:
sarl $4,%eax // shift var 4 to the right , which is the same as multiplying var by 16
given by above understanding, I wrote the following code
int function(int x){
int var = x;
if(var>0) {
ret = ret * 16;
}
ret = ret + 15;
return ret;
}
however, my assembly code looks like the following
movl 8(%ebp), %ebp
movl %eax. %edx
sall $4, %edx
test1 %eax, %eax
cmovg %edx, %eax
addl $15, %eax
am I misunderstanding the original assembly code somewhere?
Edit: is there perhaps a loop involved?
Notice that the code continues with the shift even after the addition, and that jge also includes the equal case. Thus the code could look more like this:
int function(int x) {
int ret = x;
if (ret >= 0) goto skip_add;
ret = ret + 15;
skip_add:
ret = ret / 16;
return ret;
}
Or, to avoid the goto, reverse the condition:
int function(int x) {
int ret = x;
if(ret < 0) {
ret = ret + 15;
}
ret = ret / 16;
return ret;
}
PS: shifting right is division, shifting left would be multiplication.

Resources