Why is divsd not much slower than mulsd in this benchmark? - c

Someone told me the factor of speed differences is about 40x. Curious, I wrote a benchmark. It required some help from greybeards who were more knowledgeable about what might be optimizing out, but after several revisions we just cannot find a meaningful difference between divsd and mulsd.
Here's results:
➜ linked gcc main.c && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 2080434
mul: 1925889
div / mul: 1.080246
0.000000
And with O3:
➜ linked gcc main.c -O3 && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 1948388
mul: 1804587
div / mul: 1.079686
0.000000
The code:
#include <time.h>
#include <stdio.h>
#define TRIALS 10000000
int main() {
double op = 0.0;
double x = 1.0;
double const y = 1.21462343468798723984729;
time_t div_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
}
time_t div_end = clock();
printf("%.16f %.16f\n", x, op);
time_t div_seconds = div_end - div_start;
double op2 = 0.0;
x = 1.0;
double const z = 1 / y;
time_t mul_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
}
time_t mul_end = clock();
time_t mul_seconds = mul_end - mul_start;
printf("%.16f %.16f\n", x, op2);
// print results as seconds
printf("div: %ld\nmul: %ld\n", div_seconds, mul_seconds);
printf("div / mul: %f\n", (double)div_seconds / (double)mul_seconds);
printf("%f\n", op - op2);
return 0;
}
The assembly with O3:
.file "main.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string "%.16f %.16f\n"
.LC5:
.string "div: %ld\nmul: %ld\n"
.LC6:
.string "div / mul: %f\n"
.LC7:
.string "%f\n"
.section .text.startup,"ax",#progbits
.p2align 4
.globl main
.type main, #function
main:
.LFB11:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
pushq %r12
.cfi_def_cfa_offset 24
.cfi_offset 12, -24
pushq %rbp
.cfi_def_cfa_offset 32
.cfi_offset 6, -32
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset 3, -40
subq $40, %rsp
.cfi_def_cfa_offset 80
call clock#PLT
movq .LC0(%rip), %rcx
pxor %xmm2, %xmm2
movsd .LC2(%rip), %xmm1
movq %rax, %rbp
movl $10000000, %eax
movq %rcx, %xmm0
.p2align 4,,10
.p2align 3
.L2:
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
subq $1, %rax
jne .L2
movsd %xmm2, 8(%rsp)
leaq .LC3(%rip), %r13
movsd %xmm0, 16(%rsp)
call clock#PLT
movsd 8(%rsp), %xmm2
movsd 16(%rsp), %xmm0
movq %r13, %rdi
movq %rax, %rbx
movl $2, %eax
movapd %xmm2, %xmm1
subq %rbp, %rbx
call printf#PLT
call clock#PLT
movq .LC0(%rip), %rdx
movsd 8(%rsp), %xmm2
pxor %xmm1, %xmm1
movsd .LC4(%rip), %xmm3
movq %rax, %r12
movl $10000000, %eax
movq %rdx, %xmm0
.p2align 4,,10
.p2align 3
.L3:
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
subq $1, %rax
jne .L3
movsd %xmm2, 24(%rsp)
movsd %xmm1, 8(%rsp)
movsd %xmm0, 16(%rsp)
call clock#PLT
movsd 8(%rsp), %xmm1
movsd 16(%rsp), %xmm0
movq %r13, %rdi
subq %r12, %rax
movq %rax, %rbp
movl $2, %eax
call printf#PLT
movq %rbp, %rdx
movq %rbx, %rsi
xorl %eax, %eax
leaq .LC5(%rip), %rdi
call printf#PLT
pxor %xmm0, %xmm0
pxor %xmm3, %xmm3
leaq .LC6(%rip), %rdi
cvtsi2sdq %rbp, %xmm3
movl $1, %eax
cvtsi2sdq %rbx, %xmm0
divsd %xmm3, %xmm0
call printf#PLT
movsd 24(%rsp), %xmm2
movsd 8(%rsp), %xmm1
leaq .LC7(%rip), %rdi
movl $1, %eax
subsd %xmm1, %xmm2
movapd %xmm2, %xmm0
call printf#PLT
addq $40, %rsp
.cfi_def_cfa_offset 40
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 32
popq %rbp
.cfi_def_cfa_offset 24
popq %r12
.cfi_def_cfa_offset 16
popq %r13
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1072693248
.align 8
.LC2:
.long -74511709
.long 1072918296
.align 8
.LC4:
.long 644960408
.long 1072322682
.ident "GCC: (GNU) 12.2.0"
.section .note.GNU-stack,"",#progbits
It's clear divsd is not being optimized out? What am I doing wrong?

Related

GCC optimizing _mm256_setzero_si256 away?

Consider the following C program.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
static void do_stuff(void)
{
const int n = 256;
int *ar = malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
ar[i] = random();
}
int main(void)
{
do_stuff();
__m256i sm = _mm256_setzero_si256();
int sum = 0;
int *vcadd = (int*)&sm;
for (size_t l = 0; l < 8; l++)
sum += vcadd[l];
printf("sum = %d\n", sum);
return 0;
}
I expected this program to print sum = 0, but when I compile it with gcc -mavx2 src.c -O2, it sometimes prints sum = 0, sometimes sum = 18.
When compiled with -O1 or -O0, the programs works as expected. It also seems to work fine with -O2 and the do_stuff(); call commented out.
Assembly generated for main with -O1 (+ comments from me of what I think the instructions do):
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
movl $1024, %edi
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.L2:
call random#PLT
movl %eax, (%rbx)
addq $4, %rbx
cmpq %r12, %rbx
jne .L2
vpxor %xmm0, %xmm0, %xmm0 ; zero out %ymm0
vmovdqa %ymm0, (%rsp) ; store these zeros at %rsp
movq %rsp, %rax ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
leaq 32(%rsp), %rcx ; ^
movl $0, %edx ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
leaq .LC0(%rip), %rsi
movl $1, %edi
movl $0, %eax
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L8
movl $0, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L8:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
and with -O2:
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $1024, %edi
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.p2align 4,,10
.p2align 3
.L2:
call random#PLT
addq $4, %rbx
movl %eax, -4(%rbx)
cmpq %r12, %rbx
jne .L2
movq %rsp, %rax ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
leaq 32(%rsp), %rcx ; ^
xorl %edx, %edx ; ^
.p2align 4,,10 ; ^
.p2align 3 ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
xorl %eax, %eax
leaq .LC0(%rip), %rsi
movl $1, %edi
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L9
leaq -16(%rbp), %rsp
xorl %eax, %eax
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L9:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0?
I'm using
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Solution based on comments
(all below compiled with -O2)
Using memcpy as
__m256i sm = _mm256_setzero_si256();
int ar[8];
memcpy(ar, &sm, 32);
copies the data, although in a somewhat convoluted way (?)
vpxor %xmm0, %xmm0, %xmm0
leaq 48(%rsp), %rax
leaq 80(%rsp), %rcx
xorl %edx, %edx
vmovdqa %ymm0, (%rsp)
vmovdqa 16(%rsp), %xmm2
vmovdqa %xmm0, 48(%rsp)
vmovdqa %xmm2, 64(%rsp)
A union
union conv
{
__m256i val;
int ar[8];
};
union conv c;
c.val = _mm256_setzero_si256();
// access c.ar
works too by producing
vpxor %xmm0, %xmm0, %xmm0
leaq 4(%rsp), %rax
leaq 32(%rsp), %rsi
xorl %ecx, %ecx
vmovdqa %ymm0, (%rsp)
Another option is to compile with -fno-strict-aliasing. In that case, the original code works as I expected.
If you have 8 integers in __m256i variable, and you want horizontal sum, best way is probably intrinsics.
Here’s an example, untested:
// Horizontal sum of all 8 lanes in int32 SIMD vector
inline int hadd_epi32( __m256i vec )
{
// Add 8 lanes into 4
__m128i r = _mm256_extracti128_si256( vec, 1 );
r = _mm_add_epi32( r, _mm256_castsi256_si128( vec ) );
// Add 4 lanes into 2
r = _mm_add_epi32( r, _mm_unpackhi_epi64( r, r ) );
// Extract 2 lowest lanes from the vector into scalar registers, return their sum
const int i1 = _mm_extract_epi32( r, 1 );
const int i0 = _mm_cvtsi128_si32( r );
return i1 + i0;
}

Why GCC -Ofast makes the program wrong but only when it prints the result twice?

Recompiling an old program made it output the wrong result. I'd like to know why.
I know that -Ofast may "disregard strict standards compliance" but I'm curious about what happens under the hood.
I reduced the program to this minimal example foo1.c:
#include <stdio.h>
double my_pow(double x, unsigned n)
{ /* returns x^n */
double y = 1;
while(n--) y *= x;
return y;
}
void foo(double small)
{ /* prints small^19 */
double x = my_pow(small,19);
printf("%E\n",x);
printf("%E\n",x);
}
int main(void)
{
foo(1-0.8-0.2);
return 0;
}
When compiled with -Ofast it gives a different output than with any other optimization level.
gcc -Ofast foo1.c && ./a.out:
-0.000000E+00
-0.000000E+00
gcc foo1.c && ./a.out:
-1.390671E-309
-1.390671E-309
A strange fact is that when one of the printf is commented out (file foo2.c) this behavior doesn't replicate making it a sort of heisenbug.
gcc -Ofast foo2.c && ./a.out:
-1.390671E-309
gcc foo2.c && ./a.out:
-1.390671E-309
Informations that might be useful:
gcc -v:
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC)
gcc -Ofast foo1.c -S -o -:
.file "foo1.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
subq $24, %rsp
.cfi_def_cfa_offset 32
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
movsd %xmm2, 8(%rsp)
call printf
movsd 8(%rsp), %xmm2
movl $.LC2, %edi
movl $1, %eax
addq $24, %rsp
.cfi_def_cfa_offset 8
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movsd .LC3(%rip), %xmm0
call foo
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -1131413504
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo1.c -S -o -:
.file "foo1.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc -Ofast foo2.c -S -o -:
.file "foo2.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $.LC2, %edi
movl $1, %eax
movsd .LC3(%rip), %xmm0
call printf
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -2147418112
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo2.c -S -o -:
.file "foo2.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
Under -ffast-math (and it's siblings like -Ofast) gcc links your app with a special startup code in crtfastmath.c which sets flush-to-zero flag:
static void __attribute__((constructor))
set_fast_math (void)
{
#ifndef __x86_64__
...
#else
unsigned int mxcsr = __builtin_ia32_stmxcsr ();
mxcsr |= MXCSR_DAZ | MXCSR_FTZ;
__builtin_ia32_ldmxcsr (mxcsr);
#endif
}
(from here).

Why does summing floats into an int temporary run so much slower than when everything is int?

I compiled this program in GCC 4.8 -O2 on Skylake 6700HQ.
When I use float data type, total execution time is 0.000176 sec. When I change the float to int, the total time is 0.000026 (~7x faster). I don't know the reason for this difference. Related question: assembly output O3
I use this command in Geany IDE build command gcc -Wall -march=native -O2 -o "%e" "%f". I also tried -O3 and -Ofast, but those do not fix the problem.
I also read this question but there is too much differences between this float and int implementation. Since this float implementation is 7 times slower than the corresponding int implementation, this is not a duplicate question
#include <stdio.h>
#include <time.h>
float a[32][32]
, t[32][32]
, c_result[32][32]
, c_tra[32][32] ;
int main()
{
int w = 10000;
int i, j, k, temp;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for( i = 0; i < 32; i++){
for( j =0 ; j < 32; j++){
temp=0;
for( k = 0 ;k < 32; k++) {
temp += a[i][k] * c_tra[j][k];
}
c_result[i][j]= temp;
}
}
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
}while(w--);
printf(" The best time: %lf sec\n",tBest);
return 0;
}
It is assembly out put for int data type:
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %esi, %esi
.L2:
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L7:
movq %r8, %rdi
xorl %eax, %eax
xorl %ecx, %ecx
salq $5, %rdi
.p2align 4,,10
.p2align 3
.L5:
movl a(%rsi,%rax), %edx
imull c_tra(%rdi,%rax), %edx
addq $4, %rax
addl %edx, %ecx
cmpq $128, %rax
jne .L5
movl %ecx, c_result(%rsi,%r8)
addq $4, %r8
cmpq $128, %r8
jne .L7
subq $-128, %rsi
cmpq $4096, %rsi
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits
And this is for float :
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %ecx, %ecx
.L2:
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L7:
movq %rdi, %rsi
xorl %eax, %eax
xorl %edx, %edx
salq $5, %rsi
.p2align 4,,10
.p2align 3
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss %xmm0, c_result(%rcx,%rdi)
addq $4, %rdi
cmpq $128, %rdi
jne .L7
subq $-128, %rcx
cmpq $4096, %rcx
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits
The problem is the inner loop of the floating-point version:
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
Because temp in main() is of type int (corresponding to %edx in the assembly), the value has to be converted back and forth between float and int in the loop. According to http://www.agner.org/optimize/instruction_tables.pdf, CVTSI2SS and CVT(T)SS2SI each have 6 cycles latency on Skylake. Furthermore, the conversions are in the dependency-chain, so out-of-order and superscalar execution do not help much in this case.
Changing main()s int temp to float temp removes these conversions.

Understanding simple assember program [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have practiced Assembler long time ago and I would like to understand a simple program (I generate assembler code from a C code) which adds 2 vectors (actually 2 arrays) and store the result in another vector (an output array). My goal is after to study vectorization. For this, I use gcc-4.9 under Debian Wheezy on i7-core processor.
Here the C code snippet (not vectorized version) :
#include <stdio.h>
#define SIZE 10000
void test(double *a, double *b, double *c)
{
int i;
for (i = 0; i < SIZE; i++)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int i;
double tab1[SIZE];
double tab2[SIZE];
double tab3[SIZE];
for (i = 0; i < SIZE; i++)
{
tab1[i] = i;
tab2[i] = i;
tab3[i] = 0;
}
test(tab1, tab2, tab3);
for (i = 0; i < SIZE; i++)
printf(" tab3[%d] = %f\n", i, tab3[i]);
return 0;
}
I generate Assembler code with AT&T syntax :
gcc -std=c99 -c main_no_vectorized.c -O3 -S -o main_no_vectorized.s
Here is the assembly code :
.file "main_no_vectorized.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl test
.type test, #function
test:
.LFB3:
.cfi_startproc
leaq 16(%rdx), %rax
leaq 16(%rsi), %rcx
cmpq %rax, %rsi
setae %r8b
cmpq %rcx, %rdx
setae %cl
orb %cl, %r8b
je .L7
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %cl
cmpq %rax, %rdx
setae %al
orb %al, %cl
je .L7
testb $8, %dil
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
je .L8
movsd (%rdi), %xmm0
movl $9998, %ebp
movl $4999, %r9d
movl $9999, %r12d
movl $1, %r8d
movl $1, %ebx
addsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L3:
salq $3, %r8
xorl %eax, %eax
xorl %ecx, %ecx
leaq (%rdi,%r8), %r11
leaq (%rsi,%r8), %r10
addq %rdx, %r8
.p2align 4,,10
.p2align 3
.L4:
movupd (%r10,%rax), %xmm0
addl $1, %ecx
addpd (%r11,%rax), %xmm0
movups %xmm0, (%r8,%rax)
addq $16, %rax
cmpl %r9d, %ecx
jb .L4
cmpl %ebp, %r12d
leal (%rbx,%rbp), %eax
je .L1
cltq
movsd (%rdi,%rax,8), %xmm0
addsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L1:
popq %rbx
.cfi_remember_state
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L8:
.cfi_restore_state
movl $10000, %ebp
movl $5000, %r9d
movl $10000, %r12d
xorl %r8d, %r8d
xorl %ebx, %ebx
jmp .L3
.L7:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movsd (%rdi,%rax), %xmm0
addsd (%rsi,%rax), %xmm0
movsd %xmm0, (%rdx,%rax)
addq $8, %rax
cmpq $80000, %rax
jne .L2
rep ret
.cfi_endproc
.LFE3:
.size test, .-test
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string " tab3[%d] = %f\n"
.section .text.unlikely
.LCOLDB4:
.section .text.startup,"ax",#progbits
.LHOTB4:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl %eax, %eax
subq $240016, %rsp
.cfi_def_cfa_offset 240032
movdqa .LC2(%rip), %xmm3
leaq 32(%rsp), %rcx
leaq 80032(%rsp), %rdx
movdqa .LC1(%rip), %xmm1
.p2align 4,,10
.p2align 3
.L21:
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm2
paddd %xmm3, %xmm1
movaps %xmm2, 16(%rsp,%rax)
cvtdq2pd %xmm0, %xmm0
movaps %xmm2, 80016(%rsp,%rax)
movaps %xmm0, (%rcx,%rax)
movaps %xmm0, (%rdx,%rax)
addq $32, %rax
cmpq $80000, %rax
jne .L21
leaq 160016(%rsp), %rdi
movl $80000, %edx
xorl %esi, %esi
call memset
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L22:
movapd 16(%rsp,%rax), %xmm0
addpd 80016(%rsp,%rax), %xmm0
movaps %xmm0, 160016(%rsp,%rax)
addq $16, %rax
cmpq $80000, %rax
jne .L22
xorl %ebx, %ebx
.p2align 4,,10
.p2align 3
.L23:
movsd 160016(%rsp,%rbx,8), %xmm4
movl %ebx, %esi
movl $.LC3, %edi
movl $1, %eax
addq $1, %rbx
movapd %xmm4, %xmm0
movsd %xmm4, 8(%rsp)
call printf
cmpq $10000, %rbx
jne .L23
addq $240016, %rsp
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.section .text.unlikely
.LCOLDE4:
.section .text.startup
.LHOTE4:
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 4
.long 4
.long 4
.long 4
.ident "GCC: (Debian 4.9.1-16) 4.9.1"
.section .note.GNU-stack,"",#progbits
Could you explain to me the main steps of this above assembly code in relation with the C code, in particulary the "test" function, the loop of initialization in main function and the parameters passing (i.e where's the push and pop instructions for the stack) and the effective addition of "a" and "b" arrays ?
What corresponds to .L2, .L3, ... segments ? is there a relation with L2 cache, L3 cache ?
Sorry for these basics questions but I begin with Intel x86_64 assembler.
Thanks for your precious help
The generated assembly code is quite complicated. It first checks to see if the arrays a, b, and c overlap in a way that will cause an optimized loop to fail. For example, if you did this:
test(tab1, tab2, &tab1[1]);
then the overlap would be detected and cause the code to jump to L7 (the straightforward implementation). By the way, L stands for Label, and the label numbers are just generated by the compiler with no particular meaning. So L1, L2, L3, etc are just labels that are used for the code to branch to various places. The overlap checks start at .LFB3 and end at the last je .L7.
If no overlap is detected, then an optimized loop will be used. This optimized loop will try to add two doubles at a time instead of just one. The first thing the optimized loop does is to find out if array a is aligned to a 16 byte boundary (the testb $8, %dil instruction). If it is, it will jump to L8 to load a set of constants (e.g. r9 = 5000). If the array is not aligned, if will fall through and load a different set of constants (e.g. r9 = 4999), and also handle the first element. This is because the unaligned case will need to do 4999 iterations two at a time and handle the first and last unaligned elements separately outside the loop. The aligned case will just do 5000 iterations.
Either way, the code reaches L3 next. The code at L3 and L4 is the optimized loop that does the adds two at a time using the addpd instruction (the nonoptimized loop at L7 used addsd to do one add at a time). After the L4 loop finishes, it checks to see if it needs to handle the last element (for the unaligned case). Then it returns with the ret instruction.
By the way, it helps to know that when test is called, a is in rdi, b is in rsi, and c is in rdx. That is the calling convention for 64-bit. Therefore, there are no arguments pushed on the stack. If you don't understand x86 assembly too well, concentrate on the code starting at L7. That is the non-optimized version and you should be able to figure that part out given that I said your three arguments were in rdi, rsi, and rdx.
The .L2 and such are labels, they are used to refer to the next instruction. They are pretty much exactly like labels in C, if you've used goto. The primary use of a label is with a jump or branch, to specify where the jump goes to.
For example, the .L2 label is start of the body of your for (i = 0; i < SIZE; i++) loop in test(), it is counting by 8 bytes (the size of a double) up to 8*10000. The last instruction in the loop is jne .L2, which jumps to .L2 if the previous comparison was not equal.
You may find this reference (PDF) on x64 helpful.

Performance difference with casting to double and increment

I'm incrementing a counter, which I will need to use after the loop in double arithmetic. So, which would you expect to be faster? (Or too close to call?)
Code 1:
double dubs = 3.14159265;
double d;
for(d=0; d<BIGNUM; d++) { /* do stuff not depending on d */ }
dubs /= d;
Code 2:
double dubs = 3.14159265;
int i;
for(i=0; i<BIGNUM; i++) { /* do stuff not depending on i */ }
dubs /= (double) i;
And does it depend on the size of BIGNUM? I know it would be a minuscule difference, but just found myself wondering in theory.
Bonus question: if it were C++, any change in your answer for using static_cast?
--Edit--
Ok, here's a sample code and assembler:
#define BIGNUM 1000000000
#define NUMLOOPS 1000
double test1()
{
double dubs = 3.14159265;
double d;
int k = 1;
for(d=0; d<BIGNUM; d++) { k*= 2; }
dubs /= d;
return dubs;
}
double test2()
{
double dubs = 3.14159265;
int i;
int k = 1;
for(i=0; i<BIGNUM; i++) { k*= 2; }
dubs /= (double)i;
return dubs;
}
int main()
{
double d1=0;
double d2=0;
int i;
for(i=0; i<NUMLOOPS; i++)
{
d1 += test1();
d2 += test2();
}
}
_test1:
LFB2:
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
subq $48, %rsp
LCFI2:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -4(%rbp)
movl $0, %eax
movq %rax, -24(%rbp)
jmp L2
L3:
sall -4(%rbp)
movsd -24(%rbp), %xmm0
movsd LC2(%rip), %xmm1
addsd %xmm1, %xmm0
movsd %xmm0, -24(%rbp)
L2:
movsd -24(%rbp), %xmm1
movsd LC3(%rip), %xmm0
ucomisd %xmm1, %xmm0
ja L3
movsd -16(%rbp), %xmm0
divsd -24(%rbp), %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
leave
ret
_test2:
LFB3:
pushq %rbp
LCFI3:
movq %rsp, %rbp
LCFI4:
subq $32, %rsp
LCFI5:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -8(%rbp)
movl $0, -4(%rbp)
jmp L7
L8:
sall -8(%rbp)
incl -4(%rbp)
L7:
cmpl $99999, -4(%rbp)
jle L8
cvtsi2sd -4(%rbp), %xmm1
movsd -16(%rbp), %xmm0
divsd %xmm1, %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -24(%rbp)
movsd -24(%rbp), %xmm0
leave
ret
Test is currently running....
As a double it probably doesn't matter, but if you'd used float, the first code fragment might not even work. Due to limited precision, after a while, incrementing a float will not change its value. Of course with (signed) integer types, you get UB on overflow, which is arguably worse.
Personally I would recommend always using integer types for a variable that contains something like a count/index that is naturally an integer. Using floating point types for this just feels wrong. But please remove the useless cast in the last line of the second fragment.

Resources