Understanding simple assember program [closed]

Understanding simple assember program [closed] - c

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have practiced Assembler long time ago and I would like to understand a simple program (I generate assembler code from a C code) which adds 2 vectors (actually 2 arrays) and store the result in another vector (an output array). My goal is after to study vectorization. For this, I use gcc-4.9 under Debian Wheezy on i7-core processor.
Here the C code snippet (not vectorized version) :
#include <stdio.h>
#define SIZE 10000
void test(double *a, double *b, double *c)
{
int i;
for (i = 0; i < SIZE; i++)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int i;
double tab1[SIZE];
double tab2[SIZE];
double tab3[SIZE];
for (i = 0; i < SIZE; i++)
{
tab1[i] = i;
tab2[i] = i;
tab3[i] = 0;
}
test(tab1, tab2, tab3);
for (i = 0; i < SIZE; i++)
printf(" tab3[%d] = %f\n", i, tab3[i]);
return 0;
}
I generate Assembler code with AT&T syntax :
gcc -std=c99 -c main_no_vectorized.c -O3 -S -o main_no_vectorized.s
Here is the assembly code :
.file "main_no_vectorized.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl test
.type test, #function
test:
.LFB3:
.cfi_startproc
leaq 16(%rdx), %rax
leaq 16(%rsi), %rcx
cmpq %rax, %rsi
setae %r8b
cmpq %rcx, %rdx
setae %cl
orb %cl, %r8b
je .L7
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %cl
cmpq %rax, %rdx
setae %al
orb %al, %cl
je .L7
testb $8, %dil
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
je .L8
movsd (%rdi), %xmm0
movl $9998, %ebp
movl $4999, %r9d
movl $9999, %r12d
movl $1, %r8d
movl $1, %ebx
addsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L3:
salq $3, %r8
xorl %eax, %eax
xorl %ecx, %ecx
leaq (%rdi,%r8), %r11
leaq (%rsi,%r8), %r10
addq %rdx, %r8
.p2align 4,,10
.p2align 3
.L4:
movupd (%r10,%rax), %xmm0
addl $1, %ecx
addpd (%r11,%rax), %xmm0
movups %xmm0, (%r8,%rax)
addq $16, %rax
cmpl %r9d, %ecx
jb .L4
cmpl %ebp, %r12d
leal (%rbx,%rbp), %eax
je .L1
cltq
movsd (%rdi,%rax,8), %xmm0
addsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L1:
popq %rbx
.cfi_remember_state
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L8:
.cfi_restore_state
movl $10000, %ebp
movl $5000, %r9d
movl $10000, %r12d
xorl %r8d, %r8d
xorl %ebx, %ebx
jmp .L3
.L7:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movsd (%rdi,%rax), %xmm0
addsd (%rsi,%rax), %xmm0
movsd %xmm0, (%rdx,%rax)
addq $8, %rax
cmpq $80000, %rax
jne .L2
rep ret
.cfi_endproc
.LFE3:
.size test, .-test
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string " tab3[%d] = %f\n"
.section .text.unlikely
.LCOLDB4:
.section .text.startup,"ax",#progbits
.LHOTB4:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl %eax, %eax
subq $240016, %rsp
.cfi_def_cfa_offset 240032
movdqa .LC2(%rip), %xmm3
leaq 32(%rsp), %rcx
leaq 80032(%rsp), %rdx
movdqa .LC1(%rip), %xmm1
.p2align 4,,10
.p2align 3
.L21:
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm2
paddd %xmm3, %xmm1
movaps %xmm2, 16(%rsp,%rax)
cvtdq2pd %xmm0, %xmm0
movaps %xmm2, 80016(%rsp,%rax)
movaps %xmm0, (%rcx,%rax)
movaps %xmm0, (%rdx,%rax)
addq $32, %rax
cmpq $80000, %rax
jne .L21
leaq 160016(%rsp), %rdi
movl $80000, %edx
xorl %esi, %esi
call memset
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L22:
movapd 16(%rsp,%rax), %xmm0
addpd 80016(%rsp,%rax), %xmm0
movaps %xmm0, 160016(%rsp,%rax)
addq $16, %rax
cmpq $80000, %rax
jne .L22
xorl %ebx, %ebx
.p2align 4,,10
.p2align 3
.L23:
movsd 160016(%rsp,%rbx,8), %xmm4
movl %ebx, %esi
movl $.LC3, %edi
movl $1, %eax
addq $1, %rbx
movapd %xmm4, %xmm0
movsd %xmm4, 8(%rsp)
call printf
cmpq $10000, %rbx
jne .L23
addq $240016, %rsp
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.section .text.unlikely
.LCOLDE4:
.section .text.startup
.LHOTE4:
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 4
.long 4
.long 4
.long 4
.ident "GCC: (Debian 4.9.1-16) 4.9.1"
.section .note.GNU-stack,"",#progbits
Could you explain to me the main steps of this above assembly code in relation with the C code, in particulary the "test" function, the loop of initialization in main function and the parameters passing (i.e where's the push and pop instructions for the stack) and the effective addition of "a" and "b" arrays ?
What corresponds to .L2, .L3, ... segments ? is there a relation with L2 cache, L3 cache ?
Sorry for these basics questions but I begin with Intel x86_64 assembler.
Thanks for your precious help

The generated assembly code is quite complicated. It first checks to see if the arrays a, b, and c overlap in a way that will cause an optimized loop to fail. For example, if you did this:
test(tab1, tab2, &tab1[1]);
then the overlap would be detected and cause the code to jump to L7 (the straightforward implementation). By the way, L stands for Label, and the label numbers are just generated by the compiler with no particular meaning. So L1, L2, L3, etc are just labels that are used for the code to branch to various places. The overlap checks start at .LFB3 and end at the last je .L7.
If no overlap is detected, then an optimized loop will be used. This optimized loop will try to add two doubles at a time instead of just one. The first thing the optimized loop does is to find out if array a is aligned to a 16 byte boundary (the testb $8, %dil instruction). If it is, it will jump to L8 to load a set of constants (e.g. r9 = 5000). If the array is not aligned, if will fall through and load a different set of constants (e.g. r9 = 4999), and also handle the first element. This is because the unaligned case will need to do 4999 iterations two at a time and handle the first and last unaligned elements separately outside the loop. The aligned case will just do 5000 iterations.
Either way, the code reaches L3 next. The code at L3 and L4 is the optimized loop that does the adds two at a time using the addpd instruction (the nonoptimized loop at L7 used addsd to do one add at a time). After the L4 loop finishes, it checks to see if it needs to handle the last element (for the unaligned case). Then it returns with the ret instruction.
By the way, it helps to know that when test is called, a is in rdi, b is in rsi, and c is in rdx. That is the calling convention for 64-bit. Therefore, there are no arguments pushed on the stack. If you don't understand x86 assembly too well, concentrate on the code starting at L7. That is the non-optimized version and you should be able to figure that part out given that I said your three arguments were in rdi, rsi, and rdx.

The .L2 and such are labels, they are used to refer to the next instruction. They are pretty much exactly like labels in C, if you've used goto. The primary use of a label is with a jump or branch, to specify where the jump goes to.
For example, the .L2 label is start of the body of your for (i = 0; i < SIZE; i++) loop in test(), it is counting by 8 bytes (the size of a double) up to 8*10000. The last instruction in the loop is jne .L2, which jumps to .L2 if the previous comparison was not equal.
You may find this reference (PDF) on x64 helpful.

Related

GCC optimizing _mm256_setzero_si256 away?

Consider the following C program.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
static void do_stuff(void)
{
const int n = 256;
int *ar = malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
ar[i] = random();
}
int main(void)
{
do_stuff();
__m256i sm = _mm256_setzero_si256();
int sum = 0;
int *vcadd = (int*)&sm;
for (size_t l = 0; l < 8; l++)
sum += vcadd[l];
printf("sum = %d\n", sum);
return 0;
}
I expected this program to print sum = 0, but when I compile it with gcc -mavx2 src.c -O2, it sometimes prints sum = 0, sometimes sum = 18.
When compiled with -O1 or -O0, the programs works as expected. It also seems to work fine with -O2 and the do_stuff(); call commented out.
Assembly generated for main with -O1 (+ comments from me of what I think the instructions do):
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
movl $1024, %edi
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.L2:
call random#PLT
movl %eax, (%rbx)
addq $4, %rbx
cmpq %r12, %rbx
jne .L2
vpxor %xmm0, %xmm0, %xmm0 ; zero out %ymm0
vmovdqa %ymm0, (%rsp) ; store these zeros at %rsp
movq %rsp, %rax ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
leaq 32(%rsp), %rcx ; ^
movl $0, %edx ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
leaq .LC0(%rip), %rsi
movl $1, %edi
movl $0, %eax
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L8
movl $0, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L8:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
and with -O2:
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $1024, %edi
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.p2align 4,,10
.p2align 3
.L2:
call random#PLT
addq $4, %rbx
movl %eax, -4(%rbx)
cmpq %r12, %rbx
jne .L2
movq %rsp, %rax ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
leaq 32(%rsp), %rcx ; ^
xorl %edx, %edx ; ^
.p2align 4,,10 ; ^
.p2align 3 ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
xorl %eax, %eax
leaq .LC0(%rip), %rsi
movl $1, %edi
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L9
leaq -16(%rbp), %rsp
xorl %eax, %eax
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L9:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0?
I'm using
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Solution based on comments
(all below compiled with -O2)
Using memcpy as
__m256i sm = _mm256_setzero_si256();
int ar[8];
memcpy(ar, &sm, 32);
copies the data, although in a somewhat convoluted way (?)
vpxor %xmm0, %xmm0, %xmm0
leaq 48(%rsp), %rax
leaq 80(%rsp), %rcx
xorl %edx, %edx
vmovdqa %ymm0, (%rsp)
vmovdqa 16(%rsp), %xmm2
vmovdqa %xmm0, 48(%rsp)
vmovdqa %xmm2, 64(%rsp)
A union
union conv
{
__m256i val;
int ar[8];
};
union conv c;
c.val = _mm256_setzero_si256();
// access c.ar
works too by producing
vpxor %xmm0, %xmm0, %xmm0
leaq 4(%rsp), %rax
leaq 32(%rsp), %rsi
xorl %ecx, %ecx
vmovdqa %ymm0, (%rsp)
Another option is to compile with -fno-strict-aliasing. In that case, the original code works as I expected.

If you have 8 integers in __m256i variable, and you want horizontal sum, best way is probably intrinsics.
Here’s an example, untested:
// Horizontal sum of all 8 lanes in int32 SIMD vector
inline int hadd_epi32( __m256i vec )
{
// Add 8 lanes into 4
__m128i r = _mm256_extracti128_si256( vec, 1 );
r = _mm_add_epi32( r, _mm256_castsi256_si128( vec ) );
// Add 4 lanes into 2
r = _mm_add_epi32( r, _mm_unpackhi_epi64( r, r ) );
// Extract 2 lowest lanes from the vector into scalar registers, return their sum
const int i1 = _mm_extract_epi32( r, 1 );
const int i0 = _mm_cvtsi128_si32( r );
return i1 + i0;
}

Why GCC -Ofast makes the program wrong but only when it prints the result twice?

Recompiling an old program made it output the wrong result. I'd like to know why.
I know that -Ofast may "disregard strict standards compliance" but I'm curious about what happens under the hood.
I reduced the program to this minimal example foo1.c:
#include <stdio.h>
double my_pow(double x, unsigned n)
{ /* returns x^n */
double y = 1;
while(n--) y *= x;
return y;
}
void foo(double small)
{ /* prints small^19 */
double x = my_pow(small,19);
printf("%E\n",x);
printf("%E\n",x);
}
int main(void)
{
foo(1-0.8-0.2);
return 0;
}
When compiled with -Ofast it gives a different output than with any other optimization level.
gcc -Ofast foo1.c && ./a.out:
-0.000000E+00
-0.000000E+00
gcc foo1.c && ./a.out:
-1.390671E-309
-1.390671E-309
A strange fact is that when one of the printf is commented out (file foo2.c) this behavior doesn't replicate making it a sort of heisenbug.
gcc -Ofast foo2.c && ./a.out:
-1.390671E-309
gcc foo2.c && ./a.out:
-1.390671E-309
Informations that might be useful:
gcc -v:
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC)
gcc -Ofast foo1.c -S -o -:
.file "foo1.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
subq $24, %rsp
.cfi_def_cfa_offset 32
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
movsd %xmm2, 8(%rsp)
call printf
movsd 8(%rsp), %xmm2
movl $.LC2, %edi
movl $1, %eax
addq $24, %rsp
.cfi_def_cfa_offset 8
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movsd .LC3(%rip), %xmm0
call foo
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -1131413504
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo1.c -S -o -:
.file "foo1.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc -Ofast foo2.c -S -o -:
.file "foo2.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $.LC2, %edi
movl $1, %eax
movsd .LC3(%rip), %xmm0
call printf
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -2147418112
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo2.c -S -o -:
.file "foo2.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits

Under -ffast-math (and it's siblings like -Ofast) gcc links your app with a special startup code in crtfastmath.c which sets flush-to-zero flag:
static void __attribute__((constructor))
set_fast_math (void)
{
#ifndef __x86_64__
...
#else
unsigned int mxcsr = __builtin_ia32_stmxcsr ();
mxcsr |= MXCSR_DAZ | MXCSR_FTZ;
__builtin_ia32_ldmxcsr (mxcsr);
#endif
}
(from here).

How can gcc -O3 option make the run so fast?

[Question]
I run below code with O3 option. And then, I found that the perforamce of the code with O3, is nine times higher than performance of the code without O3.
Edit :
I want to know the key of optimization technique, not reason. This is my question. I have never experienced x86 assembly. So it is too hard to understand x86 assembly code. That is the reason I posted this question. Or, could you explain the code with O3 option for me?
................................................................................
[C code]
The code just executes addition.
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
return 0;
}
[Assembly with O3]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .text.unlikely
.LCOLDB2:
.section .text.startup,"ax",#progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
xorl %eax, %eax
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
[Assembly without O3]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2096, %rsp
movl %edi, -2084(%rbp)
movq %rsi, -2096(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L6
.L7:
movl -2068(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2068(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2072(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L6:
cmpl $127, -2072(%rbp)
jle .L7
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L9
call __stack_chk_fail
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

You code has no observable side-effects so the optimizer is simply discarding most of your code.
Using -O3 turns your main function into:
main:
xorl %eax, %eax
ret
Which is equivalent to:
int main()
{
return 0;
}
This shows that micro-benchmarking code can be difficult to do correctly.
Edit:
As pointed out in a comment below, the posted code doesn't initialize ibuffer[INPUT_FEATURE]. Reading an uninitialized variable is undefined behavior which makes the whole program malformed. This is a real problem and the code isn't required to produce reasonable results. Thanks #chqrlie

I modified the code and experimented, reflecting your reply, with it as follows. result is the same as before. O3 option is better than no option.
#define OFFSET (8)
#define INPUT_FEATURE (1024)
#define TSIZE (INPUT_FEATURE/OFFSET)
#include<stdio.h>
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = 0.0;
}
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
printf("%lf",H[k]);
}
return 0;
}
[code with O3 option]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB23:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE23:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .rodata.str1.1,"aMS",#progbits,1
.LC5:
.string "%lf"
.section .text.unlikely
.LCOLDB6:
.section .text.startup,"ax",#progbits
.LHOTB6:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $128, %ecx
pxor %xmm12, %xmm12
[code no option]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .rodata
.LC2:
.string "%lf"
.text
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2128, %rsp
movl %edi, -2100(%rbp)
movq %rsi, -2112(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2088(%rbp)
movl $0, -2084(%rbp)
jmp .L6
.L7:
movl -2088(%rbp), %eax
cltq
pxor %xmm0, %xmm0
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2084(%rbp)
addl $1, -2088(%rbp)
.L6:
cmpl $127, -2088(%rbp)
jle .L7
movl $0, -2080(%rbp)
movl $0, -2076(%rbp)
jmp .L8
.L9:
movl -2076(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2076(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2080(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2076(%rbp)
addl $1, -2080(%rbp)
.L8:
cmpl $127, -2080(%rbp)
jle .L9
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L10
.L11:
movl -2072(%rbp), %eax
cltq
movq -2064(%rbp,%rax,8), %rax
movq %rax, -2120(%rbp)
movsd -2120(%rbp), %xmm0
movl $.LC2, %edi
movl $1, %eax
call printf
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L10:
cmpl $127, -2072(%rbp)
jle .L11
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L13
call __stack_chk_fail
.L13:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

Why does summing floats into an int temporary run so much slower than when everything is int?

I compiled this program in GCC 4.8 -O2 on Skylake 6700HQ.
When I use float data type, total execution time is 0.000176 sec. When I change the float to int, the total time is 0.000026 (~7x faster). I don't know the reason for this difference. Related question: assembly output O3
I use this command in Geany IDE build command gcc -Wall -march=native -O2 -o "%e" "%f". I also tried -O3 and -Ofast, but those do not fix the problem.
I also read this question but there is too much differences between this float and int implementation. Since this float implementation is 7 times slower than the corresponding int implementation, this is not a duplicate question
#include <stdio.h>
#include <time.h>
float a[32][32]
, t[32][32]
, c_result[32][32]
, c_tra[32][32] ;
int main()
{
int w = 10000;
int i, j, k, temp;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for( i = 0; i < 32; i++){
for( j =0 ; j < 32; j++){
temp=0;
for( k = 0 ;k < 32; k++) {
temp += a[i][k] * c_tra[j][k];
}
c_result[i][j]= temp;
}
}
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
}while(w--);
printf(" The best time: %lf sec\n",tBest);
return 0;
}
It is assembly out put for int data type:
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %esi, %esi
.L2:
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L7:
movq %r8, %rdi
xorl %eax, %eax
xorl %ecx, %ecx
salq $5, %rdi
.p2align 4,,10
.p2align 3
.L5:
movl a(%rsi,%rax), %edx
imull c_tra(%rdi,%rax), %edx
addq $4, %rax
addl %edx, %ecx
cmpq $128, %rax
jne .L5
movl %ecx, c_result(%rsi,%r8)
addq $4, %r8
cmpq $128, %r8
jne .L7
subq $-128, %rsi
cmpq $4096, %rsi
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits
And this is for float :
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %ecx, %ecx
.L2:
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L7:
movq %rdi, %rsi
xorl %eax, %eax
xorl %edx, %edx
salq $5, %rsi
.p2align 4,,10
.p2align 3
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss %xmm0, c_result(%rcx,%rdi)
addq $4, %rdi
cmpq $128, %rdi
jne .L7
subq $-128, %rcx
cmpq $4096, %rcx
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits

The problem is the inner loop of the floating-point version:
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
Because temp in main() is of type int (corresponding to %edx in the assembly), the value has to be converted back and forth between float and int in the loop. According to http://www.agner.org/optimize/instruction_tables.pdf, CVTSI2SS and CVT(T)SS2SI each have 6 cycles latency on Skylake. Furthermore, the conversions are in the dependency-chain, so out-of-order and superscalar execution do not help much in this case.
Changing main()s int temp to float temp removes these conversions.

va_alist ( using variable list in 64 bit machine )

I am trying to implement a print function in the kernel module for my learning purposes. I am emulating it on QEMU.
#define va_alist __builtin_va_alist
#define va_dcl __builtin_va_list_t __builtin_va_list; ...
#define va_start(ap) __builtin_varargs_start(ap)
#define va_arg(ap, type) __builtin_va_arg((ap), type)
#define va_end(ap) __builtin_va_end(ap)
But I am getting the error that __builtin_va_alist is undeclared. Should I try to find the definition of __builtin_va_alist also and put it in my include file or am I not aware of something here? Also, If i change __builtin_va_alist to __builtin_va_list ( note: a is not there ), then I am getting an error called implicit declaration of __builtin_varargs_start . Kindly help.
Thanks
Chidambaram

How varargs works on x86-64 is actually fairly complicated.
If we take this as an example:
#include <stdio.h>
int main()
{
double f=0.7;
printf("%d %f %p %d %f", 17, f, "hello", 42, 0.8);
return 0;
}
The code it generates is:
.file "printf.c"
.section .rodata.str1.1,"aMS",#progbits,1
.LC1:
.string "hello"
.LC3:
.string "%d %f %p %d %f"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB11:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $42, %ecx
movl $.LC1, %edx
movsd .LC0(%rip), %xmm1
movl $17, %esi
movsd .LC2(%rip), %xmm0
movl $.LC3, %edi
movl $2, %eax
call printf
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 2576980378
.long 1072273817
.align 8
.LC2:
.long 1717986918
.long 1072064102
.ident "GCC: (GNU) 4.6.3 20120306 (Red Hat 4.6.3-2)"
.section .note.GNU-stack,"",#progbits
As you can see, the floating point values are held in %xmm0 and %xmm1, and the printf function (like any other varargs function) is "told" how many arguments are passed in SSE registers by the value in %eax (2 in this case). Regular arguments are passed in registers, so %edi, %esi, %edx, %ecx contain the format string, the first integer argument, the address of "hello" and the second integer argument. This follows the standard argument ordering of x86_64.
The compiler normally generates code to then push all the argument registers on the stack, and "fish out" the registers in the va* functions.
So if we take the above source code and replace the printf with a myprintf, which looks like this:
void myprintf(const char *fmt, ...)
{
va_list va;
int i;
va_start(va, fmt);
for(i = 0; i < 5; i++)
{
switch(i)
{
case 1:
case 4:
{
double d = va_arg(va, double);
printf("double %f:", d);
}
break;
default:
{
long l = va_arg(va, long);
printf("long %ld:", l);
}
}
}
printf("\n");
}
at the beginning of myprintf it does:
...
movq %rsi, 40(%rsp)
movq %rdx, 48(%rsp)
movq %rcx, 56(%rsp)
movq %r8, 64(%rsp)
movq %r9, 72(%rsp)
je .L2
movaps %xmm0, 80(%rsp)
movaps %xmm1, 96(%rsp)
movaps %xmm2, 112(%rsp)
movaps %xmm3, 128(%rsp)
movaps %xmm4, 144(%rsp)
movaps %xmm5, 160(%rsp)
movaps %xmm6, 176(%rsp)
movaps %xmm7, 192(%rsp)
.L2:
...
The code to then fish things out of the stack is quite complicated. This is the floating point side:
.L4:
.cfi_restore_state
movl 12(%rsp), %edx
cmpl $176, %edx
jae .L5
movl %edx, %eax
addq 24(%rsp), %rax
addl $16, %edx
movl %edx, 12(%rsp)
.L6:
movsd (%rax), %xmm0
movl $.LC0, %edi
movl $1, %eax
call printf
jmp .L7
.p2align 4,,10
.p2align 3
.L8:
movq 16(%rsp), %rax
leaq 8(%rax), %rdx
movq %rdx, 16(%rsp)
jmp .L9
.p2align 4,,10
.p2align 3
.L5:
movq 16(%rsp), %rax
leaq 8(%rax), %rdx
movq %rdx, 16(%rsp)
jmp .L6
Now, I don't know what compiler flags you are using, because my compiler generates this code with gcc -O2 -nostdlib -fno-builtin -ffreestanding without any problem.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight