How can gcc -O3 option make the run so fast? - c

[Question]
I run below code with O3 option. And then, I found that the perforamce of the code with O3, is nine times higher than performance of the code without O3.
Edit :
I want to know the key of optimization technique, not reason. This is my question. I have never experienced x86 assembly. So it is too hard to understand x86 assembly code. That is the reason I posted this question. Or, could you explain the code with O3 option for me?
................................................................................
[C code]
The code just executes addition.
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
return 0;
}
[Assembly with O3]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .text.unlikely
.LCOLDB2:
.section .text.startup,"ax",#progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
xorl %eax, %eax
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
[Assembly without O3]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2096, %rsp
movl %edi, -2084(%rbp)
movq %rsi, -2096(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L6
.L7:
movl -2068(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2068(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2072(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L6:
cmpl $127, -2072(%rbp)
jle .L7
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L9
call __stack_chk_fail
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

You code has no observable side-effects so the optimizer is simply discarding most of your code.
Using -O3 turns your main function into:
main:
xorl %eax, %eax
ret
Which is equivalent to:
int main()
{
return 0;
}
This shows that micro-benchmarking code can be difficult to do correctly.
Edit:
As pointed out in a comment below, the posted code doesn't initialize ibuffer[INPUT_FEATURE]. Reading an uninitialized variable is undefined behavior which makes the whole program malformed. This is a real problem and the code isn't required to produce reasonable results. Thanks #chqrlie

I modified the code and experimented, reflecting your reply, with it as follows. result is the same as before. O3 option is better than no option.
#define OFFSET (8)
#define INPUT_FEATURE (1024)
#define TSIZE (INPUT_FEATURE/OFFSET)
#include<stdio.h>
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = 0.0;
}
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
printf("%lf",H[k]);
}
return 0;
}
[code with O3 option]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB23:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE23:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .rodata.str1.1,"aMS",#progbits,1
.LC5:
.string "%lf"
.section .text.unlikely
.LCOLDB6:
.section .text.startup,"ax",#progbits
.LHOTB6:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $128, %ecx
pxor %xmm12, %xmm12
[code no option]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .rodata
.LC2:
.string "%lf"
.text
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2128, %rsp
movl %edi, -2100(%rbp)
movq %rsi, -2112(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2088(%rbp)
movl $0, -2084(%rbp)
jmp .L6
.L7:
movl -2088(%rbp), %eax
cltq
pxor %xmm0, %xmm0
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2084(%rbp)
addl $1, -2088(%rbp)
.L6:
cmpl $127, -2088(%rbp)
jle .L7
movl $0, -2080(%rbp)
movl $0, -2076(%rbp)
jmp .L8
.L9:
movl -2076(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2076(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2080(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2076(%rbp)
addl $1, -2080(%rbp)
.L8:
cmpl $127, -2080(%rbp)
jle .L9
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L10
.L11:
movl -2072(%rbp), %eax
cltq
movq -2064(%rbp,%rax,8), %rax
movq %rax, -2120(%rbp)
movsd -2120(%rbp), %xmm0
movl $.LC2, %edi
movl $1, %eax
call printf
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L10:
cmpl $127, -2072(%rbp)
jle .L11
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L13
call __stack_chk_fail
.L13:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

Related

Why accessing 2d array takes less than 1d array?

I am working with the matrix-matrix multiplication and I noticed that if I use 2D arrays, A[M][N], insted of 1D arrays to store a matrix, the access to that takes less time against the locality principle. Here there is the code where the time using linearised matrix is greater than the second one. Why ?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
void matrixMul(double* m1, double* m2, double* m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i * N + j] += m1[i*N+k] * m2[k*N+j];
}
}
}
}
void matrixlin(double** m1, double** m2, double** m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i][j] += m1[i][k] * m2[k][j];
}
}
}
}
int main(int argc, char* argv[]) {
int N = 1024;
int i, j;
double *m1 = (double *)malloc(N*N*sizeof(double));
double *m2 = (double *)malloc(N*N*sizeof(double));
double *m3 = (double *)malloc(N*N*sizeof(double));
memset(m3, 0, N * N * sizeof(double));
double **mm1 = (double **)malloc(N*sizeof(double*));
double **mm2 = (double **)malloc(N*sizeof(double*));
double **mm3 = (double **)malloc(N*sizeof(double*));
for(i=0; i<N; i++){
mm1[i]=(double *)malloc(N*sizeof(double));
mm2[i]=(double *)malloc(N*sizeof(double));
mm3[i]=(double *)malloc(N*sizeof(double));
memset(mm3[i], 0, N * sizeof(double));
}
for(i=0; i<N; i++){
for(j=0; j<N; j++){
m1[i * N + j] = 1.1; m2[i * N + j]=2.1;
mm1[i][j] = 1.1; mm2[i][j] = 2.1;
}
}
clock_t t1 = clock();
matrixMul(m1, m2, m3, N);
t1 = clock() - t1;
printf("Elapsed time linearized: %.5f seconds\n", ((double)t1)/CLOCKS_PER_SEC);
clock_t t2 = clock();
matrixlin(mm1, mm2, mm3, N);
t2 = clock() - t2;
printf("Elapsed time 2D array: %.5f seconds\n", ((double)t2)/CLOCKS_PER_SEC);
free(m1);
free(m2);
free(m3);
for(i=0; i<N; i++){
free(mm1[i]);
free(mm2[i]);
free(mm3[i]);
}
free(mm1);
free(mm2);
free(mm3);
return 0;
}
Output:
$ gcc test.c
$ ./a.out
Elapsed time linearized: 22.40697 seconds
Elapsed time 2D array: 7.61103 seconds
Assembly code (gcc -S):
.file "test.c"
.text
.globl matrixMul
.type matrixMul, #function
matrixMul:
.LFB6:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L2
.L7:
movl $0, -8(%rbp)
jmp .L3
.L6:
movl $0, -4(%rbp)
jmp .L4
.L5:
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -4(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L4:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L5
addl $1, -8(%rbp)
.L3:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L6
addl $1, -12(%rbp)
.L2:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L7
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size matrixMul, .-matrixMul
.globl matrixlin
.type matrixlin, #function
matrixlin:
.LFB7:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L9
.L14:
movl $0, -8(%rbp)
jmp .L10
.L13:
movl $0, -4(%rbp)
jmp .L11
.L12:
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -4(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L11:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L12
addl $1, -8(%rbp)
.L10:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L13
addl $1, -12(%rbp)
.L9:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L14
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE7:
.size matrixlin, .-matrixlin
.section .rodata
.align 8
.LC3:
.string "Elapsed time linearized: %.5f seconds\n"
.align 8
.LC4:
.string "Elapsed time 2D array: %.5f seconds\n"
.text
.globl main
.type main, #function
main:
.LFB8:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $104, %rsp
.cfi_offset 3, -24
movl %edi, -100(%rbp)
movq %rsi, -112(%rbp)
movl $1024, -84(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -80(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -72(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -64(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -64(%rbp), %rax
movl $0, %esi
movq %rax, %rdi
call memset#PLT
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -56(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -48(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -40(%rbp)
movl $0, -92(%rbp)
jmp .L16
.L17:
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -56(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -48(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -40(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rcx
movq -40(%rbp), %rax
addq %rcx, %rax
movq (%rax), %rax
movl $0, %esi
movq %rax, %rdi
call memset#PLT
addl $1, -92(%rbp)
.L16:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L17
movl $0, -92(%rbp)
jmp .L18
.L21:
movl $0, -88(%rbp)
jmp .L19
.L20:
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
addl $1, -88(%rbp)
.L19:
movl -88(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L20
addl $1, -92(%rbp)
.L18:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L21
call clock#PLT
movq %rax, -32(%rbp)
movl -84(%rbp), %ecx
movq -64(%rbp), %rdx
movq -72(%rbp), %rsi
movq -80(%rbp), %rax
movq %rax, %rdi
call matrixMul
call clock#PLT
subq -32(%rbp), %rax
movq %rax, -32(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -32(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC3(%rip), %rdi
movl $1, %eax
call printf#PLT
call clock#PLT
movq %rax, -24(%rbp)
movl -84(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call matrixlin
call clock#PLT
subq -24(%rbp), %rax
movq %rax, -24(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -24(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC4(%rip), %rdi
movl $1, %eax
call printf#PLT
movq -80(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -72(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -64(%rbp), %rax
movq %rax, %rdi
call free#PLT
movl $0, -92(%rbp)
jmp .L22
.L23:
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
addl $1, -92(%rbp)
.L22:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L23
movq -56(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -48(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -40(%rbp), %rax
movq %rax, %rdi
call free#PLT
movl $0, %eax
movq -8(%rbp), %rbx
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE8:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long -1717986918
.long 1072798105
.align 8
.LC1:
.long -858993459
.long 1073794252
.align 8
.LC2:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 10.3.0-1ubuntu1) 10.3.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:

Why GCC -Ofast makes the program wrong but only when it prints the result twice?

Recompiling an old program made it output the wrong result. I'd like to know why.
I know that -Ofast may "disregard strict standards compliance" but I'm curious about what happens under the hood.
I reduced the program to this minimal example foo1.c:
#include <stdio.h>
double my_pow(double x, unsigned n)
{ /* returns x^n */
double y = 1;
while(n--) y *= x;
return y;
}
void foo(double small)
{ /* prints small^19 */
double x = my_pow(small,19);
printf("%E\n",x);
printf("%E\n",x);
}
int main(void)
{
foo(1-0.8-0.2);
return 0;
}
When compiled with -Ofast it gives a different output than with any other optimization level.
gcc -Ofast foo1.c && ./a.out:
-0.000000E+00
-0.000000E+00
gcc foo1.c && ./a.out:
-1.390671E-309
-1.390671E-309
A strange fact is that when one of the printf is commented out (file foo2.c) this behavior doesn't replicate making it a sort of heisenbug.
gcc -Ofast foo2.c && ./a.out:
-1.390671E-309
gcc foo2.c && ./a.out:
-1.390671E-309
Informations that might be useful:
gcc -v:
Using built-in specs.
COLLECT_GCC=gcc
COLLECT_LTO_WRAPPER=/usr/libexec/gcc/x86_64-redhat-linux/4.8.5/lto-wrapper
Target: x86_64-redhat-linux
Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --with-bugurl=http://bugzilla.redhat.com/bugzilla --enable-bootstrap --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-gnu-unique-object --enable-linker-build-id --with-linker-hash-style=gnu --enable-languages=c,c++,objc,obj-c++,java,fortran,ada,go,lto --enable-plugin --enable-initfini-array --disable-libgcj --with-isl=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/isl-install --with-cloog=/builddir/build/BUILD/gcc-4.8.5-20150702/obj-x86_64-redhat-linux/cloog-install --enable-gnu-indirect-function --with-tune=generic --with-arch_32=x86-64 --build=x86_64-redhat-linux
Thread model: posix
gcc version 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC)
gcc -Ofast foo1.c -S -o -:
.file "foo1.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
subq $24, %rsp
.cfi_def_cfa_offset 32
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
movsd %xmm2, 8(%rsp)
call printf
movsd 8(%rsp), %xmm2
movl $.LC2, %edi
movl $1, %eax
addq $24, %rsp
.cfi_def_cfa_offset 8
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movsd .LC3(%rip), %xmm0
call foo
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -1131413504
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo1.c -S -o -:
.file "foo1.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc -Ofast foo2.c -S -o -:
.file "foo2.c"
.text
.p2align 4,,15
.globl my_pow
.type my_pow, #function
my_pow:
.LFB11:
.cfi_startproc
testl %edi, %edi
leal -1(%rdi), %edx
je .L10
movl %edi, %ecx
shrl %ecx
movl %ecx, %esi
addl %esi, %esi
je .L11
cmpl $9, %edi
jbe .L11
movapd %xmm0, %xmm1
movapd .LC0(%rip), %xmm2
xorl %eax, %eax
unpcklpd %xmm1, %xmm1
.L9:
addl $1, %eax
mulpd %xmm1, %xmm2
cmpl %eax, %ecx
ja .L9
movapd %xmm2, -24(%rsp)
subl %esi, %edx
cmpl %esi, %edi
movsd -16(%rsp), %xmm1
mulsd %xmm2, %xmm1
je .L2
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
.L35:
cmpl $1, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $2, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $3, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $4, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $5, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $6, %edx
mulsd %xmm0, %xmm1
je .L2
cmpl $7, %edx
mulsd %xmm0, %xmm1
je .L2
mulsd %xmm0, %xmm1
.p2align 4,,10
.p2align 3
.L2:
movapd %xmm1, %xmm0
ret
.p2align 4,,10
.p2align 3
.L11:
movsd .LC1(%rip), %xmm1
testl %edx, %edx
mulsd %xmm0, %xmm1
je .L2
jmp .L35
.p2align 4,,10
.p2align 3
.L10:
movsd .LC1(%rip), %xmm1
jmp .L2
.cfi_endproc
.LFE11:
.size my_pow, .-my_pow
.section .rodata.str1.1,"aMS",#progbits,1
.LC2:
.string "%E\n"
.text
.p2align 4,,15
.globl foo
.type foo, #function
foo:
.LFB12:
.cfi_startproc
movapd %xmm0, %xmm2
movl $.LC2, %edi
movl $1, %eax
unpcklpd %xmm2, %xmm2
movapd %xmm2, %xmm1
mulpd %xmm2, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm1, %xmm1
mulpd %xmm2, %xmm1
movapd %xmm1, %xmm2
unpckhpd %xmm1, %xmm1
mulsd %xmm1, %xmm2
mulsd %xmm0, %xmm2
movapd %xmm2, %xmm0
jmp printf
.cfi_endproc
.LFE12:
.size foo, .-foo
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB13:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $.LC2, %edi
movl $1, %eax
movsd .LC3(%rip), %xmm0
call printf
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE13:
.size main, .-main
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC0:
.long 0
.long 1072693248
.long 0
.long 1072693248
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC1:
.long 0
.long 1072693248
.align 8
.LC3:
.long 0
.long -2147418112
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
gcc foo2.c -S -o -:
.file "foo2.c"
.text
.globl my_pow
.type my_pow, #function
my_pow:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movsd %xmm0, -24(%rbp)
movl %edi, -28(%rbp)
movabsq $4607182418800017408, %rax
movq %rax, -8(%rbp)
jmp .L2
.L3:
movsd -8(%rbp), %xmm0
mulsd -24(%rbp), %xmm0
movsd %xmm0, -8(%rbp)
.L2:
movl -28(%rbp), %eax
leal -1(%rax), %edx
movl %edx, -28(%rbp)
testl %eax, %eax
jne .L3
movq -8(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size my_pow, .-my_pow
.section .rodata
.LC1:
.string "%E\n"
.text
.globl foo
.type foo, #function
foo:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movsd %xmm0, -24(%rbp)
movq -24(%rbp), %rax
movl $19, %edi
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
call my_pow
movsd %xmm0, -32(%rbp)
movq -32(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, -32(%rbp)
movsd -32(%rbp), %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size foo, .-foo
.globl main
.type main, #function
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movabsq $-4859383997932765184, %rax
movq %rax, -8(%rbp)
movsd -8(%rbp), %xmm0
call foo
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size main, .-main
.ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)"
.section .note.GNU-stack,"",#progbits
Under -ffast-math (and it's siblings like -Ofast) gcc links your app with a special startup code in crtfastmath.c which sets flush-to-zero flag:
static void __attribute__((constructor))
set_fast_math (void)
{
#ifndef __x86_64__
...
#else
unsigned int mxcsr = __builtin_ia32_stmxcsr ();
mxcsr |= MXCSR_DAZ | MXCSR_FTZ;
__builtin_ia32_ldmxcsr (mxcsr);
#endif
}
(from here).

Why does summing floats into an int temporary run so much slower than when everything is int?

I compiled this program in GCC 4.8 -O2 on Skylake 6700HQ.
When I use float data type, total execution time is 0.000176 sec. When I change the float to int, the total time is 0.000026 (~7x faster). I don't know the reason for this difference. Related question: assembly output O3
I use this command in Geany IDE build command gcc -Wall -march=native -O2 -o "%e" "%f". I also tried -O3 and -Ofast, but those do not fix the problem.
I also read this question but there is too much differences between this float and int implementation. Since this float implementation is 7 times slower than the corresponding int implementation, this is not a duplicate question
#include <stdio.h>
#include <time.h>
float a[32][32]
, t[32][32]
, c_result[32][32]
, c_tra[32][32] ;
int main()
{
int w = 10000;
int i, j, k, temp;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for( i = 0; i < 32; i++){
for( j =0 ; j < 32; j++){
temp=0;
for( k = 0 ;k < 32; k++) {
temp += a[i][k] * c_tra[j][k];
}
c_result[i][j]= temp;
}
}
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
}while(w--);
printf(" The best time: %lf sec\n",tBest);
return 0;
}
It is assembly out put for int data type:
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %esi, %esi
.L2:
xorl %r8d, %r8d
.p2align 4,,10
.p2align 3
.L7:
movq %r8, %rdi
xorl %eax, %eax
xorl %ecx, %ecx
salq $5, %rdi
.p2align 4,,10
.p2align 3
.L5:
movl a(%rsi,%rax), %edx
imull c_tra(%rdi,%rax), %edx
addq $4, %rax
addl %edx, %ecx
cmpq $128, %rax
jne .L5
movl %ecx, c_result(%rsi,%r8)
addq $4, %r8
cmpq $128, %r8
jne .L7
subq $-128, %rsi
cmpq $4096, %rsi
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits
And this is for float :
.file "floatMULm.c"
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
subq $40, %rsp
.cfi_def_cfa_offset 48
movl $1, %edi
movq %rsp, %rsi
call clock_gettime
xorl %ecx, %ecx
.L2:
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L7:
movq %rdi, %rsi
xorl %eax, %eax
xorl %edx, %edx
salq $5, %rsi
.p2align 4,,10
.p2align 3
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss %xmm0, c_result(%rcx,%rdi)
addq $4, %rdi
cmpq $128, %rdi
jne .L7
subq $-128, %rcx
cmpq $4096, %rcx
jne .L2
leaq 16(%rsp), %rsi
movl $1, %edi
call clock_gettime
movq 24(%rsp), %rax
subq 8(%rsp), %rax
movl $32, %r8d
movl $32, %ecx
movl $10000, %edx
movl $.LC2, %esi
movl $1, %edi
vcvtsi2sdq %rax, %xmm1, %xmm1
movq 16(%rsp), %rax
subq (%rsp), %rax
vcvtsi2sdq %rax, %xmm0, %xmm0
movl $1, %eax
vdivsd .LC1(%rip), %xmm1, %xmm1
vaddsd %xmm0, %xmm1, %xmm0
vminsd .LC0(%rip), %xmm0, %xmm0
call __printf_chk
xorl %eax, %eax
addq $40, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE24:
.size main, .-main
.comm c_tra,4096,32
.comm c_result,4096,32
.comm t,4096,32
.comm a,4096,32
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1086556160
.align 8
.LC1:
.long 0
.long 1104006501
.ident "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
.section .note.GNU-stack,"",#progbits
The problem is the inner loop of the floating-point version:
.L5:
vcvtsi2ss %edx, %xmm0, %xmm0
vmovss a(%rcx,%rax), %xmm2
vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
addq $4, %rax
vcvttss2si %xmm0, %edx
cmpq $128, %rax
jne .L5
Because temp in main() is of type int (corresponding to %edx in the assembly), the value has to be converted back and forth between float and int in the loop. According to http://www.agner.org/optimize/instruction_tables.pdf, CVTSI2SS and CVT(T)SS2SI each have 6 cycles latency on Skylake. Furthermore, the conversions are in the dependency-chain, so out-of-order and superscalar execution do not help much in this case.
Changing main()s int temp to float temp removes these conversions.

Why is GCC exchanging rax and xmm0 registers? [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 7 years ago.
Improve this question
I was verifying some assembly generated by gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2) and realized that the following instructions were being generated:
movq %xmm0, %rax
movq %rax, %xmm0
I'd like to know what is the purpose of these instructions considering that it seems irrelevant, is it some kind of optimization? Like when we do:
xor ax, ax
I'd like to let clear that this code appeared just when I used the option -mtune=native and my CPU is a Intel Core I5 4200U.
Following is my source code:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"
void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
unsigned int i = 0;
for (i = 0; i < array_size; i++)
{
array3[i] = array1[i] * array2[i];
}
}
int main()
{
const unsigned int array_size = 1024*1024;
unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
int i = 0;
srand(time(NULL));
for (i = 0; i < array_size; i++)
{
array1[i] = rand();
array2[i] = rand();
}
clock_t t0 = clock();
multiply(array1,array2,array3, array_size);
multiply(array1,array2,array3, array_size);
clock_t t1 = clock();
printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}
This is the assembly generated by GCC using:gcc -S -mtune=native Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
And this with gcc -S Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
The differences can be found at the end of .L5 label.

Understanding exactly how the increased efficiency is achieved in Assembly language

I have generated two assembly files - one that is optimized, and one that is not. The assembly-language code generated with optimization on should be more efficient than the other assembly-language code. I am more interested in how the efficiency is achieved. To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address. In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them.
Am I correct? And if so, would there ever be a time when the optimized version will not be advantageous? Thank you for your time.
Here is a function that converts from 42 GIF to CYMK.
void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}
Here is the assembly-language code that has not been optimized. Note I have made notes using ;; in the code.
No Opt:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
;;initializing variable c, m, y
movl $255, %eax
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl %edx, -12(%rbp)
movq %rcx, -24(%rbp)
movl %eax, %edx
subl -4(%rbp), %edx
movl %edx, -28(%rbp)
movl %eax, %edx
subl -8(%rbp), %edx
movl %edx, -32(%rbp)
subl -12(%rbp), %eax
movl %eax, -36(%rbp)
movl -28(%rbp), %eax
;;compare
cmpl -32(%rbp), %eax
jge LBB0_5
## BB#1:
movl -28(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_3
## BB#2:
movl -28(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
jmp LBB0_4
LBB0_3:
movl -36(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
movl -44(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
jmp LBB0_9
LBB0_5:
movl -32(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_7
## BB#6:
movl -32(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
jmp LBB0_8
LBB0_7:
movl -36(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
movl -52(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
movl -48(%rbp), %eax ## 4-byte Reload
movl %eax, -40(%rbp)
movl -40(%rbp), %eax
movl -28(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -28(%rbp)
movl -40(%rbp), %eax
movl -32(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -32(%rbp)
movl -40(%rbp), %eax
movl -36(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -36(%rbp)
movl -28(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, (%rdx)
movl -32(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 4(%rdx)
movl -36(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 8(%rdx)
movl -40(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 12(%rdx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Optimization:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
movl $255, %r8d
movl $255, %eax
subl %edi, %eax
movl $255, %edi
subl %esi, %edi
subl %edx, %r8d
cmpl %edi, %eax ##;; compare m and c
jge LBB0_2
## BB#1: ;; c < m
cmpl %r8d, %eax ## compare y and c
movl %r8d, %edx
cmovlel %eax, %edx
jmp LBB0_3
LBB0_2: ##;; c >= m
cmpl %r8d, %edi ## compare y and m
movl %r8d, %edx
cmovlel %edi, %edx
LBB0_3:
subl %edx, %eax
subl %edx, %edi
subl %edx, %r8d
movl %eax, (%rcx)
movl %edi, 4(%rcx)
movl %r8d, 8(%rcx)
movl %edx, 12(%rcx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Yes. The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over.
You are using call wrong. It is a technical term that means to push a return address on the stack and branch to a new location for instructions. The term you mean is simply to use the register.
Can you think of a reason that longer, slower code is "better"?

Resources