GCC optimizing _mm256_setzero_si256 away? - c

Consider the following C program.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
static void do_stuff(void)
{
const int n = 256;
int *ar = malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
ar[i] = random();
}
int main(void)
{
do_stuff();
__m256i sm = _mm256_setzero_si256();
int sum = 0;
int *vcadd = (int*)&sm;
for (size_t l = 0; l < 8; l++)
sum += vcadd[l];
printf("sum = %d\n", sum);
return 0;
}
I expected this program to print sum = 0, but when I compile it with gcc -mavx2 src.c -O2, it sometimes prints sum = 0, sometimes sum = 18.
When compiled with -O1 or -O0, the programs works as expected. It also seems to work fine with -O2 and the do_stuff(); call commented out.
Assembly generated for main with -O1 (+ comments from me of what I think the instructions do):
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
movl $1024, %edi
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.L2:
call random#PLT
movl %eax, (%rbx)
addq $4, %rbx
cmpq %r12, %rbx
jne .L2
vpxor %xmm0, %xmm0, %xmm0 ; zero out %ymm0
vmovdqa %ymm0, (%rsp) ; store these zeros at %rsp
movq %rsp, %rax ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
leaq 32(%rsp), %rcx ; ^
movl $0, %edx ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
leaq .LC0(%rip), %rsi
movl $1, %edi
movl $0, %eax
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L8
movl $0, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L8:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
and with -O2:
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $1024, %edi
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.p2align 4,,10
.p2align 3
.L2:
call random#PLT
addq $4, %rbx
movl %eax, -4(%rbx)
cmpq %r12, %rbx
jne .L2
movq %rsp, %rax ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
leaq 32(%rsp), %rcx ; ^
xorl %edx, %edx ; ^
.p2align 4,,10 ; ^
.p2align 3 ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
xorl %eax, %eax
leaq .LC0(%rip), %rsi
movl $1, %edi
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L9
leaq -16(%rbp), %rsp
xorl %eax, %eax
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L9:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0?
I'm using
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Solution based on comments
(all below compiled with -O2)
Using memcpy as
__m256i sm = _mm256_setzero_si256();
int ar[8];
memcpy(ar, &sm, 32);
copies the data, although in a somewhat convoluted way (?)
vpxor %xmm0, %xmm0, %xmm0
leaq 48(%rsp), %rax
leaq 80(%rsp), %rcx
xorl %edx, %edx
vmovdqa %ymm0, (%rsp)
vmovdqa 16(%rsp), %xmm2
vmovdqa %xmm0, 48(%rsp)
vmovdqa %xmm2, 64(%rsp)
A union
union conv
{
__m256i val;
int ar[8];
};
union conv c;
c.val = _mm256_setzero_si256();
// access c.ar
works too by producing
vpxor %xmm0, %xmm0, %xmm0
leaq 4(%rsp), %rax
leaq 32(%rsp), %rsi
xorl %ecx, %ecx
vmovdqa %ymm0, (%rsp)
Another option is to compile with -fno-strict-aliasing. In that case, the original code works as I expected.

If you have 8 integers in __m256i variable, and you want horizontal sum, best way is probably intrinsics.
Here’s an example, untested:
// Horizontal sum of all 8 lanes in int32 SIMD vector
inline int hadd_epi32( __m256i vec )
{
// Add 8 lanes into 4
__m128i r = _mm256_extracti128_si256( vec, 1 );
r = _mm_add_epi32( r, _mm256_castsi256_si128( vec ) );
// Add 4 lanes into 2
r = _mm_add_epi32( r, _mm_unpackhi_epi64( r, r ) );
// Extract 2 lowest lanes from the vector into scalar registers, return their sum
const int i1 = _mm_extract_epi32( r, 1 );
const int i0 = _mm_cvtsi128_si32( r );
return i1 + i0;
}

Related

Why accessing 2d array takes less than 1d array?

I am working with the matrix-matrix multiplication and I noticed that if I use 2D arrays, A[M][N], insted of 1D arrays to store a matrix, the access to that takes less time against the locality principle. Here there is the code where the time using linearised matrix is greater than the second one. Why ?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
void matrixMul(double* m1, double* m2, double* m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i * N + j] += m1[i*N+k] * m2[k*N+j];
}
}
}
}
void matrixlin(double** m1, double** m2, double** m3, int N) {
int i, j, k;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
m3[i][j] += m1[i][k] * m2[k][j];
}
}
}
}
int main(int argc, char* argv[]) {
int N = 1024;
int i, j;
double *m1 = (double *)malloc(N*N*sizeof(double));
double *m2 = (double *)malloc(N*N*sizeof(double));
double *m3 = (double *)malloc(N*N*sizeof(double));
memset(m3, 0, N * N * sizeof(double));
double **mm1 = (double **)malloc(N*sizeof(double*));
double **mm2 = (double **)malloc(N*sizeof(double*));
double **mm3 = (double **)malloc(N*sizeof(double*));
for(i=0; i<N; i++){
mm1[i]=(double *)malloc(N*sizeof(double));
mm2[i]=(double *)malloc(N*sizeof(double));
mm3[i]=(double *)malloc(N*sizeof(double));
memset(mm3[i], 0, N * sizeof(double));
}
for(i=0; i<N; i++){
for(j=0; j<N; j++){
m1[i * N + j] = 1.1; m2[i * N + j]=2.1;
mm1[i][j] = 1.1; mm2[i][j] = 2.1;
}
}
clock_t t1 = clock();
matrixMul(m1, m2, m3, N);
t1 = clock() - t1;
printf("Elapsed time linearized: %.5f seconds\n", ((double)t1)/CLOCKS_PER_SEC);
clock_t t2 = clock();
matrixlin(mm1, mm2, mm3, N);
t2 = clock() - t2;
printf("Elapsed time 2D array: %.5f seconds\n", ((double)t2)/CLOCKS_PER_SEC);
free(m1);
free(m2);
free(m3);
for(i=0; i<N; i++){
free(mm1[i]);
free(mm2[i]);
free(mm3[i]);
}
free(mm1);
free(mm2);
free(mm3);
return 0;
}
Output:
$ gcc test.c
$ ./a.out
Elapsed time linearized: 22.40697 seconds
Elapsed time 2D array: 7.61103 seconds
Assembly code (gcc -S):
.file "test.c"
.text
.globl matrixMul
.type matrixMul, #function
matrixMul:
.LFB6:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L2
.L7:
movl $0, -8(%rbp)
jmp .L3
.L6:
movl $0, -4(%rbp)
jmp .L4
.L5:
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -4(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
imull -44(%rbp), %eax
movl %eax, %edx
movl -8(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L4:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L5
addl $1, -8(%rbp)
.L3:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L6
addl $1, -12(%rbp)
.L2:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L7
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size matrixMul, .-matrixMul
.globl matrixlin
.type matrixlin, #function
matrixlin:
.LFB7:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -12(%rbp)
jmp .L9
.L14:
movl $0, -8(%rbp)
jmp .L10
.L13:
movl $0, -4(%rbp)
jmp .L11
.L12:
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm1
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -24(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -4(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm2
movl -4(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -32(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd (%rax), %xmm0
mulsd %xmm2, %xmm0
movl -12(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -8(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addl $1, -4(%rbp)
.L11:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L12
addl $1, -8(%rbp)
.L10:
movl -8(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L13
addl $1, -12(%rbp)
.L9:
movl -12(%rbp), %eax
cmpl -44(%rbp), %eax
jl .L14
nop
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE7:
.size matrixlin, .-matrixlin
.section .rodata
.align 8
.LC3:
.string "Elapsed time linearized: %.5f seconds\n"
.align 8
.LC4:
.string "Elapsed time 2D array: %.5f seconds\n"
.text
.globl main
.type main, #function
main:
.LFB8:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $104, %rsp
.cfi_offset 3, -24
movl %edi, -100(%rbp)
movq %rsi, -112(%rbp)
movl $1024, -84(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -80(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -72(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -64(%rbp)
movl -84(%rbp), %eax
imull %eax, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -64(%rbp), %rax
movl $0, %esi
movq %rax, %rdi
call memset#PLT
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -56(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -48(%rbp)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movq %rax, %rdi
call malloc#PLT
movq %rax, -40(%rbp)
movl $0, -92(%rbp)
jmp .L16
.L17:
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -56(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -48(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
salq $3, %rax
movl -92(%rbp), %edx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rcx
movq -40(%rbp), %rdx
leaq (%rcx,%rdx), %rbx
movq %rax, %rdi
call malloc#PLT
movq %rax, (%rbx)
movl -84(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rcx
movq -40(%rbp), %rax
addq %rcx, %rax
movq (%rax), %rax
movl $0, %esi
movq %rax, %rdi
call memset#PLT
addl $1, -92(%rbp)
.L16:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L17
movl $0, -92(%rbp)
jmp .L18
.L21:
movl $0, -88(%rbp)
jmp .L19
.L20:
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
imull -84(%rbp), %eax
movl %eax, %edx
movl -88(%rbp), %eax
addl %edx, %eax
cltq
leaq 0(,%rax,8), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC0(%rip), %xmm0
movsd %xmm0, (%rax)
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rdx
movl -88(%rbp), %eax
cltq
salq $3, %rax
addq %rdx, %rax
movsd .LC1(%rip), %xmm0
movsd %xmm0, (%rax)
addl $1, -88(%rbp)
.L19:
movl -88(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L20
addl $1, -92(%rbp)
.L18:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L21
call clock#PLT
movq %rax, -32(%rbp)
movl -84(%rbp), %ecx
movq -64(%rbp), %rdx
movq -72(%rbp), %rsi
movq -80(%rbp), %rax
movq %rax, %rdi
call matrixMul
call clock#PLT
subq -32(%rbp), %rax
movq %rax, -32(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -32(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC3(%rip), %rdi
movl $1, %eax
call printf#PLT
call clock#PLT
movq %rax, -24(%rbp)
movl -84(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call matrixlin
call clock#PLT
subq -24(%rbp), %rax
movq %rax, -24(%rbp)
pxor %xmm0, %xmm0
cvtsi2sdq -24(%rbp), %xmm0
movsd .LC2(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
leaq .LC4(%rip), %rdi
movl $1, %eax
call printf#PLT
movq -80(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -72(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -64(%rbp), %rax
movq %rax, %rdi
call free#PLT
movl $0, -92(%rbp)
jmp .L22
.L23:
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -56(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
movl -92(%rbp), %eax
cltq
leaq 0(,%rax,8), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
movq (%rax), %rax
movq %rax, %rdi
call free#PLT
addl $1, -92(%rbp)
.L22:
movl -92(%rbp), %eax
cmpl -84(%rbp), %eax
jl .L23
movq -56(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -48(%rbp), %rax
movq %rax, %rdi
call free#PLT
movq -40(%rbp), %rax
movq %rax, %rdi
call free#PLT
movl $0, %eax
movq -8(%rbp), %rbx
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE8:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long -1717986918
.long 1072798105
.align 8
.LC1:
.long -858993459
.long 1073794252
.align 8
.LC2:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 10.3.0-1ubuntu1) 10.3.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:

How can gcc -O3 option make the run so fast?

[Question]
I run below code with O3 option. And then, I found that the perforamce of the code with O3, is nine times higher than performance of the code without O3.
Edit :
I want to know the key of optimization technique, not reason. This is my question. I have never experienced x86 assembly. So it is too hard to understand x86 assembly code. That is the reason I posted this question. Or, could you explain the code with O3 option for me?
................................................................................
[C code]
The code just executes addition.
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
return 0;
}
[Assembly with O3]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .text.unlikely
.LCOLDB2:
.section .text.startup,"ax",#progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
xorl %eax, %eax
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
[Assembly without O3]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2096, %rsp
movl %edi, -2084(%rbp)
movq %rsi, -2096(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L6
.L7:
movl -2068(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2068(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2072(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L6:
cmpl $127, -2072(%rbp)
jle .L7
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L9
call __stack_chk_fail
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
You code has no observable side-effects so the optimizer is simply discarding most of your code.
Using -O3 turns your main function into:
main:
xorl %eax, %eax
ret
Which is equivalent to:
int main()
{
return 0;
}
This shows that micro-benchmarking code can be difficult to do correctly.
Edit:
As pointed out in a comment below, the posted code doesn't initialize ibuffer[INPUT_FEATURE]. Reading an uninitialized variable is undefined behavior which makes the whole program malformed. This is a real problem and the code isn't required to produce reasonable results. Thanks #chqrlie
I modified the code and experimented, reflecting your reply, with it as follows. result is the same as before. O3 option is better than no option.
#define OFFSET (8)
#define INPUT_FEATURE (1024)
#define TSIZE (INPUT_FEATURE/OFFSET)
#include<stdio.h>
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = 0.0;
}
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
printf("%lf",H[k]);
}
return 0;
}
[code with O3 option]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB23:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE23:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .rodata.str1.1,"aMS",#progbits,1
.LC5:
.string "%lf"
.section .text.unlikely
.LCOLDB6:
.section .text.startup,"ax",#progbits
.LHOTB6:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $128, %ecx
pxor %xmm12, %xmm12
[code no option]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .rodata
.LC2:
.string "%lf"
.text
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2128, %rsp
movl %edi, -2100(%rbp)
movq %rsi, -2112(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2088(%rbp)
movl $0, -2084(%rbp)
jmp .L6
.L7:
movl -2088(%rbp), %eax
cltq
pxor %xmm0, %xmm0
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2084(%rbp)
addl $1, -2088(%rbp)
.L6:
cmpl $127, -2088(%rbp)
jle .L7
movl $0, -2080(%rbp)
movl $0, -2076(%rbp)
jmp .L8
.L9:
movl -2076(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2076(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2080(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2076(%rbp)
addl $1, -2080(%rbp)
.L8:
cmpl $127, -2080(%rbp)
jle .L9
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L10
.L11:
movl -2072(%rbp), %eax
cltq
movq -2064(%rbp,%rax,8), %rax
movq %rax, -2120(%rbp)
movsd -2120(%rbp), %xmm0
movl $.LC2, %edi
movl $1, %eax
call printf
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L10:
cmpl $127, -2072(%rbp)
jle .L11
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L13
call __stack_chk_fail
.L13:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits

Fast modulo 10 in c

I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}
Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.
You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax
The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.
This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function

Why is GCC exchanging rax and xmm0 registers? [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 7 years ago.
Improve this question
I was verifying some assembly generated by gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2) and realized that the following instructions were being generated:
movq %xmm0, %rax
movq %rax, %xmm0
I'd like to know what is the purpose of these instructions considering that it seems irrelevant, is it some kind of optimization? Like when we do:
xor ax, ax
I'd like to let clear that this code appeared just when I used the option -mtune=native and my CPU is a Intel Core I5 4200U.
Following is my source code:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include "print.h"
void multiply(const unsigned int* array1, const unsigned int* array2, unsigned int* array3, const unsigned int array_size)
{
unsigned int i = 0;
for (i = 0; i < array_size; i++)
{
array3[i] = array1[i] * array2[i];
}
}
int main()
{
const unsigned int array_size = 1024*1024;
unsigned int* array1 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array2 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
unsigned int* array3 = (unsigned int*)malloc(sizeof(unsigned int) * array_size);
int i = 0;
srand(time(NULL));
for (i = 0; i < array_size; i++)
{
array1[i] = rand();
array2[i] = rand();
}
clock_t t0 = clock();
multiply(array1,array2,array3, array_size);
multiply(array1,array2,array3, array_size);
clock_t t1 = clock();
printf("\nTempo: %f\n", ((double)(t1 - t0)) / CLOCKS_PER_SEC);
}
This is the assembly generated by GCC using:gcc -S -mtune=native Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movq %xmm0, %rax
movq %rax, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
And this with gcc -S Main.c:
.file "Main.c"
.text
.globl multiply
.type multiply, #function
multiply:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq %rdx, -40(%rbp)
movl %ecx, -44(%rbp)
movl $0, -4(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rax, %rdx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rcx
movq -24(%rbp), %rax
addq %rcx, %rax
movl (%rax), %ecx
movl -4(%rbp), %eax
leaq 0(,%rax,4), %rsi
movq -32(%rbp), %rax
addq %rsi, %rax
movl (%rax), %eax
imull %ecx, %eax
movl %eax, (%rdx)
addl $1, -4(%rbp)
.L2:
movl -4(%rbp), %eax
cmpl -44(%rbp), %eax
jb .L3
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2:
.size multiply, .-multiply
.section .rodata
.LC1:
.string "\nTempo: %f\n"
.text
.globl main
.type main, #function
main:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $56, %rsp
.cfi_offset 3, -24
movl $1048576, -60(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -56(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -48(%rbp)
movl -60(%rbp), %eax
salq $2, %rax
movq %rax, %rdi
call malloc
movq %rax, -40(%rbp)
movl $0, -64(%rbp)
movl $0, %edi
call time
movl %eax, %edi
call srand
movl $0, -64(%rbp)
jmp .L5
.L6:
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -56(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
movl -64(%rbp), %eax
cltq
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
leaq (%rdx,%rax), %rbx
call rand
movl %eax, (%rbx)
addl $1, -64(%rbp)
.L5:
movl -64(%rbp), %eax
cmpl -60(%rbp), %eax
jb .L6
call clock
movq %rax, -32(%rbp)
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
movl -60(%rbp), %ecx
movq -40(%rbp), %rdx
movq -48(%rbp), %rsi
movq -56(%rbp), %rax
movq %rax, %rdi
call multiply
call clock
movq %rax, -24(%rbp)
movq -24(%rbp), %rax
subq -32(%rbp), %rax
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
movl $.LC1, %edi
movl $1, %eax
call printf
movl $0, %eax
addq $56, %rsp
popq %rbx
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1093567616
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",#progbits
The differences can be found at the end of .L5 label.

Understanding simple assember program [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
I have practiced Assembler long time ago and I would like to understand a simple program (I generate assembler code from a C code) which adds 2 vectors (actually 2 arrays) and store the result in another vector (an output array). My goal is after to study vectorization. For this, I use gcc-4.9 under Debian Wheezy on i7-core processor.
Here the C code snippet (not vectorized version) :
#include <stdio.h>
#define SIZE 10000
void test(double *a, double *b, double *c)
{
int i;
for (i = 0; i < SIZE; i++)
{
c[i] = a[i] + b[i];
}
}
int main()
{
int i;
double tab1[SIZE];
double tab2[SIZE];
double tab3[SIZE];
for (i = 0; i < SIZE; i++)
{
tab1[i] = i;
tab2[i] = i;
tab3[i] = 0;
}
test(tab1, tab2, tab3);
for (i = 0; i < SIZE; i++)
printf(" tab3[%d] = %f\n", i, tab3[i]);
return 0;
}
I generate Assembler code with AT&T syntax :
gcc -std=c99 -c main_no_vectorized.c -O3 -S -o main_no_vectorized.s
Here is the assembly code :
.file "main_no_vectorized.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB0:
.text
.LHOTB0:
.p2align 4,,15
.globl test
.type test, #function
test:
.LFB3:
.cfi_startproc
leaq 16(%rdx), %rax
leaq 16(%rsi), %rcx
cmpq %rax, %rsi
setae %r8b
cmpq %rcx, %rdx
setae %cl
orb %cl, %r8b
je .L7
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %cl
cmpq %rax, %rdx
setae %al
orb %al, %cl
je .L7
testb $8, %dil
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
je .L8
movsd (%rdi), %xmm0
movl $9998, %ebp
movl $4999, %r9d
movl $9999, %r12d
movl $1, %r8d
movl $1, %ebx
addsd (%rsi), %xmm0
movsd %xmm0, (%rdx)
.L3:
salq $3, %r8
xorl %eax, %eax
xorl %ecx, %ecx
leaq (%rdi,%r8), %r11
leaq (%rsi,%r8), %r10
addq %rdx, %r8
.p2align 4,,10
.p2align 3
.L4:
movupd (%r10,%rax), %xmm0
addl $1, %ecx
addpd (%r11,%rax), %xmm0
movups %xmm0, (%r8,%rax)
addq $16, %rax
cmpl %r9d, %ecx
jb .L4
cmpl %ebp, %r12d
leal (%rbx,%rbp), %eax
je .L1
cltq
movsd (%rdi,%rax,8), %xmm0
addsd (%rsi,%rax,8), %xmm0
movsd %xmm0, (%rdx,%rax,8)
.L1:
popq %rbx
.cfi_remember_state
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L8:
.cfi_restore_state
movl $10000, %ebp
movl $5000, %r9d
movl $10000, %r12d
xorl %r8d, %r8d
xorl %ebx, %ebx
jmp .L3
.L7:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L2:
movsd (%rdi,%rax), %xmm0
addsd (%rsi,%rax), %xmm0
movsd %xmm0, (%rdx,%rax)
addq $8, %rax
cmpq $80000, %rax
jne .L2
rep ret
.cfi_endproc
.LFE3:
.size test, .-test
.section .text.unlikely
.LCOLDE0:
.text
.LHOTE0:
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string " tab3[%d] = %f\n"
.section .text.unlikely
.LCOLDB4:
.section .text.startup,"ax",#progbits
.LHOTB4:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xorl %eax, %eax
subq $240016, %rsp
.cfi_def_cfa_offset 240032
movdqa .LC2(%rip), %xmm3
leaq 32(%rsp), %rcx
leaq 80032(%rsp), %rdx
movdqa .LC1(%rip), %xmm1
.p2align 4,,10
.p2align 3
.L21:
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm2
paddd %xmm3, %xmm1
movaps %xmm2, 16(%rsp,%rax)
cvtdq2pd %xmm0, %xmm0
movaps %xmm2, 80016(%rsp,%rax)
movaps %xmm0, (%rcx,%rax)
movaps %xmm0, (%rdx,%rax)
addq $32, %rax
cmpq $80000, %rax
jne .L21
leaq 160016(%rsp), %rdi
movl $80000, %edx
xorl %esi, %esi
call memset
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L22:
movapd 16(%rsp,%rax), %xmm0
addpd 80016(%rsp,%rax), %xmm0
movaps %xmm0, 160016(%rsp,%rax)
addq $16, %rax
cmpq $80000, %rax
jne .L22
xorl %ebx, %ebx
.p2align 4,,10
.p2align 3
.L23:
movsd 160016(%rsp,%rbx,8), %xmm4
movl %ebx, %esi
movl $.LC3, %edi
movl $1, %eax
addq $1, %rbx
movapd %xmm4, %xmm0
movsd %xmm4, 8(%rsp)
call printf
cmpq $10000, %rbx
jne .L23
addq $240016, %rsp
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.section .text.unlikely
.LCOLDE4:
.section .text.startup
.LHOTE4:
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LC1:
.long 0
.long 1
.long 2
.long 3
.align 16
.LC2:
.long 4
.long 4
.long 4
.long 4
.ident "GCC: (Debian 4.9.1-16) 4.9.1"
.section .note.GNU-stack,"",#progbits
Could you explain to me the main steps of this above assembly code in relation with the C code, in particulary the "test" function, the loop of initialization in main function and the parameters passing (i.e where's the push and pop instructions for the stack) and the effective addition of "a" and "b" arrays ?
What corresponds to .L2, .L3, ... segments ? is there a relation with L2 cache, L3 cache ?
Sorry for these basics questions but I begin with Intel x86_64 assembler.
Thanks for your precious help
The generated assembly code is quite complicated. It first checks to see if the arrays a, b, and c overlap in a way that will cause an optimized loop to fail. For example, if you did this:
test(tab1, tab2, &tab1[1]);
then the overlap would be detected and cause the code to jump to L7 (the straightforward implementation). By the way, L stands for Label, and the label numbers are just generated by the compiler with no particular meaning. So L1, L2, L3, etc are just labels that are used for the code to branch to various places. The overlap checks start at .LFB3 and end at the last je .L7.
If no overlap is detected, then an optimized loop will be used. This optimized loop will try to add two doubles at a time instead of just one. The first thing the optimized loop does is to find out if array a is aligned to a 16 byte boundary (the testb $8, %dil instruction). If it is, it will jump to L8 to load a set of constants (e.g. r9 = 5000). If the array is not aligned, if will fall through and load a different set of constants (e.g. r9 = 4999), and also handle the first element. This is because the unaligned case will need to do 4999 iterations two at a time and handle the first and last unaligned elements separately outside the loop. The aligned case will just do 5000 iterations.
Either way, the code reaches L3 next. The code at L3 and L4 is the optimized loop that does the adds two at a time using the addpd instruction (the nonoptimized loop at L7 used addsd to do one add at a time). After the L4 loop finishes, it checks to see if it needs to handle the last element (for the unaligned case). Then it returns with the ret instruction.
By the way, it helps to know that when test is called, a is in rdi, b is in rsi, and c is in rdx. That is the calling convention for 64-bit. Therefore, there are no arguments pushed on the stack. If you don't understand x86 assembly too well, concentrate on the code starting at L7. That is the non-optimized version and you should be able to figure that part out given that I said your three arguments were in rdi, rsi, and rdx.
The .L2 and such are labels, they are used to refer to the next instruction. They are pretty much exactly like labels in C, if you've used goto. The primary use of a label is with a jump or branch, to specify where the jump goes to.
For example, the .L2 label is start of the body of your for (i = 0; i < SIZE; i++) loop in test(), it is counting by 8 bytes (the size of a double) up to 8*10000. The last instruction in the loop is jne .L2, which jumps to .L2 if the previous comparison was not equal.
You may find this reference (PDF) on x64 helpful.

Resources