Performance difference with casting to double and increment

Performance difference with casting to double and increment - c

I'm incrementing a counter, which I will need to use after the loop in double arithmetic. So, which would you expect to be faster? (Or too close to call?)
Code 1:
double dubs = 3.14159265;
double d;
for(d=0; d<BIGNUM; d++) { /* do stuff not depending on d */ }
dubs /= d;
Code 2:
double dubs = 3.14159265;
int i;
for(i=0; i<BIGNUM; i++) { /* do stuff not depending on i */ }
dubs /= (double) i;
And does it depend on the size of BIGNUM? I know it would be a minuscule difference, but just found myself wondering in theory.
Bonus question: if it were C++, any change in your answer for using static_cast?
--Edit--
Ok, here's a sample code and assembler:
#define BIGNUM 1000000000
#define NUMLOOPS 1000
double test1()
{
double dubs = 3.14159265;
double d;
int k = 1;
for(d=0; d<BIGNUM; d++) { k*= 2; }
dubs /= d;
return dubs;
}
double test2()
{
double dubs = 3.14159265;
int i;
int k = 1;
for(i=0; i<BIGNUM; i++) { k*= 2; }
dubs /= (double)i;
return dubs;
}
int main()
{
double d1=0;
double d2=0;
int i;
for(i=0; i<NUMLOOPS; i++)
{
d1 += test1();
d2 += test2();
}
}
_test1:
LFB2:
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
subq $48, %rsp
LCFI2:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -4(%rbp)
movl $0, %eax
movq %rax, -24(%rbp)
jmp L2
L3:
sall -4(%rbp)
movsd -24(%rbp), %xmm0
movsd LC2(%rip), %xmm1
addsd %xmm1, %xmm0
movsd %xmm0, -24(%rbp)
L2:
movsd -24(%rbp), %xmm1
movsd LC3(%rip), %xmm0
ucomisd %xmm1, %xmm0
ja L3
movsd -16(%rbp), %xmm0
divsd -24(%rbp), %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
leave
ret
_test2:
LFB3:
pushq %rbp
LCFI3:
movq %rsp, %rbp
LCFI4:
subq $32, %rsp
LCFI5:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -8(%rbp)
movl $0, -4(%rbp)
jmp L7
L8:
sall -8(%rbp)
incl -4(%rbp)
L7:
cmpl $99999, -4(%rbp)
jle L8
cvtsi2sd -4(%rbp), %xmm1
movsd -16(%rbp), %xmm0
divsd %xmm1, %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -24(%rbp)
movsd -24(%rbp), %xmm0
leave
ret
Test is currently running....

As a double it probably doesn't matter, but if you'd used float, the first code fragment might not even work. Due to limited precision, after a while, incrementing a float will not change its value. Of course with (signed) integer types, you get UB on overflow, which is arguably worse.
Personally I would recommend always using integer types for a variable that contains something like a count/index that is naturally an integer. Using floating point types for this just feels wrong. But please remove the useless cast in the last line of the second fragment.

Related

Why is divsd not much slower than mulsd in this benchmark?

Someone told me the factor of speed differences is about 40x. Curious, I wrote a benchmark. It required some help from greybeards who were more knowledgeable about what might be optimizing out, but after several revisions we just cannot find a meaningful difference between divsd and mulsd.
Here's results:
➜ linked gcc main.c && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 2080434
mul: 1925889
div / mul: 1.080246
0.000000
And with O3:
➜ linked gcc main.c -O3 && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 1948388
mul: 1804587
div / mul: 1.079686
0.000000
The code:
#include <time.h>
#include <stdio.h>
#define TRIALS 10000000
int main() {
double op = 0.0;
double x = 1.0;
double const y = 1.21462343468798723984729;
time_t div_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
x /= y;
op += x;
}
time_t div_end = clock();
printf("%.16f %.16f\n", x, op);
time_t div_seconds = div_end - div_start;
double op2 = 0.0;
x = 1.0;
double const z = 1 / y;
time_t mul_start = clock();
for (size_t i = 0; i < TRIALS; i++) {
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
x *= z;
op2 += x;
}
time_t mul_end = clock();
time_t mul_seconds = mul_end - mul_start;
printf("%.16f %.16f\n", x, op2);
// print results as seconds
printf("div: %ld\nmul: %ld\n", div_seconds, mul_seconds);
printf("div / mul: %f\n", (double)div_seconds / (double)mul_seconds);
printf("%f\n", op - op2);
return 0;
}
The assembly with O3:
.file "main.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC3:
.string "%.16f %.16f\n"
.LC5:
.string "div: %ld\nmul: %ld\n"
.LC6:
.string "div / mul: %f\n"
.LC7:
.string "%f\n"
.section .text.startup,"ax",#progbits
.p2align 4
.globl main
.type main, #function
main:
.LFB11:
.cfi_startproc
pushq %r13
.cfi_def_cfa_offset 16
.cfi_offset 13, -16
pushq %r12
.cfi_def_cfa_offset 24
.cfi_offset 12, -24
pushq %rbp
.cfi_def_cfa_offset 32
.cfi_offset 6, -32
pushq %rbx
.cfi_def_cfa_offset 40
.cfi_offset 3, -40
subq $40, %rsp
.cfi_def_cfa_offset 80
call clock#PLT
movq .LC0(%rip), %rcx
pxor %xmm2, %xmm2
movsd .LC2(%rip), %xmm1
movq %rax, %rbp
movl $10000000, %eax
movq %rcx, %xmm0
.p2align 4,,10
.p2align 3
.L2:
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
divsd %xmm1, %xmm0
addsd %xmm0, %xmm2
subq $1, %rax
jne .L2
movsd %xmm2, 8(%rsp)
leaq .LC3(%rip), %r13
movsd %xmm0, 16(%rsp)
call clock#PLT
movsd 8(%rsp), %xmm2
movsd 16(%rsp), %xmm0
movq %r13, %rdi
movq %rax, %rbx
movl $2, %eax
movapd %xmm2, %xmm1
subq %rbp, %rbx
call printf#PLT
call clock#PLT
movq .LC0(%rip), %rdx
movsd 8(%rsp), %xmm2
pxor %xmm1, %xmm1
movsd .LC4(%rip), %xmm3
movq %rax, %r12
movl $10000000, %eax
movq %rdx, %xmm0
.p2align 4,,10
.p2align 3
.L3:
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
mulsd %xmm3, %xmm0
addsd %xmm0, %xmm1
subq $1, %rax
jne .L3
movsd %xmm2, 24(%rsp)
movsd %xmm1, 8(%rsp)
movsd %xmm0, 16(%rsp)
call clock#PLT
movsd 8(%rsp), %xmm1
movsd 16(%rsp), %xmm0
movq %r13, %rdi
subq %r12, %rax
movq %rax, %rbp
movl $2, %eax
call printf#PLT
movq %rbp, %rdx
movq %rbx, %rsi
xorl %eax, %eax
leaq .LC5(%rip), %rdi
call printf#PLT
pxor %xmm0, %xmm0
pxor %xmm3, %xmm3
leaq .LC6(%rip), %rdi
cvtsi2sdq %rbp, %xmm3
movl $1, %eax
cvtsi2sdq %rbx, %xmm0
divsd %xmm3, %xmm0
call printf#PLT
movsd 24(%rsp), %xmm2
movsd 8(%rsp), %xmm1
leaq .LC7(%rip), %rdi
movl $1, %eax
subsd %xmm1, %xmm2
movapd %xmm2, %xmm0
call printf#PLT
addq $40, %rsp
.cfi_def_cfa_offset 40
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 32
popq %rbp
.cfi_def_cfa_offset 24
popq %r12
.cfi_def_cfa_offset 16
popq %r13
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE11:
.size main, .-main
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1072693248
.align 8
.LC2:
.long -74511709
.long 1072918296
.align 8
.LC4:
.long 644960408
.long 1072322682
.ident "GCC: (GNU) 12.2.0"
.section .note.GNU-stack,"",#progbits
It's clear divsd is not being optimized out? What am I doing wrong?

GCC optimizing _mm256_setzero_si256 away?

Consider the following C program.
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
static void do_stuff(void)
{
const int n = 256;
int *ar = malloc(n * sizeof(int));
for (int i = 0; i < n; i++)
ar[i] = random();
}
int main(void)
{
do_stuff();
__m256i sm = _mm256_setzero_si256();
int sum = 0;
int *vcadd = (int*)&sm;
for (size_t l = 0; l < 8; l++)
sum += vcadd[l];
printf("sum = %d\n", sum);
return 0;
}
I expected this program to print sum = 0, but when I compile it with gcc -mavx2 src.c -O2, it sometimes prints sum = 0, sometimes sum = 18.
When compiled with -O1 or -O0, the programs works as expected. It also seems to work fine with -O2 and the do_stuff(); call commented out.
Assembly generated for main with -O1 (+ comments from me of what I think the instructions do):
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
movl $1024, %edi
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.L2:
call random#PLT
movl %eax, (%rbx)
addq $4, %rbx
cmpq %r12, %rbx
jne .L2
vpxor %xmm0, %xmm0, %xmm0 ; zero out %ymm0
vmovdqa %ymm0, (%rsp) ; store these zeros at %rsp
movq %rsp, %rax ; add up the 8 ints stored at %rsp,..., %rsp + 32 (upper bound exclusive)
leaq 32(%rsp), %rcx ; ^
movl $0, %edx ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
leaq .LC0(%rip), %rsi
movl $1, %edi
movl $0, %eax
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L8
movl $0, %eax
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L8:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
and with -O2:
main:
.LFB5513:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $1024, %edi
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %fs:40, %rax
movq %rax, 56(%rsp)
xorl %eax, %eax
call malloc#PLT
movq %rax, %rbx
leaq 1024(%rax), %r12
.p2align 4,,10
.p2align 3
.L2:
call random#PLT
addq $4, %rbx
movl %eax, -4(%rbx)
cmpq %r12, %rbx
jne .L2
movq %rsp, %rax ; just add up %rsp,..., %rsp + 32 without setting that memory to zero
leaq 32(%rsp), %rcx ; ^
xorl %edx, %edx ; ^
.p2align 4,,10 ; ^
.p2align 3 ; ^
.L3: ; ^
addl (%rax), %edx ; ^
addq $4, %rax ; ^
cmpq %rcx, %rax ; ^
jne .L3 ; ^
xorl %eax, %eax
leaq .LC0(%rip), %rsi
movl $1, %edi
call __printf_chk#PLT
movq 56(%rsp), %rax
subq %fs:40, %rax
jne .L9
leaq -16(%rbp), %rsp
xorl %eax, %eax
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L9:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
So my question is: Why can the compiler do this optimization? Shouldn't the output always be sum = 0?
I'm using
gcc (Ubuntu 11.2.0-7ubuntu2) 11.2.0
Solution based on comments
(all below compiled with -O2)
Using memcpy as
__m256i sm = _mm256_setzero_si256();
int ar[8];
memcpy(ar, &sm, 32);
copies the data, although in a somewhat convoluted way (?)
vpxor %xmm0, %xmm0, %xmm0
leaq 48(%rsp), %rax
leaq 80(%rsp), %rcx
xorl %edx, %edx
vmovdqa %ymm0, (%rsp)
vmovdqa 16(%rsp), %xmm2
vmovdqa %xmm0, 48(%rsp)
vmovdqa %xmm2, 64(%rsp)
A union
union conv
{
__m256i val;
int ar[8];
};
union conv c;
c.val = _mm256_setzero_si256();
// access c.ar
works too by producing
vpxor %xmm0, %xmm0, %xmm0
leaq 4(%rsp), %rax
leaq 32(%rsp), %rsi
xorl %ecx, %ecx
vmovdqa %ymm0, (%rsp)
Another option is to compile with -fno-strict-aliasing. In that case, the original code works as I expected.

If you have 8 integers in __m256i variable, and you want horizontal sum, best way is probably intrinsics.
Here’s an example, untested:
// Horizontal sum of all 8 lanes in int32 SIMD vector
inline int hadd_epi32( __m256i vec )
{
// Add 8 lanes into 4
__m128i r = _mm256_extracti128_si256( vec, 1 );
r = _mm_add_epi32( r, _mm256_castsi256_si128( vec ) );
// Add 4 lanes into 2
r = _mm_add_epi32( r, _mm_unpackhi_epi64( r, r ) );
// Extract 2 lowest lanes from the vector into scalar registers, return their sum
const int i1 = _mm_extract_epi32( r, 1 );
const int i0 = _mm_cvtsi128_si32( r );
return i1 + i0;
}

Fast modulo 10 in c

I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}

Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.

You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax

The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.

This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function

Are this inlining results common?

Due to university work, I have to investigate a simple optimization, the inlining.
Here is the basic code:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void add(float x, float y, float *z){
*z = x + y;
}
void initialVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
add(x[i], y[i], &z[i]);
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
And here is the code with inlining:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void inliningVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
z[i] = x[i] + y[i];
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
Compiling using the option -O0 with gcc, the results are 14.27 seconds for the basic version and 4.45 seconds for the version with the inlining. Is that common? I executed the programm 10 times and the results are always similar. What do you think?
Then, compiling with the option -O1 the results are similar for both versions, 1.5 seconds approximately so I suppose that gcc does the inlining for me with O1.
By the way, I know that gettimeofday counts the overall time and not only the time used by the programm itself, but I am required to use that function specifically.
Thanks in advance!

Let's us analyze the assembly output generated by GCC 7.2 (with O0) for both versions of the code.
Without inlining
First, let's check how much work has to be done by the computer to achieve the task with a separate function:
void add(float x, float y, float *z){
*z = x + y;
}
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
add(x[i], y[i], &z[i]);
}
}
For the above code, GCC produces an assembly as given below:
add(float, float, float*):
pushq %rbp
movq %rsp, %rbp
movss %xmm0, -4(%rbp)
movss %xmm1, -8(%rbp)
movq %rdi, -16(%rbp)
movss -4(%rbp), %xmm0
addss -8(%rbp), %xmm0
movq -16(%rbp), %rax
movss %xmm0, (%rax)
nop
popq %rbp
ret
main:
pushq %rbp
movq %rsp, %rbp
subq $1224, %rsp
movl $0, -4(%rbp)
.L4:
cmpl $99, -4(%rbp)
jg .L3
leaq -1216(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
salq $2, %rdx
addq %rax, %rdx
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
movl -4(%rbp), %eax
cltq
movl -416(%rbp,%rax,4), %eax
movq %rdx, %rdi
movaps %xmm0, %xmm1
movl %eax, -1220(%rbp)
movss -1220(%rbp), %xmm0
call add(float, float, float*)
addl $1, -4(%rbp)
jmp .L4
.L3:
movl $0, %eax
leave
ret
The processing part of the code takes approximately 32 instructions (instructions between L4 and L3 and that of add function).
A large majority of the instructions are used for making the function call.
A simplified way to understand how function calls work is:
arguments are pushed on the call stack
return address is pushed on to the call stack
the function is called
make a copy of the frame pointer
make room for locals on the stack
actual function code is executed
restorel the state as it was before the function call
return to the caller
The above steps (except 6th) take additional instructions to do the required processing. This is called the function call overhead.
With inlining
Now let's check how much work the computer has to do if the function was inlined.
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
z[i] = x[i] + y[i];
}
}
For the above code, GCC produces an assembly output as given below:
main:
pushq %rbp
movq %rsp, %rbp
subq $1096, %rsp
movl $0, -4(%rbp)
.L3:
cmpl $99, -4(%rbp)
jg .L2
movl -4(%rbp), %eax
cltq
movss -416(%rbp,%rax,4), %xmm1
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
addss %xmm1, %xmm0
movl -4(%rbp), %eax
cltq
movss %xmm0, -1216(%rbp,%rax,4)
addl $1, -4(%rbp)
jmp .L3
.L2:
movl $0, %eax
leave
ret
The processing code (instructions between label L3 and L2) has around 14 instructions. In this assembly output, all the instructions which are responsible for making the function call aren't present which saves considerable amount of CPU cycles.
In general, the overhead of a function call is not relevant when your function's running time is more than several times of the overhead of a function call. In your code, the running time of your function is quite small and hence the function call overhead gains significance.
If you use the O1 flag, the compiler indeed does the inlining for you. You can find out by checking the assembly generated with the O1 or you can directly check the GCC manual for the list of optimizations which are tried with O1.
You can generate assembly output using the -S flag or you can do it online with GodBolt (the assembly outputs were taken from here for this post).

Why is `switch` so slow?

In a bytecode interpreting loop, after several tests, I'm surprised to see that using switch is the worst choice to make. Making calls to a function pointer array, or using gcc's computed goto extension is always 10~20% faster, the computed goto version being the fastest. I've tested with my real toy VM with 97 instructions and with the mini fake VM pasted below.
Why is using switch the slowest?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
enum {
ADD1 = 1,
ADD2,
SUB3,
SUB4,
MUL5,
MUL6,
};
static unsigned int number;
static void add1(void) {
number += 1;
}
static void add2(void) {
number += 2;
}
static void sub3(void) {
number -= 3;
}
static void sub4(void) {
number -= 4;
}
static void mul5(void) {
number *= 5;
}
static void mul6(void) {
number *= 6;
}
static void interpret_bytecodes_switch(uint8_t *bcs) {
while (true) {
switch (*bcs++) {
case 0:
return;
case ADD1:
add1();
break;
case ADD2:
add2();
break;
case SUB3:
sub3();
break;
case SUB4:
sub4();
break;
case MUL5:
mul5();
break;
case MUL6:
mul6();
break;
}
}
}
static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
void (*fs[])(void) = {
NULL,
add1,
add2,
sub3,
sub4,
mul5,
mul6,
};
while (*bcs) {
fs[*bcs++]();
}
}
static void interpret_bytecodes_goto(uint8_t *bcs) {
void *labels[] = {
&&l_exit,
&&l_add1,
&&l_add2,
&&l_sub3,
&&l_sub4,
&&l_mul5,
&&l_mul6,
};
#define JUMP goto *labels[*bcs++]
JUMP;
l_exit:
return;
l_add1:
add1();
JUMP;
l_add2:
add2();
JUMP;
l_sub3:
sub3();
JUMP;
l_sub4:
sub4();
JUMP;
l_mul5:
mul5();
JUMP;
l_mul6:
mul6();
JUMP;
#undef JUMP
}
struct timer {
clock_t start, end;
};
static void timer_start(struct timer *self) {
self->start = clock();
}
static void timer_end(struct timer *self) {
self->end = clock();
}
static double timer_measure(struct timer *self) {
return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}
static void test(void (*f)(uint8_t *), uint8_t *bcs) {
number = 0;
struct timer timer;
timer_start(&timer);
f(bcs);
timer_end(&timer);
printf("%u %.3fs\n", number, timer_measure(&timer));
}
int main(void) {
const int N = 300000000;
srand((unsigned)time(NULL));
uint8_t *bcs = malloc(N + 1);
for (int i = 0; i < N; ++i) {
bcs[i] = rand() % 5 + 1;
}
bcs[N] = 0;
for (int i = 0; i < 10; ++i) {
printf("%d ", bcs[i]);
}
printf("\nswitch\n");
test(interpret_bytecodes_switch, bcs);
printf("function pointer\n");
test(interpret_bytecodes_function_pointer, bcs);
printf("goto\n");
test(interpret_bytecodes_goto, bcs);
return 0;
}
result
~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s
Below is the relevant disassembly of the code posted here after -O3 optimization.
interpret_bytecodes_switch:
.L8:
addq $1, %rdi
cmpb $6, -1(%rdi)
ja .L8
movzbl -1(%rdi), %edx
jmp *.L11(,%rdx,8)
.L11:
.quad .L10
.quad .L12
.quad .L13
.quad .L14
.quad .L15
.quad .L16
.quad .L17
.L16:
leal (%rax,%rax,4), %eax
jmp .L8
.L15:
subl $4, %eax
jmp .L8
.L14:
subl $3, %eax
jmp .L8
.L13:
addl $2, %eax
jmp .L8
.L12:
addl $1, %eax
jmp .L8
.L10:
movl %eax, number(%rip)
ret
.L17:
leal (%rax,%rax,2), %eax
addl %eax, %eax
jmp .L8
interpret_bytecodes_function_pointer:
pushq %rbx
movq %rdi, %rbx
subq $64, %rsp
movzbl (%rdi), %eax
movq $0, (%rsp)
movq $add1, 8(%rsp)
movq $add2, 16(%rsp)
movq $sub3, 24(%rsp)
movq $sub4, 32(%rsp)
movq $mul5, 40(%rsp)
testb %al, %al
movq $mul6, 48(%rsp)
je .L19
.L23:
addq $1, %rbx
call *(%rsp,%rax,8)
movzbl (%rbx), %eax
testb %al, %al
jne .L23
.L19:
addq $64, %rsp
popq %rbx
ret
interpret_bytecodes_goto:
movzbl (%rdi), %eax
movq $.L27, -72(%rsp)
addq $2, %rdi
movq $.L28, -64(%rsp)
movq $.L29, -56(%rsp)
movq $.L30, -48(%rsp)
movq $.L31, -40(%rsp)
movq $.L32, -32(%rsp)
movq $.L33, -24(%rsp)
movq -72(%rsp,%rax,8), %rax
jmp *%rax
.L33:
movl number(%rip), %eax
leal (%rax,%rax,2), %eax
addl %eax, %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
.L35:
addq $1, %rdi
jmp *%rax
.L28:
addl $1, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L30:
subl $3, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L31:
subl $4, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L32:
movl number(%rip), %eax
leal (%rax,%rax,4), %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L29:
addl $2, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L27:
rep ret

switch is slowest because it has to manage default cases and this may add an extra bounds test you didn't implemented.
switch also handles a more general case where case values are not arranged in a so simple sequence, extra computation may be needed for that.

I was in the middle of writing a long answer when you posted the assembly code...
Basically, the goto version uses more "code" to prevent a few (or a single) instructions in each iteration. It's similar to a size vs. speed optimization.
Since your "real work" is negligible, it makes enough of a difference in the benchmark, but in a real world scenario that instruction will become negligible.

You are performing a micro benchmark. Micro benchmarks on modern CPUs can be affected by all kinds of random or unpredicatable effects. There is actually very little difference in execution time. However, in order to make the code comparable, you combined switch and function calls, which in real life wouldn't happen for time critical code.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Performance difference with casting to double and increment - c

Related

Why is divsd not much slower than mulsd in this benchmark?

GCC optimizing _mm256_setzero_si256 away?

Fast modulo 10 in c

Are this inlining results common?

Why is `switch` so slow?

Categories

Resources