Due to university work, I have to investigate a simple optimization, the inlining.
Here is the basic code:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void add(float x, float y, float *z){
*z = x + y;
}
void initialVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
add(x[i], y[i], &z[i]);
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
And here is the code with inlining:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void inliningVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
z[i] = x[i] + y[i];
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
Compiling using the option -O0 with gcc, the results are 14.27 seconds for the basic version and 4.45 seconds for the version with the inlining. Is that common? I executed the programm 10 times and the results are always similar. What do you think?
Then, compiling with the option -O1 the results are similar for both versions, 1.5 seconds approximately so I suppose that gcc does the inlining for me with O1.
By the way, I know that gettimeofday counts the overall time and not only the time used by the programm itself, but I am required to use that function specifically.
Thanks in advance!
Let's us analyze the assembly output generated by GCC 7.2 (with O0) for both versions of the code.
Without inlining
First, let's check how much work has to be done by the computer to achieve the task with a separate function:
void add(float x, float y, float *z){
*z = x + y;
}
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
add(x[i], y[i], &z[i]);
}
}
For the above code, GCC produces an assembly as given below:
add(float, float, float*):
pushq %rbp
movq %rsp, %rbp
movss %xmm0, -4(%rbp)
movss %xmm1, -8(%rbp)
movq %rdi, -16(%rbp)
movss -4(%rbp), %xmm0
addss -8(%rbp), %xmm0
movq -16(%rbp), %rax
movss %xmm0, (%rax)
nop
popq %rbp
ret
main:
pushq %rbp
movq %rsp, %rbp
subq $1224, %rsp
movl $0, -4(%rbp)
.L4:
cmpl $99, -4(%rbp)
jg .L3
leaq -1216(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
salq $2, %rdx
addq %rax, %rdx
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
movl -4(%rbp), %eax
cltq
movl -416(%rbp,%rax,4), %eax
movq %rdx, %rdi
movaps %xmm0, %xmm1
movl %eax, -1220(%rbp)
movss -1220(%rbp), %xmm0
call add(float, float, float*)
addl $1, -4(%rbp)
jmp .L4
.L3:
movl $0, %eax
leave
ret
The processing part of the code takes approximately 32 instructions (instructions between L4 and L3 and that of add function).
A large majority of the instructions are used for making the function call.
A simplified way to understand how function calls work is:
arguments are pushed on the call stack
return address is pushed on to the call stack
the function is called
make a copy of the frame pointer
make room for locals on the stack
actual function code is executed
restorel the state as it was before the function call
return to the caller
The above steps (except 6th) take additional instructions to do the required processing. This is called the function call overhead.
With inlining
Now let's check how much work the computer has to do if the function was inlined.
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
z[i] = x[i] + y[i];
}
}
For the above code, GCC produces an assembly output as given below:
main:
pushq %rbp
movq %rsp, %rbp
subq $1096, %rsp
movl $0, -4(%rbp)
.L3:
cmpl $99, -4(%rbp)
jg .L2
movl -4(%rbp), %eax
cltq
movss -416(%rbp,%rax,4), %xmm1
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
addss %xmm1, %xmm0
movl -4(%rbp), %eax
cltq
movss %xmm0, -1216(%rbp,%rax,4)
addl $1, -4(%rbp)
jmp .L3
.L2:
movl $0, %eax
leave
ret
The processing code (instructions between label L3 and L2) has around 14 instructions. In this assembly output, all the instructions which are responsible for making the function call aren't present which saves considerable amount of CPU cycles.
In general, the overhead of a function call is not relevant when your function's running time is more than several times of the overhead of a function call. In your code, the running time of your function is quite small and hence the function call overhead gains significance.
If you use the O1 flag, the compiler indeed does the inlining for you. You can find out by checking the assembly generated with the O1 or you can directly check the GCC manual for the list of optimizations which are tried with O1.
You can generate assembly output using the -S flag or you can do it online with GodBolt (the assembly outputs were taken from here for this post).
Related
I've been trying to translate this function to assembly:
void foo (int a[], int n) {
int i;
int s = 0;
for (i=0; i<n; i++) {
s += a[i];
if (a[i] == 0) {
a[i] = s;
s = 0;
}
}
}
But something is going wrong.
That's what I've done so far:
.section .text
.globl foo
foo:
.L1:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $0, -16(%rbp) /*s*/
movl $0, -8(%rbp) /*i*/
jmp .L2
.L2:
cmpl -8(%rbp), %esi
jle .L4
leave
ret
.L3:
addl $1, -8(%rbp)
jmp .L2
.L4:
movl -8(%rbp), %eax
imull $4, %eax
movslq %eax, %rax
addq %rdi, %rax
movl (%rax), %eax
addl %eax, -16(%rbp)
cmpl $0, %eax
jne .L3
/* if */
leaq (%rax), %rdx
movl -16(%rbp), %eax
movl %eax, (%rdx)
movl $0, -16(%rbp)
jmp .L3
I am compiling the .s module with a .c module, for example, with an int nums [5] = {65, 23, 11, 0, 34} and I'm getting back the same array instead of {65, 23, 11 , 99, 34}.
Could someone help me?
Presumably you have a compiler that can generate AT&T syntax. It might be more instructive to look at what assembly output the compiler generates. Here's my re-formulation of your demo:
#include <stdio.h>
void foo (int a[], int n)
{
for (int s = 0, i = 0; i < n; i++)
{
if (a[i] != 0)
s += a[i];
else
a[i] = s, s = 0;
}
}
int main (void)
{
int nums[] = {65, 23, 11, 0, 34};
int size = sizeof(nums) / sizeof(int);
foo(nums, size);
for (int i = 0; i < size; i++)
fprintf(stdout, i < (size - 1) ? "%d, " : "%d\n", nums[i]);
return (0);
}
Compiling without optimizations enabled is typically harder to work through than optimized code, since it loads from and spills results to memory. You won't learn much from it if you're investing time in learning how to write efficient assembly.
Compiling with the Godbolt compiler explorer with -O2 optimizations yields much more efficient code; it's also useful for cutting out unnecessary directives, labels, etc., that would be visual noise in this case.
In my experience, using -O2 optimizations are clever enough to make you rethink your use of registers, refactoring, etc. -O3 can sometimes optimize too agressively - unrolling loops, vectorizing, etc., to easily follow.
Finally, for the case you have presented, there's a perfect compromise: -Os, which enables many of the optimizations of -O2, but not at the expense of increased code size. I'll paste the assembly here just for comparative purposes:
foo:
xorl %eax, %eax
xorl %ecx, %ecx
.L2:
cmpl %eax, %esi
jle .L7
movl (%rdi,%rax,4), %edx
testl %edx, %edx
je .L3
addl %ecx, %edx
jmp .L4
.L3:
movl %ecx, (%rdi,%rax,4)
.L4:
incq %rax
movl %edx, %ecx
jmp .L2
.L7:
ret
Remember that the calling convention passes the pointer to (a) in %rdi, and the 'count' (n) in %rsi. These are the calling conventions being used. Notice that your code does not 'dereference' or 'index' any elements through %rdi. It's definitely worth going stepping through the code - even with pen and paper if it helps - to understand the branch conditions and how reading and writing is performed on element a[i].
Curiously, using the inner loop of your code:
s += a[i];
if (a[i] == 0)
a[i] = s, s = 0;
Appears to generate more efficient code with -Os than the inner loop I used:
foo:
xorl %eax, %eax
xorl %edx, %edx
.L2:
cmpl %eax, %esi
jle .L6
movl (%rdi,%rax,4), %ecx
addl %ecx, %edx
testl %ecx, %ecx
jne .L3
movl %edx, (%rdi,%rax,4)
xorl %edx, %edx
.L3:
incq %rax
jmp .L2
.L6:
ret
A reminder for me to keep things simple!
In a bytecode interpreting loop, after several tests, I'm surprised to see that using switch is the worst choice to make. Making calls to a function pointer array, or using gcc's computed goto extension is always 10~20% faster, the computed goto version being the fastest. I've tested with my real toy VM with 97 instructions and with the mini fake VM pasted below.
Why is using switch the slowest?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
enum {
ADD1 = 1,
ADD2,
SUB3,
SUB4,
MUL5,
MUL6,
};
static unsigned int number;
static void add1(void) {
number += 1;
}
static void add2(void) {
number += 2;
}
static void sub3(void) {
number -= 3;
}
static void sub4(void) {
number -= 4;
}
static void mul5(void) {
number *= 5;
}
static void mul6(void) {
number *= 6;
}
static void interpret_bytecodes_switch(uint8_t *bcs) {
while (true) {
switch (*bcs++) {
case 0:
return;
case ADD1:
add1();
break;
case ADD2:
add2();
break;
case SUB3:
sub3();
break;
case SUB4:
sub4();
break;
case MUL5:
mul5();
break;
case MUL6:
mul6();
break;
}
}
}
static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
void (*fs[])(void) = {
NULL,
add1,
add2,
sub3,
sub4,
mul5,
mul6,
};
while (*bcs) {
fs[*bcs++]();
}
}
static void interpret_bytecodes_goto(uint8_t *bcs) {
void *labels[] = {
&&l_exit,
&&l_add1,
&&l_add2,
&&l_sub3,
&&l_sub4,
&&l_mul5,
&&l_mul6,
};
#define JUMP goto *labels[*bcs++]
JUMP;
l_exit:
return;
l_add1:
add1();
JUMP;
l_add2:
add2();
JUMP;
l_sub3:
sub3();
JUMP;
l_sub4:
sub4();
JUMP;
l_mul5:
mul5();
JUMP;
l_mul6:
mul6();
JUMP;
#undef JUMP
}
struct timer {
clock_t start, end;
};
static void timer_start(struct timer *self) {
self->start = clock();
}
static void timer_end(struct timer *self) {
self->end = clock();
}
static double timer_measure(struct timer *self) {
return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}
static void test(void (*f)(uint8_t *), uint8_t *bcs) {
number = 0;
struct timer timer;
timer_start(&timer);
f(bcs);
timer_end(&timer);
printf("%u %.3fs\n", number, timer_measure(&timer));
}
int main(void) {
const int N = 300000000;
srand((unsigned)time(NULL));
uint8_t *bcs = malloc(N + 1);
for (int i = 0; i < N; ++i) {
bcs[i] = rand() % 5 + 1;
}
bcs[N] = 0;
for (int i = 0; i < 10; ++i) {
printf("%d ", bcs[i]);
}
printf("\nswitch\n");
test(interpret_bytecodes_switch, bcs);
printf("function pointer\n");
test(interpret_bytecodes_function_pointer, bcs);
printf("goto\n");
test(interpret_bytecodes_goto, bcs);
return 0;
}
result
~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s
Below is the relevant disassembly of the code posted here after -O3 optimization.
interpret_bytecodes_switch:
.L8:
addq $1, %rdi
cmpb $6, -1(%rdi)
ja .L8
movzbl -1(%rdi), %edx
jmp *.L11(,%rdx,8)
.L11:
.quad .L10
.quad .L12
.quad .L13
.quad .L14
.quad .L15
.quad .L16
.quad .L17
.L16:
leal (%rax,%rax,4), %eax
jmp .L8
.L15:
subl $4, %eax
jmp .L8
.L14:
subl $3, %eax
jmp .L8
.L13:
addl $2, %eax
jmp .L8
.L12:
addl $1, %eax
jmp .L8
.L10:
movl %eax, number(%rip)
ret
.L17:
leal (%rax,%rax,2), %eax
addl %eax, %eax
jmp .L8
interpret_bytecodes_function_pointer:
pushq %rbx
movq %rdi, %rbx
subq $64, %rsp
movzbl (%rdi), %eax
movq $0, (%rsp)
movq $add1, 8(%rsp)
movq $add2, 16(%rsp)
movq $sub3, 24(%rsp)
movq $sub4, 32(%rsp)
movq $mul5, 40(%rsp)
testb %al, %al
movq $mul6, 48(%rsp)
je .L19
.L23:
addq $1, %rbx
call *(%rsp,%rax,8)
movzbl (%rbx), %eax
testb %al, %al
jne .L23
.L19:
addq $64, %rsp
popq %rbx
ret
interpret_bytecodes_goto:
movzbl (%rdi), %eax
movq $.L27, -72(%rsp)
addq $2, %rdi
movq $.L28, -64(%rsp)
movq $.L29, -56(%rsp)
movq $.L30, -48(%rsp)
movq $.L31, -40(%rsp)
movq $.L32, -32(%rsp)
movq $.L33, -24(%rsp)
movq -72(%rsp,%rax,8), %rax
jmp *%rax
.L33:
movl number(%rip), %eax
leal (%rax,%rax,2), %eax
addl %eax, %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
.L35:
addq $1, %rdi
jmp *%rax
.L28:
addl $1, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L30:
subl $3, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L31:
subl $4, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L32:
movl number(%rip), %eax
leal (%rax,%rax,4), %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L29:
addl $2, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L27:
rep ret
switch is slowest because it has to manage default cases and this may add an extra bounds test you didn't implemented.
switch also handles a more general case where case values are not arranged in a so simple sequence, extra computation may be needed for that.
I was in the middle of writing a long answer when you posted the assembly code...
Basically, the goto version uses more "code" to prevent a few (or a single) instructions in each iteration. It's similar to a size vs. speed optimization.
Since your "real work" is negligible, it makes enough of a difference in the benchmark, but in a real world scenario that instruction will become negligible.
You are performing a micro benchmark. Micro benchmarks on modern CPUs can be affected by all kinds of random or unpredicatable effects. There is actually very little difference in execution time. However, in order to make the code comparable, you combined switch and function calls, which in real life wouldn't happen for time critical code.
Below is a C function to evaluate a polynomial:
/* Calculate a0 + a1*x + a2*x^2 + ... + an*x^n */
/* from CSAPP Ex.5.5, modified to integer version */
int poly(int a[], int x, int degree) {
long int i;
int result = a[0];
int xpwr = x;
for (i = 1; i <= degree; ++i) {
result += a[i]*xpwr;
xpwr *= x;
}
return result;
}
And a main function:
#define TIMES 100000ll
int main(void) {
long long int i;
unsigned long long int result = 0;
for (i = 0; i < TIMES; ++i) {
/* g_a is an int[10000] global variable with all elements equals to 1 */
/* x = 2, i.e. evaluate 1 + 2 + 2^2 + ... + 2^9999 */
result += poly(g_a, 2, 9999);
}
printf("%lld\n", result);
return 0;
}
When I compile the program with GCC and options -O1 and -O2 separately, I found that -O1 is FASTER than -O2 a lot.
Platform details:
i5-4600
Arch Linux x86_64 with kernel 3.18
GCC 4.9.2
gcc -O1 -o /tmp/a.out test.c
gcc -O2 -o /tmp/a.out test.c
Result:
When TIMES = 100000ll, -O1 prints the result instantly, while -O2 needs 0.36s
When TIMES = 1000000000ll, -O1 prints the result in 0.28s, -O2 takes so long that I didn't finish the test
It seems that -O1 is approximately 10000 times faster than -O2.
When I test it on Mac (clang-600.0.56), the result is even more weird: -O1 takes no more than 0.02s even when TIMES = 1000000000000000000ll
I have tested the following changes:
makes g_a random (elements are from 1 to 10)
x = 19234 (or some other number)
use int instead of long long int
And the results are the same.
I tried to look at the assembly code, it seems that -O1 is calling the poly function while -O2 does inline optimization. But inline should make the performance better, isn't it?
What makes these huge differences? Why -O1 on clang can make the program so fast? Is -O1 doing something wrong? (I cannot check the result as it is too slow without optimization)
Here is the assembly code of main for -O1: (you may get it by adding -S option to gcc)
main:
.LFB12:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $9999, %edx
movl $2, %esi
movl $g_a, %edi
call poly
movslq %eax, %rdx
movl $100000, %eax
.L6:
subq $1, %rax
jne .L6
imulq $100000, %rdx, %rsi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
And for -O2:
main:
.LFB12:
.cfi_startproc
movl g_a(%rip), %r9d
movl $100000, %r8d
xorl %esi, %esi
.p2align 4,,10
.p2align 3
.L8:
movl $g_a+4, %eax
movl %r9d, %ecx
movl $2, %edx
.p2align 4,,10
.p2align 3
.L7:
movl (%rax), %edi
addq $4, %rax
imull %edx, %edi
addl %edx, %edx
addl %edi, %ecx
cmpq $g_a+40000, %rax
jne .L7
movslq %ecx, %rcx
addq %rcx, %rsi
subq $1, %r8
jne .L8
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $.LC1, %edi
xorl %eax, %eax
call printf
xorl %eax, %eax
addq $8, %rsp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
Although I don't know much about assembly, it is obvious that -O1 is just calling poly once, and multiply the result by 100000 (imulq $100000, %rdx, %rsi). This is the reason that it is so fast.
It seems that gcc can detect that poly is a pure function with no side effect. (It will be interesting if we have another thread modifying g_a while poly is running...)
On the other hand, -O2 has inlined the poly function, so it has no chance to check poly as a pure function.
I have further done some research:
I cannot find the actual flag used by -O1 which do the pure function checking.
I have tried all the flags listed by gcc -Q -O1 --help=optimizers individually, but none of them have the effect.
Maybe it needs a combination of the flags together to get the effect, but it is very hard to try all the combinations.
But I have found the flag used by -O2 which makes the effect disappear, which is the -finline-small-functions flag. The name of the flag explains itself.
One thing that jumps out at me is that you're overflowing signed integers. The behaviour of this is undefined in C. Specifically, int result won't be able to hold pow(2,9999). I don't see what the point is of benchmarking code with undefined behaviour?
The compare function is a function that takes two arguments a and b and returns an integer describing their order. If a is smaller than b, the result is some negative integer. If a is bigger than b, the result is some positive integer. Otherwise, a and b are equal, and the result is zero.
This function is often used to parameterize sorting and searching algorithms from standard libraries.
Implementing the compare function for characters is quite easy; you simply subtract the arguments:
int compare_char(char a, char b)
{
return a - b;
}
This works because the difference between two characters is generally assumed to fit into an integer. (Note that this assumption does not hold for systems where sizeof(char) == sizeof(int).)
This trick cannot work to compare integers, because the difference between two integers generally does not fit into an integer. For example, INT_MAX - (-1) = INT_MIN suggests that INT_MAX is smaller than -1 (technically, the overflow leads to undefined behavior, but let's assume modulo arithmetic).
So how can we implement the compare function efficiently for integers? Here is my first attempt:
int compare_int(int a, int b)
{
int temp;
int result;
__asm__ __volatile__ (
"cmp %3, %2 \n\t"
"mov $0, %1 \n\t"
"mov $1, %0 \n\t"
"cmovg %0, %1 \n\t"
"mov $-1, %0 \n\t"
"cmovl %0, %1 \n\t"
: "=r"(temp), "=r"(result)
: "r"(a), "r"(b)
: "cc");
return result;
}
Can it be done in less than 6 instructions? Is there a less straightforward way that is more efficient?
This one has no branches, and doesn't suffer from overflow or underflow:
return (a > b) - (a < b);
With gcc -O2 -S, this compiles down to the following six instructions:
xorl %eax, %eax
cmpl %esi, %edi
setl %dl
setg %al
movzbl %dl, %edx
subl %edx, %eax
Here's some code to benchmark various compare implementations:
#include <stdio.h>
#include <stdlib.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE compare2
#define USE_RAND 1
int arr[COUNT];
int compare1 (int a, int b)
{
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
int compare2 (int a, int b)
{
return (a > b) - (a < b);
}
int compare3 (int a, int b)
{
return (a < b) ? -1 : (a > b);
}
int compare4 (int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
int sum = 0;
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j++) {
sum += COMPARE(arr[i], arr[j]);
}
}
}
printf("%d=0\n", sum);
return 0;
}
The results on my 64-bit system, compiled with gcc -std=c99 -O2, for positive integers (USE_RAND=1):
compare1: 0m1.118s
compare2: 0m0.756s
compare3: 0m1.101s
compare4: 0m0.561s
Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge).
Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.
With a full range of integers (USE_RAND=0), the 4-instruction solution is in fact very slow (others are the same):
compare4: 0m1.897s
The following has always proven to be fairly efficient for me:
return (a < b) ? -1 : (a > b);
With gcc -O2 -S, this compiles down to the following five instructions:
xorl %edx, %edx
cmpl %esi, %edi
movl $-1, %eax
setg %dl
cmovge %edx, %eax
As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.
./opt_cmp_branchless: 0m1.070s
./opt_cmp_branch: 0m1.037s
I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.
The following is the version with the cmovge instruction ((a < b) ? -1 : (a > b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
orl $-1, %edi
.L12:
xorl %ebp, %ebp
.p2align 4,,10
.p2align 3
.L18:
movl arr.2789(%rbp), %ecx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L15:
movl arr.2789(%rax), %edx
xorl %ebx, %ebx
cmpl %ecx, %edx
movl $-1, %edx
setg %bl
cmovge %ebx, %edx
addq $4, %rax
addl %edx, %esi
cmpq $4096, %rax
jne .L15
addq $4, %rbp
cmpq $4096, %rbp
jne .L18
addl $1, %r8d
cmpl $500, %r8d
jne .L12
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
The version below uses the branchless method ((a > b) - (a < b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
.L19:
movl %ebp, %ebx
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L24:
movl %ebp, %ecx
xorl %eax, %eax
jmp .L22
.p2align 4,,10
.p2align 3
.L20:
movl arr.2789(%rax), %ecx
.L22:
xorl %edx, %edx
cmpl %ebx, %ecx
setg %cl
setl %dl
movzbl %cl, %ecx
subl %ecx, %edx
addl %edx, %esi
addq $4, %rax
cmpq $4096, %rax
jne .L20
addq $4, %rdi
cmpq $4096, %rdi
je .L21
movl arr.2789(%rdi), %ebx
jmp .L24
.L21:
addl $1, %r8d
cmpl $500, %r8d
jne .L19
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
Okay, I managed to get it down to four instructions :) The basic idea is as follows:
Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.
Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
00000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
11111111 shifted
Shifting the carry bit in would yield 0 for the first case (although INT_MIN is not equal to INT_MAX) and some negative number for the second case (although INT_MAX is not smaller than INT_MIN).
But if we flip the carry bit before doing the shift, we get sensible numbers:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
100000001 carry flipped
10000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
011111111 carry flipped
01111111 shifted
I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.
int compare_int(int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?
For what it's worth I put together an SSE2 implementation. vec_compare1 uses the same approach as compare2 but requires just three SSE2 arithmetic instructions:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE vec_compare1
#define USE_RAND 1
int arr[COUNT] __attribute__ ((aligned(16)));
typedef __m128i vSInt32;
vSInt32 vec_compare1 (vSInt32 va, vSInt32 vb)
{
vSInt32 vcmp1 = _mm_cmpgt_epi32(va, vb);
vSInt32 vcmp2 = _mm_cmpgt_epi32(vb, va);
return _mm_sub_epi32(vcmp2, vcmp1);
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
vSInt32 vsum = _mm_set1_epi32(0);
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j+=4) {
vSInt32 v1 = _mm_loadu_si128(&arr[i]);
vSInt32 v2 = _mm_load_si128(&arr[j]);
vSInt32 v = COMPARE(v1, v2);
vsum = _mm_add_epi32(vsum, v);
}
}
}
printf("vsum = %vd\n", vsum);
return 0;
}
Time for this is 0.137s.
Time for compare2 with the same CPU and compiler is 0.674s.
So the SSE2 implementation is around 4x faster, as might be expected (since it's 4-wide SIMD).
This code has no branches and uses 5 instructions. It may outperform other branch-less alternatives on recent Intel processors, where cmov* instructions are quite expensive. Disadvantage is non-symmetrical return value (INT_MIN+1, 0, 1).
int compare_int (int a, int b)
{
int res;
__asm__ __volatile__ (
"xor %0, %0 \n\t"
"cmpl %2, %1 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "=q"(res)
: "r"(a)
, "r"(b)
: "cc"
);
return res;
}
This variant does not need initialization, so it uses only 4 instructions:
int compare_int (int a, int b)
{
__asm__ __volatile__ (
"subl %1, %0 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "+q"(a)
: "r"(b)
: "cc"
);
return a;
}
Maybe you can use the following idea (in pseudo-code; didn't write asm-code because i am not comfortable with syntax):
Subtract the numbers (result = a - b)
If no overflow, done (jo instruction and branch prediction should work very well here)
If there was overflow, use any robust method (return (a < b) ? -1 : (a > b))
Edit: for additional simplicity: if there was overflow, flip the sign of the result, instead of step 3.
You could consider promoting the integers to 64bit values.
I'm incrementing a counter, which I will need to use after the loop in double arithmetic. So, which would you expect to be faster? (Or too close to call?)
Code 1:
double dubs = 3.14159265;
double d;
for(d=0; d<BIGNUM; d++) { /* do stuff not depending on d */ }
dubs /= d;
Code 2:
double dubs = 3.14159265;
int i;
for(i=0; i<BIGNUM; i++) { /* do stuff not depending on i */ }
dubs /= (double) i;
And does it depend on the size of BIGNUM? I know it would be a minuscule difference, but just found myself wondering in theory.
Bonus question: if it were C++, any change in your answer for using static_cast?
--Edit--
Ok, here's a sample code and assembler:
#define BIGNUM 1000000000
#define NUMLOOPS 1000
double test1()
{
double dubs = 3.14159265;
double d;
int k = 1;
for(d=0; d<BIGNUM; d++) { k*= 2; }
dubs /= d;
return dubs;
}
double test2()
{
double dubs = 3.14159265;
int i;
int k = 1;
for(i=0; i<BIGNUM; i++) { k*= 2; }
dubs /= (double)i;
return dubs;
}
int main()
{
double d1=0;
double d2=0;
int i;
for(i=0; i<NUMLOOPS; i++)
{
d1 += test1();
d2 += test2();
}
}
_test1:
LFB2:
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
subq $48, %rsp
LCFI2:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -4(%rbp)
movl $0, %eax
movq %rax, -24(%rbp)
jmp L2
L3:
sall -4(%rbp)
movsd -24(%rbp), %xmm0
movsd LC2(%rip), %xmm1
addsd %xmm1, %xmm0
movsd %xmm0, -24(%rbp)
L2:
movsd -24(%rbp), %xmm1
movsd LC3(%rip), %xmm0
ucomisd %xmm1, %xmm0
ja L3
movsd -16(%rbp), %xmm0
divsd -24(%rbp), %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
leave
ret
_test2:
LFB3:
pushq %rbp
LCFI3:
movq %rsp, %rbp
LCFI4:
subq $32, %rsp
LCFI5:
call mcount
movabsq $4614256656543962353, %rax
movq %rax, -16(%rbp)
movl $1, -8(%rbp)
movl $0, -4(%rbp)
jmp L7
L8:
sall -8(%rbp)
incl -4(%rbp)
L7:
cmpl $99999, -4(%rbp)
jle L8
cvtsi2sd -4(%rbp), %xmm1
movsd -16(%rbp), %xmm0
divsd %xmm1, %xmm0
movsd %xmm0, -16(%rbp)
movq -16(%rbp), %rax
movq %rax, -24(%rbp)
movsd -24(%rbp), %xmm0
leave
ret
Test is currently running....
As a double it probably doesn't matter, but if you'd used float, the first code fragment might not even work. Due to limited precision, after a while, incrementing a float will not change its value. Of course with (signed) integer types, you get UB on overflow, which is arguably worse.
Personally I would recommend always using integer types for a variable that contains something like a count/index that is naturally an integer. Using floating point types for this just feels wrong. But please remove the useless cast in the last line of the second fragment.