The compare function is a function that takes two arguments a and b and returns an integer describing their order. If a is smaller than b, the result is some negative integer. If a is bigger than b, the result is some positive integer. Otherwise, a and b are equal, and the result is zero.
This function is often used to parameterize sorting and searching algorithms from standard libraries.
Implementing the compare function for characters is quite easy; you simply subtract the arguments:
int compare_char(char a, char b)
{
return a - b;
}
This works because the difference between two characters is generally assumed to fit into an integer. (Note that this assumption does not hold for systems where sizeof(char) == sizeof(int).)
This trick cannot work to compare integers, because the difference between two integers generally does not fit into an integer. For example, INT_MAX - (-1) = INT_MIN suggests that INT_MAX is smaller than -1 (technically, the overflow leads to undefined behavior, but let's assume modulo arithmetic).
So how can we implement the compare function efficiently for integers? Here is my first attempt:
int compare_int(int a, int b)
{
int temp;
int result;
__asm__ __volatile__ (
"cmp %3, %2 \n\t"
"mov $0, %1 \n\t"
"mov $1, %0 \n\t"
"cmovg %0, %1 \n\t"
"mov $-1, %0 \n\t"
"cmovl %0, %1 \n\t"
: "=r"(temp), "=r"(result)
: "r"(a), "r"(b)
: "cc");
return result;
}
Can it be done in less than 6 instructions? Is there a less straightforward way that is more efficient?
This one has no branches, and doesn't suffer from overflow or underflow:
return (a > b) - (a < b);
With gcc -O2 -S, this compiles down to the following six instructions:
xorl %eax, %eax
cmpl %esi, %edi
setl %dl
setg %al
movzbl %dl, %edx
subl %edx, %eax
Here's some code to benchmark various compare implementations:
#include <stdio.h>
#include <stdlib.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE compare2
#define USE_RAND 1
int arr[COUNT];
int compare1 (int a, int b)
{
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
int compare2 (int a, int b)
{
return (a > b) - (a < b);
}
int compare3 (int a, int b)
{
return (a < b) ? -1 : (a > b);
}
int compare4 (int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
int sum = 0;
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j++) {
sum += COMPARE(arr[i], arr[j]);
}
}
}
printf("%d=0\n", sum);
return 0;
}
The results on my 64-bit system, compiled with gcc -std=c99 -O2, for positive integers (USE_RAND=1):
compare1: 0m1.118s
compare2: 0m0.756s
compare3: 0m1.101s
compare4: 0m0.561s
Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge).
Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.
With a full range of integers (USE_RAND=0), the 4-instruction solution is in fact very slow (others are the same):
compare4: 0m1.897s
The following has always proven to be fairly efficient for me:
return (a < b) ? -1 : (a > b);
With gcc -O2 -S, this compiles down to the following five instructions:
xorl %edx, %edx
cmpl %esi, %edi
movl $-1, %eax
setg %dl
cmovge %edx, %eax
As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.
./opt_cmp_branchless: 0m1.070s
./opt_cmp_branch: 0m1.037s
I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.
The following is the version with the cmovge instruction ((a < b) ? -1 : (a > b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
orl $-1, %edi
.L12:
xorl %ebp, %ebp
.p2align 4,,10
.p2align 3
.L18:
movl arr.2789(%rbp), %ecx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L15:
movl arr.2789(%rax), %edx
xorl %ebx, %ebx
cmpl %ecx, %edx
movl $-1, %edx
setg %bl
cmovge %ebx, %edx
addq $4, %rax
addl %edx, %esi
cmpq $4096, %rax
jne .L15
addq $4, %rbp
cmpq $4096, %rbp
jne .L18
addl $1, %r8d
cmpl $500, %r8d
jne .L12
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
The version below uses the branchless method ((a > b) - (a < b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
.L19:
movl %ebp, %ebx
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L24:
movl %ebp, %ecx
xorl %eax, %eax
jmp .L22
.p2align 4,,10
.p2align 3
.L20:
movl arr.2789(%rax), %ecx
.L22:
xorl %edx, %edx
cmpl %ebx, %ecx
setg %cl
setl %dl
movzbl %cl, %ecx
subl %ecx, %edx
addl %edx, %esi
addq $4, %rax
cmpq $4096, %rax
jne .L20
addq $4, %rdi
cmpq $4096, %rdi
je .L21
movl arr.2789(%rdi), %ebx
jmp .L24
.L21:
addl $1, %r8d
cmpl $500, %r8d
jne .L19
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
Okay, I managed to get it down to four instructions :) The basic idea is as follows:
Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.
Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
00000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
11111111 shifted
Shifting the carry bit in would yield 0 for the first case (although INT_MIN is not equal to INT_MAX) and some negative number for the second case (although INT_MAX is not smaller than INT_MIN).
But if we flip the carry bit before doing the shift, we get sensible numbers:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
100000001 carry flipped
10000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
011111111 carry flipped
01111111 shifted
I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.
int compare_int(int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?
For what it's worth I put together an SSE2 implementation. vec_compare1 uses the same approach as compare2 but requires just three SSE2 arithmetic instructions:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE vec_compare1
#define USE_RAND 1
int arr[COUNT] __attribute__ ((aligned(16)));
typedef __m128i vSInt32;
vSInt32 vec_compare1 (vSInt32 va, vSInt32 vb)
{
vSInt32 vcmp1 = _mm_cmpgt_epi32(va, vb);
vSInt32 vcmp2 = _mm_cmpgt_epi32(vb, va);
return _mm_sub_epi32(vcmp2, vcmp1);
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
vSInt32 vsum = _mm_set1_epi32(0);
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j+=4) {
vSInt32 v1 = _mm_loadu_si128(&arr[i]);
vSInt32 v2 = _mm_load_si128(&arr[j]);
vSInt32 v = COMPARE(v1, v2);
vsum = _mm_add_epi32(vsum, v);
}
}
}
printf("vsum = %vd\n", vsum);
return 0;
}
Time for this is 0.137s.
Time for compare2 with the same CPU and compiler is 0.674s.
So the SSE2 implementation is around 4x faster, as might be expected (since it's 4-wide SIMD).
This code has no branches and uses 5 instructions. It may outperform other branch-less alternatives on recent Intel processors, where cmov* instructions are quite expensive. Disadvantage is non-symmetrical return value (INT_MIN+1, 0, 1).
int compare_int (int a, int b)
{
int res;
__asm__ __volatile__ (
"xor %0, %0 \n\t"
"cmpl %2, %1 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "=q"(res)
: "r"(a)
, "r"(b)
: "cc"
);
return res;
}
This variant does not need initialization, so it uses only 4 instructions:
int compare_int (int a, int b)
{
__asm__ __volatile__ (
"subl %1, %0 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "+q"(a)
: "r"(b)
: "cc"
);
return a;
}
Maybe you can use the following idea (in pseudo-code; didn't write asm-code because i am not comfortable with syntax):
Subtract the numbers (result = a - b)
If no overflow, done (jo instruction and branch prediction should work very well here)
If there was overflow, use any robust method (return (a < b) ? -1 : (a > b))
Edit: for additional simplicity: if there was overflow, flip the sign of the result, instead of step 3.
You could consider promoting the integers to 64bit values.
Related
I've been trying to translate this function to assembly:
void foo (int a[], int n) {
int i;
int s = 0;
for (i=0; i<n; i++) {
s += a[i];
if (a[i] == 0) {
a[i] = s;
s = 0;
}
}
}
But something is going wrong.
That's what I've done so far:
.section .text
.globl foo
foo:
.L1:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $0, -16(%rbp) /*s*/
movl $0, -8(%rbp) /*i*/
jmp .L2
.L2:
cmpl -8(%rbp), %esi
jle .L4
leave
ret
.L3:
addl $1, -8(%rbp)
jmp .L2
.L4:
movl -8(%rbp), %eax
imull $4, %eax
movslq %eax, %rax
addq %rdi, %rax
movl (%rax), %eax
addl %eax, -16(%rbp)
cmpl $0, %eax
jne .L3
/* if */
leaq (%rax), %rdx
movl -16(%rbp), %eax
movl %eax, (%rdx)
movl $0, -16(%rbp)
jmp .L3
I am compiling the .s module with a .c module, for example, with an int nums [5] = {65, 23, 11, 0, 34} and I'm getting back the same array instead of {65, 23, 11 , 99, 34}.
Could someone help me?
Presumably you have a compiler that can generate AT&T syntax. It might be more instructive to look at what assembly output the compiler generates. Here's my re-formulation of your demo:
#include <stdio.h>
void foo (int a[], int n)
{
for (int s = 0, i = 0; i < n; i++)
{
if (a[i] != 0)
s += a[i];
else
a[i] = s, s = 0;
}
}
int main (void)
{
int nums[] = {65, 23, 11, 0, 34};
int size = sizeof(nums) / sizeof(int);
foo(nums, size);
for (int i = 0; i < size; i++)
fprintf(stdout, i < (size - 1) ? "%d, " : "%d\n", nums[i]);
return (0);
}
Compiling without optimizations enabled is typically harder to work through than optimized code, since it loads from and spills results to memory. You won't learn much from it if you're investing time in learning how to write efficient assembly.
Compiling with the Godbolt compiler explorer with -O2 optimizations yields much more efficient code; it's also useful for cutting out unnecessary directives, labels, etc., that would be visual noise in this case.
In my experience, using -O2 optimizations are clever enough to make you rethink your use of registers, refactoring, etc. -O3 can sometimes optimize too agressively - unrolling loops, vectorizing, etc., to easily follow.
Finally, for the case you have presented, there's a perfect compromise: -Os, which enables many of the optimizations of -O2, but not at the expense of increased code size. I'll paste the assembly here just for comparative purposes:
foo:
xorl %eax, %eax
xorl %ecx, %ecx
.L2:
cmpl %eax, %esi
jle .L7
movl (%rdi,%rax,4), %edx
testl %edx, %edx
je .L3
addl %ecx, %edx
jmp .L4
.L3:
movl %ecx, (%rdi,%rax,4)
.L4:
incq %rax
movl %edx, %ecx
jmp .L2
.L7:
ret
Remember that the calling convention passes the pointer to (a) in %rdi, and the 'count' (n) in %rsi. These are the calling conventions being used. Notice that your code does not 'dereference' or 'index' any elements through %rdi. It's definitely worth going stepping through the code - even with pen and paper if it helps - to understand the branch conditions and how reading and writing is performed on element a[i].
Curiously, using the inner loop of your code:
s += a[i];
if (a[i] == 0)
a[i] = s, s = 0;
Appears to generate more efficient code with -Os than the inner loop I used:
foo:
xorl %eax, %eax
xorl %edx, %edx
.L2:
cmpl %eax, %esi
jle .L6
movl (%rdi,%rax,4), %ecx
addl %ecx, %edx
testl %ecx, %ecx
jne .L3
movl %edx, (%rdi,%rax,4)
xorl %edx, %edx
.L3:
incq %rax
jmp .L2
.L6:
ret
A reminder for me to keep things simple!
this is the assembly code i am supposed to translate:
f1:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
heres the c code I wrote that I think is equivalent.
int f1(int y){
int x = y-97;
int i = 0;
if(x<=25){
x = i;
}
return x;
}
and heres what I get from compiling the C code.
_f1: ## #f1
.cfi_startproc
%bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
## kill: def %edi killed %edi def %rdi
leal -97(%rdi), %ecx
xorl %eax, %eax
cmpl $123, %edi
cmovgel %ecx, %eax
popq %rbp
retq
.cfi_endproc
I was wondering if this was correct / what should be different and if anyone could help explain how jmps work as I am also trying to translate this assembly code and have gotten stuck
f2:
cmpl $1, %edi
jle .L6
movl $2, %edx
movl $1, %eax
jmp .L5
.L8:
movl %ecx, %edx
.L5:
imull %edx, %eax
leal 1(%rdx), %ecx
cmpl %eax, %edi
jg .L8
.L4:
cmpl %edi, %eax
sete %al
movzbl %al, %eax
ret
.L6:
movl $1, %eax
jmp .L4
gcc8.3 -O3 emits exactly the asm in the question for this way of writing the range check using the unsigned-compare trick.
int is_ascii_lowercase_v2(int y){
unsigned char x = y-'a';
return x <= (unsigned)('z'-'a');
}
Narrowing to 8-bit after the int subtract matches the asm more exactly, but it's not necessary for correctness or even to convince compilers to use a 32-bit sub. For unsigned char y, the upper bytes of RDI are allowed to hold arbitrary garbage (x86-64 System V calling convention), but carry only propagates from low to high with sub and add.
The low 8 bits of the result (which is all the cmp reads) would be the same with sub $'a', %dil or sub $'a', %edi.
Writing it as a normal range-check also gets gcc to emit identical code, because compilers know how optimize range-checks. (And gcc chooses to use 32-bit operand-size for the sub, unlike clang which uses 8-bit.)
int is_ascii_lowercase_v3(char y){
return (y>='a' && y<='z');
}
On the Godbolt compiler explorer, this and _v2 compile as follows:
## gcc8.3 -O3
is_ascii_lowercase_v3: # and _v2 is identical
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
Returning a compare result as an integer, instead of using an if, much more naturally matches the asm.
But even writing it "branchlessly" in C won't match the asm unless you enable optimization. The default code-gen from gcc/clang is -O0: anti-optimize for consistent debugging, storing/reloading everything to memory between statements. (And function args on function entry.) You need optimization, because -O0 code-gen is (intentionally) mostly braindead, and nasty looking. See How to remove "noise" from GCC/clang assembly output?
## gcc8.3 -O0
is_ascii_lowercase_v2:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl -20(%rbp), %eax
subl $97, %eax
movb %al, -1(%rbp)
cmpb $25, -1(%rbp)
setbe %al
movzbl %al, %eax
popq %rbp
ret
gcc and clang with optimization enabled will do if-conversion to branchless code when it's efficient. e.g.
int is_ascii_lowercase_branchy(char y){
unsigned char x = y-'a';
if (x < 25U) {
return 1;
}
return 0;
}
still compiles to the same asm with GCC8.3 -O3
is_ascii_lowercase_branchy:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
We can tell that the optimization level was at least gcc -O2. At -O1, gcc uses the less efficient setbe / movzx instead of xor-zeroing EAX ahead of setbe
is_ascii_lowercase_v2:
subl $97, %edi
cmpb $25, %dil
setbe %al
movzbl %al, %eax
ret
I could never get clang to reproduce exactly the same sequence of instructions. It likes to use add $-97, %edi, and cmp with $26 / setb.
Or it will do really interesting (but sub-optimal) things like this:
# clang7.0 -O3
is_ascii_lowercase_v2:
addl $159, %edi # 256-97 = 8-bit version of -97
andl $254, %edi # 0xFE; I haven't figured out why it's clearing the low bit as well as the high bits
xorl %eax, %eax
cmpl $26, %edi
setb %al
retq
So this is something involving -(x-97), maybe using the 2's complement identity in there somewhere (-x = ~x + 1).
Here is an annotated version of the assembly:
# %edi is the first argument, we denote x
subl $97, %edi
# x -= 97
# %eax is the return value, we denote y
xorl %eax, %eax
# y = 0
# %dil is the least significant byte (lsb) of x
cmpb $25, %dil
# %al is lsb(y) which is already zeroed
setbe %al
# if lsb(x) <= 25 then lsb(y) = 1
# setbe is unsigned version, setle would be signed
ret
# return y
So a verbose C equivalent is:
int f(int x) {
int y = 0;
x -= 97;
x &= 0xFF; // x = lsb(x) using 0xFF as a bitmask
y = (unsigned)x <= 25; // Section 6.5.8 of C standard: comparisons yield 0 or 1
return y;
}
We can shorten it by realizing y is unnecessary:
int f(int x) {
x -= 97;
x &= 0xFF;
return (unsigned)x <= 25;
}
The assembly of this is an exact match on Godbolt Compiler Explorer (x86-64 gcc8.2 -O2): https://godbolt.org/z/fQ0LVR
I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}
Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.
You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax
The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.
This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function
Due to university work, I have to investigate a simple optimization, the inlining.
Here is the basic code:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void add(float x, float y, float *z){
*z = x + y;
}
void initialVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
add(x[i], y[i], &z[i]);
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
And here is the code with inlining:
#include <stdio.h>
#include <sys/time.h>
#include <stdlib.h>
#define ITER 1000
#define N 3000000
int i, j;
float x[N], y[N], z[N];
void inliningVersion(){
struct timeval inicio, final;
double time;
gettimeofday(&inicio, 0);
for(j = 0; j < ITER; j++){
for(i = 0; i < N; i++){
z[i] = x[i] + y[i];
}
}
gettimeofday(&final, 0);
time = (final.tv_sec - inicio.tv_sec + (final.tv_usec - inicio.tv_usec)/1.e6);
printf("Time: %f\n", time);
}
Compiling using the option -O0 with gcc, the results are 14.27 seconds for the basic version and 4.45 seconds for the version with the inlining. Is that common? I executed the programm 10 times and the results are always similar. What do you think?
Then, compiling with the option -O1 the results are similar for both versions, 1.5 seconds approximately so I suppose that gcc does the inlining for me with O1.
By the way, I know that gettimeofday counts the overall time and not only the time used by the programm itself, but I am required to use that function specifically.
Thanks in advance!
Let's us analyze the assembly output generated by GCC 7.2 (with O0) for both versions of the code.
Without inlining
First, let's check how much work has to be done by the computer to achieve the task with a separate function:
void add(float x, float y, float *z){
*z = x + y;
}
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
add(x[i], y[i], &z[i]);
}
}
For the above code, GCC produces an assembly as given below:
add(float, float, float*):
pushq %rbp
movq %rsp, %rbp
movss %xmm0, -4(%rbp)
movss %xmm1, -8(%rbp)
movq %rdi, -16(%rbp)
movss -4(%rbp), %xmm0
addss -8(%rbp), %xmm0
movq -16(%rbp), %rax
movss %xmm0, (%rax)
nop
popq %rbp
ret
main:
pushq %rbp
movq %rsp, %rbp
subq $1224, %rsp
movl $0, -4(%rbp)
.L4:
cmpl $99, -4(%rbp)
jg .L3
leaq -1216(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
salq $2, %rdx
addq %rax, %rdx
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
movl -4(%rbp), %eax
cltq
movl -416(%rbp,%rax,4), %eax
movq %rdx, %rdi
movaps %xmm0, %xmm1
movl %eax, -1220(%rbp)
movss -1220(%rbp), %xmm0
call add(float, float, float*)
addl $1, -4(%rbp)
jmp .L4
.L3:
movl $0, %eax
leave
ret
The processing part of the code takes approximately 32 instructions (instructions between L4 and L3 and that of add function).
A large majority of the instructions are used for making the function call.
A simplified way to understand how function calls work is:
arguments are pushed on the call stack
return address is pushed on to the call stack
the function is called
make a copy of the frame pointer
make room for locals on the stack
actual function code is executed
restorel the state as it was before the function call
return to the caller
The above steps (except 6th) take additional instructions to do the required processing. This is called the function call overhead.
With inlining
Now let's check how much work the computer has to do if the function was inlined.
int main ()
{
float x[100], y[100], z[100];
for(int i = 0; i < 100; i++){
z[i] = x[i] + y[i];
}
}
For the above code, GCC produces an assembly output as given below:
main:
pushq %rbp
movq %rsp, %rbp
subq $1096, %rsp
movl $0, -4(%rbp)
.L3:
cmpl $99, -4(%rbp)
jg .L2
movl -4(%rbp), %eax
cltq
movss -416(%rbp,%rax,4), %xmm1
movl -4(%rbp), %eax
cltq
movss -816(%rbp,%rax,4), %xmm0
addss %xmm1, %xmm0
movl -4(%rbp), %eax
cltq
movss %xmm0, -1216(%rbp,%rax,4)
addl $1, -4(%rbp)
jmp .L3
.L2:
movl $0, %eax
leave
ret
The processing code (instructions between label L3 and L2) has around 14 instructions. In this assembly output, all the instructions which are responsible for making the function call aren't present which saves considerable amount of CPU cycles.
In general, the overhead of a function call is not relevant when your function's running time is more than several times of the overhead of a function call. In your code, the running time of your function is quite small and hence the function call overhead gains significance.
If you use the O1 flag, the compiler indeed does the inlining for you. You can find out by checking the assembly generated with the O1 or you can directly check the GCC manual for the list of optimizations which are tried with O1.
You can generate assembly output using the -S flag or you can do it online with GodBolt (the assembly outputs were taken from here for this post).
I am trying to make assembly function that uses SSE and FPU for parallel calculations. Unfortunately I am receiving segmentation fault(core dumped) error(while debugging it doesn't show in assembly function). I also cannot step out from assembly function. Gdb shows:
Warning:
Cannot insert breakpoint 0.
Cannot access memory at address 0x2bffff
after ret statement.
I'm out of any ideas what may cause that type of behaviour. Maybe some of you see something I don't? Cheers.
Integrals.s
#float intgr_vert(float x)
#{
# return pow(x, 2) - 4*x + 6;
#}
s_precision = 0x007f
.bss
.lcomm holder, 4
.lcomm rect_size_vec, 16
.lcomm x_vec, 16
.lcomm result, 16
.data
four:
.float 4.0, 4.0, 4.0, 4.0
six:
.float 6.0, 6.0, 6.0, 6.0
.globl four_intgr_strips
.type four_intgr_strips, #function
four_intgr_strips:
pushl %eax
pushl %ecx
pushl %edx
pushl %ebp
movl %esp, %ebp
subl $2, %esp
movl $0, %edi
movl 20(%ebp), %eax #x position
movl 24(%ebp), %ebx #rectangle size
movw $s_precision, -2(%ebp)
finit
fldcw -2(%ebp)
pool:
movl %eax, x_vec(, %edi, 4)
movl %ebx, rect_size_vec(, %edi, 4)
movl %eax, holder
flds holder
movl %ebx, holder
flds holder #adding size of rectangle to calculate different x
fadd %st(1), %st(0)
fstps holder
movl holder, %eax
inc %edi
cmp $4, %edi
je pool_dne
jmp pool
pool_dne:
ret ###########################can't go further
.type sumAreas, #function
sumAreas:
movl $0, %edi
flds result(, %edi, 4)
inc %edi
loop:
flds result(, %edi, 4)
fadd %st(1), %st(0)
inc %edi
cmp $4, %edi
je end_loop
jmp loop
end_loop:
ret
.type calcAreas, #function
calcAreas:
movaps rect_size_vec, %xmm1
mulps %xmm1, %xmm0
movaps %xmm0, result
ret
.type calcVertical, #function
calcVertical:
movaps x_vec, %xmm0
mulps %xmm0, %xmm0
movaps x_vec, %xmm1
movups four, %xmm2
mulps %xmm1, %xmm2
subps %xmm2, %xmm0
movups six, %xmm1
addps %xmm1, %xmm0
ret
main.c
#include <stdio.h>
#include <math.h>
// x^2 - 4x + 6 integral
float four_intgr_strips(float, float);
float calc_intgr_in_as(int a, int n, float rect_size)
{
float sum = 0;
float four_rect_area;
float last_rect_l_corner = a;
for(int i = 0; i != n/4; i++)
{
four_rect_area = four_intgr_strips(last_rect_l_corner, rect_size);
sum = sum + four_rect_area;
last_rect_l_corner = last_rect_l_corner + 4*rect_size;
}
return sum;
}
int main()
{
int a, b, n;
float rect_size;
float sum;
printf("\nType integral lower bound:");
scanf("%d", &a);
printf("\nType integral upper bound:");
scanf("%d", &b);
do
{
printf("\nType rectangles number(must be multiple of 4):");
scanf("%d", &n);
}
while(n % 4 != 0);
rect_size = (float)(b - a)/n;
sum = calc_intgr_in_as(a, n, rect_size);
printf("\nArea under function is: %f with SSE", sum);
}
You have forgotten to cleanup the stack.
In the prologue you have:
pushl %eax
pushl %ecx
pushl %edx
pushl %ebp
movl %esp, %ebp
You obviously need to undo that before you ret, such as:
movl %ebp, %esp
popl %ebp
popl %edx
popl %ecx
popl %eax
ret
PS: I have already told you that unaligning the stack is a bad idea, sooner or later that will bite you. Also, next time you ask a question, mention what input you used and what output you expect.