Fast modulo 10 in c - c

I am looking for a fast modulo 10 algorithm because I need to speed up my program which does many modulo operations in cycles.
I have checked out this page which compares some alternatives.
As far as I understand it correctly, T3 was the fastest of all.
My question is, how would x % y look like using T3 technique?
I copied T3 technique here for simplicity in case the link gets down.
for (int x = 0; x < max; x++)
{
if (y > (threshold - 1))
{
y = 0; //reset
total += x;
}
y += 1;
}
Regarding to comments, if this is not really faster then regular mod, I am looking for at least 2 times faster modulo than using %.
I have seen many examples with use power of two, but since 10 is not, how can I get it to work?
Edit:
For my program, let's say I have 2 for cycles where n=1 000 000 and m=1000.
Looks like this:
for (i = 1; i <= n; i++) {
D[(i%10)*m] = i;
for (j = 1; j <= m; j++) {
...
}
}

Here's the fastest modulo-10 function you can write:
unsigned mod10(unsigned x)
{
return x % 10;
}
And here's what it looks like once compiled:
movsxd rax, edi
imul rcx, rax, 1717986919
mov rdx, rcx
shr rdx, 63
sar rcx, 34
add ecx, edx
add ecx, ecx
lea ecx, [rcx + 4*rcx]
sub eax, ecx
ret
Note the lack of division/modulus instructions, the mysterious constants, the use of an instruction which was originally intended for complex array indexing, etc. Needless to say, the compiler knows a lot of tricks to make your program as fast as possible. You'll rarely beat it on tasks like this.

You likely can't beat the compiler.
Debug build
// int foo = x % 10;
010341C5 mov eax,dword ptr [x]
010341C8 cdq
010341C9 mov ecx,0Ah
010341CE idiv eax,ecx
010341D0 mov dword ptr [foo],edx
Retail build (doing some ninja math there...)
// int foo = x % 10;
00BD100E mov eax,66666667h
00BD1013 imul esi
00BD1015 sar edx,2
00BD1018 mov ecx,edx
00BD101A shr ecx,1Fh
00BD101D add ecx,edx
00BD101F lea eax,[ecx+ecx*4]
00BD1022 add eax,eax
00BD1024 sub esi,eax

The code isn’t a direct substitute for modulo, it substitutes modulo in that situation. You can write your own mod by analogy (for a, b > 0):
int mod(int a, int b) {
while (a >= b) a -= b;
return a;
}
… but whether that’s faster than % is highly questionable.

This will work for (multiword) values larger than the machineword (but assuming a binary computer ...):
#include <stdio.h>
unsigned long mod10(unsigned long val)
{
unsigned res=0;
res =val &0xf;
while (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += 6 * (val&0xf);
while (res >= 10) { res -= 10; }
}
return res;
}
int main (int argc, char **argv)
{
unsigned long val;
unsigned res;
sscanf(argv[1], "%lu", &val);
res = mod10(val);
printf("%lu -->%u\n", val,res);
return 0;
}
UPDATE:
With some extra effort, you could get the algoritm free of multiplications, and with the proper amount of optimisation we can even get the recursive call inlined:
static unsigned long mod10_1(unsigned long val)
{
unsigned char res=0; //just to show that we don't need a big accumulator
res =val &0xf; // res can never be > 15
if (res>=10) { res -= 10; }
for(val >>= 4; val; val >>= 4){
res += (val&0xf)<<2 | (val&0xf) <<1;
res= mod10_1(res); // the recursive call
}
return res;
}
And the result for mod10_1 appears to be mul/div free and almost without branches:
mod10_1:
.LFB25:
.cfi_startproc
movl %edi, %eax
andl $15, %eax
leal -10(%rax), %edx
cmpb $10, %al
cmovnb %edx, %eax
movq %rdi, %rdx
shrq $4, %rdx
testq %rdx, %rdx
je .L12
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
.L4:
movl %edx, %ecx
andl $15, %ecx
leal (%rcx,%rcx,2), %ecx
leal (%rax,%rcx,2), %eax
movl %eax, %ecx
movzbl %al, %esi
andl $15, %ecx
leal -10(%rcx), %r9d
cmpb $9, %cl
cmovbe %ecx, %r9d
shrq $4, %rsi
leal (%rsi,%rsi,2), %ecx
leal (%r9,%rcx,2), %ecx
movl %ecx, %edi
movzbl %cl, %ecx
andl $15, %edi
testq %rsi, %rsi
setne %r10b
cmpb $9, %dil
leal -10(%rdi), %eax
seta %sil
testb %r10b, %sil
cmove %edi, %eax
shrq $4, %rcx
andl $1, %r10d
leal (%rcx,%rcx,2), %r8d
movl %r10d, %r11d
leal (%rax,%r8,2), %r8d
movl %r8d, %edi
andl $15, %edi
testq %rcx, %rcx
setne %sil
leal -10(%rdi), %ecx
andl %esi, %r11d
cmpb $9, %dil
seta %bl
testb %r11b, %bl
cmovne %ecx, %edi
andl $1, %r11d
andl $240, %r8d
leal 6(%rdi), %ebx
setne %cl
movl %r11d, %r8d
andl %ecx, %r8d
leal -4(%rdi), %ebp
cmpb $9, %bl
seta %r12b
testb %r8b, %r12b
cmovne %ebp, %ebx
andl $1, %r8d
cmovne %ebx, %edi
xorl $1, %ecx
andl %r11d, %ecx
orb %r8b, %cl
cmovne %edi, %eax
xorl $1, %esi
andl %r10d, %esi
orb %sil, %cl
cmove %r9d, %eax
shrq $4, %rdx
testq %rdx, %rdx
jne .L4
popq %rbx
.cfi_restore 3
.cfi_def_cfa_offset 24
popq %rbp
.cfi_restore 6
.cfi_def_cfa_offset 16
movzbl %al, %eax
popq %r12
.cfi_restore 12
.cfi_def_cfa_offset 8
ret
.L12:
movzbl %al, %eax
ret
.cfi_endproc
.LFE25:
.size mod10_1, .-mod10_1
.p2align 4,,15
.globl mod10
.type mod10, #function

Related

Trying to translate a C function to x86_64 AT&T assembly

I've been trying to translate this function to assembly:
void foo (int a[], int n) {
int i;
int s = 0;
for (i=0; i<n; i++) {
s += a[i];
if (a[i] == 0) {
a[i] = s;
s = 0;
}
}
}
But something is going wrong.
That's what I've done so far:
.section .text
.globl foo
foo:
.L1:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $0, -16(%rbp) /*s*/
movl $0, -8(%rbp) /*i*/
jmp .L2
.L2:
cmpl -8(%rbp), %esi
jle .L4
leave
ret
.L3:
addl $1, -8(%rbp)
jmp .L2
.L4:
movl -8(%rbp), %eax
imull $4, %eax
movslq %eax, %rax
addq %rdi, %rax
movl (%rax), %eax
addl %eax, -16(%rbp)
cmpl $0, %eax
jne .L3
/* if */
leaq (%rax), %rdx
movl -16(%rbp), %eax
movl %eax, (%rdx)
movl $0, -16(%rbp)
jmp .L3
I am compiling the .s module with a .c module, for example, with an int nums [5] = {65, 23, 11, 0, 34} and I'm getting back the same array instead of {65, 23, 11 , 99, 34}.
Could someone help me?
Presumably you have a compiler that can generate AT&T syntax. It might be more instructive to look at what assembly output the compiler generates. Here's my re-formulation of your demo:
#include <stdio.h>
void foo (int a[], int n)
{
for (int s = 0, i = 0; i < n; i++)
{
if (a[i] != 0)
s += a[i];
else
a[i] = s, s = 0;
}
}
int main (void)
{
int nums[] = {65, 23, 11, 0, 34};
int size = sizeof(nums) / sizeof(int);
foo(nums, size);
for (int i = 0; i < size; i++)
fprintf(stdout, i < (size - 1) ? "%d, " : "%d\n", nums[i]);
return (0);
}
Compiling without optimizations enabled is typically harder to work through than optimized code, since it loads from and spills results to memory. You won't learn much from it if you're investing time in learning how to write efficient assembly.
Compiling with the Godbolt compiler explorer with -O2 optimizations yields much more efficient code; it's also useful for cutting out unnecessary directives, labels, etc., that would be visual noise in this case.
In my experience, using -O2 optimizations are clever enough to make you rethink your use of registers, refactoring, etc. -O3 can sometimes optimize too agressively - unrolling loops, vectorizing, etc., to easily follow.
Finally, for the case you have presented, there's a perfect compromise: -Os, which enables many of the optimizations of -O2, but not at the expense of increased code size. I'll paste the assembly here just for comparative purposes:
foo:
xorl %eax, %eax
xorl %ecx, %ecx
.L2:
cmpl %eax, %esi
jle .L7
movl (%rdi,%rax,4), %edx
testl %edx, %edx
je .L3
addl %ecx, %edx
jmp .L4
.L3:
movl %ecx, (%rdi,%rax,4)
.L4:
incq %rax
movl %edx, %ecx
jmp .L2
.L7:
ret
Remember that the calling convention passes the pointer to (a) in %rdi, and the 'count' (n) in %rsi. These are the calling conventions being used. Notice that your code does not 'dereference' or 'index' any elements through %rdi. It's definitely worth going stepping through the code - even with pen and paper if it helps - to understand the branch conditions and how reading and writing is performed on element a[i].
Curiously, using the inner loop of your code:
s += a[i];
if (a[i] == 0)
a[i] = s, s = 0;
Appears to generate more efficient code with -Os than the inner loop I used:
foo:
xorl %eax, %eax
xorl %edx, %edx
.L2:
cmpl %eax, %esi
jle .L6
movl (%rdi,%rax,4), %ecx
addl %ecx, %edx
testl %ecx, %ecx
jne .L3
movl %edx, (%rdi,%rax,4)
xorl %edx, %edx
.L3:
incq %rax
jmp .L2
.L6:
ret
A reminder for me to keep things simple!

Reverse-engineering asm using sub / cmp / setbe back to C? My attempt is compiling to branches

this is the assembly code i am supposed to translate:
f1:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
heres the c code I wrote that I think is equivalent.
int f1(int y){
int x = y-97;
int i = 0;
if(x<=25){
x = i;
}
return x;
}
and heres what I get from compiling the C code.
_f1: ## #f1
.cfi_startproc
%bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
## kill: def %edi killed %edi def %rdi
leal -97(%rdi), %ecx
xorl %eax, %eax
cmpl $123, %edi
cmovgel %ecx, %eax
popq %rbp
retq
.cfi_endproc
I was wondering if this was correct / what should be different and if anyone could help explain how jmps work as I am also trying to translate this assembly code and have gotten stuck
f2:
cmpl $1, %edi
jle .L6
movl $2, %edx
movl $1, %eax
jmp .L5
.L8:
movl %ecx, %edx
.L5:
imull %edx, %eax
leal 1(%rdx), %ecx
cmpl %eax, %edi
jg .L8
.L4:
cmpl %edi, %eax
sete %al
movzbl %al, %eax
ret
.L6:
movl $1, %eax
jmp .L4
gcc8.3 -O3 emits exactly the asm in the question for this way of writing the range check using the unsigned-compare trick.
int is_ascii_lowercase_v2(int y){
unsigned char x = y-'a';
return x <= (unsigned)('z'-'a');
}
Narrowing to 8-bit after the int subtract matches the asm more exactly, but it's not necessary for correctness or even to convince compilers to use a 32-bit sub. For unsigned char y, the upper bytes of RDI are allowed to hold arbitrary garbage (x86-64 System V calling convention), but carry only propagates from low to high with sub and add.
The low 8 bits of the result (which is all the cmp reads) would be the same with sub $'a', %dil or sub $'a', %edi.
Writing it as a normal range-check also gets gcc to emit identical code, because compilers know how optimize range-checks. (And gcc chooses to use 32-bit operand-size for the sub, unlike clang which uses 8-bit.)
int is_ascii_lowercase_v3(char y){
return (y>='a' && y<='z');
}
On the Godbolt compiler explorer, this and _v2 compile as follows:
## gcc8.3 -O3
is_ascii_lowercase_v3: # and _v2 is identical
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
Returning a compare result as an integer, instead of using an if, much more naturally matches the asm.
But even writing it "branchlessly" in C won't match the asm unless you enable optimization. The default code-gen from gcc/clang is -O0: anti-optimize for consistent debugging, storing/reloading everything to memory between statements. (And function args on function entry.) You need optimization, because -O0 code-gen is (intentionally) mostly braindead, and nasty looking. See How to remove "noise" from GCC/clang assembly output?
## gcc8.3 -O0
is_ascii_lowercase_v2:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl -20(%rbp), %eax
subl $97, %eax
movb %al, -1(%rbp)
cmpb $25, -1(%rbp)
setbe %al
movzbl %al, %eax
popq %rbp
ret
gcc and clang with optimization enabled will do if-conversion to branchless code when it's efficient. e.g.
int is_ascii_lowercase_branchy(char y){
unsigned char x = y-'a';
if (x < 25U) {
return 1;
}
return 0;
}
still compiles to the same asm with GCC8.3 -O3
is_ascii_lowercase_branchy:
subl $97, %edi
xorl %eax, %eax
cmpb $25, %dil
setbe %al
ret
We can tell that the optimization level was at least gcc -O2. At -O1, gcc uses the less efficient setbe / movzx instead of xor-zeroing EAX ahead of setbe
is_ascii_lowercase_v2:
subl $97, %edi
cmpb $25, %dil
setbe %al
movzbl %al, %eax
ret
I could never get clang to reproduce exactly the same sequence of instructions. It likes to use add $-97, %edi, and cmp with $26 / setb.
Or it will do really interesting (but sub-optimal) things like this:
# clang7.0 -O3
is_ascii_lowercase_v2:
addl $159, %edi # 256-97 = 8-bit version of -97
andl $254, %edi # 0xFE; I haven't figured out why it's clearing the low bit as well as the high bits
xorl %eax, %eax
cmpl $26, %edi
setb %al
retq
So this is something involving -(x-97), maybe using the 2's complement identity in there somewhere (-x = ~x + 1).
Here is an annotated version of the assembly:
# %edi is the first argument, we denote x
subl $97, %edi
# x -= 97
# %eax is the return value, we denote y
xorl %eax, %eax
# y = 0
# %dil is the least significant byte (lsb) of x
cmpb $25, %dil
# %al is lsb(y) which is already zeroed
setbe %al
# if lsb(x) <= 25 then lsb(y) = 1
# setbe is unsigned version, setle would be signed
ret
# return y
So a verbose C equivalent is:
int f(int x) {
int y = 0;
x -= 97;
x &= 0xFF; // x = lsb(x) using 0xFF as a bitmask
y = (unsigned)x <= 25; // Section 6.5.8 of C standard: comparisons yield 0 or 1
return y;
}
We can shorten it by realizing y is unnecessary:
int f(int x) {
x -= 97;
x &= 0xFF;
return (unsigned)x <= 25;
}
The assembly of this is an exact match on Godbolt Compiler Explorer (x86-64 gcc8.2 -O2): https://godbolt.org/z/fQ0LVR

Why is `switch` so slow?

In a bytecode interpreting loop, after several tests, I'm surprised to see that using switch is the worst choice to make. Making calls to a function pointer array, or using gcc's computed goto extension is always 10~20% faster, the computed goto version being the fastest. I've tested with my real toy VM with 97 instructions and with the mini fake VM pasted below.
Why is using switch the slowest?
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <inttypes.h>
#include <time.h>
enum {
ADD1 = 1,
ADD2,
SUB3,
SUB4,
MUL5,
MUL6,
};
static unsigned int number;
static void add1(void) {
number += 1;
}
static void add2(void) {
number += 2;
}
static void sub3(void) {
number -= 3;
}
static void sub4(void) {
number -= 4;
}
static void mul5(void) {
number *= 5;
}
static void mul6(void) {
number *= 6;
}
static void interpret_bytecodes_switch(uint8_t *bcs) {
while (true) {
switch (*bcs++) {
case 0:
return;
case ADD1:
add1();
break;
case ADD2:
add2();
break;
case SUB3:
sub3();
break;
case SUB4:
sub4();
break;
case MUL5:
mul5();
break;
case MUL6:
mul6();
break;
}
}
}
static void interpret_bytecodes_function_pointer(uint8_t *bcs) {
void (*fs[])(void) = {
NULL,
add1,
add2,
sub3,
sub4,
mul5,
mul6,
};
while (*bcs) {
fs[*bcs++]();
}
}
static void interpret_bytecodes_goto(uint8_t *bcs) {
void *labels[] = {
&&l_exit,
&&l_add1,
&&l_add2,
&&l_sub3,
&&l_sub4,
&&l_mul5,
&&l_mul6,
};
#define JUMP goto *labels[*bcs++]
JUMP;
l_exit:
return;
l_add1:
add1();
JUMP;
l_add2:
add2();
JUMP;
l_sub3:
sub3();
JUMP;
l_sub4:
sub4();
JUMP;
l_mul5:
mul5();
JUMP;
l_mul6:
mul6();
JUMP;
#undef JUMP
}
struct timer {
clock_t start, end;
};
static void timer_start(struct timer *self) {
self->start = clock();
}
static void timer_end(struct timer *self) {
self->end = clock();
}
static double timer_measure(struct timer *self) {
return (double)(self->end - self->start) / CLOCKS_PER_SEC;
}
static void test(void (*f)(uint8_t *), uint8_t *bcs) {
number = 0;
struct timer timer;
timer_start(&timer);
f(bcs);
timer_end(&timer);
printf("%u %.3fs\n", number, timer_measure(&timer));
}
int main(void) {
const int N = 300000000;
srand((unsigned)time(NULL));
uint8_t *bcs = malloc(N + 1);
for (int i = 0; i < N; ++i) {
bcs[i] = rand() % 5 + 1;
}
bcs[N] = 0;
for (int i = 0; i < 10; ++i) {
printf("%d ", bcs[i]);
}
printf("\nswitch\n");
test(interpret_bytecodes_switch, bcs);
printf("function pointer\n");
test(interpret_bytecodes_function_pointer, bcs);
printf("goto\n");
test(interpret_bytecodes_goto, bcs);
return 0;
}
result
~$ gcc vm.c -ovm -std=gnu11 -O3
~$ ./vm
3 4 5 3 3 5 3 3 1 2
switch
3050839589 2.866s
function pointer
3050839589 2.573s
goto
3050839589 2.433s
~$ ./vm
3 1 1 3 5 5 2 4 5 1
switch
3898179583 2.871s
function pointer
3898179583 2.573s
goto
3898179583 2.431s
~$ ./vm
5 5 1 2 3 3 1 2 2 4
switch
954521520 2.869s
function pointer
954521520 2.574s
goto
954521520 2.432s
Below is the relevant disassembly of the code posted here after -O3 optimization.
interpret_bytecodes_switch:
.L8:
addq $1, %rdi
cmpb $6, -1(%rdi)
ja .L8
movzbl -1(%rdi), %edx
jmp *.L11(,%rdx,8)
.L11:
.quad .L10
.quad .L12
.quad .L13
.quad .L14
.quad .L15
.quad .L16
.quad .L17
.L16:
leal (%rax,%rax,4), %eax
jmp .L8
.L15:
subl $4, %eax
jmp .L8
.L14:
subl $3, %eax
jmp .L8
.L13:
addl $2, %eax
jmp .L8
.L12:
addl $1, %eax
jmp .L8
.L10:
movl %eax, number(%rip)
ret
.L17:
leal (%rax,%rax,2), %eax
addl %eax, %eax
jmp .L8
interpret_bytecodes_function_pointer:
pushq %rbx
movq %rdi, %rbx
subq $64, %rsp
movzbl (%rdi), %eax
movq $0, (%rsp)
movq $add1, 8(%rsp)
movq $add2, 16(%rsp)
movq $sub3, 24(%rsp)
movq $sub4, 32(%rsp)
movq $mul5, 40(%rsp)
testb %al, %al
movq $mul6, 48(%rsp)
je .L19
.L23:
addq $1, %rbx
call *(%rsp,%rax,8)
movzbl (%rbx), %eax
testb %al, %al
jne .L23
.L19:
addq $64, %rsp
popq %rbx
ret
interpret_bytecodes_goto:
movzbl (%rdi), %eax
movq $.L27, -72(%rsp)
addq $2, %rdi
movq $.L28, -64(%rsp)
movq $.L29, -56(%rsp)
movq $.L30, -48(%rsp)
movq $.L31, -40(%rsp)
movq $.L32, -32(%rsp)
movq $.L33, -24(%rsp)
movq -72(%rsp,%rax,8), %rax
jmp *%rax
.L33:
movl number(%rip), %eax
leal (%rax,%rax,2), %eax
addl %eax, %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
.L35:
addq $1, %rdi
jmp *%rax
.L28:
addl $1, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L30:
subl $3, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L31:
subl $4, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L32:
movl number(%rip), %eax
leal (%rax,%rax,4), %eax
movl %eax, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L29:
addl $2, number(%rip)
movzbl -1(%rdi), %eax
movq -72(%rsp,%rax,8), %rax
jmp .L35
.L27:
rep ret
switch is slowest because it has to manage default cases and this may add an extra bounds test you didn't implemented.
switch also handles a more general case where case values are not arranged in a so simple sequence, extra computation may be needed for that.
I was in the middle of writing a long answer when you posted the assembly code...
Basically, the goto version uses more "code" to prevent a few (or a single) instructions in each iteration. It's similar to a size vs. speed optimization.
Since your "real work" is negligible, it makes enough of a difference in the benchmark, but in a real world scenario that instruction will become negligible.
You are performing a micro benchmark. Micro benchmarks on modern CPUs can be affected by all kinds of random or unpredicatable effects. There is actually very little difference in execution time. However, in order to make the code comparable, you combined switch and function calls, which in real life wouldn't happen for time critical code.

I'm trying to interpret this IA32 assembly language code

I have this IA32 assembly language code I'm trying to convert into regular C code.
.globl fn
.type fn, #function
fn:
pushl %ebp #setup
movl $1, %eax #setup 1 is in A
movl %esp, %ebp #setup
movl 8(%ebp), %edx # pointer X is in D
cmpl $1, %edx # (*x > 1)
jle .L4
.L5:
imull %edx, %eax
subl $1, %edx
cmpl $1, %edx
jne .L5
.L4:
popl %ebp
ret
The trouble I'm having is deciding what type of comparison is going on. I don't get how the program gets to the L5 cache. L5 seems to be a loop since there's a comparison within it. I'm also unsure of what is being returned because it seems like most of the work is done is the %edx register, but doesn't go back to %eax for returning.
What I have so far:
int fn(int x)
{
}
It looks to me like it's computing a factorial. Ignoring the stack frame manipulation and such, we're left with:
movl $1, %eax #setup 1 is in A
Puts 1 into eax.
movl 8(%ebp), %edx # pointer X is in D
Retrieves a parameter into edx
imull %edx, %eax
Multiplies eax by edx, putting the result into eax.
subl $1, %edx
cmpl $1, %edx
jne .L5
Decrements edx and repeats if edx != 1.
In other words, this is roughly equivalent to:
unsigned fact(unsigned input) {
unsigned retval = 1;
for ( ; input != 1; --input)
retval *= input;
return retval;
}

Converting a recursive function to tail recursive

I have the following recursive function to count all the nodes having value 20, in a circular doubly linked list. I need to convert this to tail recursive function to prevent safety issues. Please help me with the same. Thanks
int count(node *start)
{
return count_helper(start, start);
}
int count_helper(node *current, node *start)
{
int c;
c = 0;
if(current == NULL)
return 0;
if((current->roll_no) == 20)
c = 1;
if(current->next == start) return c;
return (c + count_helper(current->next, start));
}
In order to take advantage of tail recursion, the recursive call simply has to be the last thing performed. Currently, the only thing standing in the way of this goal is an addition. So, to transform the function, that addition has to be moved around. A common way to accomplish this is by passing the variable c as a parameter to the recursive helper function, as so:
int count(node *start)
{
return count_helper(start,start,0);
}
int count_helper(node *current, node *start, int c)
{
if(current == NULL)
return c;
if((current->roll_no) == 20)
c+=1;
if(current->next == start)
return c;
return count_helper(current->next, start,c);
}
This unrolls as follows (using gcc 4.6.1, as produced by gcc -S -O2):
count_helper:
.LFB23:
.cfi_startproc
pushl %ebx
.cfi_def_cfa_offset 8
.cfi_offset 3, -8
movl 8(%esp), %edx
movl 12(%esp), %ebx
movl 16(%esp), %eax
testl %edx, %edx
jne .L15
jmp .L10
.p2align 4,,7
.p2align 3
.L14:
testl %edx, %edx
je .L10
.L15:
xorl %ecx, %ecx
cmpl $20, 4(%edx)
movl (%edx), %edx
sete %cl
addl %ecx, %eax
cmpl %ebx, %edx
jne .L14 # <-- this is the key line right here
.L10:
popl %ebx
.cfi_def_cfa_offset 4
.cfi_restore 3
ret
.cfi_endproc
Compare this to your original (done without -O2, as apparently the compiler finds a way to make your original tail recursive as well, although in the process it mucks it up so much that I can barely read it):
count_helper:
.LFB1:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $40, %esp
movl $0, -12(%ebp)
cmpl $0, 8(%ebp)
jne .L3
movl $0, %eax
jmp .L4
.L3:
movl 8(%ebp), %eax
movl 4(%eax), %eax
cmpl $20, %eax
jne .L5
movl $1, -12(%ebp)
.L5:
movl 8(%ebp), %eax
movl (%eax), %eax
cmpl 12(%ebp), %eax
jne .L6
movl -12(%ebp), %eax
jmp .L4
.L6:
movl 8(%ebp), %eax
movl (%eax), %eax
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call count_helper # <-- this is the key line right here
addl -12(%ebp), %eax
.L4:
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc

Resources