Edit : I understand the confusion. But I am not trying to optimize here, as #sergio said, I could not come up with a better word.
--
I have been writing code in JavaScript and PHP for a long time now, that I find it hard sometimes to optimize my code in C.
What I mean by optimizing is writing a program in less code. here is an example:
int i;
srand(time(NULL));
for(i = 0; i < 10; i++){
printf(" %d ", rand() % 300);
if(i < 10 - 1){
printf("|");
}
}
in Javascript I would have wrote it this way :
var html = ''
for(var i = 0; i < 10; i++){
html += ' '+Math.floor(Math.random() * 100)+' '+( i == 9 ? '|' : '' )
}
the difference in C is that I had to do the If in an other line, and could not act inline on the string. I hope you get my point.
So how would you write my code?
Thank you.
"Number of lines" is traditionally a poor judge of code, if you cram too much into one line it gets unreadable.
for(i = 0; i < 10; i++)
printf("%s %d ", i ? "|" : "", rand() % 300);
Optimizing and writing minimum code are different.
In C, you could use the ternary operator instead of your if statement if you want to just condense the code.
The assembly code generated and its efficiency however probably aren't changing as long as you have the same 1 conditional within 1 loop running N times, no matter how cool it looks, so focus on the algorithm, and not how concise the code is.
This answer is in response to Murilo Vasconcelos:
Use http://gcc.godbolt.org/ to follow along.
#include <stdio.h>
#include <ctime>
#include <cstdlib>
int main() {
int i;
srand(time(NULL));
for(i=0; i< 10; i++){
printf(" %d ", rand() % 300);
if(i < 10 - 1){
printf("|");
}
}
}
Generates the following assembly using g++-4.8:
.LC0:
.string " %d "
main:
pushq %rbp
xorl %edi, %edi
movl $458129845, %ebp
pushq %rbx
xorl %ebx, %ebx
subq $8, %rsp
call time
movl %eax, %edi
call srand
call rand
movl $458129845, %edx
movl $.LC0, %edi
movl %eax, %esi
imull %edx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
.L2:
movl $124, %edi
addl $1, %ebx
call putchar
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebp
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
cmpl $9, %ebx
jne .L2
addq $8, %rsp
xorl %eax, %eax
popq %rbx
popq %rbp
ret
On the other hand this code:
#include <stdio.h>
#include <ctime>
#include <cstdlib>
int main() {
int i;
srand(time(NULL));
for (i = 0; i < 9; i++) {
printf(" %d |", rand() % 300);
}
printf(" %d ", rand() % 300);
}
Generates this assembly:
.LC0:
.string " %d |"
.LC1:
.string " %d "
main:
pushq %rbx
xorl %edi, %edi
movl $458129845, %ebx
call time
movl %eax, %edi
call srand
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $5, %edx
sarl $31, %eax
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC0, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
call rand
movl $.LC1, %edi
movl %eax, %esi
imull %ebx
movl %esi, %eax
sarl $31, %eax
sarl $5, %edx
subl %eax, %edx
xorl %eax, %eax
imull $300, %edx, %edx
subl %edx, %esi
call printf
xorl %eax, %eax
popq %rbx
ret
In other words, by changing the loop it allowed the compiler to unroll the loop, which should be a rather large performance increase that you wouldn't get without your change. So don't let people poo poo you. Check for yourself what the assembly becomes, not all hand optimization is a waste of time. And, of course, test test test.
But, of course, you shouldn't optimize prematurely. You should follow your profiler and let it tell you what your hot spots are, and where you need to optimize.
Better Algorithms first.
Then better code.
Then better assembly.
EDIT: the OP edited the question.
If you want just to make an inline code:
int i = 0;
for (srand(time(NULL)); i < 10; printf("%d %s ", rand() % 300, (i++ < 9 ? "|" : "")));
Related
So I'm pretty much a noob in IA32 assembly language. I tried compiling this C function into IA32 assembly (-mpreferred-stack-boundary=2):
__attribute__((cdecl))
int odd_sum(int a[], int n, int sum) {
if (n == 0) return sum;
else if ((a[n-1] % 2) == 0)
return odd_sum(a, n-1, sum);
else return odd_sum(a, n-1, sum + a[n-1]);
}
and the GCC outputs this:
.file "test.c"
.text
.globl _odd_sum
.def _odd_sum; .scl 2; .type 32; .endef
_odd_sum:
LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $12, %esp
cmpl $0, 12(%ebp)
jne L2
movl 16(%ebp), %eax
jmp L3
L2:
movl 12(%ebp), %eax
addl $1073741823, %eax
leal 0(,%eax,4), %edx
movl 8(%ebp), %eax
addl %edx, %eax
movl (%eax), %eax
andl $1, %eax
testl %eax, %eax
jne L4
movl 12(%ebp), %eax
leal -1(%eax), %edx
movl 16(%ebp), %eax
movl %eax, 8(%esp)
movl %edx, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call _odd_sum
jmp L3
L4:
movl 12(%ebp), %eax
addl $1073741823, %eax
leal 0(,%eax,4), %edx
movl 8(%ebp), %eax
addl %edx, %eax
movl (%eax), %edx
movl 16(%ebp), %eax
addl %eax, %edx
movl 12(%ebp), %eax
subl $1, %eax
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call _odd_sum
L3:
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE0:
.ident "GCC: (MinGW.org GCC-8.2.0-3) 8.2.0"
What I am not able to comprehend are these 2 lines:
addl $1073741823, %eax
leal 0(,%eax,4), %edx
I understand those 2 lines should have something to do with the a[n-1], but I can't seem to be able to understand what exactly they do in the process. Can someone help me with this problem please?
It is just a fancy way of computing the offset into the array a[n-1].
1073741823 is 0x3fffffff. If n is 3, for example, it will add them and get 0x40000002. Then it multiplies by 4 with the second instruction, which results in 0x00000008, discarding the top bits.
So we are left with an offset of 8 bytes, which is exactly the offset (in bytes) that you need for a[n-1], i.e. a[2] (when the size of an int is 4 bytes).
To get a more understandable output with the -S flag:
create assembler code:
c++ -S -fverbose-asm -g -O2 (other optimizaton flags) test.cc -o test.s
create asm interlaced with source lines:
as -alhnd test.s > test.lst
Given the following code
#include <stdio.h>
int main(int argc, char **argv)
{
int k = 0;
for( k = 0; k < 20; ++k )
{
printf( "%d\n", k ) ;
}
}
Using GCC 5.1 or later with
-x c -std=c99 -O3 -funroll-all-loops --param max-completely-peeled-insns=1000 --param max-completely-peel-times=10000
does partially loop unrolling, it unrolls the loop ten times and then does a conditional jump.
.LC0:
.string "%d\n"
main:
pushq %rbx
xorl %ebx, %ebx
.L2:
movl %ebx, %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 1(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 2(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 3(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 4(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 5(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 6(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 7(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 8(%rbx), %esi
movl $.LC0, %edi
xorl %eax, %eax
call printf
leal 9(%rbx), %esi
xorl %eax, %eax
movl $.LC0, %edi
addl $10, %ebx
call printf
cmpl $20, %ebx
jne .L2
xorl %eax, %eax
popq %rbx
ret
But using older versions of GCC such as 4.9.2 creates the desired assemlby
.LC0:
.string "%d\n"
main:
subq $8, %rsp
xorl %edx, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $1, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $2, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $3, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $4, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $5, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $6, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $7, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $8, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $9, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $10, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $11, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $12, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $13, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $14, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $15, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $16, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $17, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $18, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
movl $19, %edx
movl $.LC0, %esi
movl $1, %edi
xorl %eax, %eax
call __printf_chk
xorl %eax, %eax
addq $8, %rsp
ret
It there a way to force the later versions of GCC to produce the same output?
Using https://godbolt.org/g/D1AR6i to produce the assembly
EDIT: No duplicated question, since the problem to completly unroll loops with later versions of GCC has not yet been solved. Passing --param max-completely-peeled-insns=1000 --param max-completely-peel-times=10000 has not effects on the generated assembly using GCC >= 5.1
The flags and parameters you are using do not guarantee that the loops will be completely unrolled. The GCC documentation states the following regarding the -funroll-all-loops flag you are using:
turns on complete loop peeling (i.e. complete removal of loops with a
small constant number of iterations)
If the compiler decides that the number of iterations for a given piece of code is not "a small constant" (i.e. number is too high), it may only do partial peeling or unrolling as it has done here. Furthermore, the param options you are using are only maximum values, but do not force complete unrolling for loops smaller than the set value. In other words, if a loop has more iterations than the maximum you have set, then the loop will not to be completely unrolled; but the inverse is not true.
Many factors are taken into account when doing optimisations. Here the bottleneck in your code is the call to printf function, and the compiler will probably take this into account when doing its cost calculations, or judge that the instruction size overhead for unrolling is too important. As you are nevertheless telling it to unroll loops, it seems to determine that the best solution is to transform the initial loop with 10 unrolls and a a jump.
If you replace printf with something else, the compiler may optimise differently. For instance try replacing it with the following:
volatile int temp = k;
The loop with this new code snippet will be fully unrolled on the newer versions of GCC (and the older ones as well). Note that the volatile keyword is just a trick used so the compiler does not optimise out the loop completely.
To sum up, to the best of my knowledge there is no way to force later versions of GCC to produce the same output.
As a side note, from optimisation level -O2 onwards and without any additional compiler flags, recent versions of Clang fully unroll your loop.
So I am trying to convert a bubble sort program from assembly to Y86. I started with this C code and then converted it to assembly:
#include <stdio.h>
void bubble(int *, int);
int main(){
int count = 5;
int data[5]= {3, 2, 6, 1, 9,};
bubble(data, count);
return 0;
}
void bubble(int *data, int count){
int i, last;
for(last = count-1; last > 0; last--){
for (i=0; i < last; i++)
if (data[i+1] < data[i]){
/* Swap adjacent elements */
int t = data[i+1];
data[i+1] = data[i];
data[i] = t;
}
}
}
and this is my Y86 code:
init: irmovl Stack, %esp # Set up stack pointer
irmovl Stack, %ebp # Set up base pointer
call main # call main program
halt # Terminate program
main:
pushl %ebp
rrmovl %esp, %ebp
irmovl -16, %ecx
andl %ecx, %esp
irmovl 48, %ecx
subl %ecx, %esp
irmovl 5, %esi
rmmovl %esi, 44(%esp)
irmovl 3, %esi
rmmovl %esi, 24(%esp)
irmovl 2, %esi
rmmovl %esi, 28(%esp)
irmovl 6, %esi
rmmovl %esi, 32(%esp)
irmovl 1, %esi
rmmovl %esi, 36(%esp)
irmovl 9, %esi
rmmovl %esi, 40(%esp)
mrmovl 44(%esp), %eax
rmmovl %eax, 4(%esp)
mrmovl 24(%esp), %ecx
addl %ecx, %eax
rmmovl %eax, (%esp)
call bubble
irmovl 0, %eax
rrmovl %ebp, %esp
popl %ebp
ret
bubble:
pushl %ebp
rrmovl %esp, %ebp
irmovl 16, %esi
subl %esi, %esp
mrmovl 12(%ebp), %eax
irmovl 1, %esi
subl %esi, %eax
rmmovl %eax, -8(%ebp)
jmp L4
L8:
irmovl 0, %ebx
rmmovl %ebx, -12(%ebp)
jmp L5
L7:
mrmovl -12(%ebp), %eax
irmovl 1, %ecx
addl %ecx, %eax
addl %eax, %eax
addl %eax, %eax
addl %eax, %eax
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl (%eax), %edx
mrmovl -12(%ebp), %eax
addl %eax, %eax
addl %eax, %eax
addl %eax, %eax
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl (%eax), %eax
subl %eax, %edx
jge L6
mrmovl -12(%ebp), %eax
irmovl 1, %esi
addl %esi, %eax
addl %eax, %eax
addl %eax, %eax
addl %eax, %eax
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl (%eax), %eax
rmmovl %eax, -4(%ebp)
mrmovl -12(%ebp), %eax
irmovl 1, %esi
addl %esi, %eax
addl %eax, %eax
addl %eax, %eax
addl %eax, %eax
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl -12(%ebp), %edx
addl %edx, %edx
addl %edx, %edx
addl %edx, %edx
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl (%edx), %edx
rmmovl %edx, (%eax)
mrmovl -12(%ebp), %eax
addl %eax, %eax
addl %eax, %eax
addl %eax, %eax
mrmovl 8(%ebp), %esi
addl %esi, %eax
mrmovl -4(%ebp), %edx
rmmovl %edx, (%eax)
L6:
irmovl 1, %esi
mrmovl -12(%ebp), %ebx
addl %esi, %ebx
L5:
mrmovl -12(%ebp), %eax
mrmovl -8(%ebp), %ebx
subl %ebx, %eax
jl L7
irmovl 1, %esi
mrmovl -8(%ebp), %ebx
subl %esi, %ebx
L4:
irmovl 0, %esi #i = 0
mrmovl -8(%ebp), %ebx
subl %esi, %ebx
jg L8
rrmovl %ebp, %esp
popl %ebp
ret
#stack starts here
.pos 0x200
Stack: .long 0
It seems as though the sign flag is never being raised, and I am not quite sure why this is. When I run the simulator, the elements of the array are never sorted, they just remain in the same position. Any tips on how to fix this would be much appreciated!
I have generated two assembly files - one that is optimized, and one that is not. The assembly-language code generated with optimization on should be more efficient than the other assembly-language code. I am more interested in how the efficiency is achieved. To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address. In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them.
Am I correct? And if so, would there ever be a time when the optimized version will not be advantageous? Thank you for your time.
Here is a function that converts from 42 GIF to CYMK.
void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}
Here is the assembly-language code that has not been optimized. Note I have made notes using ;; in the code.
No Opt:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
;;initializing variable c, m, y
movl $255, %eax
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl %edx, -12(%rbp)
movq %rcx, -24(%rbp)
movl %eax, %edx
subl -4(%rbp), %edx
movl %edx, -28(%rbp)
movl %eax, %edx
subl -8(%rbp), %edx
movl %edx, -32(%rbp)
subl -12(%rbp), %eax
movl %eax, -36(%rbp)
movl -28(%rbp), %eax
;;compare
cmpl -32(%rbp), %eax
jge LBB0_5
## BB#1:
movl -28(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_3
## BB#2:
movl -28(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
jmp LBB0_4
LBB0_3:
movl -36(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
movl -44(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
jmp LBB0_9
LBB0_5:
movl -32(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_7
## BB#6:
movl -32(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
jmp LBB0_8
LBB0_7:
movl -36(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
movl -52(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
movl -48(%rbp), %eax ## 4-byte Reload
movl %eax, -40(%rbp)
movl -40(%rbp), %eax
movl -28(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -28(%rbp)
movl -40(%rbp), %eax
movl -32(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -32(%rbp)
movl -40(%rbp), %eax
movl -36(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -36(%rbp)
movl -28(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, (%rdx)
movl -32(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 4(%rdx)
movl -36(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 8(%rdx)
movl -40(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 12(%rdx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Optimization:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
movl $255, %r8d
movl $255, %eax
subl %edi, %eax
movl $255, %edi
subl %esi, %edi
subl %edx, %r8d
cmpl %edi, %eax ##;; compare m and c
jge LBB0_2
## BB#1: ;; c < m
cmpl %r8d, %eax ## compare y and c
movl %r8d, %edx
cmovlel %eax, %edx
jmp LBB0_3
LBB0_2: ##;; c >= m
cmpl %r8d, %edi ## compare y and m
movl %r8d, %edx
cmovlel %edi, %edx
LBB0_3:
subl %edx, %eax
subl %edx, %edi
subl %edx, %r8d
movl %eax, (%rcx)
movl %edi, 4(%rcx)
movl %r8d, 8(%rcx)
movl %edx, 12(%rcx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Yes. The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over.
You are using call wrong. It is a technical term that means to push a return address on the stack and branch to a new location for instructions. The term you mean is simply to use the register.
Can you think of a reason that longer, slower code is "better"?
I'm trying to understand the assembly code during a recursive function call.
#include<stdio.h>
int recursive(int no){
if(no > 1){
no--;
recursive(no);
printf("\n %d \n",no);
}
else if(no == 1){
return 1;
}
}
int main(){
int a = 10;
recursive(a);
return 0;
}
disassembly :
.file "sample2.c"
.section .rodata
.LC0:
.string "\n %d \n"
.text
.globl recursive
.type recursive, #function
recursive:
pushl %ebp
movl %esp, %ebp
subl $24, %esp
cmpl $1, 8(%ebp)
jle .L2
subl $1, 8(%ebp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call recursive
movl $.LC0, %eax
movl 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call printf
jmp .L5
.L2:
cmpl $1, 8(%ebp)
jne .L5
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
.L5:
.L4:
leave
ret
.size recursive, .-recursive
.globl main
.type main, #function
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
movl $10, 28(%esp)
movl 28(%esp), %eax
movl %eax, (%esp)
call recursive
movl $0, %eax
leave
ret
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5"
.section .note.GNU-stack,"",#progbits
I could understand .LC0 always holds the string literals. But I dont know what it really means. Would like to understand the code during the function call recursion was made.
I could not understand what this piece of assembly code does,
subl $24, %esp
cmpl $1, 8(%ebp)
jle .L2
subl $1, 8(%ebp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call recursive
movl $.LC0, %eax
movl 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call printf
jmp .L5
.L2:
cmpl $1, 8(%ebp)
jne .L5
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
Q1:
The recursive function contains 1 parameter. so after the padding alignment, it has to be 8. why is it 24.
Also in .L2 ,
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
Q2:
we have moved '1' to the accumulater, why are we moving again to data register and then back to the accumulator.
Q3:
Are we popping out of stack. If leave is used for popping out of stack, are we not popping the rest of the 8 stack frames ?
To answer the only thing in your post that matches your title:
Why are we not popping out from the stack and only push instruction in the assembly.
Because leave is equivalent to:
movl %ebp, %esp
popl %ebp