So I'm pretty much a noob in IA32 assembly language. I tried compiling this C function into IA32 assembly (-mpreferred-stack-boundary=2):
__attribute__((cdecl))
int odd_sum(int a[], int n, int sum) {
if (n == 0) return sum;
else if ((a[n-1] % 2) == 0)
return odd_sum(a, n-1, sum);
else return odd_sum(a, n-1, sum + a[n-1]);
}
and the GCC outputs this:
.file "test.c"
.text
.globl _odd_sum
.def _odd_sum; .scl 2; .type 32; .endef
_odd_sum:
LFB0:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $12, %esp
cmpl $0, 12(%ebp)
jne L2
movl 16(%ebp), %eax
jmp L3
L2:
movl 12(%ebp), %eax
addl $1073741823, %eax
leal 0(,%eax,4), %edx
movl 8(%ebp), %eax
addl %edx, %eax
movl (%eax), %eax
andl $1, %eax
testl %eax, %eax
jne L4
movl 12(%ebp), %eax
leal -1(%eax), %edx
movl 16(%ebp), %eax
movl %eax, 8(%esp)
movl %edx, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call _odd_sum
jmp L3
L4:
movl 12(%ebp), %eax
addl $1073741823, %eax
leal 0(,%eax,4), %edx
movl 8(%ebp), %eax
addl %edx, %eax
movl (%eax), %edx
movl 16(%ebp), %eax
addl %eax, %edx
movl 12(%ebp), %eax
subl $1, %eax
movl %edx, 8(%esp)
movl %eax, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call _odd_sum
L3:
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE0:
.ident "GCC: (MinGW.org GCC-8.2.0-3) 8.2.0"
What I am not able to comprehend are these 2 lines:
addl $1073741823, %eax
leal 0(,%eax,4), %edx
I understand those 2 lines should have something to do with the a[n-1], but I can't seem to be able to understand what exactly they do in the process. Can someone help me with this problem please?
It is just a fancy way of computing the offset into the array a[n-1].
1073741823 is 0x3fffffff. If n is 3, for example, it will add them and get 0x40000002. Then it multiplies by 4 with the second instruction, which results in 0x00000008, discarding the top bits.
So we are left with an offset of 8 bytes, which is exactly the offset (in bytes) that you need for a[n-1], i.e. a[2] (when the size of an int is 4 bytes).
To get a more understandable output with the -S flag:
create assembler code:
c++ -S -fverbose-asm -g -O2 (other optimizaton flags) test.cc -o test.s
create asm interlaced with source lines:
as -alhnd test.s > test.lst
[Question]
I run below code with O3 option. And then, I found that the perforamce of the code with O3, is nine times higher than performance of the code without O3.
Edit :
I want to know the key of optimization technique, not reason. This is my question. I have never experienced x86 assembly. So it is too hard to understand x86 assembly code. That is the reason I posted this question. Or, could you explain the code with O3 option for me?
................................................................................
[C code]
The code just executes addition.
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
return 0;
}
[Assembly with O3]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .text.unlikely
.LCOLDB2:
.section .text.startup,"ax",#progbits
.LHOTB2:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
xorl %eax, %eax
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .text.unlikely
.LCOLDE2:
.section .text.startup
.LHOTE2:
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
[Assembly without O3]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2096, %rsp
movl %edi, -2084(%rbp)
movq %rsi, -2096(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L6
.L7:
movl -2068(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2068(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2068(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2072(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L6:
cmpl $127, -2072(%rbp)
jle .L7
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L9
call __stack_chk_fail
.L9:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
You code has no observable side-effects so the optimizer is simply discarding most of your code.
Using -O3 turns your main function into:
main:
xorl %eax, %eax
ret
Which is equivalent to:
int main()
{
return 0;
}
This shows that micro-benchmarking code can be difficult to do correctly.
Edit:
As pointed out in a comment below, the posted code doesn't initialize ibuffer[INPUT_FEATURE]. Reading an uninitialized variable is undefined behavior which makes the whole program malformed. This is a real problem and the code isn't required to produce reasonable results. Thanks #chqrlie
I modified the code and experimented, reflecting your reply, with it as follows. result is the same as before. O3 option is better than no option.
#define OFFSET (8)
#define INPUT_FEATURE (1024)
#define TSIZE (INPUT_FEATURE/OFFSET)
#include<stdio.h>
float minmax_scale(unsigned int x) {
// x_min = 0.0, x_max = 2040.0, new_min = 0.0, new_max = 1.0
return (x/(255.0 * OFFSET));
}
int main(int argc, char** argv) {
char ibuffer[INPUT_FEATURE];
double H[TSIZE];
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = 0.0;
}
// feature summation and scale
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
H[k] = minmax_scale(
(unsigned int)ibuffer[i]
+ ibuffer[i+1]
+ ibuffer[i+2]
+ ibuffer[i+3]
+ ibuffer[i+4]
+ ibuffer[i+5]
+ ibuffer[i+6]
+ ibuffer[i+7]
);
}
for (int k = 0, i = 0; k < TSIZE; i+=OFFSET, k++) {
printf("%lf",H[k]);
}
return 0;
}
[code with O3 option]
.file "measure_fs_simple.c"
.section .text.unlikely,"ax",#progbits
.LCOLDB1:
.text
.LHOTB1:
.p2align 4,,15
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB23:
.cfi_startproc
pxor %xmm0, %xmm0
movl %edi, %edi
cvtsi2sdq %rdi, %xmm0
divsd .LC0(%rip), %xmm0
cvtsd2ss %xmm0, %xmm0
ret
.cfi_endproc
.LFE23:
.size minmax_scale, .-minmax_scale
.section .text.unlikely
.LCOLDE1:
.text
.LHOTE1:
.section .rodata.str1.1,"aMS",#progbits,1
.LC5:
.string "%lf"
.section .text.unlikely
.LCOLDB6:
.section .text.startup,"ax",#progbits
.LHOTB6:
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $128, %ecx
pxor %xmm12, %xmm12
[code no option]
.file "measure_fs_simple.c"
.text
.globl minmax_scale
.type minmax_scale, #function
minmax_scale:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
testq %rax, %rax
js .L2
pxor %xmm0, %xmm0
cvtsi2sdq %rax, %xmm0
jmp .L3
.L2:
movq %rax, %rdx
shrq %rdx
andl $1, %eax
orq %rax, %rdx
pxor %xmm0, %xmm0
cvtsi2sdq %rdx, %xmm0
addsd %xmm0, %xmm0
.L3:
movsd .LC0(%rip), %xmm1
divsd %xmm1, %xmm0
cvtsd2ss %xmm0, %xmm0
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size minmax_scale, .-minmax_scale
.section .rodata
.LC2:
.string "%lf"
.text
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $2128, %rsp
movl %edi, -2100(%rbp)
movq %rsi, -2112(%rbp)
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
movl $0, -2088(%rbp)
movl $0, -2084(%rbp)
jmp .L6
.L7:
movl -2088(%rbp), %eax
cltq
pxor %xmm0, %xmm0
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2084(%rbp)
addl $1, -2088(%rbp)
.L6:
cmpl $127, -2088(%rbp)
jle .L7
movl $0, -2080(%rbp)
movl $0, -2076(%rbp)
jmp .L8
.L9:
movl -2076(%rbp), %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %edx
movl -2076(%rbp), %eax
addl $1, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $2, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $3, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $4, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $5, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $6, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %eax, %edx
movl -2076(%rbp), %eax
addl $7, %eax
cltq
movzbl -1040(%rbp,%rax), %eax
movsbl %al, %eax
addl %edx, %eax
movl %eax, %edi
call minmax_scale
cvtss2sd %xmm0, %xmm0
movl -2080(%rbp), %eax
cltq
movsd %xmm0, -2064(%rbp,%rax,8)
addl $8, -2076(%rbp)
addl $1, -2080(%rbp)
.L8:
cmpl $127, -2080(%rbp)
jle .L9
movl $0, -2072(%rbp)
movl $0, -2068(%rbp)
jmp .L10
.L11:
movl -2072(%rbp), %eax
cltq
movq -2064(%rbp,%rax,8), %rax
movq %rax, -2120(%rbp)
movsd -2120(%rbp), %xmm0
movl $.LC2, %edi
movl $1, %eax
call printf
addl $8, -2068(%rbp)
addl $1, -2072(%rbp)
.L10:
cmpl $127, -2072(%rbp)
jle .L11
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L13
call __stack_chk_fail
.L13:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 0
.long 1084219392
.ident "GCC: (Ubuntu 5.4.0-6ubuntu1~16.04.11) 5.4.0 20160609"
.section .note.GNU-stack,"",#progbits
I have a C program which has a function decod and the function has the following statements.
My decode.c script:
int decod(int x, int y, int z) {
int ty = y;
ty = ty - z;
int py = ty;
py = py << 31;
py = py >> 31;
ty = ty * x;
py = py ^ ty;
}
The assembly code of this program (generated by gcc -S decod.c) shows the following code.
movl %edi, -20(%rbp)
movl %esi, -24(%rbp)
movl %edx, -28(%rbp)
movl -24(%rbp), %eax
movl %eax, -8(%rbp)
movl -28(%rbp), %eax
subl %eax, -8(%rbp)
movl -8(%rbp), %eax
movl %eax, -4(%rbp)
sall $31, -4(%rbp)
sarl $31, -4(%rbp)
movl -8(%rbp), %eax
imull -20(%rbp), %eax
movl %eax, -8(%rbp)
movl -8(%rbp), %eax
xorl %eax, -4(%rbp)
popq %rbp
.cfi_def_cfa 7, 8
ret
But, I want the program generate an assembly file with only the following lines of code.
subl %edx, %esi
movl %esi, %eax
sall $31, %eax
sarl $31, %eax
imull %edi, %esi
xorl %esi, %eax
ret
I know I am pretty close to write a program which will generate the above mentioned code. But, I am clueless why the script generates different assembly code. Any direction will be helpful.
If you compile your function as is, in optimization level3, -O3 the entire function is optimized out. This is because there is no return value and py and ty are anyways discarded after the function.
For reference the code is below
.globl decod
.def decod; .scl 2; .type 32; .endef
.seh_proc decod
decod:
.seh_endprologue
ret
.seh_endproc
If however, you add a return py; at the end the code generated is as follows.
.globl decod
.def decod; .scl 2; .type 32; .endef
.seh_proc decod
decod:
.seh_endprologue
subl %r8d, %edx
movl %edx, %eax
imull %edx, %ecx
sall $31, %eax
sarl $31, %eax
xorl %ecx, %eax
ret
.seh_endproc
This is functionally identical to what you are expecting.
I'm trying to understand the assembly code during a recursive function call.
#include<stdio.h>
int recursive(int no){
if(no > 1){
no--;
recursive(no);
printf("\n %d \n",no);
}
else if(no == 1){
return 1;
}
}
int main(){
int a = 10;
recursive(a);
return 0;
}
disassembly :
.file "sample2.c"
.section .rodata
.LC0:
.string "\n %d \n"
.text
.globl recursive
.type recursive, #function
recursive:
pushl %ebp
movl %esp, %ebp
subl $24, %esp
cmpl $1, 8(%ebp)
jle .L2
subl $1, 8(%ebp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call recursive
movl $.LC0, %eax
movl 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call printf
jmp .L5
.L2:
cmpl $1, 8(%ebp)
jne .L5
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
.L5:
.L4:
leave
ret
.size recursive, .-recursive
.globl main
.type main, #function
main:
pushl %ebp
movl %esp, %ebp
andl $-16, %esp
subl $32, %esp
movl $10, 28(%esp)
movl 28(%esp), %eax
movl %eax, (%esp)
call recursive
movl $0, %eax
leave
ret
.size main, .-main
.ident "GCC: (Ubuntu/Linaro 4.4.4-14ubuntu5) 4.4.5"
.section .note.GNU-stack,"",#progbits
I could understand .LC0 always holds the string literals. But I dont know what it really means. Would like to understand the code during the function call recursion was made.
I could not understand what this piece of assembly code does,
subl $24, %esp
cmpl $1, 8(%ebp)
jle .L2
subl $1, 8(%ebp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call recursive
movl $.LC0, %eax
movl 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call printf
jmp .L5
.L2:
cmpl $1, 8(%ebp)
jne .L5
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
Q1:
The recursive function contains 1 parameter. so after the padding alignment, it has to be 8. why is it 24.
Also in .L2 ,
movl $1, %eax
movl %eax, %edx
movl %edx, %eax
jmp .L4
Q2:
we have moved '1' to the accumulater, why are we moving again to data register and then back to the accumulator.
Q3:
Are we popping out of stack. If leave is used for popping out of stack, are we not popping the rest of the 8 stack frames ?
To answer the only thing in your post that matches your title:
Why are we not popping out from the stack and only push instruction in the assembly.
Because leave is equivalent to:
movl %ebp, %esp
popl %ebp