Related
I have a C program which has a function decod and the function has the following statements.
My decode.c script:
int decod(int x, int y, int z) {
int ty = y;
ty = ty - z;
int py = ty;
py = py << 31;
py = py >> 31;
ty = ty * x;
py = py ^ ty;
}
The assembly code of this program (generated by gcc -S decod.c) shows the following code.
movl %edi, -20(%rbp)
movl %esi, -24(%rbp)
movl %edx, -28(%rbp)
movl -24(%rbp), %eax
movl %eax, -8(%rbp)
movl -28(%rbp), %eax
subl %eax, -8(%rbp)
movl -8(%rbp), %eax
movl %eax, -4(%rbp)
sall $31, -4(%rbp)
sarl $31, -4(%rbp)
movl -8(%rbp), %eax
imull -20(%rbp), %eax
movl %eax, -8(%rbp)
movl -8(%rbp), %eax
xorl %eax, -4(%rbp)
popq %rbp
.cfi_def_cfa 7, 8
ret
But, I want the program generate an assembly file with only the following lines of code.
subl %edx, %esi
movl %esi, %eax
sall $31, %eax
sarl $31, %eax
imull %edi, %esi
xorl %esi, %eax
ret
I know I am pretty close to write a program which will generate the above mentioned code. But, I am clueless why the script generates different assembly code. Any direction will be helpful.
If you compile your function as is, in optimization level3, -O3 the entire function is optimized out. This is because there is no return value and py and ty are anyways discarded after the function.
For reference the code is below
.globl decod
.def decod; .scl 2; .type 32; .endef
.seh_proc decod
decod:
.seh_endprologue
ret
.seh_endproc
If however, you add a return py; at the end the code generated is as follows.
.globl decod
.def decod; .scl 2; .type 32; .endef
.seh_proc decod
decod:
.seh_endprologue
subl %r8d, %edx
movl %edx, %eax
imull %edx, %ecx
sall $31, %eax
sarl $31, %eax
xorl %ecx, %eax
ret
.seh_endproc
This is functionally identical to what you are expecting.
I'm trying to compile my program to run on linux but it doesn't work because of this error.
I compile with gcc -ansi -pedantic -Wall -m64 -o formula formula.c nCr.s and the error i get is:
/tmp/ccnNz7Jr.o: In function `main':
formula.c:(.text+0xc1): undefined reference to `nCr'
collect2: ld returned 1 exit status
make: *** [build] Error 1
Here's the code from the file formula.c
#include <stdio.h>
#include <stdlib.h>
#include "nCr.h"
#include <time.h>
#include <sys/time.h>
int main(int argc, const char * argv[])
{
int x = 0;
int y;
int z;
struct timeval start, end;
if (argv[1][0] == '-' && argv[1][1] == 'h') {
printf("Usage: formula <positive integer>");
} else {
y = atoi(argv[1]);
gettimeofday(&start, NULL);
printf("(1 + x)^%i = 1+", y);
if (y == 0)
printf("0");
if (y > 12) {
printf("%s\n","Please enter a number 12 or below. Anything higher results in overflow, the answer you want is not the answer you will get.");
}
for (; x <= y; x++) {
z = nCr(y, x);
if (z == -1) {
printf("Multiplication overflow. \n");
return 1;
} else {
if (x != 0)
printf("%i x^%i ",z , x);
if (x != y && x != 0)
printf("+ ");
}
}
gettimeofday(&end, NULL);
}
printf("\n%ld microseconds\n", ((end.tv_sec * 1000000 + end.tv_usec)
- (start.tv_sec * 1000000 + start.tv_usec)));
return 0;
}
ncr.h
#ifndef _NCR_H_
#define _NCR_H_
extern int Factorial(int n);
extern int nCr(int n, int r);
#endif /* _NCR_H_ */
and there's an assembly file called nCr.s
.globl _factorial
_factorial:
Leh_func_begin2:
pushq %rbp
Ltmp3:
movq %rsp, %rbp
Ltmp4:
movl %edi, -4(%rbp)
movl $1, -16(%rbp)
movl $1, -20(%rbp)
jmp LBB2_2
LBB2_1:
movl -16(%rbp), %eax
movl -20(%rbp), %ecx
imull %ecx, %eax
movl %eax, -16(%rbp)
movl -20(%rbp), %eax
addl $1, %eax
movl %eax, -20(%rbp)
LBB2_2:
movl -20(%rbp), %eax
movl -4(%rbp), %ecx
cmpl %ecx, %eax
jle LBB2_1
movl -16(%rbp), %eax
movl %eax, -12(%rbp)
movl -12(%rbp), %eax
movl %eax, -8(%rbp)
movl -8(%rbp), %eax
popq %rbp
ret
.globl _nCr
_nCr:
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
subq $32, %rsp
Ltmp2:
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl -4(%rbp), %eax
xorb %cl, %cl
movl %eax, %edi
movb %cl, %al
callq _factorial
movl %eax, %ecx
movl %ecx, -20(%rbp)
movl -4(%rbp), %ecx
movl -8(%rbp), %edx
subl %edx, %ecx
movl %ecx, -24(%rbp)
movl -8(%rbp), %ecx
xorb %dl, %dl
movl %ecx, %edi
movb %dl, %al
callq _factorial
movl %eax, %ecx
movl -24(%rbp), %edx
xorb %sil, %sil
movl %edx, %edi
movb %sil, %al
movl %ecx, -32(%rbp)
callq _factorial
movl %eax, %ecx
movl -32(%rbp), %esi
imull %ecx, %esi
movl %esi, -28(%rbp)
movl -20(%rbp), %ecx
movl -28(%rbp), %esi
movl %ecx, %eax
cltd
idivl %esi
movl %eax, %ecx
movl %ecx, -16(%rbp)
movl %ecx, -12(%rbp)
movl -12(%rbp), %eax
addq $32, %rsp
popq %rbp
ret
Any help is greatly appreciated, can't seem to figure out why it's not compiling. It compiles just fine on mac but I need to get it run on linux.
GCC is not able to find extern int Factorial(int n); and extern int nCr(int n, int r); definitions because they are called _factorial: and _nCr: in your asm file. Change those labels to match the decl and your code will compile.
I noticed that you have only one call of nCr in the main function, so I think it is better to change decls in your .h. So your nCr.h will be:
#ifndef _NCR_H_
#define _NCR_H_
extern int _factorial(int n);
extern int _nCr(int n, int r);
#endif /* _NCR_H_ */
And you have to call _nCr(...) at main.
I have generated two assembly files - one that is optimized, and one that is not. The assembly-language code generated with optimization on should be more efficient than the other assembly-language code. I am more interested in how the efficiency is achieved. To my understanding, in the non-optimized version there will always have to be an offset call to the register %rbp to find the address. In the optimized version, the addresses are being stored in the registers, so you don't have to rely and call on %rbp to find them.
Am I correct? And if so, would there ever be a time when the optimized version will not be advantageous? Thank you for your time.
Here is a function that converts from 42 GIF to CYMK.
void rgb2cmyk(int r, int g, int b, int ret[]) {
int c = 255 - r;
int m = 255 - g;
int y = 255 - b;
int k = (c < m) ? (c < y ? c : y) : (m < y ? m : y);
c -= k; m -= k; y -= k;
ret[0] = c; ret[1] = m; ret[2] = y; ret[3] = k;
}
Here is the assembly-language code that has not been optimized. Note I have made notes using ;; in the code.
No Opt:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
;;initializing variable c, m, y
movl $255, %eax
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
movl %edx, -12(%rbp)
movq %rcx, -24(%rbp)
movl %eax, %edx
subl -4(%rbp), %edx
movl %edx, -28(%rbp)
movl %eax, %edx
subl -8(%rbp), %edx
movl %edx, -32(%rbp)
subl -12(%rbp), %eax
movl %eax, -36(%rbp)
movl -28(%rbp), %eax
;;compare
cmpl -32(%rbp), %eax
jge LBB0_5
## BB#1:
movl -28(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_3
## BB#2:
movl -28(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
jmp LBB0_4
LBB0_3:
movl -36(%rbp), %eax
movl %eax, -44(%rbp) ## 4-byte Spill
LBB0_4:
movl -44(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
jmp LBB0_9
LBB0_5:
movl -32(%rbp), %eax
cmpl -36(%rbp), %eax
jge LBB0_7
## BB#6:
movl -32(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
jmp LBB0_8
LBB0_7:
movl -36(%rbp), %eax
movl %eax, -52(%rbp) ## 4-byte Spill
LBB0_8:
movl -52(%rbp), %eax ## 4-byte Reload
movl %eax, -48(%rbp) ## 4-byte Spill
LBB0_9:
movl -48(%rbp), %eax ## 4-byte Reload
movl %eax, -40(%rbp)
movl -40(%rbp), %eax
movl -28(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -28(%rbp)
movl -40(%rbp), %eax
movl -32(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -32(%rbp)
movl -40(%rbp), %eax
movl -36(%rbp), %ecx
subl %eax, %ecx
movl %ecx, -36(%rbp)
movl -28(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, (%rdx)
movl -32(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 4(%rdx)
movl -36(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 8(%rdx)
movl -40(%rbp), %eax
movq -24(%rbp), %rdx
movl %eax, 12(%rdx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Optimization:
.section __TEXT,__text,regular,pure_instructions
.globl _rgb2cmyk
.align 4, 0x90
_rgb2cmyk: ## #rgb2cmyk
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp4:
.cfi_def_cfa_register %rbp
movl $255, %r8d
movl $255, %eax
subl %edi, %eax
movl $255, %edi
subl %esi, %edi
subl %edx, %r8d
cmpl %edi, %eax ##;; compare m and c
jge LBB0_2
## BB#1: ;; c < m
cmpl %r8d, %eax ## compare y and c
movl %r8d, %edx
cmovlel %eax, %edx
jmp LBB0_3
LBB0_2: ##;; c >= m
cmpl %r8d, %edi ## compare y and m
movl %r8d, %edx
cmovlel %edi, %edx
LBB0_3:
subl %edx, %eax
subl %edx, %edi
subl %edx, %r8d
movl %eax, (%rcx)
movl %edi, 4(%rcx)
movl %r8d, 8(%rcx)
movl %edx, 12(%rcx)
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
Yes. The optimized version performs many fewer memory read operations by storing intermediate values in registers and not reloading them over and over.
You are using call wrong. It is a technical term that means to push a return address on the stack and branch to a new location for instructions. The term you mean is simply to use the register.
Can you think of a reason that longer, slower code is "better"?
I guess you all heard of the 'swap problem'; SO is full of questions about it.
The version of the swap without use of a third variable is often considered to be faster since, well, you have one variable less. I wanted to know what was going on behind the curtains and wrote the following two programs:
int main () {
int a = 9;
int b = 5;
int swap;
swap = a;
a = b;
b = swap;
return 0;
}
and the version without third variable:
int main () {
int a = 9;
int b = 5;
a ^= b;
b ^= a;
a ^= b;
return 0;
}
I generated the assembly code using clang and got this for the first version (that uses a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -8(%rbp), %ecx
movl %ecx, -16(%rbp)
movl -12(%rbp), %ecx
movl %ecx, -8(%rbp)
movl -16(%rbp), %ecx
movl %ecx, -12(%rbp)
popq %rbp
ret
Leh_func_end0:
...
and this for the second version (that does not use a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
movl -8(%rbp), %ecx
movl -12(%rbp), %edx
xorl %ecx, %edx
movl %edx, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
popq %rbp
ret
Leh_func_end0:
...
The second one is longer but I don't know much about assembly code so I have no idea if that means that it is slower so I'd like to hear the opinion of someone more knowledgable about it.
Which of the above versions of a variable swap is faster and takes less memory?
Look at some optimised assembly. From
void swap_temp(int *restrict a, int *restrict b){
int temp = *a;
*a = *b;
*b = temp;
}
void swap_xor(int *restrict a, int *restrict b){
*a ^= *b;
*b ^= *a;
*a ^= *b;
}
gcc -O3 -std=c99 -S -o swapping.s swapping.c produced
.file "swapping.c"
.text
.p2align 4,,15
.globl swap_temp
.type swap_temp, #function
swap_temp:
.LFB0:
.cfi_startproc
movl (%rdi), %eax
movl (%rsi), %edx
movl %edx, (%rdi)
movl %eax, (%rsi)
ret
.cfi_endproc
.LFE0:
.size swap_temp, .-swap_temp
.p2align 4,,15
.globl swap_xor
.type swap_xor, #function
swap_xor:
.LFB1:
.cfi_startproc
movl (%rsi), %edx
movl (%rdi), %eax
xorl %edx, %eax
xorl %eax, %edx
xorl %edx, %eax
movl %edx, (%rsi)
movl %eax, (%rdi)
ret
.cfi_endproc
.LFE1:
.size swap_xor, .-swap_xor
.ident "GCC: (SUSE Linux) 4.5.1 20101208 [gcc-4_5-branch revision 167585]"
.section .comment.SUSE.OPTs,"MS",#progbits,1
.string "Ospwg"
.section .note.GNU-stack,"",#progbits
To me, swap_temp looks as efficient as can be.
The problem with XOR swap trick is that it's strictly sequential. It may seem deceptively fast, but in reality, it is not. There's an instruction called XCHG that swaps two registers, but this can also be slower than simply using 3 MOVs, due to its atomic nature. The common technique with temp is an excellent choice ;)
To get an idea of the cost imagine that every command has a cost to be performed and also the indirect addressing has its own cost.
movl -12(%rbp), %ecx
This line will need something like a time unit for accessing the value in ecx register,
one time unit for accessing rbp, another one for applying the offset (-12) and more time
units (let's say arbitrarily 3) for moving the value from the address stored in ecx to the
address indicated from -12(%rbp).
If you count all the operations in every line and all line, the second method is for sure costlier than the first one.
I have the following recursive function to count all the nodes having value 20, in a circular doubly linked list. I need to convert this to tail recursive function to prevent safety issues. Please help me with the same. Thanks
int count(node *start)
{
return count_helper(start, start);
}
int count_helper(node *current, node *start)
{
int c;
c = 0;
if(current == NULL)
return 0;
if((current->roll_no) == 20)
c = 1;
if(current->next == start) return c;
return (c + count_helper(current->next, start));
}
In order to take advantage of tail recursion, the recursive call simply has to be the last thing performed. Currently, the only thing standing in the way of this goal is an addition. So, to transform the function, that addition has to be moved around. A common way to accomplish this is by passing the variable c as a parameter to the recursive helper function, as so:
int count(node *start)
{
return count_helper(start,start,0);
}
int count_helper(node *current, node *start, int c)
{
if(current == NULL)
return c;
if((current->roll_no) == 20)
c+=1;
if(current->next == start)
return c;
return count_helper(current->next, start,c);
}
This unrolls as follows (using gcc 4.6.1, as produced by gcc -S -O2):
count_helper:
.LFB23:
.cfi_startproc
pushl %ebx
.cfi_def_cfa_offset 8
.cfi_offset 3, -8
movl 8(%esp), %edx
movl 12(%esp), %ebx
movl 16(%esp), %eax
testl %edx, %edx
jne .L15
jmp .L10
.p2align 4,,7
.p2align 3
.L14:
testl %edx, %edx
je .L10
.L15:
xorl %ecx, %ecx
cmpl $20, 4(%edx)
movl (%edx), %edx
sete %cl
addl %ecx, %eax
cmpl %ebx, %edx
jne .L14 # <-- this is the key line right here
.L10:
popl %ebx
.cfi_def_cfa_offset 4
.cfi_restore 3
ret
.cfi_endproc
Compare this to your original (done without -O2, as apparently the compiler finds a way to make your original tail recursive as well, although in the process it mucks it up so much that I can barely read it):
count_helper:
.LFB1:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $40, %esp
movl $0, -12(%ebp)
cmpl $0, 8(%ebp)
jne .L3
movl $0, %eax
jmp .L4
.L3:
movl 8(%ebp), %eax
movl 4(%eax), %eax
cmpl $20, %eax
jne .L5
movl $1, -12(%ebp)
.L5:
movl 8(%ebp), %eax
movl (%eax), %eax
cmpl 12(%ebp), %eax
jne .L6
movl -12(%ebp), %eax
jmp .L4
.L6:
movl 8(%ebp), %eax
movl (%eax), %eax
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call count_helper # <-- this is the key line right here
addl -12(%ebp), %eax
.L4:
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc