I have a code in c language that needs to be translated to assembly x86.
Here is the c code:
int rb (FILE *f){
int s;
char c;
s = fr(&c, 1, 1, f);
if (s <= 0) return -1;
return (int)c;
}
So far I got to this assembly code that gives me Segmentation fault:
rb:
pushl %ebp
movl %esp,%ebp
pushl 8(%ebp)
pushl $1
pushl $1
leal 12(%ebp), %eax
pushl %eax
call fr
jz ng
jns ex
ng:
pushl $1
negl %eax
ex:
popl %ebp
ret
Can anyone help me to solve this? :)
Both Gcc and Clang can generate the assember for you. It might not always be easy to read but this is how to do it:
Make the snippet you want to inspect compilable with no errors. Note, I've changed your example to take a pointer to an integer as an argument because in your example you were declaring a char on the stack and then returning it ie Undefined Bahaviour.
Create a file called foo.c with this in it:
#include <stdio.h>
extern size_t fr(void *restrict ptr, size_t size, size_t nitems, FILE *restrict stream);
int rb (FILE *f, int *c){
int s;
s = fr(c, 1, 1, f);
if (s <= 0) return -1;
return *c;
}
Compile it using the S flag to gcc ie
gcc-5 -S -O0 -Wall -pedantic -std=c11 foo.c
The open the following file foo.s
.text
.globl _rb
_rb:
LFB1:
pushq %rbp
LCFI0:
movq %rsp, %rbp
LCFI1:
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rdx
movq -32(%rbp), %rax
movq %rdx, %rcx
movl $1, %edx
movl $1, %esi
movq %rax, %rdi
call _fr
movl %eax, -4(%rbp)
cmpl $0, -4(%rbp)
jg L2
movl $-1, %eax
jmp L3
L2:
movq -32(%rbp), %rax
movl (%rax), %eax
L3:
leave
LCFI2:
ret
LFE1:
.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
....
.... snipped
....
Now you have the assembler on x86 for the code you wanted. Note, you can play around with various options to change the output in particular the optimization levels will drastically change the output.
Related
I'm trying to make a write function in x64 that I can call in a C file.
I have the following files
write.s
.text
.globl write
write:
// stack thing
pushq %rbp
movq %rsp, %rbp
// function arguments as done by the C convention
movl %edi, -4(%rbp) // fd
movl %esi, -8(%rbp) // buf
movl %edx, -12(%rbp) // length
// write
movq $1, %rax // syscall 1 for write
movq -4(%rbp), %rdi // fd to rdi
movq -8(%rbp), %rsi // buf to rsi
movq -12(%rbp), %rdx // len to rdx
syscall
// return
movq %rbp, %rsp
popq %rbp
ret
write.h
void write(int fd, char *buf, int len);
main.c
#include "write.h"
int main() {
int fd = 1;
char *buf = "hi";
int len = 2;
write(fd, buf, len);
return 0;
}
The problem is that when I compile this with gcc -no-pie -o main write.s main.c
and run ./main it doesn't output anything.
I'm sorry if this is some obvious mistake, as I am not that familiar with x64 assembly.
Taking the following C code
#include <stdio.h>
void test(unsigned char buffer[], int size) {
for (int i = 0; i < size; i++) {
unsigned char data = buffer[i];
printf("%c", data);
}
}
void main() {
unsigned char buffer[5] = "Hello";
test(buffer, 5);
return;
}
and compiling it the flags -fno-stack-protector -fno-asynchronous-unwind-tables -fno-unroll-loops for clarity produces the following assembly for the test() function:
test:
testl %esi, %esi
jle .L6
pushq %rbp
leal -1(%rsi), %eax
pushq %rbx
leaq 1(%rdi,%rax), %rbp
movq %rdi, %rbx
subq $8, %rsp
.p2align 4,,10
.p2align 3
.L3:
movzbl (%rbx), %edi
addq $1, %rbx
call putchar#PLT
cmpq %rbp, %rbx
jne .L3
addq $8, %rsp
popq %rbx
popq %rbp
ret
.p2align 4,,10
.p2align 3
.L6:
ret
.size test, .-test
.section .text.startup,"ax",#progbits
.p2align 4
It seems to me like the L3 label here is completely useless since it is never jumped to or entered. (Except by jne .L3, but that instruction is inside of the L3 label already).
Can anyone explain how and why this assembly still produces the expected effect?
If you read the assembler code from the top you will see that it reaches .L3, plus it also jumps to it with jne .L3, which is your for loop in C.
This simple c:
#include <stdio.h>
#include <string.h>
int *add(int a, int b){
int ar[1];
int result = a+b;
memcpy(ar, &result, sizeof(int));
return ar;
}
int main(){
int a = add(1,2)[0];
printf("%i\n",a);
}
is compiled into this:
.text
.globl add
.type add, #function
add:
pushq %rbp #
movq %rsp, %rbp #,
movl %edi, -20(%rbp) # a, a
movl %esi, -24(%rbp) # b, b
# a.c:5: int result = a+b;
movl -20(%rbp), %edx # a, tmp91
movl -24(%rbp), %eax # b, tmp92
addl %edx, %eax # tmp91, _1
# a.c:5: int result = a+b;
movl %eax, -8(%rbp) # _1, result
# a.c:6: memcpy(ar, &result, sizeof(int)); ---I SEE NO CALL INSTRUCTION---
movl -8(%rbp), %eax # MEM[(char * {ref-all})&result], _6
movl %eax, -4(%rbp) # _6, MEM[(char * {ref-all})&ar]
# a.c:7: return ar;
movl $0, %eax #--THE FUNCTION SHOULD RETURN ADDRESS OF ARRAY, NOT 0. OTHERWISE command terminated
# lea -4(%rbp), %rax #--ONLY THIS IS CORRECT, NOT `0`
# a.c:8: }
popq %rbp #
ret
.size add, .-add
.section .rodata
.LC0:
.string "%i\n"
.text
.globl main
.type main, #function
main:
pushq %rbp #
movq %rsp, %rbp #,
subq $16, %rsp #,
# a.c:11: int a = add(1,2)[0];
movl $2, %esi #,
movl $1, %edi #,
call add #
# a.c:11: int a = add(1,2)[0];
movl (%rax), %eax # *_1, tmp90
movl %eax, -4(%rbp) # tmp90, a
# a.c:12: printf("%i\n",a);
movl -4(%rbp), %eax # a, tmp91
movl %eax, %esi # tmp91,
leaq .LC0(%rip), %rdi #,
movl $0, %eax #,
call printf#PLT #
movl $0, %eax #, _6
# a.c:13: }
leave
ret
.size main, .-main
.ident "GCC: (Debian 8.3.0-6) 8.3.0"
.section .note.GNU-stack,"",#progbits
Every function from stdlib like printf or puts are called from GOT (i.e. %rip register holds the address of GOT). But not memcpy, it is like "assembly inline instructions" instead of regular call address. So is memcpy even a symbol? If so, why is it not as argument to call? Is memcpy in GOT table? If so, what is a offset from GOT to that symbol?
So first off, you have a bug:
$ cc -O2 -S test.c
test.c: In function ‘add’:
test.c:7:12: warning: function returns address of local variable
Returning the address of a local variable has undefined behavior, if and only if the caller uses that value; this is why your compiler generated code that returned a null pointer, which will crash the program if used but be harmless otherwise. In fact, my copy of GCC generates only this for add:
add:
xorl %eax, %eax
ret
because that treatment of the return value makes the other operations in add be dead code.
(The "only if used" restriction is also why my compiler generates a warning, not a hard error.)
Now, if I modify your program to have well-defined behavior, e.g.
#include <stdio.h>
#include <string.h>
void add(int *sum, int a, int b)
{
int result = a+b;
memcpy(sum, &result, sizeof(int));
}
int main(void)
{
int a;
add(&a, 1, 2);
printf("%i\n",a);
return 0;
}
then I do indeed see assembly code in which the memcpy call has been replaced by inline code:
add:
addl %edx, %esi
movl %esi, (%rdi)
ret
This is a feature of many modern C compilers: they know what some of the C library's functions do, and can inline them when that makes sense. (You can see that in this case the generated code is both smaller and faster than it would have been with an actual call to memcpy.)
GCC lets me turn this feature off with a command-line option:
$ gcc -O2 -ffreestanding test.c
$ sed -ne '/^add:/,/cfi_endproc/{; /^\.LF[BE]/d; /\.cfi_/d; p; }' test.s
add:
subq $24, %rsp
addl %edx, %esi
movl $4, %edx
movl %esi, 12(%rsp)
leaq 12(%rsp), %rsi
call memcpy#PLT
addq $24, %rsp
ret
In this mode, the call to memcpy in add is treated the same as the call to printf in main. Your compiler may have similar options.
I have the code working with lined lists. I use tail calls. Unfortunately, GCC does not optimise the calls.
Here is C code of the function that recursively calculates length of the linked list:
size_t ll_length(const ll_t* list) {
return ll_length_rec(list, 0);
}
size_t ll_length_rec(const ll_t* list, size_t size_so_far)
{
if (list) {
return ll_length_rec(list->next, size_so_far + 1);
} else {
return size_so_far;
}
}
and here is the assembler code:
.globl _ll_length_rec
_ll_length_rec:
LFB8:
.loc 1 47 0
pushq %rbp
LCFI6:
movq %rsp, %rbp
LCFI7:
subq $32, %rsp
LCFI8:
movq %rdi, -8(%rbp)
movq %rsi, -16(%rbp)
.loc 1 48 0
cmpq $0, -8(%rbp)
je L8
.loc 1 49 0
movq -16(%rbp), %rsi
incq %rsi
movq -8(%rbp), %rax
movq 8(%rax), %rdi
call _ll_length_rec # < THIS SHOUD BE OPTIMIZED
movq %rax, -24(%rbp)
jmp L10
If GCC would optimize it, there would be no call in the asm. I compile it with:
gcc -S -fnested-functions -foptimize-sibling-calls \
-03 -g -Wall -o llist llist.c
and GCC version is:
i686-apple-darwin10-gcc-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5666) (dot 3)
If I add -O3 to your compilation line, it does not seem to generate the offending call, while without it, I get the unoptimised call. I don't know all gcc options in my head, but is -03 a typo for -O3 or intentional?
Ltmp2:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
jmp LBB1_1
.align 4, 0x90
LBB1_3:
addq $2, %rsi
Ltmp3:
movq (%rax), %rdi
Ltmp4:
LBB1_1:
Ltmp5:
testq %rdi, %rdi
je LBB1_5
Ltmp6:
movq (%rdi), %rax
testq %rax, %rax
jne LBB1_3
incq %rsi
LBB1_5:
movq %rsi, %rax
Ltmp7:
Ltmp8:
popq %rbp
ret
Most likely because neither of your functions are declared as static, which means that the symbols must be visible to the linker in case any other compilation units need them at link time. Try to compile with the -fwhole-program flag and see what happens.
Probably depends on the version of GCC and specific build. This is what I get from GCC 3.4.4 on Windows starting from -O2 and up
.globl _ll_length_rec
.def _ll_length_rec; .scl 2; .type 32; .endef
_ll_length_rec:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %edx
movl 12(%ebp), %eax
jmp L3
.p2align 4,,7
L6:
movl (%edx), %edx
incl %eax
L3:
testl %edx, %edx
jne L6
popl %ebp
ret
I guess you all heard of the 'swap problem'; SO is full of questions about it.
The version of the swap without use of a third variable is often considered to be faster since, well, you have one variable less. I wanted to know what was going on behind the curtains and wrote the following two programs:
int main () {
int a = 9;
int b = 5;
int swap;
swap = a;
a = b;
b = swap;
return 0;
}
and the version without third variable:
int main () {
int a = 9;
int b = 5;
a ^= b;
b ^= a;
a ^= b;
return 0;
}
I generated the assembly code using clang and got this for the first version (that uses a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -8(%rbp), %ecx
movl %ecx, -16(%rbp)
movl -12(%rbp), %ecx
movl %ecx, -8(%rbp)
movl -16(%rbp), %ecx
movl %ecx, -12(%rbp)
popq %rbp
ret
Leh_func_end0:
...
and this for the second version (that does not use a third variable):
...
Ltmp0:
movq %rsp, %rbp
Ltmp1:
movl $0, %eax
movl $0, -4(%rbp)
movl $9, -8(%rbp)
movl $5, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
movl -8(%rbp), %ecx
movl -12(%rbp), %edx
xorl %ecx, %edx
movl %edx, -12(%rbp)
movl -12(%rbp), %ecx
movl -8(%rbp), %edx
xorl %ecx, %edx
movl %edx, -8(%rbp)
popq %rbp
ret
Leh_func_end0:
...
The second one is longer but I don't know much about assembly code so I have no idea if that means that it is slower so I'd like to hear the opinion of someone more knowledgable about it.
Which of the above versions of a variable swap is faster and takes less memory?
Look at some optimised assembly. From
void swap_temp(int *restrict a, int *restrict b){
int temp = *a;
*a = *b;
*b = temp;
}
void swap_xor(int *restrict a, int *restrict b){
*a ^= *b;
*b ^= *a;
*a ^= *b;
}
gcc -O3 -std=c99 -S -o swapping.s swapping.c produced
.file "swapping.c"
.text
.p2align 4,,15
.globl swap_temp
.type swap_temp, #function
swap_temp:
.LFB0:
.cfi_startproc
movl (%rdi), %eax
movl (%rsi), %edx
movl %edx, (%rdi)
movl %eax, (%rsi)
ret
.cfi_endproc
.LFE0:
.size swap_temp, .-swap_temp
.p2align 4,,15
.globl swap_xor
.type swap_xor, #function
swap_xor:
.LFB1:
.cfi_startproc
movl (%rsi), %edx
movl (%rdi), %eax
xorl %edx, %eax
xorl %eax, %edx
xorl %edx, %eax
movl %edx, (%rsi)
movl %eax, (%rdi)
ret
.cfi_endproc
.LFE1:
.size swap_xor, .-swap_xor
.ident "GCC: (SUSE Linux) 4.5.1 20101208 [gcc-4_5-branch revision 167585]"
.section .comment.SUSE.OPTs,"MS",#progbits,1
.string "Ospwg"
.section .note.GNU-stack,"",#progbits
To me, swap_temp looks as efficient as can be.
The problem with XOR swap trick is that it's strictly sequential. It may seem deceptively fast, but in reality, it is not. There's an instruction called XCHG that swaps two registers, but this can also be slower than simply using 3 MOVs, due to its atomic nature. The common technique with temp is an excellent choice ;)
To get an idea of the cost imagine that every command has a cost to be performed and also the indirect addressing has its own cost.
movl -12(%rbp), %ecx
This line will need something like a time unit for accessing the value in ecx register,
one time unit for accessing rbp, another one for applying the offset (-12) and more time
units (let's say arbitrarily 3) for moving the value from the address stored in ecx to the
address indicated from -12(%rbp).
If you count all the operations in every line and all line, the second method is for sure costlier than the first one.