Confusion with system call - c

I am trying to understand how a system call is made in x86. I am reading Smashing the stack for fun and profit. There is a function given on page 7:
#include <stdio.h>
void main() {
char *name[2];
name[0] = "/bin/sh";
name[1] = NULL;
execve(name[0], name, NULL);
}
and below the function is given its assembly dump:
Dump of assembler code for function main:
0x8000130 : pushl %ebp
0x8000131 : movl %esp,%ebp
0x8000133 : subl $0x8,%esp
0x8000136 : movl $0x80027b8,0xfffffff8(%ebp)
0x800013d : movl $0x0,0xfffffffc(%ebp)
0x8000144 : pushl $0x0
0x8000146 : leal 0xfffffff8(%ebp),%eax
0x8000149 : pushl %eax
0x800014a : movl 0xfffffff8(%ebp),%eax
0x800014d : pushl %eax
0x800014e : call 0x80002bc <__execve>
0x8000153 : addl $0xc,%esp
0x8000156 : movl %ebp,%esp
0x8000158 : popl %ebp
0x8000159 : ret
Dump of assembler code for function __execve:
0x80002bc <__execve>: pushl %ebp
0x80002bd <__execve+1>: movl %esp,%ebp
0x80002bf <__execve+3>: pushl %ebx
0x80002c0 <__execve+4>: movl $0xb,%eax
0x80002c5 <__execve+9>: movl 0x8(%ebp),%ebx
0x80002c8 <__execve+12>: movl 0xc(%ebp),%ecx
0x80002cb <__execve+15>: movl 0x10(%ebp),%edx
0x80002ce <__execve+18>: int $0x80
0x80002d0 <__execve+20>: movl %eax,%edx
0x80002d2 <__execve+22>: testl %edx,%edx
0x80002d4 <__execve+24>: jnl 0x80002e6 <__execve+42>
0x80002d6 <__execve+26>: negl %edx
0x80002d8 <__execve+28>: pushl %edx
0x80002d9 <__execve+29>: call 0x8001a34 <__normal_errno_location>
0x80002de <__execve+34>: popl %edx
0x80002df <__execve+35>: movl %edx,(%eax)
0x80002e1 <__execve+37>: movl $0xffffffff,%eax
0x80002e6 <__execve+42>: popl %ebx
0x80002e7 <__execve+43>: movl %ebp,%esp
0x80002e9 <__execve+45>: popl %ebp
0x80002ea <__execve+46>: ret
0x80002eb <__execve+47>: nop
However on writing the same code on my machine and compiling with
gcc test.c -m32 -g -o test -fno-stack-protector -static
and generating the dump with
objdump -S test > test.dis
I get the following dump for main:
void main(){
8048e24: 55 push %ebp
8048e25: 89 e5 mov %esp,%ebp
8048e27: 83 e4 f0 and $0xfffffff0,%esp
8048e2a: 83 ec 20 sub $0x20,%esp
char *name[2];
name[0] = "/bin/sh";
8048e2d: c7 44 24 18 e8 de 0b movl $0x80bdee8,0x18(%esp)
8048e34: 08
name[1] = NULL;
8048e35: c7 44 24 1c 00 00 00 movl $0x0,0x1c(%esp)
8048e3c: 00
execve(name[0], name, NULL);
8048e3d: 8b 44 24 18 mov 0x18(%esp),%eax
8048e41: c7 44 24 08 00 00 00 movl $0x0,0x8(%esp)
8048e48: 00
8048e49: 8d 54 24 18 lea 0x18(%esp),%edx
8048e4d: 89 54 24 04 mov %edx,0x4(%esp)
8048e51: 89 04 24 mov %eax,(%esp)
8048e54: e8 17 34 02 00 call 806c270 <__execve>
}
And for __execve:
0806c270 <__execve>:
806c270: 53 push %ebx
806c271: 8b 54 24 10 mov 0x10(%esp),%edx
806c275: 8b 4c 24 0c mov 0xc(%esp),%ecx
806c279: 8b 5c 24 08 mov 0x8(%esp),%ebx
806c27d: b8 0b 00 00 00 mov $0xb,%eax
806c282: ff 15 f0 99 0e 08 call *0x80e99f0
806c288: 3d 00 f0 ff ff cmp $0xfffff000,%eax
806c28d: 77 02 ja 806c291 <__execve+0x21>
806c28f: 5b pop %ebx
806c290: c3 ret
806c291: c7 c2 e8 ff ff ff mov $0xffffffe8,%edx
806c297: f7 d8 neg %eax
806c299: 65 89 02 mov %eax,%gs:(%edx)
806c29c: 83 c8 ff or $0xffffffff,%eax
806c29f: 5b pop %ebx
806c2a0: c3 ret
806c2a1: 66 90 xchg %ax,%ax
806c2a3: 66 90 xchg %ax,%ax
806c2a5: 66 90 xchg %ax,%ax
806c2a7: 66 90 xchg %ax,%ax
806c2a9: 66 90 xchg %ax,%ax
806c2ab: 66 90 xchg %ax,%ax
806c2ad: 66 90 xchg %ax,%ax
806c2af: 90 nop
I understand that the article is very old so it may not match exactly with the current standards. In fact i am able make sense of most of the differences. Here is what is bothering me:
From what I know: to make the exec system call I need to put the arguments in specific registers and call the instruction
int 0x80
to send an interrupt. I can see this instruction at address 0x80002ce in the dump given in the article. But I cannot find the same instruction in mine. In place of it I find
call *0x80e99f0
and the address 0x80e99f0 doesn't even exists in my dump. What am I missing here? What is the point of a * before 0x80e99f0. Is the address 0x80e99f0 being dynamically loaded at runtime? If it is true then what is the use of -static flag during compilation and what can I do to make the dump similar to that of the article?
I am running 64 bit ubuntu 14.04 on Intel processor
Edit after getting suggestion to run objdump with -DS flag:
I finally get the hidden address:
080e99f0 <_dl_sysinfo>:
80e99f0: 70 ed jo 80e99df <_dl_load_lock+0x7>
80e99f2: 06 push %es
80e99f3: 08 b0 a6 09 08 07 or %dh,0x70809a6(%eax)
but still can't make any sense.
The address in jo 80e99df points again to something that is hidden in between these lines:
080e99d8 <_dl_load_lock>:
...
80e99e4: 01 00 add %eax,(%eax)
...
As evident from the answer the code actually jumps to the address present in memory location 0x80e99f0 which eventually points to int $0x80 instruction.

Traditionally, Linux used interrupt 0x80 to invoke system calls. Since the PentiumPro, there is an alternative way to invoke a system call: using the SYSENTER instruction (AMD also has its own SYSCALL instruction). This is a more efficient way to invoke a system call.
Choosing which syscall mechanism to use
The linux kernel and glibc have a mechanism to choose between the different ways to invoke a system call.
The kernel sets up a virtual shared library for each process, it's called the VDSO (virtual dynamic shared object), which you can see in the output of cat /proc/<pid>/maps:
$ cat /proc/self/maps
08048000-0804c000 r-xp 00000000 03:04 1553592 /bin/cat
0804c000-0804d000 rw-p 00003000 03:04 1553592 /bin/cat
[...]
b7ee8000-b7ee9000 r-xp b7ee8000 00:00 0 [vdso]
[...]
This vdso, among other things, contains an appropriate system call invocation sequence for the CPU in use, e.g:
ffffe414 <__kernel_vsyscall>:
ffffe414: 51 push %ecx ; \
ffffe415: 52 push %edx ; > save registers
ffffe416: 55 push %ebp ; /
ffffe417: 89 e5 mov %esp,%ebp ; save stack pointer
ffffe419: 0f 34 sysenter ; invoke system call
ffffe41b: 90 nop
ffffe41c: 90 nop ; the kernel will usually
ffffe41d: 90 nop ; return to the insn just
ffffe41e: 90 nop ; past the jmp, but if the
ffffe41f: 90 nop ; system call was interrupted
ffffe420: 90 nop ; and needs to be restarted
ffffe421: 90 nop ; it will return to this jmp
ffffe422: eb f3 jmp ffffe417 <__kernel_vsyscall+0x3>
ffffe424: 5d pop %ebp ; \
ffffe425: 5a pop %edx ; > restore registers
ffffe426: 59 pop %ecx ; /
ffffe427: c3 ret ; return to caller
In arch/x86/vdso/vdso32/ there are implementations using int 0x80, sysenter and syscall, the kernel selects the appropriate one.
To let userspace know that there is a vdso, and where it is located, the kernel sets AT_SYSINFO and AT_SYSINFO_EHDR entries in the auxiliary vector (auxv, the 4th argument to main(), after argc, argv, envp, which is used to pass some information from the kernel to newly started processes). AT_SYSINFO_EHDR points to the ELF header of the vdso, AT_SYSINFO points to the vsyscall implementation:
$ LD_SHOW_AUXV=1 id # tell the dynamic linker ld.so to output auxv values
AT_SYSINFO: 0xb7fd4414
AT_SYSINFO_EHDR: 0xb7fd4000
[...]
glibc uses this information to locate the vsyscall. It stores it into the dynamic loader global _dl_sysinfo, e.g.:
glibc-2.16.0/elf/dl-support.c:_dl_aux_init():
ifdef NEED_DL_SYSINFO
case AT_SYSINFO:
GL(dl_sysinfo) = av->a_un.a_val;
break;
#endif
#if defined NEED_DL_SYSINFO || defined NEED_DL_SYSINFO_DSO
case AT_SYSINFO_EHDR:
GL(dl_sysinfo_dso) = (void *) av->a_un.a_val;
break;
#endif
glibc-2.16.0/elf/dl-sysdep.c:_dl_sysdep_start()
glibc-2.16.0/elf/rtld.c:dl_main:
GLRO(dl_sysinfo) = GLRO(dl_sysinfo_dso)->e_entry + l->l_addr;
and in a field in the header of the TCB (thread control block):
glibc-2.16.0/nptl/sysdeps/i386/tls.h
_head->sysinfo = GLRO(dl_sysinfo)
If the kernel is old and doesn't provide a vdso, glibc provides a default implementation for _dl_sysinfo:
.hidden _dl_sysinfo_int80:
int $0x80
ret
When a program is compiled against glibc, depending on circumstances, a choice is made between different ways of invoking a system call:
glibc-2.16.0/sysdeps/unix/sysv/linux/i386/sysdep.h:
/* The original calling convention for system calls on Linux/i386 is
to use int $0x80. */
#ifdef I386_USE_SYSENTER
# ifdef SHARED
# define ENTER_KERNEL call *%gs:SYSINFO_OFFSET
# else
# define ENTER_KERNEL call *_dl_sysinfo
# endif
#else
# define ENTER_KERNEL int $0x80
#endif
int 0x80 ← the traditional way
call *%gs:offsetof(tcb_head_t, sysinfo) ← %gs points to the TCB, so this jumps indirectly through the pointer to vsyscall stored in the TCB
call *_dl_sysinfo ← this jumps indirectly through the global variable
So, in x86:
system call
↓
int 0x80 / call *%gs:0x10 / call *_dl_sysinfo
│ │
╰─┬──────────┼─────────╮
↓ ↓ ↓
(in vdso) int 0x80 / sysenter / syscall

Try to use objdump -DS or objdump -sS to include the address 0x80e99f0 in your dump.
Local example:
0806bf70 <__execve>:
...
806bf82: ff 15 10 a3 0e 08 call *0x80ea310
At address 0x80ea310 (shown with objdump -sS):
80ea310 10ea0608 60a60908 07000000 7f030000
10ea0608 is address 0x806ea10 little-endian in memory.
You will then see, that the address of _dl_sysinfo_int80 is located there:
0806ea10 <_dl_sysinfo_int80>:
806ea10: cd 80 int $0x80
806ea12: c3 ret
which calls the software interrupt 0x80 (executes the syscall) and returns to the caller then.
call *0x80ea310 is therefore really calling 0x806ea10 (dereferencing a pointer)

Related

How to convert an assembler program to shellcode correctly?

I programmed a program in nasm (x64) which should execute /bin/bash, and that works fine. Then i ran the program with objdump -D and i wrote down the machine code like this: \xbb\x68\x53\x48\xbb\x2f\x62\x69\x6e\x2f\x62\x61\x73\x53\x48\x89\xe7\x50\x57\x48\x89\xe6\xb0\x3b\x0f\x05. Then i ran this with ./shell $(python -c 'print "\xbb\x68\x53\x48\xbb\x2f\x62\x69\x6e\x2f\x62\x61\x73\x53\x48\x89\xe7\x50\x57\x48\x89\xe6\xb0\x3b\x0f\x05"') and i got an illegal instruction. But the assembler program worked fine! Can someone help?
shell.c:
int main(int argc, char **argv) {
int (*func)();
func = (int (*)()) argv[1];
(int)(*func)();
}
bash.asm:
section .text
global start
start:
mov rbx, 0x68
push rbx
mov rbx, 0x7361622f6e69622f
push rbx
mov rdi, rsp
push rax
push rdi
mov rsi, rsp
mov al, 59
syscall
objdump:
./bash: file format elf64-x86-64
Disassembly of section .text:
0000000000401000 <start>:
401000: bb 68 00 00 00 mov $0x68,%ebx
401005: 53 push %rbx
401006: 48 bb 2f 62 69 6e 2f movabs $0x7361622f6e69622f,%rbx
40100d: 62 61 73
401010: 53 push %rbx
401011: 48 89 e7 mov %rsp,%rdi
401014: 50 push %rax
401015: 57 push %rdi
401016: 48 89 e6 mov %rsp,%rsi
401019: b0 3b mov $0x3b,%al
40101b: 0f 05 syscall
You are omitting the zero bytes here:
\xbb\x68\x53\x48\xbb\x2f\x62\x69\x6e\x2f\x62\x61\x73\x53\x48\x89\xe7\x50\x57\x48\x89\xe6\xb0\x3b\x0f\x05
as opposed to
401000: bb 68 00 00 00 mov $0x68,%ebx
The zero bytes are part of the instructions and cannot be skipped. So you have to include them.
The problem is, however, that the zero bytes would terminate the argument string and hence have to be avoided. It is your duty as shellcode designer to construct it in a way, that it does not include byte values that may not occur. In many cases this means no zero bytes, because the shellcode is injected as a C string, but other values may be problematic in other situations, too.

Try to understand calling process in assembly code

I wrote a very simple program in C and try to understand the function calling process.
#include "stdio.h"
void Oh(unsigned x) {
printf("%u\n", x);
}
int main(int argc, char const *argv[])
{
Oh(0x67611c8c);
return 0;
}
And its assembly code seems to be
0000000100000f20 <_Oh>:
100000f20: 55 push %rbp
100000f21: 48 89 e5 mov %rsp,%rbp
100000f24: 48 83 ec 10 sub $0x10,%rsp
100000f28: 48 8d 05 6b 00 00 00 lea 0x6b(%rip),%rax # 100000f9a <_printf$stub+0x20>
100000f2f: 89 7d fc mov %edi,-0x4(%rbp)
100000f32: 8b 75 fc mov -0x4(%rbp),%esi
100000f35: 48 89 c7 mov %rax,%rdi
100000f38: b0 00 mov $0x0,%al
100000f3a: e8 3b 00 00 00 callq 100000f7a <_printf$stub>
100000f3f: 89 45 f8 mov %eax,-0x8(%rbp)
100000f42: 48 83 c4 10 add $0x10,%rsp
100000f46: 5d pop %rbp
100000f47: c3 retq
100000f48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
100000f4f: 00
0000000100000f50 <_main>:
100000f50: 55 push %rbp
100000f51: 48 89 e5 mov %rsp,%rbp
100000f54: 48 83 ec 10 sub $0x10,%rsp
100000f58: b8 8c 1c 61 67 mov $0x67611c8c,%eax
100000f5d: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
100000f64: 89 7d f8 mov %edi,-0x8(%rbp)
100000f67: 48 89 75 f0 mov %rsi,-0x10(%rbp)
100000f6b: 89 c7 mov %eax,%edi
100000f6d: e8 ae ff ff ff callq 100000f20 <_Oh>
100000f72: 31 c0 xor %eax,%eax
100000f74: 48 83 c4 10 add $0x10,%rsp
100000f78: 5d pop %rbp
100000f79: c3 retq
Well, I don't quite understand the argument passing process, since there is only one parameter passed to Oh function, I could under stand this
100000f58: b8 8c 1c 61 67 mov $0x67611c8c,%eax
So what does the the code below do? Why rbp? Isn't it abandoned in X86-64 assembly? If it is a x86 style assembly, how can I generate the x86-64 style assembly using clang? If it is x86, it doesn't matter, could any one explains the below code line by line for me?
100000f5d: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
100000f64: 89 7d f8 mov %edi,-0x8(%rbp)
100000f67: 48 89 75 f0 mov %rsi,-0x10(%rbp)
100000f6b: 89 c7 mov %eax,%edi
100000f6d: e8 ae ff ff ff callq 100000f20 <_Oh>
You might get cleaner code if you turned optimizations on, or you might not. But, here’s what that does.
The %rbp register is being used as a frame pointer, that is, a pointer to the original top of the stack. It’s saved on the stack, stored, and restored at the end. Far from being removed in x86_64, it was added there; the 32-bit equivalent was %ebp.
After this value is saved, the program allocates sixteen bytes off the stack by subtracting from the stack pointer.
There then is a very inefficient series of copies that sets the first argument of Oh() as the second argument of printf() and the constant address of the format string (relative to the instruction pointer) as the first argument of printf(). Remember that, in this calling convention, the first argument is passed in %rdi (or %edi for 32-bit operands) and the second in %rsi This could have been simplified to two instructions.
After calling printf(), the program (needlessly) saves the return value on the stack, restores the stack and frame pointers, and returns.
In main(), there’s similar code to set up the stack frame, then the program saves argc and argv (needlessly), then it moves around the constant argument to Oh into its first argument, by way of %eax. This could have been optimized into a single instruction. It then calls Oh(). On return, it sets its return value to 0, cleans up the stack, and returns.
The code you’re asking about does the following: stores the constant 32-bit value 0 on the stack, saves the 32-bit value argc on the stack, saves the 64-bit pointer argv on the stack (the first and second arguments to main()), and sets the first argument of the function it is about to call to %eax, which it had previously loaded with a constant. This is all unnecessary for this program, but would have been necessary had it needed to use argc and argv after the call, when those registers would have been clobbered. There’s no good reason it used two steps to load the constant instead of one.
As Jester mentions you still have frame pointers on (to aid debugging)so stepping through main:
0000000100000f50 <_main>:
First we enter a new stack frame, we have to save the base pointer and move the stack to the new base. Also, in x86_64 the stack frame has to be aligned to a 16 byte boundary (hence moving the stack pointer by 0x10).
100000f50: push %rbp
100000f51: mov %rsp,%rbp
100000f54: sub $0x10,%rsp
As you mention, x86_64 passes parameters by register, so load the param in to the register:
100000f58: mov $0x67611c8c,%eax
??? Help needed
100000f5d: movl $0x0,-0x4(%rbp)
From here: "Registers RBP, RBX, and R12-R15 are callee-save registers", so if we want to save other resisters then we have to do it ourselves ....
100000f64: mov %edi,-0x8(%rbp)
100000f67: mov %rsi,-0x10(%rbp)
Not really sure why we didn't just load this in %edi where it needs to be for the call to begin with, but we better move it there now.
100000f6b: mov %eax,%edi
Call the function:
100000f6d: callq 100000f20 <_Oh>
This is the return value (passed in %eax), xor is a smaller instruction than load 0, so is a cmmon optimization:
100000f72: xor %eax,%eax
Clean up that stack frame we added earlier (not really sure why we saved those registers on it when we didn't use them)
100000f74: add $0x10,%rsp
100000f78: pop %rbp
100000f79: retq

Why is gcc putting NOPs at the end of functions? [duplicate]

I've been working with C for a short while and very recently started to get into ASM. When I compile a program:
int main(void)
{
int a = 0;
a += 1;
return 0;
}
The objdump disassembly has the code, but nops after the ret:
...
08048394 <main>:
8048394: 55 push %ebp
8048395: 89 e5 mov %esp,%ebp
8048397: 83 ec 10 sub $0x10,%esp
804839a: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%ebp)
80483a1: 83 45 fc 01 addl $0x1,-0x4(%ebp)
80483a5: b8 00 00 00 00 mov $0x0,%eax
80483aa: c9 leave
80483ab: c3 ret
80483ac: 90 nop
80483ad: 90 nop
80483ae: 90 nop
80483af: 90 nop
...
From what I learned nops do nothing, and since after ret wouldn't even be executed.
My question is: why bother? Couldn't ELF(linux-x86) work with a .text section(+main) of any size?
I'd appreciate any help, just trying to learn.
First of all, gcc doesn't always do this. The padding is controlled by -falign-functions, which is automatically turned on by -O2 and -O3:
-falign-functions
-falign-functions=n
Align the start of functions to the next power-of-two greater than n, skipping up to n bytes. For instance,
-falign-functions=32 aligns functions to the next 32-byte boundary, but -falign-functions=24 would align to the next 32-byte boundary only
if this can be done by skipping 23 bytes or less.
-fno-align-functions and -falign-functions=1 are equivalent and mean that functions will not be aligned.
Some assemblers only support this flag when n is a power of two; in
that case, it is rounded up.
If n is not specified or is zero, use a machine-dependent default.
Enabled at levels -O2, -O3.
There could be multiple reasons for doing this, but the main one on x86 is probably this:
Most processors fetch instructions in aligned 16-byte or 32-byte blocks. It can be
advantageous to align critical loop entries and subroutine entries by 16 in order to minimize
the number of 16-byte boundaries in the code. Alternatively, make sure that there is no 16-byte boundary in the first few instructions after a critical loop entry or subroutine entry.
(Quoted from "Optimizing subroutines in assembly
language" by Agner Fog.)
edit: Here is an example that demonstrates the padding:
// align.c
int f(void) { return 0; }
int g(void) { return 0; }
When compiled using gcc 4.4.5 with default settings, I get:
align.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <f>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: b8 00 00 00 00 mov $0x0,%eax
9: c9 leaveq
a: c3 retq
000000000000000b <g>:
b: 55 push %rbp
c: 48 89 e5 mov %rsp,%rbp
f: b8 00 00 00 00 mov $0x0,%eax
14: c9 leaveq
15: c3 retq
Specifying -falign-functions gives:
align.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <f>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: b8 00 00 00 00 mov $0x0,%eax
9: c9 leaveq
a: c3 retq
b: eb 03 jmp 10 <g>
d: 90 nop
e: 90 nop
f: 90 nop
0000000000000010 <g>:
10: 55 push %rbp
11: 48 89 e5 mov %rsp,%rbp
14: b8 00 00 00 00 mov $0x0,%eax
19: c9 leaveq
1a: c3 retq
This is done to align the next function by 8, 16 or 32-byte boundary.
From “Optimizing subroutines in assembly language” by A.Fog:
11.5 Alignment of code
Most microprocessors fetch code in aligned 16-byte or 32-byte blocks. If an importantsubroutine entry or jump label happens to be near the end of a 16-byte block then themicroprocessor will only get a few useful bytes of code when fetching that block of code. Itmay have to fetch the next 16 bytes too before it can decode the first instructions after thelabel. This can be avoided by aligning important subroutine entries and loop entries by 16.
[...]
Aligning a subroutine entry is as simple as putting as many
NOP
's as needed before thesubroutine entry to make the address divisible by 8, 16, 32 or 64, as desired.
As far as I remember, instructions are pipelined in cpu and different cpu blocks (loader, decoder and such) process subsequent instructions. When RET instructions is being executed, few next instructions are already loaded into cpu pipeline. It's a guess, but you can start digging here and if you find out (maybe the specific number of NOPs that are safe, share your findings please.

How to get opcodes of a c program

I know how to get the assembly code of my program using gdb but how do I get the opcode?
I need it to hack a linux server (don't worry it's part of a class I'm having so no real server will be harmed). Actually I was reading this article and I'm wondering how can I get from assembly:
[aleph1]$ gcc -o shellcodeasm -g -ggdb shellcodeasm.c
[aleph1]$ gdb shellcodeasm
(gdb) disassemble main
Dump of assembler code for function main:
0x8000130 <main>: pushl %ebp
0x8000131 <main+1>: movl %esp,%ebp
0x8000133 <main+3>: jmp 0x800015f <main+47>
0x8000135 <main+5>: popl %esi
0x8000136 <main+6>: movl %esi,0x8(%esi)
0x8000139 <main+9>: movb $0x0,0x7(%esi)
0x800013d <main+13>: movl $0x0,0xc(%esi)
0x8000144 <main+20>: movl $0xb,%eax
0x8000149 <main+25>: movl %esi,%ebx
0x800014b <main+27>: leal 0x8(%esi),%ecx
0x800014e <main+30>: leal 0xc(%esi),%edx
0x8000151 <main+33>: int $0x80
0x8000153 <main+35>: movl $0x1,%eax
0x8000158 <main+40>: movl $0x0,%ebx
0x800015d <main+45>: int $0x80
0x800015f <main+47>: call 0x8000135 <main+5>
0x8000164 <main+52>: das
0x8000165 <main+53>: boundl 0x6e(%ecx),%ebp
0x8000168 <main+56>: das
0x8000169 <main+57>: jae 0x80001d3 <__new_exitfn+55>
0x800016b <main+59>: addb %cl,0x55c35dec(%ecx)
End of assembler dump.
the following:
testsc.c
------------------------------------------------------------------------------
char shellcode[] =
"\xeb\x2a\x5e\x89\x76\x08\xc6\x46\x07\x00\xc7\x46\x0c\x00\x00\x00"
"\x00\xb8\x0b\x00\x00\x00\x89\xf3\x8d\x4e\x08\x8d\x56\x0c\xcd\x80"
"\xb8\x01\x00\x00\x00\xbb\x00\x00\x00\x00\xcd\x80\xe8\xd1\xff\xff"
"\xff\x2f\x62\x69\x6e\x2f\x73\x68\x00\x89\xec\x5d\xc3";
The system is linux x86 and the language I will be using C. I'd really like an automated way, but a manual solution would work too.
I mean how do I convert %ebp, %esi, %esp etc.. Is there a map I can use? or an automated programm?
Here you go:
Disassembly of section .data:
00000000 <shellcode>:
0: eb 2a jmp 2c <shellcode+0x2c>
2: 5e pop %esi
3: 89 76 08 mov %esi,0x8(%esi)
6: c6 46 07 00 movb $0x0,0x7(%esi)
a: c7 46 0c 00 00 00 00 movl $0x0,0xc(%esi)
11: b8 0b 00 00 00 mov $0xb,%eax
16: 89 f3 mov %esi,%ebx
18: 8d 4e 08 lea 0x8(%esi),%ecx
1b: 8d 56 0c lea 0xc(%esi),%edx
1e: cd 80 int $0x80
20: b8 01 00 00 00 mov $0x1,%eax
25: bb 00 00 00 00 mov $0x0,%ebx
2a: cd 80 int $0x80
2c: e8 d1 ff ff ff call 2 <shellcode+0x2>
31: 2f das
32: 62 69 6e bound %ebp,0x6e(%ecx)
35: 2f das
36: 73 68 jae a0 <shellcode+0xa0>
38: 00 89 ec 5d c3 00 add %cl,0xc35dec(%ecx)
Note how the last 00 in that add %cl instruction comes from the string null terminator byte; it is not explicit.
How I got this was that I simply compiled your declaration with
gcc testsc.c -c
and then
objdump -D testsc.o
You can use:
gcc -S -c tst.c -o -
or
gcc -g -ggdb -c tst.c
objdump -S tst.o
to get the disassembly of your program with the opcodes.
To get the disassembly of your char array, you can use:
gcc -c tst.c
objdump -D -j .data tst.o
Found it! First disassemble then type :
x/bx hit enter and get one by one the hex representation of the assembly commands!
Create a small assembly file, say code.s. Then put the following inside:
.text
.byte 0xeb, 0x2a, 0x5e, ..
Assemble it with as code.s -o code.o and use objdump to disassemble the result.

GCC 4.3/4.4 vs MSC 6 on i386 optimization for size fail

I am not sure what am I doing wrong, but I've tried reading manuals about calling conventions of GCC and found nothing useful there. My current problem is GCC generates excessively LARGE code for a very simple operation, like shown below.
main.c:
#ifdef __GNUC__
// defines for GCC
typedef void (* push1)(unsigned long);
#define PUSH1(P,A0)((push1)P)((unsigned long)A0)
#else
// defines for MSC
typedef void (__stdcall * push1)(unsigned long);
#define PUSH1(P,A0)((push1)P)((unsigned long)A0)
#endif
int main() {
// pointer to nasm-linked exit syscall "function".
// will not work for win32 target, provided as an example.
PUSH1(0x08048200,0x7F);
}
Now, let's build and dump it with gcc: gcc -c main.c -Os;objdump -d main.o:
main.o: file format elf32-i386
Disassembly of section .text:
00000000 <.text>:
0: 8d 4c 24 04 lea 0x4(%esp),%ecx
4: 83 e4 f0 and $0xfffffff0,%esp
7: ff 71 fc pushl -0x4(%ecx)
a: b8 00 82 04 08 mov $0x8048200,%eax
f: 55 push %ebp
10: 89 e5 mov %esp,%ebp
12: 51 push %ecx
13: 83 ec 10 sub $0x10,%esp
16: 6a 7f push $0x7f
18: ff d0 call *%eax
1a: 8b 4d fc mov -0x4(%ebp),%ecx
1d: 83 c4 0c add $0xc,%esp
20: c9 leave
21: 8d 61 fc lea -0x4(%ecx),%esp
24: c3 ret
That's the minimum size code I am able to get... If I don't specify -O* or specify other values, it will be 0x29 + bytes long.
Now, let's build it with ms c compiler v 6 (yea, one of year 98 iirc): wine /mnt/ssd/msc/6/cl /c /TC main.c;wine /mnt/ssd/msc/6/dumpbin /disasm main.obj:
Dump of file main.obj
File Type: COFF OBJECT
_main:
00000000: 55 push ebp
00000001: 8B EC mov ebp,esp
00000003: 6A 7F push 7Fh
00000005: B8 00 82 04 08 mov eax,8048200h
0000000A: FF D0 call eax
0000000C: 5D pop ebp
0000000D: C3 ret
How do I make GCC generate the similar by size code? any hints, tips? Don't you agree resulting code should be small as that? Why does GCC append so much useless code? I thought it'd be smarter than such old stuff like msc6 when optimizing for size. What am I missing here?
main() is special here: gcc is doing some extra work to make the stack 16-byte aligned at the entry point of the program. So the size of the result aren't directly comparable... try renaming main() to f() and you'll see gcc generates drastically different code.
(The MSVC-compiled code doesn't need to care about alignment because Windows has different rules for stack alignment.)
This is the best reference I can get. I'm on Windows now and too lazy to login to my Linux to test. Here (MinGW GCC 4.5.2), the code is smaller than yours. One difference is the calling convention, stdcall of course has a few bytes advantage over cdecl (default on GCC if not specified or with -O1 and I guess with -Os, too) to clean up the stack.
Here's the way I compile and the result (source code is purely copy pasted from your post)
gcc -S test.c:
_main:
pushl %ebp #
movl %esp, %ebp #,
andl $-16, %esp #,
subl $16, %esp #,
call ___main #
movl $127, (%esp) #,
movl $134513152, %eax #, tmp59
call *%eax # tmp59
leave
ret
gcc -c -o test.o test.c && objdump -d test.o:
test.o: file format pe-i386
Disassembly of section .text:
00000000 <_main>:
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 83 e4 f0 and $0xfffffff0,%esp
6: 83 ec 10 sub $0x10,%esp
9: e8 00 00 00 00 call e <_main+0xe>
e: c7 04 24 7f 00 00 00 movl $0x7f,(%esp)
15: b8 00 82 04 08 mov $0x8048200,%eax
1a: ff d0 call *%eax
1c: c9 leave
1d: c3 ret
1e: 90 nop
1f: 90 nop

Resources