Prevent __start entry point from being optimized out - c

With the following CFLAGS:
-Wall -Werror -Wextra -pedantic -std=c99 -O3 -nostartfiles -nodefaultlibs
my __start entry point (notice -nostartfiles) is successfully compiled and put into an output executable.
However, when I add -flto flag, both the entry point and functions called by it only are optimized out. Moreover, the following linking is performed with neither error nor warning, but with incorrect (random) entry point.
A question is how to prevent __start function from being optimized out. It`s also interesting for me why a linker “forgets” about an external dependency on my entry point in lack of the default one.
My GCC version is gcc (i686-posix-dwarf-rev1, Built by MinGW-W64 project) 4.9.2.
UPD:
Source code (fixed with help of #FUZxxl, who wrote about prepended underscores in Windows ABI):
#include <windows.h>
void _start()
{
MessageBox(NULL, TEXT("Hello world."), TEXT(""), MB_OK);
ExitProcess(0);
}
Assembly output emitted by a linker (-S):
Non--flto version:
Disassembly of section .text:
00401000 <__start>:
401000: 83 ec 1c sub $0x1c,%esp
401003: c7 44 24 0c 00 00 00 movl $0x0,0xc(%esp)
40100a: 00
40100b: c7 44 24 08 00 20 40 movl $0x402000,0x8(%esp)
401012: 00
401013: c7 44 24 04 0d 20 40 movl $0x40200d,0x4(%esp)
40101a: 00
40101b: c7 04 24 00 00 00 00 movl $0x0,(%esp)
401022: ff 15 54 40 40 00 call *0x404054
401028: 83 ec 10 sub $0x10,%esp
40102b: c7 04 24 00 00 00 00 movl $0x0,(%esp)
401032: ff 15 4c 40 40 00 call *0x40404c
401038: 90 nop
401039: 90 nop
40103a: 90 nop
40103b: 90 nop
40103c: 90 nop
40103d: 90 nop
40103e: 90 nop
40103f: 90 nop
00401040 <__CTOR_LIST__>:
401040: ff (bad)
401041: ff (bad)
401042: ff (bad)
401043: ff 00 incl (%eax)
401045: 00 00 add %al,(%eax)
...
00401048 <__DTOR_LIST__>:
401048: ff (bad)
401049: ff (bad)
40104a: ff (bad)
40104b: ff 00 incl (%eax)
40104d: 00 00 add %al,(%eax)
-flto version (notice the lack of _start here, just a bunch of thunks for API entries):
Disassembly of section .text:
00401000 <_ExitProcess#4>:
401000: ff 25 4c 30 40 00 jmp *0x40304c
401006: 90 nop
401007: 90 nop
00401008 <_MessageBoxA#16>:
401008: ff 25 54 30 40 00 jmp *0x403054
40100e: 90 nop
40100f: 90 nop
00401010 <__CTOR_LIST__>:
401010: ff (bad)
401011: ff (bad)
401012: ff (bad)
401013: ff 00 incl (%eax)
401015: 00 00 add %al,(%eax)
...
00401018 <__DTOR_LIST__>:
401018: ff (bad)
401019: ff (bad)
40101a: ff (bad)
40101b: ff 00 incl (%eax)
40101d: 00 00 add %al,(%eax)

With all the exotic/embedded-related options you've set, you have to ensure that your symbol is seen as your entrypoint and not garbage collected by linker optimizations (--gc-sections also does that: collecting "useless" sections)
You can end up with a fully empty .elf file since no section is reachable.
To tell the linker that you are using that symbol as an entrypoint (and avoid that the linker eludes it!), just add
-Wl,-e__start
option to your link command (or write a linker spec file where you declare your symbol, but the command line option is easier)

Related

How to prevent gcc from reordering x86 frame pointer saving/setup instructions?

During my profiling with flamegraph, I found that callstacks are sometimes broken even when all codebases are compiled with the -fno-omit-frame-pointer flag. By checking the binary generated by gcc, I noticed gcc may reorder x86 frame pointer saving/setup instructions (i.e., push %rbp; move %rsp, %rbp), sometimes even after ret instructions of some branches. As shown in the example below, push %rbp; move %rsp, %rbp are put at the bottom of the function. It leads to incomplete and misleading callstacks when perf happens to sample instructions in the function before frame pointers are properly set.
C code:
int flextcp_fd_slookup(int fd, struct socket **ps)
{
struct socket *s;
if (fd >= MAXSOCK || fhs[fd].type != FH_SOCKET) {
errno = EBADF;
return -1;
}
uint32_t lock_val = 1;
s = fhs[fd].data.s;
asm volatile (
"1:\n"
"xchg %[locked], %[lv]\n"
"test %[lv], %[lv]\n"
"jz 3f\n"
"2:\n"
"pause\n"
"cmpl $0, %[locked]\n"
"jnz 2b\n"
"jmp 1b\n"
"3:\n"
: [locked] "=m" (s->sp_lock), [lv] "=q" (lock_val)
: "[lv]" (lock_val)
: "memory");
*ps = s;
return 0;
}
CMake Debug Profile:
0000000000007c73 <flextcp_fd_slookup>:
7c73: f3 0f 1e fa endbr64
7c77: 55 push %rbp
7c78: 48 89 e5 mov %rsp,%rbp
7c7b: 48 83 ec 20 sub $0x20,%rsp
7c7f: 89 7d ec mov %edi,-0x14(%rbp)
7c82: 48 89 75 e0 mov %rsi,-0x20(%rbp)
7c86: 81 7d ec ff ff 0f 00 cmpl $0xfffff,-0x14(%rbp)
7c8d: 7f 1b jg 7caa <flextcp_fd_slookup+0x37>
7c8f: 8b 45 ec mov -0x14(%rbp),%eax
7c92: 48 98 cltq
7c94: 48 c1 e0 04 shl $0x4,%rax
7c98: 48 89 c2 mov %rax,%rdx
7c9b: 48 8d 05 86 86 00 00 lea 0x8686(%rip),%rax # 10328 <fhs+0x8>
7ca2: 0f b6 04 02 movzbl (%rdx,%rax,1),%eax
7ca6: 3c 01 cmp $0x1,%al
7ca8: 74 12 je 7cbc <flextcp_fd_slookup+0x49>
7caa: e8 31 b9 ff ff callq 35e0 <__errno_location#plt>
7caf: c7 00 09 00 00 00 movl $0x9,(%rax)
7cb5: b8 ff ff ff ff mov $0xffffffff,%eax
7cba: eb 53 jmp 7d0f <flextcp_fd_slookup+0x9c>
7cbc: c7 45 f4 01 00 00 00 movl $0x1,-0xc(%rbp)
7cc3: 8b 45 ec mov -0x14(%rbp),%eax
7cc6: 48 98 cltq
7cc8: 48 c1 e0 04 shl $0x4,%rax
7ccc: 48 89 c2 mov %rax,%rdx
7ccf: 48 8d 05 4a 86 00 00 lea 0x864a(%rip),%rax # 10320 <fhs>
7cd6: 48 8b 04 02 mov (%rdx,%rax,1),%rax
7cda: 48 89 45 f8 mov %rax,-0x8(%rbp)
7cde: 48 8b 55 f8 mov -0x8(%rbp),%rdx
7ce2: 8b 45 f4 mov -0xc(%rbp),%eax
7ce5: 87 82 c0 00 00 00 xchg %eax,0xc0(%rdx)
7ceb: 85 c0 test %eax,%eax
7ced: 74 0d je 7cfc <flextcp_fd_slookup+0x89>
7cef: f3 90 pause
7cf1: 83 ba c0 00 00 00 00 cmpl $0x0,0xc0(%rdx)
7cf8: 75 f5 jne 7cef <flextcp_fd_slookup+0x7c>
7cfa: eb e9 jmp 7ce5 <flextcp_fd_slookup+0x72>
7cfc: 89 45 f4 mov %eax,-0xc(%rbp)
7cff: 48 8b 45 e0 mov -0x20(%rbp),%rax
7d03: 48 8b 55 f8 mov -0x8(%rbp),%rdx
7d07: 48 89 10 mov %rdx,(%rax)
7d0a: b8 00 00 00 00 mov $0x0,%eax
7d0f: c9 leaveq
7d10: c3 retq
CMake Release Profile:
0000000000007d80 <flextcp_fd_slookup>:
7d80: f3 0f 1e fa endbr64
7d84: 81 ff ff ff 0f 00 cmp $0xfffff,%edi
7d8a: 7f 44 jg 7dd0 <flextcp_fd_slookup+0x50>
7d8c: 48 63 ff movslq %edi,%rdi
7d8f: 48 8d 05 6a 85 00 00 lea 0x856a(%rip),%rax # 10300 <fhs>
7d96: 48 c1 e7 04 shl $0x4,%rdi
7d9a: 48 01 c7 add %rax,%rdi
7d9d: 80 7f 08 01 cmpb $0x1,0x8(%rdi)
7da1: 75 2d jne 7dd0 <flextcp_fd_slookup+0x50>
7da3: 48 8b 17 mov (%rdi),%rdx
7da6: b8 01 00 00 00 mov $0x1,%eax
7dab: 87 82 c0 00 00 00 xchg %eax,0xc0(%rdx)
7db1: 85 c0 test %eax,%eax
7db3: 74 0d je 7dc2 <flextcp_fd_slookup+0x42>
7db5: f3 90 pause
7db7: 83 ba c0 00 00 00 00 cmpl $0x0,0xc0(%rdx)
7dbe: 75 f5 jne 7db5 <flextcp_fd_slookup+0x35>
7dc0: eb e9 jmp 7dab <flextcp_fd_slookup+0x2b>
7dc2: 31 c0 xor %eax,%eax
7dc4: 48 89 16 mov %rdx,(%rsi)
7dc7: c3 retq
7dc8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
7dcf: 00
7dd0: 55 push %rbp
7dd1: 48 89 e5 mov %rsp,%rbp
7dd4: e8 b7 b7 ff ff callq 3590 <__errno_location#plt>
7dd9: c7 00 09 00 00 00 movl $0x9,(%rax)
7ddf: b8 ff ff ff ff mov $0xffffffff,%eax
7de4: 5d pop %rbp
7de5: c3 retq
7de6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
7ded: 00 00 00
Is there any way to prevent gcc from reordering these two instructions?
Edit: I use the default toolchain (gcc-11.2.0 + glibc 2.35) on Ubuntu 22.04. Sorry that a reproducible example is not available.
Edit: Add source code of the example function.
Try -fno-shrink-wrap
This looks like "shrink-wrap" optimization: only doing the function prologue in a code path where it's needed. The usual benefit is to run an early-out check before the prologue, not saving/restoring a bunch of registers on that path through the function.
But here, GCC decided to only do the prologue (setting up a frame pointer) if it had to call another function. That function is __errno_location in the error-return path. Oops. :P (And GCC correctly realized that's the uncommon case, and put it out-of-line after the ret through the fast path. So the fast path can be a straight line with no taken branches, other than inside your asm(). It's not a separate function, it's just tail-duplication of the one you showed source for.)
The main path through the function is very tiny, just a few C assignment statements and an asm() statement. GCC doesn't have a clear idea of how big an asm block is (although I think has some heuristics, but is still rather willing to inline one). And it has no idea if there might be loops or any significant time spent in an asm block.
This is a known issue, GCC bug #98018 suggested that GCC should have an option to force frame-pointer setup at the actual top of a function. Because there currently isn't an option that's 100% reliable, other than disabling optimization which is not usable. (Thanks to #Margaret Bloom for finding & linking this.)
As comment 6 on that GCC bug mentions, disabling shrink-wrapping is part of what's necessary to make sure GCC sets up the frame pointer at the top of the function itself, not just inside some if that needs the prologue.
That GCC issue seems to be considering a feature that would stop function inlining, so backtraces would fully reflect the C abstract machine's nesting of function calls. That goes beyond what you're looking for, which I think is just to have frame pointers set up on entry to functions that exist in the asm after optimization.
Disabling shrink-wrapping will force the whole prologue to happen there, including push of other regs, if there were any. Not just the frame pointer.
But here there aren't any others. Still, with optimization enabled in general, losing shrink-wrapping is probably pretty minor.

I compile a nearly empty .c file. What's in its .text section? [duplicate]

This question already has answers here:
GCC: Empty program == 23202 bytes?
(10 answers)
Closed 1 year ago.
I write a nothing.c, which is just one line as follows
int main(){}
Then I compile it using command gcc nothing.c -o nothing
Here's what I get using command readelf -x .text nothing
Hex dump of section '.text':
0x00001040 f30f1efa 31ed4989 d15e4889 e24883e4 ....1.I..^H..H..
0x00001050 f050544c 8d055601 0000488d 0ddf0000 .PTL..V...H.....
0x00001060 00488d3d c1000000 ff15722f 0000f490 .H.=......r/....
0x00001070 488d3d99 2f000048 8d05922f 00004839 H.=./..H.../..H9
0x00001080 f8741548 8b054e2f 00004885 c07409ff .t.H..N/..H..t..
0x00001090 e00f1f80 00000000 c30f1f80 00000000 ................
0x000010a0 488d3d69 2f000048 8d35622f 00004829 H.=i/..H.5b/..H)
0x000010b0 fe4889f0 48c1ee3f 48c1f803 4801c648 .H..H..?H...H..H
0x000010c0 d1fe7414 488b0525 2f000048 85c07408 ..t.H..%/..H..t.
0x000010d0 ffe0660f 1f440000 c30f1f80 00000000 ..f..D..........
0x000010e0 f30f1efa 803d252f 00000075 2b554883 .....=%/...u+UH.
0x000010f0 3d022f00 00004889 e5740c48 8b3d062f =./...H..t.H.=./
0x00001100 0000e829 ffffffe8 64ffffff c605fd2e ...)....d.......
0x00001110 0000015d c30f1f00 c30f1f80 00000000 ...]............
0x00001120 f30f1efa e977ffff fff30f1e fa554889 .....w.......UH.
0x00001130 e5b80000 00005dc3 0f1f8400 00000000 ......].........
0x00001140 f30f1efa 41574c8d 3da32c00 00415649 ....AWL.=.,..AVI
0x00001150 89d64155 4989f541 544189fc 55488d2d ..AUI..ATA..UH.-
0x00001160 942c0000 534c29fd 4883ec08 e88ffeff .,..SL).H.......
0x00001170 ff48c1fd 03741f31 db0f1f80 00000000 .H...t.1........
0x00001180 4c89f24c 89ee4489 e741ff14 df4883c3 L..L..D..A...H..
0x00001190 014839dd 75ea4883 c4085b5d 415c415d .H9.u.H...[]A\A]
0x000011a0 415e415f c366662e 0f1f8400 00000000 A^A_.ff.........
0x000011b0 f30f1efa c3 .....
So what does it do?
So what does it do?
You can see what it does:
objdump -d nothing
Disassembly of section .text:
0000000000001040 <_start>:
1040: 31 ed xor %ebp,%ebp
1042: 49 89 d1 mov %rdx,%r9
1045: 5e pop %rsi
1046: 48 89 e2 mov %rsp,%rdx
1049: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
104d: 50 push %rax
104e: 54 push %rsp
104f: 4c 8d 05 3a 01 00 00 lea 0x13a(%rip),%r8 # 1190 <__libc_csu_fini>
1056: 48 8d 0d d3 00 00 00 lea 0xd3(%rip),%rcx # 1130 <__libc_csu_init>
105d: 48 8d 3d c1 00 00 00 lea 0xc1(%rip),%rdi # 1125 <main>
1064: ff 15 76 2f 00 00 call *0x2f76(%rip) # 3fe0 <__libc_start_main#GLIBC_2.2.5>
106a: f4 hlt
106b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
0000000000001070 <deregister_tm_clones>:
1070: 48 8d 3d b1 2f 00 00 lea 0x2fb1(%rip),%rdi # 4028 <__TMC_END__>
1077: 48 8d 05 aa 2f 00 00 lea 0x2faa(%rip),%rax # 4028 <__TMC_END__>
... etc.
The compiler injects info when you are compiling the source code. This highly depends on the operating system and the compiler you are using.
For example, on a macOS, the compiler injects the so-called 'unwind info' which does something with unwinding the stack when there is an exception.
To get to know what the compiler injects in your .text file besides the empty main, you should generate a .map file in which you will see clearly what's going on. The next question will be why the compiler injects this extra section?
To generate a .map file use the following command:
gcc -Wl,-map,nothing.map nothing.c -o nothing

Why did ld turn my 5 lines of library-less C into a 100MB binary?

I'm trying to develop some very low-level x86 code following this document. I wrote the following C program:
void main()
{
char* video_memory = (char*) 0xb8000;
*video_memory = 'X';
}
I compile and link it like so:
gcc -m32 -fno-pie -c main.c -o main.o
ld -m elf_i386 -o main.bin -Ttext 513 --oformat binary main.o
This produces a binary called main.bin which is over a hundred megabytes. I disassembled that binary and it's basically my code (ten or so lines), then a hundred meg of zeros, and then some kind of footer.
The extra bytes are all unnecessary, because I used head to snip off the ones that weren't my code and it still ran fine.
I'm using 32-bit flags because my test machine is an old 32-bit laptop, but you can get similar (but less extreme) behavior in 64-bit. This script:
gcc -fno-pie -c main.c -o main.o
ld -o main.bin -Ttext 513 --oformat binary main.o
produces a main.bin of over 4 MB. Again the pattern is the same: my code, 4 meg of zeros, and then a footer. A little bit of noise in between my code and the zeros. Here's the disassembled 4MB file:
0: f3 0f 1e fa endbr64
4: 55 push %ebp
5: 48 dec %eax
6: 89 e5 mov %esp,%ebp
8: 48 dec %eax
9: c7 45 f8 00 80 0b 00 movl $0xb8000,-0x8(%ebp)
10: 48 dec %eax
11: 8b 45 f8 mov -0x8(%ebp),%eax
14: c6 00 58 movb $0x58,(%eax)
17: 90 nop
18: 5d pop %ebp
19: c3 ret
...
aea: 00 00 add %al,(%eax)
aec: 00 14 00 add %dl,(%eax,%eax,1)
aef: 00 00 add %al,(%eax)
af1: 00 00 add %al,(%eax)
af3: 00 00 add %al,(%eax)
af5: 01 7a 52 add %edi,0x52(%edx)
af8: 00 01 add %al,(%ecx)
afa: 78 10 js 0xb0c
afc: 01 1b add %ebx,(%ebx)
afe: 0c 07 or $0x7,%al
b00: 08 90 01 00 00 1c or %dl,0x1c000001(%eax)
b06: 00 00 add %al,(%eax)
b08: 00 1c 00 add %bl,(%eax,%eax,1)
b0b: 00 00 add %al,(%eax)
b0d: f3 f4 repz hlt
b0f: ff (bad)
b10: ff 1a lcall *(%edx)
b12: 00 00 add %al,(%eax)
b14: 00 00 add %al,(%eax)
b16: 45 inc %ebp
b17: 0e push %cs
b18: 10 86 02 43 0d 06 adc %al,0x60d4302(%esi)
b1e: 51 push %ecx
b1f: 0c 07 or $0x7,%al
b21: 08 00 or %al,(%eax)
...
3ffaeb: 00 00 add %al,(%eax)
3ffaed: 04 00 add $0x0,%al
3ffaef: 00 00 add %al,(%eax)
3ffaf1: 10 00 adc %al,(%eax)
3ffaf3: 00 00 add %al,(%eax)
3ffaf5: 05 00 00 00 47 add $0x47000000,%eax
3ffafa: 4e dec %esi
3ffafb: 55 push %ebp
3ffafc: 00 02 add %al,(%edx)
3ffafe: 00 00 add %al,(%eax)
3ffb00: c0 04 00 00 rolb $0x0,(%eax,%eax,1)
3ffb04: 00 03 add %al,(%ebx)
3ffb06: 00 00 add %al,(%eax)
3ffb08: 00 00 add %al,(%eax)
3ffb0a: 00 00 add %al,(%eax)
...
The giant binary files works, but it's ugly and I'd like to understand what's going on.
I'm doing the compilation/linking on Ubuntu 20.20 on a 64-bit machine. Tool versions:
gcc version 9.3.0 (Ubuntu 9.3.0-10ubuntu2)
GNU ld (GNU Binutils for Ubuntu) 2.34

gcc generates unnecessary (?) instructions

I decided to compile a very basic C program and take a look at the generated code with objdump -d.
int main(int argc, char *argv[]) {
exit(0);
}
After compiling it with gcc test.c -s -o test.o and then disassembling with objdump -d my text segment looked like this:
Disassembly of section .text:
0000000000001050 <.text>:
1050: 31 ed xor %ebp,%ebp
1052: 49 89 d1 mov %rdx,%r9
1055: 5e pop %rsi
1056: 48 89 e2 mov %rsp,%rdx
1059: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
105d: 50 push %rax
105e: 54 push %rsp
105f: 4c 8d 05 4a 01 00 00 lea 0x14a(%rip),%r8 # 11b0 <__cxa_finalize#plt+0x170>
1066: 48 8d 0d e3 00 00 00 lea 0xe3(%rip),%rcx # 1150 <__cxa_finalize#plt+0x110>
106d: 48 8d 3d c1 00 00 00 lea 0xc1(%rip),%rdi # 1135 <__cxa_finalize#plt+0xf5>
1074: ff 15 66 2f 00 00 callq *0x2f66(%rip) # 3fe0 <__cxa_finalize#plt+0x2fa0>
107a: f4 hlt
107b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
1080: 48 8d 3d a9 2f 00 00 lea 0x2fa9(%rip),%rdi # 4030 <__cxa_finalize#plt+0x2ff0>
1087: 48 8d 05 a2 2f 00 00 lea 0x2fa2(%rip),%rax # 4030 <__cxa_finalize#plt+0x2ff0>
108e: 48 39 f8 cmp %rdi,%rax
1091: 74 15 je 10a8 <__cxa_finalize#plt+0x68>
1093: 48 8b 05 3e 2f 00 00 mov 0x2f3e(%rip),%rax # 3fd8 <__cxa_finalize#plt+0x2f98>
109a: 48 85 c0 test %rax,%rax
109d: 74 09 je 10a8 <__cxa_finalize#plt+0x68>
109f: ff e0 jmpq *%rax
10a1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10a8: c3 retq
10a9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10b0: 48 8d 3d 79 2f 00 00 lea 0x2f79(%rip),%rdi # 4030 <__cxa_finalize#plt+0x2ff0>
10b7: 48 8d 35 72 2f 00 00 lea 0x2f72(%rip),%rsi # 4030 <__cxa_finalize#plt+0x2ff0>
10be: 48 29 fe sub %rdi,%rsi
10c1: 48 c1 fe 03 sar $0x3,%rsi
10c5: 48 89 f0 mov %rsi,%rax
10c8: 48 c1 e8 3f shr $0x3f,%rax
10cc: 48 01 c6 add %rax,%rsi
10cf: 48 d1 fe sar %rsi
10d2: 74 14 je 10e8 <__cxa_finalize#plt+0xa8>
10d4: 48 8b 05 15 2f 00 00 mov 0x2f15(%rip),%rax # 3ff0 <__cxa_finalize#plt+0x2fb0>
10db: 48 85 c0 test %rax,%rax
10de: 74 08 je 10e8 <__cxa_finalize#plt+0xa8>
10e0: ff e0 jmpq *%rax
10e2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
10e8: c3 retq
10e9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10f0: 80 3d 39 2f 00 00 00 cmpb $0x0,0x2f39(%rip) # 4030 <__cxa_finalize#plt+0x2ff0>
10f7: 75 2f jne 1128 <__cxa_finalize#plt+0xe8>
10f9: 55 push %rbp
10fa: 48 83 3d f6 2e 00 00 cmpq $0x0,0x2ef6(%rip) # 3ff8 <__cxa_finalize#plt+0x2fb8>
1101: 00
1102: 48 89 e5 mov %rsp,%rbp
1105: 74 0c je 1113 <__cxa_finalize#plt+0xd3>
1107: 48 8b 3d 1a 2f 00 00 mov 0x2f1a(%rip),%rdi # 4028 <__cxa_finalize#plt+0x2fe8>
110e: e8 2d ff ff ff callq 1040 <__cxa_finalize#plt>
1113: e8 68 ff ff ff callq 1080 <__cxa_finalize#plt+0x40>
1118: c6 05 11 2f 00 00 01 movb $0x1,0x2f11(%rip) # 4030 <__cxa_finalize#plt+0x2ff0>
111f: 5d pop %rbp
1120: c3 retq
1121: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
1128: c3 retq
1129: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
1130: e9 7b ff ff ff jmpq 10b0 <__cxa_finalize#plt+0x70>
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 83 ec 10 sub $0x10,%rsp
113d: 89 7d fc mov %edi,-0x4(%rbp)
1140: 48 89 75 f0 mov %rsi,-0x10(%rbp)
1144: bf 00 00 00 00 mov $0x0,%edi
1149: e8 e2 fe ff ff callq 1030 <exit#plt>
114e: 66 90 xchg %ax,%ax
1150: 41 57 push %r15
1152: 4c 8d 3d 8f 2c 00 00 lea 0x2c8f(%rip),%r15 # 3de8 <__cxa_finalize#plt+0x2da8>
1159: 41 56 push %r14
115b: 49 89 d6 mov %rdx,%r14
115e: 41 55 push %r13
1160: 49 89 f5 mov %rsi,%r13
1163: 41 54 push %r12
1165: 41 89 fc mov %edi,%r12d
1168: 55 push %rbp
1169: 48 8d 2d 80 2c 00 00 lea 0x2c80(%rip),%rbp # 3df0 <__cxa_finalize#plt+0x2db0>
1170: 53 push %rbx
1171: 4c 29 fd sub %r15,%rbp
1174: 48 83 ec 08 sub $0x8,%rsp
1178: e8 83 fe ff ff callq 1000 <exit#plt-0x30>
117d: 48 c1 fd 03 sar $0x3,%rbp
1181: 74 1b je 119e <__cxa_finalize#plt+0x15e>
1183: 31 db xor %ebx,%ebx
1185: 0f 1f 00 nopl (%rax)
1188: 4c 89 f2 mov %r14,%rdx
118b: 4c 89 ee mov %r13,%rsi
118e: 44 89 e7 mov %r12d,%edi
1191: 41 ff 14 df callq *(%r15,%rbx,8)
1195: 48 83 c3 01 add $0x1,%rbx
1199: 48 39 dd cmp %rbx,%rbp
119c: 75 ea jne 1188 <__cxa_finalize#plt+0x148>
119e: 48 83 c4 08 add $0x8,%rsp
11a2: 5b pop %rbx
11a3: 5d pop %rbp
11a4: 41 5c pop %r12
11a6: 41 5d pop %r13
11a8: 41 5e pop %r14
11aa: 41 5f pop %r15
11ac: c3 retq
11ad: 0f 1f 00 nopl (%rax)
11b0: c3 retq
As you can see, the part that was actually written by me occupies very little space.
The same program (if we ignore the fact that the main function is also treated as a function in C) in Assembly:
.global _start
.text
_start: mov $60, %rax
xor %rdi, %rdi
syscall
Assembled, linked and disassembled with gcc -c demo.s && ld demo.o -o demo && objdump -d demo:
Disassembly of section .text:
0000000000401000 <_start>:
401000: 48 c7 c0 3c 00 00 00 mov $0x3c,%rax
401007: 48 31 ff xor %rdi,%rdi
40100a: 0f 05 syscall
The question is: what purpose do all these instructions serve and is there a way to generate code without them?
While I was writing the question I noticed that the C program calls exit() from the linked library whereas in Assembly I do it directly with a syscall. I don't think it is important in this case though.
gcc generates unnecessary (?) instructions
Yes, because you invoked GCC without asking for any compiler optimizations.
My recommendation: compile with
gcc -fverbose-asm -O2 -S test.c
then look inside the generated test.s assembler code.
BTW, most of the code is from crt0, which is given by, not emitted by, gcc. Build your executable with gcc -O2 -v test.c -o testprog to understand what GCC really does. Read documentation of GCC internals.
Since GCC is free software, you are allowed to look inside its source code and improve it. But the crt0 stuff is tricky, and operating system specific.
Consider also reading about linkers and loaders, about ELF executables, and How to write shared libraries, and the Linux Assembler HowTo.
gcc -s strips symbol names out of the final executable so you can't tell where different parts of the machine code came from.
Most of it is not from your main. To just see that, look at gcc -S output (asm source), e.g. on https://godbolt.org/. How to remove "noise" from GCC/clang assembly output?
Most of that is the CRT (C RunTime) startup code that eventually calls your main after initializing the standard library. (e.g. allocating memory for stdio buffers and so on.) It gets linked in regardless of how efficient your main is. e.g. compiling an empty int main(void){} with gcc -Os (optimize for size) will barely make it any smaller.
You could in theory compile with gcc -nostdlib and write your own _start that uses inline asm to make an exit system call.
See also
A Whirlwind Tutorial on Creating Really Teensy ELF Executables for Linux
How Get arguments value using inline assembly in C without Glibc? (getting command line args complicates the exercise of writing your own _start, but the answers there show how).
C program does a lots of stuff before calling the main function. It has to initialize .data and .bss segments, set the stack, go through the constructors and destructors (yes gcc in C has a special attributes for such a functions) and initializes the library.
gcc destructor and constructor functions:
void __attribute__ ((constructor)) funcname(void);
void __attribute__ ((destructor)) funcname(void);
you may have as many constructors and destructors as you wish.
constructors are called before call to the main function, destructors on exit from the program (after the main termination)
https://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/Function-Attributes.html#Function-Attributes

How to correct relative addressing after memory copy function to user space?

Hi I'm trying to jump from kernel level to user level using "sysexit", and prepared the following user function:
void user_level_function(void)
{
color_printk(RED, BLACK, "user_level_function task is running\n");
while(1);
}
However GCC compiled as following:
ffff80000010322f <user_level_function>:
ffff80000010322f: 55 push %rbp
ffff800000103230: 48 89 e5 mov %rsp,%rbp
ffff800000103233: 41 57 push %r15
ffff800000103235: 48 83 ec 08 sub $0x8,%rsp
ffff800000103239: 48 8d 0d f9 ff ff ff lea -0x7(%rip),%rcx # ffff800000103239 <user_level_function+0xa>
ffff800000103240: 49 bb af d9 00 00 00 movabs $0xd9af,%r11
ffff800000103247: 00 00 00
ffff80000010324a: 4c 01 d9 add %r11,%rcx
ffff80000010324d: 48 b8 a0 01 00 00 00 movabs $0x1a0,%rax
ffff800000103254: 00 00 00
ffff800000103257: 48 8d 14 01 lea (%rcx,%rax,1),%rdx
ffff80000010325b: be 00 00 00 00 mov $0x0,%esi
ffff800000103260: bf 00 00 ff 00 mov $0xff0000,%edi
ffff800000103265: 49 89 cf mov %rcx,%r15
ffff800000103268: b8 00 00 00 00 mov $0x0,%eax
ffff80000010326d: 49 b8 ca 4a ff ff ff movabs $0xffffffffffff4aca,%r8
ffff800000103274: ff ff ff
ffff800000103277: 49 01 c8 add %rcx,%r8
ffff80000010327a: 41 ff d0 callq *%r8
ffff80000010327d: eb fe jmp ffff80000010327d <user_level_function+0x4e>
Above assembly code shows that the function is addressing using its current compiled address.
Therefore after I memory copy the function to user space (0x800000) and execute it, the addressing is disordered due to the relative addressing.
unsigned long do_execve(void)
{
color_printk(RED,BLACK,"do_execve task is running\n");
memcpy(user_level_function,(void *)0x800000,1024);
PUSH_ALL;
__asm__ __volatile__ (
"wrmsr \n\t"
"movq $0x800000, %%rdx \n\t"
"movq $0xa00000, %%rcx \n\t"
".byte 0x48 \n\t"
"sysexit \n\t"
:
:"d"((unsigned long)KERNEL_CS >> 32),
"a"((unsigned long)KERNEL_CS & 0xffffffff),
"c"(0x174)
:"memory"
);
POP_ALL;
return 0;
}
Therefore I would like to ask if there is a way to solve it?
Thanks a lot!!
May the -fno-pic compiler option be what you want?

Resources