Injected 64 bit shellcode wont execute syscall - c

I am currently trying to inject code that will print helloworld into a vulnerable program. I have succeeded in injecting the code by storing it in one of the environment variables and overwriting the rip register to point to that code.
Here is my assembly and op codes:
0000000000000000 <_start>:
0: eb 17 jmp 19 <stack_setup>
0000000000000002 <execute>:
2: 48 31 c0 xor %rax,%rax
5: b0 01 mov $0x1,%al
7: 48 31 ff xor %rdi,%rdi
a: 48 ff c7 inc %rdi
d: 5e pop %rsi
e: b2 0f mov $0xf,%dl
10: 0f 05 syscall
12: b0 3c mov $0x3c,%al
14: 48 ff cf dec %rdi
17: 0f 05 syscall
0000000000000019 <stack_setup>:
19: e8 e4 ff ff ff callq 2 <execute>
1e: 48 rex.W
1f: 65 gs
20: 6c insb (%dx),%es:(%rdi)
21: 6c insb (%dx),%es:(%rdi)
22: 6f outsl %ds:(%rsi),(%dx)
23: 2c 20 sub $0x20,%al
25: 77 6f ja 96 <stack_setup+0x7d>
27: 72 6c jb 95 <stack_setup+0x7c>
29: 64 21 0a and %ecx,%fs:(%rdx)
2c: 0d .byte 0xd
The opcodes after line 19 are for the instruction db "Hello, world!", 0x0a, 0x0d. After examining the execution of the instructions in lines x2-x10 with gdb, I have noticed that it properly prepared the arguments to print the string Hello, World. However, upon executing the syscall statement, nothing happens. Same is to be said for the instructions from line x12 - x17.
I have also tried to compile and link the assembly code to execute it independently and it works properly. In addition to that, I have compiled the vulnerable program with the options
-fno-stack-protector -z execstack
So that I can run code located in the stack. I have also disabled randomizing_va_space. I want to ask if there is something I did not do to allow me to successfully execute the code I injected?

The reason might be that the write syscall returns an error code. It could be for many reasons, but maybe rdx is not all zeros, so you should xor %rdx, %rdx before loading 15 there (actually you should load 14 only, for "Hello, World!\n").
Anyway, the error from write would be returned in rax, and be a negative number (-errno), thus when you do
mov $0x3c,%al
dec %rdi
syscall
That will be an invalid syscall, and errno 38 for ENOSYS will be returned.
The exit I got to work anyway by doing
xor %rax, %rax
mov $0x3c,%al
syscall
I recommend that you run the program with strace and see from its output why exactly the write is failing.
Here's a version of shell code that works for me:
0000000000000000 <main>:
0: eb 1d jmp 1f <stack_setup>
0000000000000002 <execute>:
2: 48 31 c0 xor %rax,%rax
5: b0 01 mov $0x1,%al
7: 48 31 ff xor %rdi,%rdi
a: 48 ff c7 inc %rdi
d: 5e pop %rsi
e: 48 31 d2 xor %rdx,%rdx
11: b2 0e mov $0xe,%dl
13: 0f 05 syscall
15: 48 31 c0 xor %rax,%rax
18: b0 3c mov $0x3c,%al
1a: 48 ff cf dec %rdi
1d: 0f 05 syscall
000000000000001f <stack_setup>:
1f: e8 de ff ff ff callq 2 <execute>
24: 48 rex.W
25: 65 6c gs insb (%dx),%es:(%rdi)
27: 6c insb (%dx),%es:(%rdi)
28: 6f outsl %ds:(%rsi),(%dx)
29: 2c 20 sub $0x20,%al
2b: 57 push %rdi
2c: 6f outsl %ds:(%rsi),(%dx)
2d: 72 6c jb 9b <stack_setup+0x7c>
2f: 64 21 0a and %ecx,%fs:(%rdx)

Related

Creating a print function in C 32-bit protected mode

I've been trying to develop a small OS and managed to switch into protected mode, in order to write C code instead of assembly, but since this means I can't use interrupt 10h anymore, I have to write chars to the video memory address. So I tried creating a new print function to easily print out whole strings instead of printing each char separately. That's where the problems came in, for some reason, while printing single chars with the printchar function works, this new print function doesn't work, no matter what I try.
Here's my C Code:
void print(char* message, int offset);
void printChar(char character, int offset);
void start() {
printChar('M', 2);
print("Test String", 4);
while (1) {
}
}
void print(char* msg, int offset) {
for (int i = 0; msg[i] != '\0'; i++)
{
printChar(msg[i], (i * 2) + offset);
}
}
void printChar(char character, int offset) {
unsigned char* vidmem = (unsigned char*)0xB8000;
*(vidmem + offset + 1) = character;
*(vidmem + offset + 2) = 0x0f;
}
I then use these commands to convert my code to binary and put it onto the second sector of a floppy disk with sectedit.
gcc -c test.c
objcopy -O binary -j .text test.o test.bin
Also here's the assembly code generated, when using objdump -d test.o
0000000000000000 <start>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: 48 83 ec 20 sub $0x20,%rsp
8: ba 02 00 00 00 mov $0x2,%edx
d: b9 4d 00 00 00 mov $0x4d,%ecx
12: e8 73 00 00 00 call 8a <printChar>
17: ba 04 00 00 00 mov $0x4,%edx
1c: 48 8d 05 00 00 00 00 lea 0x0(%rip),%rax # 23 <start+0x23>
23: 48 89 c1 mov %rax,%rcx
26: e8 02 00 00 00 call 2d <print>
2b: eb fe jmp 2b <start+0x2b>
000000000000002d <print>:
2d: 55 push %rbp
2e: 48 89 e5 mov %rsp,%rbp
31: 48 83 ec 30 sub $0x30,%rsp
35: 48 89 4d 10 mov %rcx,0x10(%rbp)
39: 89 55 18 mov %edx,0x18(%rbp)
3c: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
43: eb 29 jmp 6e <print+0x41>
45: 8b 45 fc mov -0x4(%rbp),%eax
48: 8d 14 00 lea (%rax,%rax,1),%edx
4b: 8b 45 18 mov 0x18(%rbp),%eax
4e: 01 c2 add %eax,%edx
50: 8b 45 fc mov -0x4(%rbp),%eax
53: 48 63 c8 movslq %eax,%rcx
56: 48 8b 45 10 mov 0x10(%rbp),%rax
5a: 48 01 c8 add %rcx,%rax
5d: 0f b6 00 movzbl (%rax),%eax
60: 0f be c0 movsbl %al,%eax
63: 89 c1 mov %eax,%ecx
65: e8 20 00 00 00 call 8a <printChar>
6a: 83 45 fc 01 addl $0x1,-0x4(%rbp)
6e: 8b 45 fc mov -0x4(%rbp),%eax
71: 48 63 d0 movslq %eax,%rdx
74: 48 8b 45 10 mov 0x10(%rbp),%rax
78: 48 01 d0 add %rdx,%rax
7b: 0f b6 00 movzbl (%rax),%eax
7e: 84 c0 test %al,%al
80: 75 c3 jne 45 <print+0x18>
82: 90 nop
83: 90 nop
84: 48 83 c4 30 add $0x30,%rsp
88: 5d pop %rbp
89: c3 ret
000000000000008a <printChar>:
8a: 55 push %rbp
8b: 48 89 e5 mov %rsp,%rbp
8e: 48 83 ec 10 sub $0x10,%rsp
92: 89 c8 mov %ecx,%eax
94: 89 55 18 mov %edx,0x18(%rbp)
97: 88 45 10 mov %al,0x10(%rbp)
9a: 48 c7 45 f8 00 80 0b movq $0xb8000,-0x8(%rbp)
a1: 00
a2: 8b 45 18 mov 0x18(%rbp),%eax
a5: 48 98 cltq
a7: 48 8d 50 01 lea 0x1(%rax),%rdx
ab: 48 8b 45 f8 mov -0x8(%rbp),%rax
af: 48 01 c2 add %rax,%rdx
b2: 0f b6 45 10 movzbl 0x10(%rbp),%eax
b6: 88 02 mov %al,(%rdx)
b8: 8b 45 18 mov 0x18(%rbp),%eax
bb: 48 98 cltq
bd: 48 8d 50 02 lea 0x2(%rax),%rdx
c1: 48 8b 45 f8 mov -0x8(%rbp),%rax
c5: 48 01 d0 add %rdx,%rax
c8: c6 00 0f movb $0xf,(%rax)
cb: 90 nop
cc: 48 83 c4 10 add $0x10,%rsp
d0: 5d pop %rbp
d1: c3 ret
d2: 90 nop
d3: 90 nop
d4: 90 nop
d5: 90 nop
d6: 90 nop
d7: 90 nop
d8: 90 nop
d9: 90 nop
da: 90 nop
db: 90 nop
dc: 90 nop
dd: 90 nop
de: 90 nop
df: 90 nop
edit: The problem basically lied in me not doing this on a linux distribution, with all the things I'd need to do to do it in Windows not properly set up, huge thanks to MichaelPetch who explained the problems to me, I've now switched to a linux VM and after slightly correcting the code, it works (as the comments pointed out my offset was weird, I used that offset as it worked in the broken setup I had, but normally it shouldn't).

OS development, issues with floating point division resulting NaN

I apologize in advance if my question isn't formatted in the best way, I'm new to asking questions here.
I've been interested in learning about operating system development lately, and I have run into some strange issues regarding floating point division in C. Even something as simple as 4.0f / 2.0f gives me a NaN result. I suspect this may have something to do with the compiler, however I don't know how to verify that, I would very much appreciate help in fixing this, as I've been at this for a few hours and have made little to no progress with google searches.
Github for the project, if you'd like to build it: https://github.com/AsherBearce/ToyOperatingSystem
The relevant parts of my project are as follows:
kernel/kernelmain.c:
#include "screen.h"
void main(){
enableCursor(1, 14);
clearScreen();
double a = 4.0f;
double b = 2.0f;
double c = a / b;
double ans = 2.0f;
//Division is the ONLY operation that isn't yielding the correct results, in fact c turns out to be NaN!
if (c == 2.0f){
char string[] = "Hardcoded values were correct\n\0";
print(string);
}
char out[] = "End output\0";
print(out);
while (1){
}
}
boot/bootsector.asm
org 0x7c00
bits 16
mov ax, HELLO_MSG ;Print a simple hello message :D
call _printString
xor ax, ax
;Here, we'll load the kernel into RAM
call LoadKernel
;Enter protected mode
call EnterProtMode
EnterProtMode:
cli ;Disable interrupts
lgdt [gdt_pointer] ;Load the GDT register with the start address of the GDT
mov eax, cr0
or al, 1 ;Set PE (protection enable) bit in CR0
mov cr0, eax
jmp 08h:Stage2 ;Jump to stage 2
LoadKernel:
mov bx, KERNEL_OFFSET ;Load the kernel offset into bx
mov dh, 16 ;Load 16 sectors
mov dl, [BOOT_DRIVE] ;The disk to read from
call diskload ;Load the kernel
ret
bits 32
KERNEL_OFFSET equ 0x1000
BOOT_DRIVE: db 0
Stage2:
mov ax, DATA_SEG
mov ds, ax
mov es, ax
mov fs, ax
mov gs, ax
mov ss, ax
mov ebp, 0x90000
mov esp, ebp
;Kernel entry here
jmp KERNEL_OFFSET ;Call the kernel finally
%include 'boot/printUtils.asm'
%include 'boot/gdt.asm'
%include 'boot/diskload.asm'
HELLO_MSG: db "Booted successfully, loading kernel.", 0
times 510 - ($ - $$) db 0
dw 0xaa55
Makefile
BOOTOUTPUT = boot.bin
OSOUTPUT = os.bin
SRCS = $(shell find . -name '*.c')
CINC = $(shell find . -name '*.h')
COBJS = $(patsubst %.c, %.o, $(SRCS))
OBJDIR = build
#Final step in the build process
$(OSOUTPUT): kernel.bin $(BOOTOUTPUT)
cat $(BOOTOUTPUT) kernel.bin > $(OSOUTPUT)
#Assemble the boot sector code
$(BOOTOUTPUT): boot/bootsector.asm
nasm -f bin boot/bootsector.asm -o $(BOOTOUTPUT)
#Compile all the kernel C files
%.o:%.c $(CINC)
gcc -m32 -ffreestanding -fno-pie -fno-stack-protector -nostdlib -c $< -o $#
#Assemble the IRQ code
irq.o: kernel/irq.asm
nasm kernel/irq.asm -f elf32 -o irq.o
#Assemble the kernel entry code
kernelEntry.o: boot/kernelEntry.asm
nasm boot/kernelEntry.asm -f elf32 -o kernelEntry.o
#Link all the .o files with the kernel entry
kernel.bin: kernelEntry.o irq.o $(COBJS)
ld -melf_i386 -o kernel.bin -Ttext 0x1000 $^ --oformat binary
run:
qemu-system-x86_64 -fda $(OSOUTPUT)
clean:
rm -f *.bin *.o $(COBJS)
Edit: I've decided to include the disassembly for kernelmain.c
kernelmain.o: file format elf32-i386
Disassembly of section .text:
00000000 <main>:
0: f3 0f 1e fb endbr32
4: 8d 4c 24 04 lea 0x4(%esp),%ecx
8: 83 e4 f0 and $0xfffffff0,%esp
b: ff 71 fc pushl -0x4(%ecx)
e: 55 push %ebp
f: 89 e5 mov %esp,%ebp
11: 51 push %ecx
12: 83 ec 54 sub $0x54,%esp
15: e8 fc ff ff ff call 16 <main+0x16>
1a: 83 ec 08 sub $0x8,%esp
1d: 6a 0e push $0xe
1f: 6a 01 push $0x1
21: e8 fc ff ff ff call 22 <main+0x22>
26: 83 c4 10 add $0x10,%esp
29: e8 fc ff ff ff call 2a <main+0x2a>
2e: e8 fc ff ff ff call 2f <main+0x2f>
33: dd 05 00 00 00 00 fldl 0x0
39: dd 5d f0 fstpl -0x10(%ebp)
3c: dd 05 08 00 00 00 fldl 0x8
42: dd 5d e8 fstpl -0x18(%ebp)
45: dd 45 f0 fldl -0x10(%ebp)
48: dc 75 e8 fdivl -0x18(%ebp)
4b: dd 5d e0 fstpl -0x20(%ebp)
4e: dd 05 08 00 00 00 fldl 0x8
54: dd 5d d8 fstpl -0x28(%ebp)
57: dd 45 e0 fldl -0x20(%ebp)
5a: dd 05 08 00 00 00 fldl 0x8
60: df e9 fucomip %st(1),%st
62: dd d8 fstp %st(0)
64: 7a 56 jp bc <main+0xbc>
66: dd 45 e0 fldl -0x20(%ebp)
69: dd 05 08 00 00 00 fldl 0x8
6f: df e9 fucomip %st(1),%st
71: dd d8 fstp %st(0)
73: 75 47 jne bc <main+0xbc>
75: c7 45 ac 48 61 72 64 movl $0x64726148,-0x54(%ebp)
7c: c7 45 b0 63 6f 64 65 movl $0x65646f63,-0x50(%ebp)
83: c7 45 b4 64 20 76 61 movl $0x61762064,-0x4c(%ebp)
8a: c7 45 b8 6c 75 65 73 movl $0x7365756c,-0x48(%ebp)
91: c7 45 bc 20 77 65 72 movl $0x72657720,-0x44(%ebp)
98: c7 45 c0 65 20 63 6f movl $0x6f632065,-0x40(%ebp)
9f: c7 45 c4 72 72 65 63 movl $0x63657272,-0x3c(%ebp)
a6: c7 45 c8 74 0a 00 00 movl $0xa74,-0x38(%ebp)
ad: 83 ec 0c sub $0xc,%esp
b0: 8d 45 ac lea -0x54(%ebp),%eax
b3: 50 push %eax
b4: e8 fc ff ff ff call b5 <main+0xb5>
b9: 83 c4 10 add $0x10,%esp
bc: c7 45 cc 45 6e 64 20 movl $0x20646e45,-0x34(%ebp)
c3: c7 45 d0 6f 75 74 70 movl $0x7074756f,-0x30(%ebp)
ca: c7 45 d4 75 74 00 00 movl $0x7475,-0x2c(%ebp)
d1: 83 ec 0c sub $0xc,%esp
d4: 8d 45 cc lea -0x34(%ebp),%eax
d7: 50 push %eax
d8: e8 fc ff ff ff call d9 <main+0xd9>
dd: 83 c4 10 add $0x10,%esp
e0: eb fe jmp e0 <main+0xe0>
After looking around at various resources, I found this forum page on OSdev here:
forum.osdev.org/viewtopic.php?f=1&t=21813 that described the process for first checking for an FPU, and then initializing it. Turns out that for whatever reason, my target platform does not have an FPU, which I'm guessing is the reason for the undefined behavior.

gcc generates unnecessary (?) instructions

I decided to compile a very basic C program and take a look at the generated code with objdump -d.
int main(int argc, char *argv[]) {
exit(0);
}
After compiling it with gcc test.c -s -o test.o and then disassembling with objdump -d my text segment looked like this:
Disassembly of section .text:
0000000000001050 <.text>:
1050: 31 ed xor %ebp,%ebp
1052: 49 89 d1 mov %rdx,%r9
1055: 5e pop %rsi
1056: 48 89 e2 mov %rsp,%rdx
1059: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
105d: 50 push %rax
105e: 54 push %rsp
105f: 4c 8d 05 4a 01 00 00 lea 0x14a(%rip),%r8 # 11b0 <__cxa_finalize#plt+0x170>
1066: 48 8d 0d e3 00 00 00 lea 0xe3(%rip),%rcx # 1150 <__cxa_finalize#plt+0x110>
106d: 48 8d 3d c1 00 00 00 lea 0xc1(%rip),%rdi # 1135 <__cxa_finalize#plt+0xf5>
1074: ff 15 66 2f 00 00 callq *0x2f66(%rip) # 3fe0 <__cxa_finalize#plt+0x2fa0>
107a: f4 hlt
107b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
1080: 48 8d 3d a9 2f 00 00 lea 0x2fa9(%rip),%rdi # 4030 <__cxa_finalize#plt+0x2ff0>
1087: 48 8d 05 a2 2f 00 00 lea 0x2fa2(%rip),%rax # 4030 <__cxa_finalize#plt+0x2ff0>
108e: 48 39 f8 cmp %rdi,%rax
1091: 74 15 je 10a8 <__cxa_finalize#plt+0x68>
1093: 48 8b 05 3e 2f 00 00 mov 0x2f3e(%rip),%rax # 3fd8 <__cxa_finalize#plt+0x2f98>
109a: 48 85 c0 test %rax,%rax
109d: 74 09 je 10a8 <__cxa_finalize#plt+0x68>
109f: ff e0 jmpq *%rax
10a1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10a8: c3 retq
10a9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10b0: 48 8d 3d 79 2f 00 00 lea 0x2f79(%rip),%rdi # 4030 <__cxa_finalize#plt+0x2ff0>
10b7: 48 8d 35 72 2f 00 00 lea 0x2f72(%rip),%rsi # 4030 <__cxa_finalize#plt+0x2ff0>
10be: 48 29 fe sub %rdi,%rsi
10c1: 48 c1 fe 03 sar $0x3,%rsi
10c5: 48 89 f0 mov %rsi,%rax
10c8: 48 c1 e8 3f shr $0x3f,%rax
10cc: 48 01 c6 add %rax,%rsi
10cf: 48 d1 fe sar %rsi
10d2: 74 14 je 10e8 <__cxa_finalize#plt+0xa8>
10d4: 48 8b 05 15 2f 00 00 mov 0x2f15(%rip),%rax # 3ff0 <__cxa_finalize#plt+0x2fb0>
10db: 48 85 c0 test %rax,%rax
10de: 74 08 je 10e8 <__cxa_finalize#plt+0xa8>
10e0: ff e0 jmpq *%rax
10e2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
10e8: c3 retq
10e9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
10f0: 80 3d 39 2f 00 00 00 cmpb $0x0,0x2f39(%rip) # 4030 <__cxa_finalize#plt+0x2ff0>
10f7: 75 2f jne 1128 <__cxa_finalize#plt+0xe8>
10f9: 55 push %rbp
10fa: 48 83 3d f6 2e 00 00 cmpq $0x0,0x2ef6(%rip) # 3ff8 <__cxa_finalize#plt+0x2fb8>
1101: 00
1102: 48 89 e5 mov %rsp,%rbp
1105: 74 0c je 1113 <__cxa_finalize#plt+0xd3>
1107: 48 8b 3d 1a 2f 00 00 mov 0x2f1a(%rip),%rdi # 4028 <__cxa_finalize#plt+0x2fe8>
110e: e8 2d ff ff ff callq 1040 <__cxa_finalize#plt>
1113: e8 68 ff ff ff callq 1080 <__cxa_finalize#plt+0x40>
1118: c6 05 11 2f 00 00 01 movb $0x1,0x2f11(%rip) # 4030 <__cxa_finalize#plt+0x2ff0>
111f: 5d pop %rbp
1120: c3 retq
1121: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
1128: c3 retq
1129: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
1130: e9 7b ff ff ff jmpq 10b0 <__cxa_finalize#plt+0x70>
1135: 55 push %rbp
1136: 48 89 e5 mov %rsp,%rbp
1139: 48 83 ec 10 sub $0x10,%rsp
113d: 89 7d fc mov %edi,-0x4(%rbp)
1140: 48 89 75 f0 mov %rsi,-0x10(%rbp)
1144: bf 00 00 00 00 mov $0x0,%edi
1149: e8 e2 fe ff ff callq 1030 <exit#plt>
114e: 66 90 xchg %ax,%ax
1150: 41 57 push %r15
1152: 4c 8d 3d 8f 2c 00 00 lea 0x2c8f(%rip),%r15 # 3de8 <__cxa_finalize#plt+0x2da8>
1159: 41 56 push %r14
115b: 49 89 d6 mov %rdx,%r14
115e: 41 55 push %r13
1160: 49 89 f5 mov %rsi,%r13
1163: 41 54 push %r12
1165: 41 89 fc mov %edi,%r12d
1168: 55 push %rbp
1169: 48 8d 2d 80 2c 00 00 lea 0x2c80(%rip),%rbp # 3df0 <__cxa_finalize#plt+0x2db0>
1170: 53 push %rbx
1171: 4c 29 fd sub %r15,%rbp
1174: 48 83 ec 08 sub $0x8,%rsp
1178: e8 83 fe ff ff callq 1000 <exit#plt-0x30>
117d: 48 c1 fd 03 sar $0x3,%rbp
1181: 74 1b je 119e <__cxa_finalize#plt+0x15e>
1183: 31 db xor %ebx,%ebx
1185: 0f 1f 00 nopl (%rax)
1188: 4c 89 f2 mov %r14,%rdx
118b: 4c 89 ee mov %r13,%rsi
118e: 44 89 e7 mov %r12d,%edi
1191: 41 ff 14 df callq *(%r15,%rbx,8)
1195: 48 83 c3 01 add $0x1,%rbx
1199: 48 39 dd cmp %rbx,%rbp
119c: 75 ea jne 1188 <__cxa_finalize#plt+0x148>
119e: 48 83 c4 08 add $0x8,%rsp
11a2: 5b pop %rbx
11a3: 5d pop %rbp
11a4: 41 5c pop %r12
11a6: 41 5d pop %r13
11a8: 41 5e pop %r14
11aa: 41 5f pop %r15
11ac: c3 retq
11ad: 0f 1f 00 nopl (%rax)
11b0: c3 retq
As you can see, the part that was actually written by me occupies very little space.
The same program (if we ignore the fact that the main function is also treated as a function in C) in Assembly:
.global _start
.text
_start: mov $60, %rax
xor %rdi, %rdi
syscall
Assembled, linked and disassembled with gcc -c demo.s && ld demo.o -o demo && objdump -d demo:
Disassembly of section .text:
0000000000401000 <_start>:
401000: 48 c7 c0 3c 00 00 00 mov $0x3c,%rax
401007: 48 31 ff xor %rdi,%rdi
40100a: 0f 05 syscall
The question is: what purpose do all these instructions serve and is there a way to generate code without them?
While I was writing the question I noticed that the C program calls exit() from the linked library whereas in Assembly I do it directly with a syscall. I don't think it is important in this case though.
gcc generates unnecessary (?) instructions
Yes, because you invoked GCC without asking for any compiler optimizations.
My recommendation: compile with
gcc -fverbose-asm -O2 -S test.c
then look inside the generated test.s assembler code.
BTW, most of the code is from crt0, which is given by, not emitted by, gcc. Build your executable with gcc -O2 -v test.c -o testprog to understand what GCC really does. Read documentation of GCC internals.
Since GCC is free software, you are allowed to look inside its source code and improve it. But the crt0 stuff is tricky, and operating system specific.
Consider also reading about linkers and loaders, about ELF executables, and How to write shared libraries, and the Linux Assembler HowTo.
gcc -s strips symbol names out of the final executable so you can't tell where different parts of the machine code came from.
Most of it is not from your main. To just see that, look at gcc -S output (asm source), e.g. on https://godbolt.org/. How to remove "noise" from GCC/clang assembly output?
Most of that is the CRT (C RunTime) startup code that eventually calls your main after initializing the standard library. (e.g. allocating memory for stdio buffers and so on.) It gets linked in regardless of how efficient your main is. e.g. compiling an empty int main(void){} with gcc -Os (optimize for size) will barely make it any smaller.
You could in theory compile with gcc -nostdlib and write your own _start that uses inline asm to make an exit system call.
See also
A Whirlwind Tutorial on Creating Really Teensy ELF Executables for Linux
How Get arguments value using inline assembly in C without Glibc? (getting command line args complicates the exercise of writing your own _start, but the answers there show how).
C program does a lots of stuff before calling the main function. It has to initialize .data and .bss segments, set the stack, go through the constructors and destructors (yes gcc in C has a special attributes for such a functions) and initializes the library.
gcc destructor and constructor functions:
void __attribute__ ((constructor)) funcname(void);
void __attribute__ ((destructor)) funcname(void);
you may have as many constructors and destructors as you wish.
constructors are called before call to the main function, destructors on exit from the program (after the main termination)
https://gcc.gnu.org/onlinedocs/gcc-4.7.0/gcc/Function-Attributes.html#Function-Attributes

Dealing with callee registers going onto stack when calling functions passing several arguments

I have a question on a homework assignment for an Assembly class. I'm not looking for an answer by any means, just some guidance with how it works. Based on my understanding, I can't determine what exactly is happening.
Consider a function P, which generates local values a-c by simple local computation
and d-f by calling Q(), R(), and S().
long P(long x,long y,long z) {
long a = ...;
long b = ...;
long c = ...;
long d = ...;
long e = ...;
long f = ...;
return d + e + f;
}
0000000000000022 <P>:
22: 55 push %rbp
23: 53 push %rbx
24: 48 83 ec 20 sub $0x20,%rsp
28: 48 83 c7 01 add $0x1,%rdi
2c: 48 89 7c 24 18 mov %rdi,0x18(%rsp)
31: 48 83 c6 02 add $0x2,%rsi
35: 48 89 74 24 10 mov %rsi,0x10(%rsp)
3a: 48 83 c2 03 add $0x3,%rdx
3e: 48 89 54 24 08 mov %rdx,0x8(%rsp)
43: 48 8d 74 24 10 lea 0x10(%rsp),%rsi
48: 48 8d 7c 24 18 lea 0x18(%rsp),%rdi
4d: b8 00 00 00 00 mov $0x0,%eax
52: e8 00 00 00 00 callq 57
57: 48 89 c3 mov %rax,%rbx
5a: 48 8d 74 24 08 lea 0x8(%rsp),%rsi
5f: 48 8d 7c 24 10 lea 0x10(%rsp),%rdi
64: b8 00 00 00 00 mov $0x0,%eax
69: e8 00 00 00 00 callq 6e
6e: 48 89 c5 mov %rax,%rbp
71: 48 8d 74 24 18 lea 0x18(%rsp),%rsi
76: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi
7b: b8 00 00 00 00 mov $0x0,%eax
80: e8 00 00 00 00 callq 85
85: 48 01 eb add %rbp,%rbx
88: 48 01 d8 add %rbx,%rax
8b: 48 83 c4 20 add $0x20,%rsp
8f: 5b pop %rbx
90: 5d pop %rbp
91: c3 retq
Find the size of the stack in bytes.
Identify the assembly
statement(s) that allocate and free the local stack.
Identify which
local values get stored in callee-saved registers.
Identify which
local values get stored on the stack.
Explain why the program could
not store all of the local values in callee-saved registers.
I understand the concept of pushing rbx and rbp on the stack to make room for other local variables later on. I understand how space on the stack is allocated on line 24. Then the arguments passed into P are altered and stored on the stack. My issue starts at line 43.
line 43 and and 48 create pointers to positions on the stack correct? Then line 4d sets eax(or rax) to 0. Then on line 57 we set rbx to rax(0), and the next 3 lines I'm completely confused about. We create more pointers of the stack and store the address into rsi and rdi. Wouldn't this override what we did in lines 43 and 48. And then it sets eax(rax) to 0 again on line 64, but eax was already 0 and nothing changed with it.
This repeats with the next call on line 69 as mentioned above. By the time you get to line 85 and 88 to me it seems like it would just be 0 + 0 + 0.
On a side note, shouldn't each 'callq' end with a 'ret'? For example, shoudn't there be a 'ret' after line 64 and 7b?
I've reached the point where it feels like something is missing from the code, but I wanted to check first because it seems more likely that I'm not understanding some core fundamental principle.
Thank you in advance for any friendly nudge in the right direction to figure this out!
The code you're looking at is unlinked code. The three call instructions do not simply jump to the next instruction,* they will be filled in by the linker with the offset to an actual function. So you cannot simply ignore their behavior as you have been.
The behavior of a function call is dependent on the ABI, as Anders mentioned. In particular, RSI and RDI should be assumed to be overwritten, and RAX contains the return value of the function.
* Call instructions in x86 are relative to the next instruction. So an offset of 0 in a call instruction causes the disassembler to display the next instruction as the target. This is typical for unlinked code.
You should be studying the ABI that describes the interface between higher level and ASM.
This is just a example link below. You need to find the ABI for the architecture and compiler you are using.
https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html

Assembly - js versus ja instruction

So the goal is for me to write out the C code that corresponds to this assembly :
0: 85 f6 test %esi,%esi
2: 78 13 js 17 <part3+0x17>
4: 83 fe 07 cmp $0x7,%esi
7: 77 14 ja 1d <part3+0x1d>
9: 8d 0c f5 00 00 00 00 lea 0x0(,%rsi,8),%ecx
10: 48 d3 ff sar %cl,%rdi
13: 48 89 f8 mov %rdi,%rax
16: c3 retq
17: b8 00 00 00 00 mov $0x0,%eax
1c: c3 retq
1d: b8 00 00 00 00 mov $0x0,%eax
22: c3 retq
I am a little confused because the first loop testing the %esi register ends before the second loop ends.
Is the second if statement comparing %esi to 7 inside the first loop? or is this a if , else if situation??
Let me sum up, what's already been said
0: 85 f6 test %esi,%esi
2: 78 13 js 17 <part3+0x17>
this is " if (esi < 0) goto 17; "
4: 83 fe 07 cmp $0x7,%esi
7: 77 14 ja 1d <part3+0x1d>
this is " if (esi >7) goto 1d; "
9: 8d 0c f5 00 00 00 00 lea 0x0(,%rsi,8),%ecx
"cx = 8*rsi" // not that obvious it's "just" a multiplication)
10: 48 d3 ff sar %cl,%rdi
rdi >> cl; // not cx, but cx is safe to be <= 7*8, so that's the same
13: 48 89 f8 mov %rdi,%rax
16: c3 retq
return rdi;
17: b8 00 00 00 00 mov $0x0,%eax
1c: c3 retq
17: "return 0"
1d: b8 00 00 00 00 mov $0x0,%eax
22: c3 retq
1d: another "return 0"
so the C-Code is:
{
if (esi < 0) return 0;
if (esi > 7) return 0;
return rdi >> ( 8 * rsi );
}
PS: the 2 "return 0" (17 and 1d) give a clear indication that, in the C-code, the two ifs were NOT combined into one
PSS: the C Code was obviously not compiled with optimization :P

Resources