I know the x64 calling convention, first four arguments are in rcx, rdx, r8, r9, rest are on the stack. But my question is how push these arguments?
call_func PROC
push rbp
mov rbp, rsp
mov rbx, rcx ; move C function address to rbx
mov rcx, 1 ; some dummy value
mov rdx, 2 ; some dummy value
mov r8, 3 ; some dummy value
mov r9, 4 ; some dummy value
; and now I want to push fifth argument, but how?
call rbx ; call the function
mov rsp, rbp
pop rbp
ret
call_func ENDP
I have tried mov QWORD PTR [rsp + 20h], 1 but when returning form this asm function the RIP register is set to weird value, like 0x0000000000000001. I know that the RIP register is instruction pointer, but why it is modifying it?
I have tried one more thing, let the function take 6 arguments and when I pass sixth argument like mov QWORD PTR [rsp + 28h], 1 the app is fine, sixth argument is passed, fifth has weird value.
As Jester said, before pushing arguments I need to allocate space for these arguments including shadow space.
Final working code:
call_func PROC
push rbp
mov rbp, rsp
sub rsp, 32 ; allocate shadow space 'padding'
sub rsp, 16 ; allocate space for fifth and sixth argument
mov r11, rcx ; move C function address to r11
mov rcx, 1 ; some dummy value
mov rdx, 2 ; some dummy value
mov r8, 3 ; some dummy value
mov r9, 4 ; some dummy value
mov QWORD PTR [rsp + 20h], 5 ; push fifth argument
mov QWORD PTR [rsp + 28h], 6 ; push sixth argument
call r11 ; call the function
mov rsp, rbp
pop rbp
ret
call_func ENDP
Related
I am able to take a reverse shell program in assembly, compile it using ld or link (visual studio), use objdump, get the shellcode, (yes it has no null-bytes), and I am able to use this in a dropper, a simple call to it works fine such as
#include <stdio.h
#include <windows.h>
int main() {
char *shellcode = "myshellcodegoesinhere";
printf("shellcode length: %i", strlen(shellcode));
void * lpAlloc = VirtualAlloc(0, sizeof shellcode, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
memcpy(lpAlloc, shellcode, strlen(shellcode));
((void(*)())lpAlloc)();
return 0;
}
However, when it comes to 64-bits, I am able to execute my programs as an exe successfully but not in a dropper like above. I literally wrote a 64-bit reverse shell in windows and it works perfectly (havent found another one online that actually works) but when I convert it to shellcode, it doesn't work in my above dropper. And yes I removed all the null bytes from it, it was quite a challenge. So then I decided to see if another simple program would behave the same way and sure enough it does. I took a simple swap mouse button program and rewrote it to remove the null bytes out of it, runs perfectly from an exe but not in a dropper. I know that my 32-bit reverse shell works in a 64-bit system. That is not the point here. The point is any 64-bit application is not able to be used as shellcode on a 64-bit machine. Here is the swap mouse button program in assembly.
BITS 64
SECTION .text
global _start
_start:
sub RSP, 0x28 ; 40 bytes of shadow space
and RSP, 0FFFFFFFFFFFFFFF0h ; Align the stack to a multiple of 16 bytes
; Parse PEB and find kernel32
xor rcx, rcx ; RCX = 0
mov rax, [gs:rcx + 0x60] ; RAX = PEB
mov rax, [rax + 0x18] ; RAX = PEB->Ldr
mov rsi, [rax + 0x20] ; RSI = PEB->Ldr.InMemOrder
lodsq ; RAX = Second module
xchg rax, rsi ; RAX = RSI, RSI = RAX
lodsq ; RAX = Third(kernel32)
mov rbx, [rax + 0x20] ; RBX = Base address
; Parse kernel32 PE
xor r8, r8 ; Clear r8
mov r8d, [rbx + 0x3c] ; R8D = DOS->e_lfanew offset
mov rdx, r8 ; RDX = DOS->e_lfanew
add rdx, rbx ; RDX = PE Header
; start a loop to inc edx 0x88 times to reach the export directory
xor rcx, rcx
xor rax, rax
mov al, 0x88 ; 136 bytes is needed to add to edx to reach the export directory
inc_edx:
inc byte edx
dec al
cmp al, cl
jne inc_edx
mov r8d, [edx] ; R8D = Offset export table
add r8, rbx ; R8 = Export table
xor rsi, rsi ; Clear RSI
mov esi, [r8 + 0x20] ; RSI = Offset namestable
add rsi, rbx ; RSI = Names table
xor rcx, rcx ; RCX = 0
mov r9, 0x41636f7250746547 ; GetProcA
; Loop through exported functions and find GetProcAddress
Get_Function:
inc rcx ; Increment the ordinal
xor rax, rax ; RAX = 0
mov eax, [rsi + rcx * 4] ; Get name offset
add rax, rbx ; Get function name
cmp QWORD [rax], r9 ; GetProcA ?
jnz Get_Function
xor rsi, rsi ; RSI = 0
mov esi, [r8 + 0x24] ; ESI = Offset ordinals
add rsi, rbx ; RSI = Ordinals table
mov cx, [rsi + rcx * 2] ; Number of function
xor rsi, rsi ; RSI = 0
mov esi, [r8 + 0x1c] ; Offset address table
add rsi, rbx ; ESI = Address table
xor rdx, rdx ; RDX = 0
mov edx, [rsi + rcx * 4] ; EDX = Pointer(offset)
add rdx, rbx ; RDX = GetProcAddress
mov rdi, rdx ; Save GetProcAddress in RDI
; Use GetProcAddress to find the address of LoadLibrary
mov rcx, 0x41797261 ; aryA
push rcx ; Push on the stack
mov rcx, 0x7262694c64616f4c ; LoadLibr
push rcx ; Push on stack
mov rdx, rsp ; LoadLibraryA
mov rcx, rbx ; kernel32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
mov rsi, rax ; LoadLibrary saved in RSI
xor rcx, rcx
push dword 0x41416c6c ; ll
;push dword rcx ; Push on the stack
sub word [rsp + 0x2], 0x4141
mov rcx, 0x642e323372657375 ; user32.d
push rcx ; Push on stack
mov rcx, rsp ; user32.dll
sub rsp, 0x20 ; Allocate stack space for function call
call rsi ; Call LoadLibraryA
mov r15, rax ; Base address of user32.dll in R15
; Call GetProcAddress(user32.dll, "SwapMouseButton")
mov rcx, 0x416e6f7474754265 ; eButton
push rcx ; Push on the stack
sub byte [rsp + 0x7], 0x41
mov rcx, 0x73756f4d70617753 ; SwapMous
push rcx ; Push on stack
mov rdx, rsp ; SwapMouseButton
mov rcx, r15 ; User32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
mov r15, rax ; SwapMouseButton in R15
; Call SwapMouseButton(true)
xor rcx, rcx ; true
inc cl
call r15 ; SwapMouseButton(true)
; Call GetProcAddress(kernel32.dll, "ExitProcess")
xor rcx, rcx ; RCX = 0
push dword 0x41737365 ; ess
sub byte [rsp + 0x3], 0x41
push rcx ; Push on the stack
mov rcx, 0x636f725074697845 ; ExitProc
push rcx ; Push on stack
mov rdx, rsp ; ExitProcess
mov rcx, rbx ; Kernel32.dll base address
sub rsp, 0x20 ; Allocate stack space for function call
call rdi ; Call GetProcAddress
; Call ExitProcess(0)
xor rcx, rcx ; Exit code 0
call rax ; ExitProcess(0)
Save this as sw64.s
link.exe (visual studio x64 native tools command prompt)
I have two linkers I use that work just fine. both link and Golink.exe
nasm -f win64 sw64.s && link sw64.obj /SUBSYSTEM:CONSOLE /OUT:sw64.exe /LARGEADDRESSAWARE:NO /ENTRY:_start && sw64.exe
or using Golink.exe
nasm -f win64 sw64.s && c:\Golink\GoLink.exe /console /entry _start sw64.obj /fo sw64.exe && sw64.exe
I then convert the shellcode using objdump and filter with awk and sed to produce \x??\x?? output and use this in my dropper. I've done it for my 32-bit reverse shell and it works like a charm but not for 64-bit. I would like to understand why this isn't working. Thank you.
converting 64-bit to shellcode does not work, but 32-bit does
Here is a basic program I written on the godbolt compiler, and it's as simple as:
#include<stdio.h>
void main()
{
int a = 10;
int *p = &a;
printf("%d", *p);
}
The results after compilation I get:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-12], 10
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
nop
leave
ret
Question: Pushing the rbp, making the stack frame by making a 16 byte block, how from a register, a value is moved to a stack location and vice versa, how the job of LEA is to figure out the address, I got this part.
Problem:
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov rax, QWORD PTR [rbp-8]
mov eax, DWORD PTR [rax]
Lea -> getting address of rbp-12 into rax,
then moving the value which is the address of rbp-12 into rax,
but next line again says, move to rax, the value of rbp-8. This seems ambiguous. Then again moving the value of rax to eax. I don't understand the amount of work here. Why couldn't I have done
lea rax, [rbp-12]
mov QWORD PTR [rbp-8], rax
mov eax, QWORD PTR [rbp-8]
and be done with it? coz on the original line, rbp-12's address is stored onto rax, then rax stored to rbp-8. then rbp-8 stored again into rax, and then again rax is stored into eax? couldn't we have just copied the rbp-8 directly to eax? i guess not. But my question is why?
I know there is de-referencing in pointers, so How LEA helps grabbing the address of rbp-12, I understand, but on the next parts, when did it went from grabbing values from addresses I completely lost. And also, after that, I didn't understand any of the asm lines.
You're seeing very un-optimized code. Here's my line-by-line interpretation:
.LC0:
.string "%d" ; Format string for printf
main:
push rbp ; Save original base pointer
mov rbp, rsp ; Set base pointer to beginning of stack frame
sub rsp, 16 ; Allocate space for stack frame
mov DWORD PTR [rbp-12], 10 ; Initialize variable 'a'
lea rax, [rbp-12] ; Load effective address of 'a'
mov QWORD PTR [rbp-8], rax ; Store address of 'a' in 'p'
mov rax, QWORD PTR [rbp-8] ; Load 'p' into rax (even though it's already there - heh!)
mov eax, DWORD PTR [rax] ; Load 32-bit value of '*p' into eax
mov esi, eax ; Load value to print into esi
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
mov eax, 0 ; Zero out eax (not sure why -- likely printf call protocol)
call printf ; Make the printf call
nop ; No-op (not sure why)
leave ; Remove the stack frame
ret ; Return
Compilers, when not optimizing, generate code like this as they parse the code you gave them. It's doing a lot of unnecessary stuff, but it is quicker to generate and makes using a debugger easier.
Compare this with the optimized code (-O2):
.LC0:
.string "%d" ; Format string for printf
main:
mov esi, 10 ; Don't need those variables -- just a 10 to pass to printf!
mov edi, OFFSET FLAT:.LC0 ; Load format string address into edi
xor eax, eax ; It's a few cycles faster to xor a register with itself than to load an immediate 0
jmp printf ; Just jmp to printf -- it will handle the return
The optimizer found that the variables weren't necessary, so no stack frame is created. Nothing is left but the printf call! And that's done as a jmp since nothing else need be done here when the printf is complete.
I am trying to understand how a variable sized static array work internally:
Following is a fixed size static array in C and its Assembly equivalent;
int main()
{
int arr[2] = {3};
}
================
main:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-8], 0
mov DWORD PTR [rbp-8], 2
mov eax, 0
pop rbp
ret
However a variable sized array is shown below
int main()
{
int varSize ;
int Arr[varSize];
}
=================
main:
push rbp
mov rbp, rsp
sub rsp, 32
mov rax, rsp
mov rcx, rax
mov eax, DWORD PTR [rbp-4]
movsx rdx, eax
sub rdx, 1
mov QWORD PTR [rbp-16], rdx
movsx rdx, eax
mov r8, rdx
mov r9d, 0
movsx rdx, eax
mov rsi, rdx
mov edi, 0
cdqe
lea rdx, [0+rax*4]
mov eax, 16
sub rax, 1
add rax, rdx
mov edi, 16
mov edx, 0
div rdi
imul rax, rax, 16
sub rsp, rax
mov rax, rsp
add rax, 3
shr rax, 2
sal rax, 2
mov QWORD PTR [rbp-24], rax
mov rsp, rcx
mov eax, 0
leave
ret
I am seeing a whole lot of assembly instructions if I declare a variable sized array. Can some one explain how is this flexibility of variable size achieved?
Same mechanism as alloca() - allocate memory by decreasing the stack pointer, with the assumption that the stack is big enough and/or the OS will grow it as needed.
There might be a bit of an issue when the requested size is over a memory page and the stack is near its end. Normally, the OS grows the stack by setting up a guard page at the stack top and watching for faults in that area, but that assumes that the stack grows more or less sequentially (by pushes and function calls). If the decreased stack pointer overshoots the guard page, it might end up pointing at a bogus location. I'm not sure what does the compiler do about that possibility.
section .data
text db 'Put a number',10,0
scanform db '%d'
number dw 0
section .text
extern printf,scanf
global main
main:
push rbp
mov rbp,rsp
push rdi
push rsi
push rbx
mov rdi,text
mov rax,0
call printf
mov rsi,number
mov rdi,scanform
mov rax,0
call scanf
pop rbx
pop rsi
pop rdi
ret
This is my code I write a other codes all day and I do not have problem with these but now when I call scanf, write program received signal SIGSEV, segfault... Specified first and last line in different files. I do not understand this message can someone help me?
You have the following issues:
You forgot to pop rbp.
You misalign the stack which needs to be 16 byte aligned.
You do not zero terminate your format string (thanks to Paul for pointing this out).
You use %d which writes a 4 byte integer but you only allocated 2 bytes with dw.
It is recommended to align integers to 4 bytes.
A possible fixed version:
section .data
number dd 0
text db 'Put a number',10,0
scanform db '%d', 0
section .text
extern printf,scanf
global main
main:
push rbp
mov rbp,rsp
push rdi
push rsi
push rbx
push rbx ; for alignment
mov rdi,text
mov rax,0
call printf
mov rsi,number
mov rdi,scanform
mov rax,0
call scanf
pop rbx
pop rbx
pop rsi
pop rdi
pop rbp
ret
Since rsi and rdi are caller-saved registers and rbx is not touched, you can simplify the code. I also changed to xor zeroing and rip-relative addressing as follows:
section .data
number dd 0
text db 'Put a number',10,0
scanform db '%d', 0
section .text
extern printf,scanf
global main
main:
push rbp
lea rdi, [rel text]
xor eax, eax
call printf
lea rsi, [rel number]
lea rdi, [rel scanform]
xor eax, eax
call scanf
pop rbp
ret
I'm stuck at figuring out to copy the string source to target, which should be initialized to all zeroes. It appears as though I need to find the size of the string, start a counter register, and push stringitem[counter] to the stack, increment counter register. I can't figure out how to even get started, let alone search for a word in the string.
Thanks!
bits 64
global main
extern printf
section .text
main:
; function setup
push rbp
mov rbp, rsp
sub rsp, 32
;
lea rdi, [rel message]
mov al, 0
call printf
;
lea rdi, [rel source]
mov al, 0
call printf
;
;mov edi, source
;mov esi, target
;lea rdi, [esi]
;mov al, 0
;call printf
;mov ecx,sizeof source -1
; mov esi,0
;L1:
; mov eax,source[esi];
; push eax
; inc esi
; loop L1
; function return
mov eax, 0
add rsp, 32
pop rbp
ret
section .data
message: db 'Project',0x0D,0x0a,'Author',0x0D,0x0a,0
source: db 0x0D,0x0a,"I can't figure out how to copy this text to target.",0x0D,0x0a,0
target: db '0000000000000000000000000000000000000000000',0x0D,0x0a,0
For your data memory layout this will do
lea rdi, [rel target]
lea rsi, [rel source]
mov rcx, target-source
cld
rep movsb
Otherwise as Jester said, a simple byte-to-byte copy will also do
lea rdi, [rel target]
lea rsi, [rel source]
cld
.copy:
lodsb
stosb
test al, al
jnz .copy