Linux writing jump to protect memory using mprotect - c

am writing a hook to a function using the basic x86 method of inserting a jump worth 5 bytes. My code is rusty but I think i have the logic. I get a segmentation fault error when I run against LD_PRELOAD env var. Am basically using a replacement function, hook func, trampoline function to modify and return the original address. Below is the code link.
foo.h
#ifndef foo_h__
#define foo_h__
extern void foo(const char*);
#endif // foo_h_
foo.c
#include <stdio.h>
void foo(const char*str)
{
puts(str);
}
main.c
#include <stdio.h>
#include "foo.h"
int main(void)
{const char*str="I am a shared lib!\n";
int count=1;
puts("This is a shared library test...");
while(count!=200){
printf("%d time!\n",count);
foo(str);
count++;
}
return 0;
}
hook.c
# include <stdio.h>
# include <unistd.h>
# define __USE_GNU
# include <dlfcn.h>
# include <stdint.h>
# include <sys/mman.h>
const char*str = "Hooked! ma fucker!\n";
struct hookdata
{
int64_t*origFunc;
int64_t*newFunc;
const char*s;
void (*foo_trampoline)(const char*str);
}*hkd;
void fooHooked(const char*str)
{
puts(str);
hkd->foo_trampoline(hkd->s);
}
void hook(void)
{
//Get pointers to the original and new functions and calculate the jump offset
hkd->origFunc = dlsym(RTLD_NOW, "foo");
hkd->newFunc = (int64_t*) &fooHooked;
int64_t offset = hkd->newFunc - (hkd->origFunc + 5);
//Make the memory containing the original funcion writable
//Code from http://stackoverflow.com/questions/20381812/mprotect-always-returns-invalid-arguments
size_t pageSize = sysconf(_SC_PAGESIZE);
uintptr_t start = (uintptr_t) hkd->origFunc;
uintptr_t end = start + 1;
uintptr_t pageStart = start & -pageSize;
mprotect((void *) pageStart, end - pageStart,
PROT_READ | PROT_WRITE | PROT_EXEC);
//Insert the jump instruction at the beginning of the original function
int32_t instruction = 0xe9 | offset << 8;
*hkd->origFunc = instruction;
}
void foo(const char*str)
{
if (*hkd->origFunc == 0xe9)
{
printf("hook detected!");
}
else
hook();
}

The combination of page access flags PROT_READ | PROT_WRITE | PROT_EXEC violates W^X protection, so that may be the first problem at hand. In a first step first setting PROT_READ | PROT_WRITE for replacing the function preamble and then restoring it to PROT_READ | PROT_EXEC will probably solve that problem.

You stocking you're 5 bytes instruction in a 4 bytes type, you need something like this:
unsigned char instr[5];
instr[0] = 0xe9;
*(int32_t*)(&instr[1]) = offset;
memcpy(hkd->origFunc, instr, 5);

Related

Failed to read pci csr via mmap using uint64_t pointer [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed last month.
Improve this question
I'm trying to reading PCI CSR (Configuration Space Register) on my system via open,mmap /dev/mem.
I met some problems when using 8 byte length reading
Here is the minimal working example of my code
#include <errno.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#define FATAL \
do { \
fprintf(stderr, \
"Error at line %d, file %s (%d) [%s]\n", \
__LINE__, \
__FILE__, \
errno, \
strerror(errno)); \
exit(1); \
} while(0)
#define PAGE_SIZE 4096UL
#define PAGE_MASK (PAGE_SIZE - 1)
typedef struct rw_config rw_config;
struct rw_config {
uint64_t address;
uint64_t data;
};
static uint64_t _mmio_read_worker(uint64_t address) {
int fd;
void *map_base = NULL;
void *map_address = NULL;
uint64_t result = 0UL;
if((fd = open("/dev/mem", O_RDONLY | O_SYNC)) < 0) FATAL;
// PAGE_SIZE = 4096UL
// PAGE_MASK = (PAGE_SIZE - 1) = 4095UL
if((map_base = mmap(NULL,
PAGE_SIZE,
PROT_READ,
MAP_SHARED,
fd,
(address & ~PAGE_MASK)))
== MAP_FAILED)
FATAL;
map_address = map_base + (address & PAGE_MASK);
result = *(uint64_t *)map_address;
printf("uint32_t 0x%016x, uint64_t 0x%016lx\n",
(*(uint32_t *)map_address),
(*(uint64_t *)map_address));
close(fd);
return result;
}
void rw_worker(rw_config *cfg) {
cfg->data = _mmio_read_worker(cfg->address);
return;
}
int main(int argc, char *argv[]) {
rw_config *cfg = malloc(sizeof(rw_config));
cfg->address = 0x80000000;
cfg->data = 0x0;
rw_worker(cfg);
return 0;
}
Reading the address = 0x80000000 which is pci mmio base address.
The output of my code is as follows:
uint32_t 0x0000000009a28086, uint64_t 0xffffffffffffffff
And I try to using gdb to get some information.
(gdb) printf "0x%llx\n",(*(uint64_t *)map_address)
0x10000009a28086
# before assigning 'result'
(gdb) printf "0x%llx\n",result
0x0
(gdb) next
# after assigning 'result'
(gdb) printf "0x%llx\n",result
0xffffffffffffffff
(gdb) print map_address
$2 = (void *) 0x7ffff7ffb000
(gdb) x/1xg 0x7ffff7ffb000
0x7ffff7ffb000: 0x0010000009a28086
I guess I fail to casting (void*) to *(uint64_t *), but why?
The value storage in map_address is correct, am I using the wrong way to get the value?
After reading the replies from other members, I read some documents that may be related to this bug, and the following are some of my insights:
I tried testing with another address which NOT in PCI CSR(Configuration Space Register), and got the correct value. So I think this bug is related to hardware registers rather than software implementation
In EDK II UEFI Writer Guide link, using 64bits read on PCI BAR(Base Address Register, which is a part of PCI CSR) may cause an alignment fault, you should use 2x of 32bits read to achieve 64bits read. Although in the example it is not enforced that the whole CSR has this limitation, but I think there is already a good reason for this bug.
PCIe config space must be read using 1, 2, or 4-byte accesses.
8-byte accesses are not permitted.
This is specified in the PCIe spec, section 2.2.7.1: "For Configuration Requests, Length[9:0] must be 00 0000 01b." (Length is specified in DW.)
My experience is that 8-byte accesses always return FF in all bytes.

SIGSEGV, after mmapping area

I am trying to revive process from core dump after SIGQUIT.
I really want that piece of virtual memory, yet I get SIGSEGV when I try to map it.
EDIT: This area isn't free: 0xf75d2000 - 0xf7774000, but still i want to have it.
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <ucontext.h>
#include <elf.h>
#include <sys/procfs.h>
#include <sys/user.h>
#include <linux/unistd.h>
#include <linux/unistd.h>
#include <asm/ldt.h>
#include <signal.h>
bool flag = false;
int argc2;
char ** argv2;
int main2(){
FILE * file = fopen("/proc/self/maps", "r");
if (file) {
char c;
while ((c = getc(file)) != EOF)
putchar(c);
fclose(file);
}
fflush(stdout);
void* res = mmap((void*)(0xf75d2000), 0x001a5000, PROT_EXEC | PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
return 0;
}
int main(int argc, char ** argv){
argc2 = argc;
argv2 = argv;
ucontext_t cont;
getcontext (&cont);
if(!flag){
void* a = mmap((void*)0x34B000, 81920, PROT_EXEC | PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
if(a == MAP_FAILED){
printf("mmapfail");
return 1;
}
cont.uc_mcontext.gregs[REG_ESP] = 0x355000;
flag = true;
setcontext(&cont);
} else{
exit(main2());
}
}
I'm compiling it with:
gcc -static -Wl,-Ttext=0x4A9480,--build-idone,-Tdata=0x639480,--section-start=.plt=0x3B9480,--section-start=.rel.plt=0x3AF480,--section-start=.note.ABI-tag=0x39B480 main.c -o main -m32
The address you are trying to map (0xf75d2000) is above the userspace/kernel split in virtual memory. If your kernel is configured with CONFIG_VMSPLIT_3G, you can't map arbitrary addresses above 0xc0000000.
The existing mappings were setup in kernel to expose the vDSO space (to assist with system calls).
Of course you get a SEGV. You map things with MAP_FIXED into some address that doesn't belong to you, then you pull the stack from under your feet. You cannot do this.
The address space is not yours to mess around in. MAP_FIXED is only safe for overwriting earlier mappings. You can possibly play around in it in a single experiment where you'll throw away the program afterwards, but any other use is just not going to work.
Right now your call to setcontext will crash because it doesn't know where to return. Do you even know how function calls and the stack interact? Your call to setcontext saves the return address on the stack, then setcontext changes the stack pointer then it tries to return and dies because it reads 0 as the return address (or setcontext maybe saves the old stack pointer in some other register and will restore it from that register before it returns and what crashes is your other mmap that overwrites the real stack). Please don't do this. Your only chance to reliably change stacks without being the operating system is to set up a signal handler with sigaltstack, catch that signal and never return from the signal handler.
But since you're mapping the memory for your new stack with MAP_FIXED into some random address you'll probably overwrite some other important data structure and it still won't work.
The address space needs to be claimed before other areas are claimed, therefore it needs to be claimed in the executable's metadata.
Create an section in assembly language, then specify it's address in a command line argument to the linker.
For example:
#include <stdio.h>
extern char mem[];
asm (R"(
.section fixed, "aw", #nobits
.global mem
mem:
.zero 0x20000000
)");
int main() {
printf("mem = %p\n", mem);
}
Compile and link with:
gcc -O2 -Wl,--section-start=fixed=0x40000000 -fno-pie -no-pie test.c
Unfortunately using GCC's __attribute__((Section("fixed"))) on a variable definition GCC results in a executable bloated with zeros.

mmap file returns a pointer to an inaccessible place in memory

I have this program that is supposed to mmap a file in read-write mode and be able to edit its contents. Also the file this is written for is about 40-50 GB, so I need mmap64. The problem is, while mmap64 does not return an error, the address it returns is not accessible.
#include <assert.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <unistd.h>
typedef unsigned long long u64;
void access_test(u64 p, u64 sz)
{
u64 i;
char tmp;
for (i=0; i<sz; i++) {
tmp = *(char*)(p+i);
}
}
int main(int argc, char *argv[])
{
int fd;
long long int sz, p;
struct stat buf;
fd = open(argv[1], O_RDWR, 0x0666);
if (fd == -1) {
perror("open");
return 1;
}
fstat64(fd, &buf);
sz = buf.st_size;
printf("File size: 0x%016llx\n", sz);
p = mmap64 (0, buf.st_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
if (p == -1) {
perror ("mmap");
return 1;
}
access_test(p,sz);
if (close (fd) == -1) {
perror ("close");
return 1;
}
if (munmap ((void*)p, buf.st_size) == -1) {
perror ("munmap");
return 1;
}
return 0;
}
The result of this is on a small file:
$ ./testmmap minicom.log
File size: 0x0000000000000023
[1] 8282 segmentation fault (core dumped) ./testmmap minicom.log
The same goes for the big one.
Always enable warnings when you compile
Here is the result with warnings enabled:
$ gcc mmp.c -Wall -g
mmp.c: In function ‘access_test’:
mmp.c:18:10: warning: variable ‘tmp’ set but not used [-Wunused-but-set-variable]
char tmp;
^
mmp.c: In function ‘main’:
mmp.c:36:5: warning: implicit declaration of function ‘fstat64’ [-Wimplicit-function-declaration]
fstat64(fd, &buf);
^
mmp.c:40:5: warning: implicit declaration of function ‘mmap64’ [-Wimplicit-function-declaration]
p = mmap64 (0, buf.st_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
The last two warnings here are extremely important. They say there is no prototype for mmap64. C therefore gives you a default prototype, and it is wrong, at least for the mmap64() call (since the prototype will return an int, which cannot represent a pointer on a 64-bit Linux host)
The argument to fstat64() is a struct stat64 too BTW, which is another issue.
Make the specific 64-bit functions available
If you want to make the fstat64()/mmap64() function available, you need to compile the code with the _LARGEFILE and LARGEFILE64_SOURCE #define, see information here, so you should compile this as e.g:
gcc -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE mmp.c -Wall -g
Or use #define _FILE_OFFSET_BITS=64
There is however no need to do this. Just call the normal fstat() and mmap() and #define _FILE_OFFSET_BITS=64 when compiling. e.g.:
gcc -D_FILE_OFFSET_BITS=64 mmp.c -Wall -g
This will enable support for large files, and e.g. translate the mmap() call to mmap64() if it is needed (e.g. if you're on a 32-bit host).
If you are trying to mmap() an 50 GB file, you anyway need to be on a 64-bit host, and on a 64-bit Linux host there's no need for any of this - mmap() and fstat() handles large files without any need to do anything.
Use pointers
The next issue is you're assigning the return value of mmap() to an integer. This might happen to work, but the code does look odd because of it. If you want to treat the thing as a char *, assign it to a char *. Don't play tricks with casting pointers around to a 64-bit integer type.
E.g. your access function should be:
void access_test(char *p, u64 sz)
{
u64 i;
char tmp;
for (i=0; i<sz; i++) {
tmp = p[i];
}
}
And p should be declared as char *p; in main(), or use uint8_t *p; if you intend to treat the data as binary data.

Copying part of the stack and using mmap to map it to the current process

I want my program to do the following:
Open a new file.
Copy a (page-aligned) portion of the stack that includes the current frame pointer address to the file.
Map the contents of the file back into the process's address space in the same range as that of the original portion of the stack, so that the process will use the file for that part of its stack rather than the region of memory the system had originally allocated to it for the stack.
Below is my code. I am getting a segmentation fault on the call to mmap, specifically where mmap makes the system call with vsyscall. (I am working with gcc 4.4.3, glibc 2.11.1, under Ubuntu Server (x86-64). I have compiled and run both with 64-bit and 32-bit configurations, with the same results.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include <assert.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#define PAGE_SIZE 0x1000
#define FILENAME_LENGTH 0x10
#if defined ARCH && ARCH == 32
#define PAGE_SIZE_COMPLEMENT 0xfffff000
#define UINT uint32_t
#define INT int32_t
#define BP "ebp"
#define SP "esp"
#define X_FORMAT "%x"
#else
#define PAGE_SIZE_COMPLEMENT 0xfffffffffffff000
#define UINT uint64_t
#define INT int64_t
#define BP "rbp"
#define SP "rsp"
#define X_FORMAT "%lx"
#endif
#define PAGE_ROUND_UP(v) (((v) + PAGE_SIZE - 1) & PAGE_SIZE_COMPLEMENT)
#define PAGE_ROUND_DOWN(v) ((v) & PAGE_SIZE_COMPLEMENT)
UINT stack_low, stack_high, stack_length;
void find_stack_high(void) {
UINT bp = 0;
UINT raw_stack_high = 0;
/* Set the global stack high to the best
* approximation.
*/
asm volatile ("mov %%"BP", %0" : "=m"(bp));
while (bp) {
raw_stack_high = bp;
bp = *(UINT *)bp;
}
stack_high = PAGE_ROUND_UP(raw_stack_high);
}
int file_create(void) {
int fd;
char filename[FILENAME_LENGTH];
strcpy(filename, "tmp.XXXXXX");
fd = mkstemp(filename);
if (fd == -1) {
perror("file_create:mkstemp");
exit(EXIT_FAILURE);
}
unlink(filename);
return fd;
}
int main(void) {
int fd, bytes_written;
UINT bp;
off_t offset;
printf("In main\n");
fd = file_create();
printf("fd %d\n", fd);
find_stack_high();
// Get the current frame pointer.
asm volatile ("mov %%"BP", %0" : "=m" (bp));
// Store page boundary below
// frame pointer as end of potentially shared stack.
stack_low = PAGE_ROUND_DOWN(bp);
stack_length = stack_high - stack_low;
printf("start "X_FORMAT" end "X_FORMAT" length "X_FORMAT"\n",
stack_low, stack_high, stack_length);
bytes_written =
write(fd, (const void *)stack_low, PAGE_SIZE);
if (bytes_written != PAGE_SIZE) {
perror("main: write");
fprintf(stderr, "Num bytes: %x\n", bytes_written);
exit(EXIT_FAILURE);
}
offset = 0;
if (mmap((void *)stack_low, PAGE_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_FIXED | MAP_GROWSDOWN, fd, offset) ==
MAP_FAILED) {
perror("file_copy: mmap");
exit(EXIT_FAILURE);
}
close(fd);
return EXIT_SUCCESS;
}
Thanks!
The stack changes (e.g. the return address for the mmap call) after you copied it. I can think of 2 possible ways around this:
Write asm that doesn't need the stack to perform the new mapping.
Call into a function with some huge local data so that the working stack is on a different page from the pages you're mapping over. Then, you could map over the lower addresses with a second call to mmap once this function returns.
Whatever you do, this is a horrible hack and probably a bad idea..
Tried turning on execute permission? In any case, the symptom suggests that you've managed to map in over the top of the stack, destroying the return pointer.

Linux Kernel: System call hooking example

I'm trying to write some simple test code as a demonstration of hooking the system call table.
"sys_call_table" is no longer exported in 2.6, so I'm just grabbing the address from the System.map file, and I can see it is correct (Looking through the memory at the address I found, I can see the pointers to the system calls).
However, when I try to modify this table, the kernel gives an "Oops" with "unable to handle kernel paging request at virtual address c061e4f4" and the machine reboots.
This is CentOS 5.4 running 2.6.18-164.10.1.el5. Is there some sort of protection or do I just have a bug? I know it comes with SELinux, and I've tried putting it in to permissive mode, but it doesn't make a difference
Here's my code:
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
// Hook: Crashes here
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
I finally found the answer myself.
http://www.linuxforums.org/forum/linux-kernel/133982-cannot-modify-sys_call_table.html
The kernel was changed at some point so that the system call table is read only.
cypherpunk:
Even if it is late but the Solution
may interest others too: In the
entry.S file you will find: Code:
.section .rodata,"a"
#include "syscall_table_32.S"
sys_call_table -> ReadOnly You have to
compile the Kernel new if you want to
"hack" around with sys_call_table...
The link also has an example of changing the memory to be writable.
nasekomoe:
Hi everybody. Thanks for replies. I
solved the problem long ago by
modifying access to memory pages. I
have implemented two functions that do
it for my upper level code:
#include <asm/cacheflush.h>
#ifdef KERN_2_6_24
#include <asm/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int set_page_ro(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ;
return change_page_attr(pg, 1, prot);
}
#else
#include <linux/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(_addr, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(_addr, 1);
}
#endif // KERN_2_6_24
Here's a modified version of the original code that works for me.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
set_page_rw(sys_call_table);
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
Thanks Stephen, your research here was helpful to me. I had a few problems, though, as I was trying this on a 2.6.32 kernel, and getting WARNING: at arch/x86/mm/pageattr.c:877 change_page_attr_set_clr+0x343/0x530() (Not tainted) followed by a kernel OOPS about not being able to write to the memory address.
The comment above the mentioned line states:
// People should not be passing in unaligned addresses
The following modified code works:
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
Note that this still doesn't actually set the page as read/write in some situations. The static_protections() function, which is called inside of set_memory_rw(), removes the _PAGE_RW flag if:
It's in the BIOS area
The address is inside .rodata
CONFIG_DEBUG_RODATA is set and the kernel is set to read-only
I found this out after debugging why I still got "unable to handle kernel paging request" when trying to modify the address of kernel functions. I was eventually able to solve that problem by finding the page table entry for the address myself and manually setting it to writable. Thankfully, the lookup_address() function is exported in version 2.6.26+. Here is the code I wrote to do that:
void set_addr_rw(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
if (pte->pte &~ _PAGE_RW) pte->pte |= _PAGE_RW;
}
void set_addr_ro(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
pte->pte = pte->pte &~_PAGE_RW;
}
Finally, while Mark's answer is technically correct, it'll case problem when ran inside Xen. If you want to disable write-protect, use the read/write cr0 functions. I macro them like this:
#define GPF_DISABLE write_cr0(read_cr0() & (~ 0x10000))
#define GPF_ENABLE write_cr0(read_cr0() | 0x10000)
Hope this helps anyone else who stumbles upon this question.
Note that the following will also work instead of using change_page_attr and cannot be depreciated:
static void disable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (value & 0x00010000) {
value &= ~0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
static void enable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (!(value & 0x00010000)) {
value |= 0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
If you are dealing with kernel 3.4 and later (it can also work with earlier kernels, I didn't test it) I would recommend a smarter way to acquire the system callы table location.
For example
#include <linux/module.h>
#include <linux/kallsyms.h>
static unsigned long **p_sys_call_table;
/* Aquire system calls table address */
p_sys_call_table = (void *) kallsyms_lookup_name("sys_call_table");
That's it. No addresses, it works fine with every kernel I've tested.
The same way you can use a not exported Kernel function from your module:
static int (*ref_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
ref_access_remote_vm = (void *)kallsyms_lookup_name("access_remote_vm");
Enjoy!
As others have hinted, the whole story is a bit different now on modern kernels. I'll be covering x86-64 here, for syscall hijacking on modern arm64 refer to this other answer of mine. Also NOTE: this is plain and simple syscall hijacking. Non-invasive hooking can be done in a much nicer way using kprobes.
Since Linux v4.17, x86 (both 64 and 32 bit) now uses syscall wrappers that take a struct pt_regs * as the only argument (see commit 1, commit 2). You can see arch/x86/include/asm/syscall.h for the definitions.
Additionally, as others have described already in different answers, the simplest way to modify sys_call_table is to temporarily disable CR0 WP (Write-Protect) bit, which could be done using read_cr0() and write_cr0(). However, since Linux v5.3, [native_]write_cr0 will check sensitive bits that should never change (like WP) and refuse to change them (commit). In order to work around this, we need to write CR0 manually using inline assembly.
Here is a working kernel module (tested on Linux 5.10 and 5.18) that does syscall hijacking on modern Linux x86-64 considering the above caveats and assuming that you already know the address of sys_call_table (if you also want to find that in the module, see Proper way of getting the address of non-exported kernel symbols in a Linux kernel module):
// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/**
* Test syscall table hijacking on x86-64. This module will replace the `read`
* syscall with a simple wrapper which logs every invocation of `read` using
* printk().
*
* Tested on Linux x86-64 v5.10, v5.18.
*
* Usage:
*
* sudo cat /proc/kallsyms | grep sys_call_table # grab address
* sudo insmod syscall_hijack.ko sys_call_table_addr=0x<address_here>
*/
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <asm/special_insns.h> // {read,write}_cr0()
#include <asm/processor-flags.h> // X86_CR0_WP
#include <asm/unistd.h> // __NR_*
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
static sys_call_ptr_t *real_sys_call_table;
static sys_call_ptr_t original_read;
static unsigned long sys_call_table_addr;
module_param(sys_call_table_addr, ulong, 0);
MODULE_PARM_DESC(sys_call_table_addr, "Address of sys_call_table");
// Since Linux v5.3 [native_]write_cr0 won't change "sensitive" CR0 bits, need
// to re-implement this ourselves.
static void write_cr0_unsafe(unsigned long val)
{
asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
}
static long myread(const struct pt_regs *regs)
{
pr_info("read(%ld, 0x%lx, %lx)\n", regs->di, regs->si, regs->dx);
return original_read(regs);
}
static int __init modinit(void)
{
unsigned long old_cr0;
real_sys_call_table = (typeof(real_sys_call_table))sys_call_table_addr;
pr_info("init\n");
// Temporarily disable CR0 WP to be able to write to read-only pages
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Overwrite syscall and save original to be restored later
original_read = real_sys_call_table[__NR_read];
real_sys_call_table[__NR_read] = myread;
// Restore CR0 WP
write_cr0_unsafe(old_cr0);
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
unsigned long old_cr0;
pr_info("exit\n");
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Restore original syscall
real_sys_call_table[__NR_read] = original_read;
write_cr0_unsafe(old_cr0);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Test syscall table hijacking on x86-64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("Dual MIT/GPL");

Resources