How to use timers in Linux kernel device drivers? - c

I want to implement a counter in Linux device drivers which increments after every fixed interval of time. I want to do this with the help of timers. A sample code snippet would be very useful.

Have a look at following article IBM Developerworks: Timers and Lists
There is a small example of how to use Linux kernel timers (included it here for convenience, comments are from myself, removed printk messages)
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/timer.h>
MODULE_LICENSE("GPL");
static struct timer_list my_timer;
void my_timer_callback( unsigned long data )
{
/* do your timer stuff here */
}
int init_module(void)
{
/* setup your timer to call my_timer_callback */
setup_timer(&my_timer, my_timer_callback, 0);
/* setup timer interval to 200 msecs */
mod_timer(&my_timer, jiffies + msecs_to_jiffies(200));
return 0;
}
void cleanup_module(void)
{
/* remove kernel timer when unloading module */
del_timer(&my_timer);
return;
}

Around Linux kernel 4.15 release, void setup_timer(timer, function, data); became obsolete with an intent to remove it completely.
Instead, now we have to use
void timer_setup(
struct timer_list *timer,
void (*callback)(struct timer_list *),
unsigned int flags
);
This can be found in linux/timer.h file.
Here's a full example of module_with_timer.c
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/jiffies.h>
#include <linux/timer.h>
MODULE_LICENSE("GPL");
static struct timer_list my_timer;
void my_timer_callback(struct timer_list *timer) {
printk(KERN_ALERT "This line is printed after 5 seconds.\n");
}
static int init_module_with_timer(void) {
printk(KERN_ALERT "Initializing a module with timer.\n");
/* Setup the timer for initial use. Look in linux/timer.h for this function */
timer_setup(&my_timer, my_timer_callback, 0);
mod_timer(&my_timer, jiffies + msecs_to_jiffies(5000));
return 0;
}
static void exit_module_with_timer(void) {
printk(KERN_ALERT "Goodbye, cruel world!\n");
del_timer(&my_timer);
}
module_init(init_module_with_timer);
module_exit(exit_module_with_timer);
And the Makefile is
obj-m = module_with_timer.o
# Get the current kernel version number
KVERSION = $(shell uname -r)
all:
make -C /lib/modules/$(KVERSION)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(KVERSION)/build M=$(PWD) clean
Note: In real life programming, it is better to check the version of the kernel we are compiling for and then use an then appropriately start the timer.
References:
https://lwn.net/Articles/735887/

Depending on what you exactly want to do, you can directly use jiffies to measure time, as it has been suggested in the comments. You can also use kernel timers, and given the information in your question, they seem to be a better fit.
The kernel timers API is quite intuitive:
#include <linux/timer.h>
struct timer_list {
/* ... */
unsigned long expires;
void (*function)(unsigned long);
unsigned long data;
};
void init_timer(struct timer_list *timer);
struct timer_list TIMER_INITIALIZER(_function, _expires, _data);
void add_timer(struct timer_list * timer);
int del_timer(struct timer_list * timer);
So you would just need to define a timer function and then initialize and start the timer.
You have several sources to further learn about this topic:
Understanding the Linux Kernel. This book is a sort of bible for the kernel. It is somehow outdated in some areas, but still a really good source of information.
Linux Device Drivers. This is a very useful book when developing device drivers. There is an online version too here. The chapter dealing with time, timers, etc. is chapter 7. This book may be also a bit outdated since it is from 2005 too.
Linux Kernel Development. I have not checked this book, but the good point is that it is much newer (from 2010), so you may find some updated information compared to the previous two books.

Related

Get NFS client IP address in a kernel module

I'm working on a kernel module to track the operations performed by NFS clients on my server.
I can intercept the file operations using a hacky way (hijacking the vfs layer) but I can't retrieve the IP address of the client.
Is there any information that might be stored in the current task that I can use to obtain the IP address of the NFS client performing an operation?
I know from digging into the source code that nfsd stores a struct nfsd_net in the struct super_block's s_fs_info field, but I can only retrieve it as a struct net pointer. And in nfsd's implementation net_generic method is being used to get the struct nfsd_net pointer (using nfsd_net_id which is the pernet_operations's id).
Can I obtain this id somehow? and if yes, can I use the struct nfsd_net in my kernel module? Is it defined somewhere other than the fs/nfsd/netns.h?
Edit
I'm using this approach to hijack the open function. I'm writing this for kernel version 4.15.0. Here's the code of the kernel module:
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/version.h>
#include <linux/proc_fs.h>
#include <linux/cred.h>
#include <linux/sched.h>
#include <linux/preempt.h>
#include <linux/uaccess.h>
#include <linux/xattr.h>
MODULE_LICENSE("GPL");
#if defined(__i386__)
#define POFF 1
#define CSIZE 6
// push address, addr, ret
char *jmp_code="\x68\x00\x00\x00\x00\xc3";
typedef unsigned int PSIZE;
#else
#define POFF 2
#define CSIZE 12
// mov address to register rax, jmp rax. for normal x64 convention
char *jmp_code="\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00\xff\xe0";
typedef unsigned long PSIZE;
#endif
DEFINE_SPINLOCK(root_open_lock);
int (*orig_root_open) (struct inode *, struct file *);
void *orig_root_open_code;
void hook(void *src_func,void *dst_addr){
barrier();
write_cr0(read_cr0() & (~0x10000));
memcpy(src_func,jmp_code,CSIZE);
*(PSIZE *)&(((unsigned char*)src_func)[POFF])=(PSIZE)dst_addr;
write_cr0(read_cr0() | 0x10000);
barrier();
}
void save_and_hook(void **p_reserve,void *src_func,void *dst_addr){
barrier();
write_cr0(read_cr0() & (~0x10000));
*p_reserve=kmalloc(CSIZE,GFP_KERNEL);
// save origin code
memcpy(*p_reserve,src_func,CSIZE);
hook(src_func,dst_addr);
write_cr0(read_cr0() | 0x10000);
barrier();
}
void fix(void **p_reserve,void *src_func){
barrier();
write_cr0(read_cr0() & (~0x10000));
memcpy(src_func,*p_reserve,CSIZE);
write_cr0(read_cr0() | 0x10000);
barrier();
}
int fake_root_open(struct inode *x, struct file *fp)
{
int ret;
printk("vfshijack: hijacked open\n"); // I need to find the client ip here.
barrier();
spin_lock(&root_open_lock);
fix(&orig_root_open_code, orig_root_open);
ret = orig_root_open(x, fp);
hook(orig_root_open, fake_root_open);
spin_unlock(&root_open_lock);
barrier();
return ret;
}
int vfs_init(void)
{
struct file *fp = filp_open("/", O_DIRECTORY|O_RDONLY, 0);
if (IS_ERR(fp))
return -1;
orig_root_open = fp->f_op->open;
if(orig_root_open)
{
save_and_hook(&orig_root_open_code, orig_root_open, fake_root_open);
}
filp_close(fp, NULL);
printk("vfshijack: vfshijack loaded\n");
return 0;
}
void vfs_exit(void)
{
if(orig_root_open)
{
fix(&orig_root_open_code, orig_root_open);
}
printk("vfshijack: vfshijack unloaded\n");
}
module_init(vfs_init);
module_exit(vfs_exit);
You can try to get needed information from linux kernel tracing tools without hooking kernel binary with some custom assembly. There are perf, ftrace, trace-cmd for most kernel versions, and stap and lttng for more custom versions. Some documentation to start: https://www.kernel.org/doc/html/v4.18/trace/index.html "Linux Tracing Technologies"
There are several tracepoints defined in nfsd:
# modprobe nfsd
# modprobe nfs
# perf list tracepoint|grep nfs
# find /sys/kernel/debug/tracing/events -type d|grep nfsd
# trace-cmd list -e nfsd:read_start -F
nfsd/read_start and nfsd/write_start tracepoints are good places to start. Both should have access to request structure rqstp with address rq_addr and fh pointer (but some eBPF or stap scripting may be useful)
https://elixir.bootlin.com/linux/v4.15/source/fs/nfsd/vfs.c#L1020
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,...)
trace_read_start(rqstp, fhp, offset, vlen);
trace_read_opened(rqstp, fhp, offset, vlen);
trace_read_io_done(rqstp, fhp, offset, vlen);
trace_read_done(rqstp, fhp, offset, vlen);
I have no complete example of trace-cmd or stap usage for nfs daemon tracing.
Systemtap (stap) has some examples of nfsd statistics:
https://github.com/jav/systemtap/blob/master/testsuite/systemtap.examples/index.txt
# stap nfsd_unlink.stp -c "sleep 0.2"
The nfsdtop.stp script gathers and displays NFS lookups
https://github.com/larytet/SystemTap/blob/master/testsuite/systemtap.examples/network/nfsdtop.stp

Instrument linux kernel local interrupt handling on x86-64

I would like to write some code (for example as a small kernel module) to instrument local interrupts on Linux running on the x86-64 architecture, i.e. I would like to write some kind of handler that is called by the kernel every time a local interrupt is triggered by the APIC.
The handler would then check whether a certain process is currently running and inspect said process' memory.
I realize that what I am trying to do may not be good engineering practice, but my aim is to create a hacky one-off solution for exploration/research purposes.
In the ideal case, there would be some kind of function similar to request_irq [1] (which as far as I can tell is used for handling interrupts from devices like keyboards, network cards, ...) allowing me to tell the kernel to run my code every time a local timer interrupt occurs.
Does anybody have any pointers on how to accomplish this?
Does the kernel provide an API for registering a handler for local interrupts?
If not, I could directly modify the kernel's source code for handling interrupts. Where in the kernel would I find this code?
Edit: Here is what my research so far has found.
TLDR:request_irq is not the way to hook local timer interrupts.
According to Understanding the Linux Kernel, Table 4.2 [2], the interrupt vector 0xef is allocated to Local APIC timer interrupts. The kernel source confirms this [3].
Since request_irq takes an interrupt vector as its first argument, let's try registering a handler for this vector inside a kernel module:
#include <linux/module.h>
#include <linux/kernel.h> // included for KERN_INFO
#include <linux/init.h> // included for __init and __exit macros
#include <linux/interrupt.h> // included for IRQF_ and request_irq
#include <linux/irqreturn.h> // included for IRQ_NONE
#include <asm/irq_vectors.h> // included for LOCAL_TIMER_VECTOR
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Instrument local timer interrupts");
static int DEVICE_COOKIE = 1;
static int count = 0;
static irqreturn_t handler(int irq, void *dev) {
count++;
return IRQ_NONE;
}
static int __init test_init(void) {
int status;
printk(KERN_INFO "Running request_irq\n");
status = request_irq(
LOCAL_TIMER_VECTOR,
&handler,
IRQF_TIMER,
"foobar",
&DEVICE_COOKIE);
if (status == 0) {
printk(KERN_INFO "Successfully installed handler\n");
return 0;
} else {
printk(
KERN_INFO "Failed to install handler. error code: %d\n",
status);
return -1;
}
}
static void __exit test_cleanup(void) {
free_irq(LOCAL_TIMER_VECTOR, &DEVICE_COOKIE);
printk(KERN_INFO "Goodbye kernel. I saw %d interrupts.\n", count);
}
module_init(test_init);
module_exit(test_cleanup);
When we try to insert the module with insmod, request_irq returns -EINVAL:
[74890.287173] Running request_irq
[74890.287174] Failed to install handler. error code: -22
So where does the -EINVAL come from?
Reading through the kernel source, we find that request_irq is just a wrapper around request_threaded_irq [4]. request_threaded_irq calls irq_to_desc and returns -EINVAL if the call fails. We can easily check whether this is the case with another small kernel module:
#include <linux/kernel.h> // included for KERN_INFO
#include <linux/init.h> // included for __init and __exit macros
#include <linux/interrupt.h> // included for IRQF_ and request_irq
#include <linux/irqreturn.h> // included for IRQ_NONE
#include <asm/irq_vectors.h> // included for LOCAL_TIMER_VECTOR
#include <linux/irqnr.h> // included for irq_to-desc
#include <linux/irqdesc.h> // included for irq_desc
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Call irq_to_desc(LOCAL_TIMER_VECTOR)");
static int __init test_init(void) {
struct irq_desc *desc;
desc = irq_to_desc(LOCAL_TIMER_VECTOR);
if (!desc) {
printk(KERN_INFO "irq_to_desc(LOCAL_TIMER_VECTOR) failed");
return -1;
}
return 0;
}
static void __exit test_cleanup(void) {
}
module_init(test_init);
module_exit(test_cleanup);
And indeed it fails:
[75787.142533] irq_to_desc(LOCAL_TIMER_VECTOR) failed
[1] www.makelinux.net/books/lkd2/ch06lev1sec3
[2] www.safaribooksonline.com/library/view/understanding-the-linux/0596005652/ch04s06.html
[3] lxr.free-electrons.com/source/arch/x86/include/asm/irq_vectors.h?v=4.8#L108
[4] lxr.free-electrons.com/source/kernel/irq/manage.c?v=4.8#L1634

Create dispatch table registering functions across multiple source files in C

How can I implement a dynamic dispatch table in C
It's essentially the same question as the linked issue, so ...
As your Strategy.c obviously already knows about the strategy
instances by name (#include "XYstrategy.h") you could go the whole
mile and use the header files instead of the implementation files to
communicate your strategy to the central dispatcher:
This is contrary to the clear intent in the question. This was an example of how he could do it statically, but wanted to have modules dynamically register themselves at compile time.
Let me try providing an example I'm struggling with for my own purposes...
I have a micro-controller which I want to use to read a variety of sensors that report temperature and/or humidity. I have a central core program which takes care of formatting the returned data and submitting it to a web server where it is recorded in an RRD.
Rather than build a large monolithic program which contains all the different functions for each sensor type, I want to be able to build a specific subset into the software loaded onto the micro-controller which corresponds to the sensors installed on that particular controller.
To do this I would like to be able to write a generic driver for each sensor that has three functions:
bool _sensor_startup();
bool _read_sensor(float *temp, float *humidity, uint8_t max_count, uint8_t *count);
bool _sensor_shutdown();
The sensor_startup function will take care of powering up the sensors, making sure that they are properly configured and in a state of readiness for read_sensor to be called. If this process fails for any reason, it returns false, otherwise, it returns true.
The read_sensor function will cause up to max_count sensors to be read with their reults stored in the arrays pointed to by temp and humidity, respectively. The number of sensors read will be stored in count.
The sensor_shutdown function will do any housekeeping necessary to return the sensors and supporting electronics into their lowest power consumption configuration.
Each of these is contained in a separate .c file which may have a corresponding .h file to define relevant constants, call relevant libraries, etc.
I'd like to have a master Sensor.h file which is included by the .c or .h files and which defines:
typedef struct { startup_func, read_func, shutdown_func } sensor_driver_entry;
extern sensor_driver_entry sensor_table[];
Then I'd like each Driver file to be able to use a macro (or a function) to register the type-specific functions in the next open slot in sensor_table at compile time.
I'd like sensor table to be declared in the global namespace of Sensor.c as:
sensor_driver_entry sensor_table[MAX_SENSOR_TYPES];
(MAX_SENSOR_TYPES would be defined in Sensor.h reflecting the maximum possible number of drivers that could be selected).
Is this even possible? If so, can someone provide a syntactic example? In this specific case, I'm coding in the Particle Dev environment for a Particle Photon, but I'd like it if I could make the code also portable to the Arduino IDE to use it with ESP8266 boards as well.
One possibility is to make use of constructors. Below is a simple example with two drivers registering their functions respectively.
If the application is compiled with both drivers (gcc main.c driver1.c driver2.c) the output shows both driver functions registered:
driver1_init
driver2_init
driver1_func
driver2_func
If only the first driver is compiled in (gcc main.c driver1.c) the output shows only that driver's function registered:
driver1_init
driver1_func
driver.h
typedef void (*driver_func_t)(void);
typedef struct { driver_func_t func; } driver_entry_t;
#define MAX_TYPES 10
extern driver_entry_t driver_table[MAX_TYPES];
extern unsigned int num_driver_entries;
main.c
#include <stdio.h>
#include "driver.h"
driver_entry_t driver_table[MAX_TYPES];
unsigned int num_driver_entries;
int main (void)
{
unsigned int ix;
for (ix = 0; ix < num_driver_entries; ix++) {
driver_table[ix].func();
}
return 0;
}
driver1.c
#include <stdio.h>
#include "driver.h"
void driver1_func (void)
{
printf("%s\n", __FUNCTION__);
}
void driver1_init (void) __attribute__ ((constructor));
void driver1_init (void)
{
printf("%s\n", __FUNCTION__);
driver_table[num_driver_entries++].func = driver1_func;
}
driver2.c
#include <stdio.h>
#include "driver.h"
void driver2_func (void)
{
printf("%s\n", __FUNCTION__);
}
void driver2_init (void) __attribute__ ((constructor));
void driver2_init (void)
{
printf("%s\n", __FUNCTION__);
driver_table[num_driver_entries++].func = driver2_func;
}

Timer interrupt in C

I have to write a c file which is about timer interrupt.Program will run on DosBox.
Features of program is simple :
Program call a function(which is typed by me) for each timer interrupts.And these interrupts will be programmed with setvect and getvect functions.
For example It would be very nice if I have a c code which prints "Hello world\n" to screen once in a second without any sleep or delay function.I mean the code that print "hello world" must be in a function and for each interrupt program must call this function.
It is very hard to find example of this case,Can you forward any example link ? Thanks for all help..
#include<signal.h>
#include<sys/time.h>
#include <unistd.h>
#include <assert.h>
#include <stdio.h>
#define PERIOD 999999
static sigset_t block ;
void timer_handler ();
static void init( ) __attribute__((constructor));
void init(){
sigemptyset(&block);
sigaddset(&block,SIGVTALRM);
struct sigaction act={0};
struct timeval interval;
struct itimerval period;
act.sa_handler=timer_handler;
assert(sigaction(SIGVTALRM,&act,NULL)==0);
interval.tv_sec=0;
interval.tv_usec=PERIOD;
period.it_interval=interval;
period.it_value=interval;
setitimer(ITIMER_VIRTUAL,&period,NULL);
}
void timer_handler(int sig){
write(1,"Hi\n",3);
}
int main(){
while(1);
return 0;
}
Just an advice.
In general, it is a good thing not to use function like print in interruption. In your case, because your code is simple it's not really a problem but you can have some priorities issues on more complexe programs if you spend to much time in interruptions...

Linux Kernel: System call hooking example

I'm trying to write some simple test code as a demonstration of hooking the system call table.
"sys_call_table" is no longer exported in 2.6, so I'm just grabbing the address from the System.map file, and I can see it is correct (Looking through the memory at the address I found, I can see the pointers to the system calls).
However, when I try to modify this table, the kernel gives an "Oops" with "unable to handle kernel paging request at virtual address c061e4f4" and the machine reboots.
This is CentOS 5.4 running 2.6.18-164.10.1.el5. Is there some sort of protection or do I just have a bug? I know it comes with SELinux, and I've tried putting it in to permissive mode, but it doesn't make a difference
Here's my code:
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
// Hook: Crashes here
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
I finally found the answer myself.
http://www.linuxforums.org/forum/linux-kernel/133982-cannot-modify-sys_call_table.html
The kernel was changed at some point so that the system call table is read only.
cypherpunk:
Even if it is late but the Solution
may interest others too: In the
entry.S file you will find: Code:
.section .rodata,"a"
#include "syscall_table_32.S"
sys_call_table -> ReadOnly You have to
compile the Kernel new if you want to
"hack" around with sys_call_table...
The link also has an example of changing the memory to be writable.
nasekomoe:
Hi everybody. Thanks for replies. I
solved the problem long ago by
modifying access to memory pages. I
have implemented two functions that do
it for my upper level code:
#include <asm/cacheflush.h>
#ifdef KERN_2_6_24
#include <asm/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int set_page_ro(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ;
return change_page_attr(pg, 1, prot);
}
#else
#include <linux/semaphore.h>
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(_addr, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(_addr, 1);
}
#endif // KERN_2_6_24
Here's a modified version of the original code that works for me.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/unistd.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
void **sys_call_table;
asmlinkage int (*original_call) (const char*, int, int);
asmlinkage int our_sys_open(const char* file, int flags, int mode)
{
printk("A file was opened\n");
return original_call(file, flags, mode);
}
int set_page_rw(long unsigned int _addr)
{
struct page *pg;
pgprot_t prot;
pg = virt_to_page(_addr);
prot.pgprot = VM_READ | VM_WRITE;
return change_page_attr(pg, 1, prot);
}
int init_module()
{
// sys_call_table address in System.map
sys_call_table = (void*)0xc061e4e0;
original_call = sys_call_table[__NR_open];
set_page_rw(sys_call_table);
sys_call_table[__NR_open] = our_sys_open;
}
void cleanup_module()
{
// Restore the original call
sys_call_table[__NR_open] = original_call;
}
Thanks Stephen, your research here was helpful to me. I had a few problems, though, as I was trying this on a 2.6.32 kernel, and getting WARNING: at arch/x86/mm/pageattr.c:877 change_page_attr_set_clr+0x343/0x530() (Not tainted) followed by a kernel OOPS about not being able to write to the memory address.
The comment above the mentioned line states:
// People should not be passing in unaligned addresses
The following modified code works:
int set_page_rw(long unsigned int _addr)
{
return set_memory_rw(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
int set_page_ro(long unsigned int _addr)
{
return set_memory_ro(PAGE_ALIGN(_addr) - PAGE_SIZE, 1);
}
Note that this still doesn't actually set the page as read/write in some situations. The static_protections() function, which is called inside of set_memory_rw(), removes the _PAGE_RW flag if:
It's in the BIOS area
The address is inside .rodata
CONFIG_DEBUG_RODATA is set and the kernel is set to read-only
I found this out after debugging why I still got "unable to handle kernel paging request" when trying to modify the address of kernel functions. I was eventually able to solve that problem by finding the page table entry for the address myself and manually setting it to writable. Thankfully, the lookup_address() function is exported in version 2.6.26+. Here is the code I wrote to do that:
void set_addr_rw(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
if (pte->pte &~ _PAGE_RW) pte->pte |= _PAGE_RW;
}
void set_addr_ro(unsigned long addr) {
unsigned int level;
pte_t *pte = lookup_address(addr, &level);
pte->pte = pte->pte &~_PAGE_RW;
}
Finally, while Mark's answer is technically correct, it'll case problem when ran inside Xen. If you want to disable write-protect, use the read/write cr0 functions. I macro them like this:
#define GPF_DISABLE write_cr0(read_cr0() & (~ 0x10000))
#define GPF_ENABLE write_cr0(read_cr0() | 0x10000)
Hope this helps anyone else who stumbles upon this question.
Note that the following will also work instead of using change_page_attr and cannot be depreciated:
static void disable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (value & 0x00010000) {
value &= ~0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
static void enable_page_protection(void) {
unsigned long value;
asm volatile("mov %%cr0,%0" : "=r" (value));
if (!(value & 0x00010000)) {
value |= 0x00010000;
asm volatile("mov %0,%%cr0": : "r" (value));
}
}
If you are dealing with kernel 3.4 and later (it can also work with earlier kernels, I didn't test it) I would recommend a smarter way to acquire the system callы table location.
For example
#include <linux/module.h>
#include <linux/kallsyms.h>
static unsigned long **p_sys_call_table;
/* Aquire system calls table address */
p_sys_call_table = (void *) kallsyms_lookup_name("sys_call_table");
That's it. No addresses, it works fine with every kernel I've tested.
The same way you can use a not exported Kernel function from your module:
static int (*ref_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
void *buf, int len, int write);
ref_access_remote_vm = (void *)kallsyms_lookup_name("access_remote_vm");
Enjoy!
As others have hinted, the whole story is a bit different now on modern kernels. I'll be covering x86-64 here, for syscall hijacking on modern arm64 refer to this other answer of mine. Also NOTE: this is plain and simple syscall hijacking. Non-invasive hooking can be done in a much nicer way using kprobes.
Since Linux v4.17, x86 (both 64 and 32 bit) now uses syscall wrappers that take a struct pt_regs * as the only argument (see commit 1, commit 2). You can see arch/x86/include/asm/syscall.h for the definitions.
Additionally, as others have described already in different answers, the simplest way to modify sys_call_table is to temporarily disable CR0 WP (Write-Protect) bit, which could be done using read_cr0() and write_cr0(). However, since Linux v5.3, [native_]write_cr0 will check sensitive bits that should never change (like WP) and refuse to change them (commit). In order to work around this, we need to write CR0 manually using inline assembly.
Here is a working kernel module (tested on Linux 5.10 and 5.18) that does syscall hijacking on modern Linux x86-64 considering the above caveats and assuming that you already know the address of sys_call_table (if you also want to find that in the module, see Proper way of getting the address of non-exported kernel symbols in a Linux kernel module):
// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/**
* Test syscall table hijacking on x86-64. This module will replace the `read`
* syscall with a simple wrapper which logs every invocation of `read` using
* printk().
*
* Tested on Linux x86-64 v5.10, v5.18.
*
* Usage:
*
* sudo cat /proc/kallsyms | grep sys_call_table # grab address
* sudo insmod syscall_hijack.ko sys_call_table_addr=0x<address_here>
*/
#include <linux/init.h> // module_{init,exit}()
#include <linux/module.h> // THIS_MODULE, MODULE_VERSION, ...
#include <linux/kernel.h> // printk(), pr_*()
#include <asm/special_insns.h> // {read,write}_cr0()
#include <asm/processor-flags.h> // X86_CR0_WP
#include <asm/unistd.h> // __NR_*
#ifdef pr_fmt
#undef pr_fmt
#endif
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
typedef long (*sys_call_ptr_t)(const struct pt_regs *);
static sys_call_ptr_t *real_sys_call_table;
static sys_call_ptr_t original_read;
static unsigned long sys_call_table_addr;
module_param(sys_call_table_addr, ulong, 0);
MODULE_PARM_DESC(sys_call_table_addr, "Address of sys_call_table");
// Since Linux v5.3 [native_]write_cr0 won't change "sensitive" CR0 bits, need
// to re-implement this ourselves.
static void write_cr0_unsafe(unsigned long val)
{
asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
}
static long myread(const struct pt_regs *regs)
{
pr_info("read(%ld, 0x%lx, %lx)\n", regs->di, regs->si, regs->dx);
return original_read(regs);
}
static int __init modinit(void)
{
unsigned long old_cr0;
real_sys_call_table = (typeof(real_sys_call_table))sys_call_table_addr;
pr_info("init\n");
// Temporarily disable CR0 WP to be able to write to read-only pages
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Overwrite syscall and save original to be restored later
original_read = real_sys_call_table[__NR_read];
real_sys_call_table[__NR_read] = myread;
// Restore CR0 WP
write_cr0_unsafe(old_cr0);
pr_info("init done\n");
return 0;
}
static void __exit modexit(void)
{
unsigned long old_cr0;
pr_info("exit\n");
old_cr0 = read_cr0();
write_cr0_unsafe(old_cr0 & ~(X86_CR0_WP));
// Restore original syscall
real_sys_call_table[__NR_read] = original_read;
write_cr0_unsafe(old_cr0);
pr_info("goodbye\n");
}
module_init(modinit);
module_exit(modexit);
MODULE_VERSION("0.1");
MODULE_DESCRIPTION("Test syscall table hijacking on x86-64.");
MODULE_AUTHOR("Marco Bonelli");
MODULE_LICENSE("Dual MIT/GPL");

Resources