Modify the user stack pointer in kernel mode

Modify the user stack pointer in kernel mode - c

I am writing a char device that takes as input with ioctl a function pointer and a buffer pointer.
I want to modify the user machine context so that back in user mode, that function is executed with a new stack pointed by that buffer pointer.
What I have done is the following :
long ioctl_funcs(struct file *filp,unsigned int cmd, unsigned long arg)
{
int ret = 0;
switch(cmd) {
case IOCTL_SET_FUN:
printk(KERN_INFO "start\n");
struct myarg* a;
a = (struct myarg*) arg;
struct pt_regs* regs = task_pt_regs(current);
regs->ip = a->func;// func is a function implemented in user space
regs->sp = a->stack;// stack is the buffer allocated in user space with malloc
break;
}
return ret;
}
The good news is that the function is activated, the bad one is that the stack is the same (I have used gdb to test it).
In particular even if : regs->sp = 0; the new function is executed when it should crash since it should have no stack.
It seems the assignment of the stack pointer in this way is ineffective.
Why? How should I correctly assign the stack pointer?
The linux kernel version is : 3.18.106 and it is executed on Virtual Box.

Related

How to implement a dynamic–grow stack with pthread?

From APUE, I learned about guardsize and stackaddr of a stack. If a rsp pointer is going to be lower than stackaddr, the rsp enters the guard stack area, and a signal emits to notify the program.
I am wondering if it is possible to implement dynamic–grow(which grows dynamically) stack using this feature. Can you show
how?

The stack can be grown "dynamically"
Set up an alternate signal stack.
Set up signal handler for SIGSEGV with SA_ONSTACK to use the alternate stack.
When the program runs out of stack, it will be issued a SIGSEGV signal.
The signal handler can use getrlimit/setrlimit with RLIMIT_STACK to change the stack size.
getrlimit and setrlimit are not explicitly mentioned in man signal-safety but I don't see why they would be a problem in a signal handler.
Here is a sample test program. It is annotated. Invoke with -d to dynamically increase the stack size from within the signal handler:
// stacktest.c -- test dynamically growing stack
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <signal.h>
#include <assert.h>
#include <setjmp.h>
#include <sys/time.h>
#include <sys/resource.h>
volatile int opt_t = 0;
volatile int opt_d = 0;
volatile int opt_i = 0;
void *top; // initial top of stack
// signal handler
volatile int may_grow = 1; // grow stack within signal handler
volatile int err1 = 0; // error on get
volatile int err2 = 0; // error on set
volatile int hitno; // number of signals
jmp_buf jbuf;
// alternate signal stack
unsigned char altstk[64 * 4096] __attribute__((aligned(4096)));
// xprtstr -- print string in signal handler
void
xprtstr(const char *str)
{
size_t len = strlen(str);
write(1,str,len);
}
// xprtstr -- print number in signal handler
void
xprtnum(unsigned long val,const char *sym)
{
static const char *hex = "0123456789ABCDEF";
xprtstr(" ");
xprtstr(sym);
xprtstr("=");
char buf[100];
char *bp = &buf[50];
*bp-- = 0;
for (int idx = 0; idx < 16; ++idx, --bp) {
*bp = hex[val & 0x0F];
val >>= 4;
}
++bp;
xprtstr(bp);
}
// sigfault -- SIGSEGV handler
void
sigfault(int signo,siginfo_t *info,void *vp)
{
++hitno;
xprtstr("ISR hit");
xprtnum(signo,"signo");
xprtnum(hitno,"hitno");
xprtstr("\n");
struct rlimit rlim;
do {
if (opt_t) {
xprtstr("ISR test\n");
siglongjmp(jbuf,1);
break;
}
// dynamically grow the stack
if (may_grow) {
xprtstr("ISR grow\n");
may_grow = 0;
err1 = getrlimit(RLIMIT_STACK,&rlim);
xprtnum(rlim.rlim_cur,"rlim_cur");
xprtnum(rlim.rlim_cur / 1024,"rlim_cur");
xprtstr("\n");
rlim.rlim_cur += 8 * 1024 * 1024;
err2 = setrlimit(RLIMIT_STACK,&rlim);
getrlimit(RLIMIT_STACK,&rlim);
xprtnum(rlim.rlim_cur,"rlim_cur");
xprtnum(rlim.rlim_cur / 1024,"rlim_cur");
xprtstr("\n");
break;
}
// stop the program
xprtstr("ISR stop\n");
siglongjmp(jbuf,2);
} while (0);
}
// loop -- recursive function to overflow stack
void
loop(unsigned char *old)
{
// get some space on the stack
unsigned char cur[4096];
// get current stack frame address
void *frame = __builtin_frame_address(0);
// get amount of space used on stack
size_t dif = top - frame;
// show where we are
printf("loop: top=%p frame=%p dif=%8.8zX/%zu may_grow=%d\n",
top,frame,dif,dif / 1024,may_grow);
// keep consuming more stack
// NOTE: we don't actually use cur/old but ensure that the compiler won't
// optimize it away
loop(cur);
}
#define SHOWFLAGS(_msk) \
if (flags & (_msk)) \
printf(" " #_msk)
void
showsa(const struct sigaction *sa,const char *who)
{
int flags = sa->sa_flags;
// show the signal flags
printf("showflags: %s flags=%8.8X",who,flags);
SHOWFLAGS(SA_NOCLDSTOP);
SHOWFLAGS(SA_NOCLDWAIT);
SHOWFLAGS(SA_NODEFER);
SHOWFLAGS(SA_ONSTACK);
SHOWFLAGS(SA_RESETHAND);
SHOWFLAGS(SA_RESTART);
SHOWFLAGS(SA_SIGINFO);
// show the signal mask
for (int signo = 1; signo < 32; ++signo) {
if (sigismember(&sa->sa_mask,signo))
printf(" S%d",signo);
}
printf("\n");
}
// setup_signal -- set up signal handler
void
setup_signal(int signo)
{
struct sigaction sa;
assert(sigaction(signo,NULL,&sa) == 0);
sa.sa_sigaction = sigfault;
showsa(&sa,"BEF");
sa.sa_flags |= SA_SIGINFO;
sa.sa_flags |= SA_ONSTACK;
sa.sa_flags &= ~SA_RESETHAND;
assert(sigaction(signo,&sa,NULL) == 0);
assert(sigaction(signo,NULL,&sa) == 0);
showsa(&sa,"AFT");
}
// setup_altstack -- set up alternate signal stack
void
setup_altstack(void)
{
stack_t ss;
sigaltstack(NULL,&ss);
printf("setup_altstack: ss_sp=%p ss_flags=%8.8X ss_size=%zu\n",
ss.ss_sp,ss.ss_flags,ss.ss_size);
void *sp = altstk;
sp += 16 * 4096;
printf("altstk=%p sp=%p\n",altstk,sp);
ss.ss_sp = sp;
ss.ss_size = sizeof(altstk) / 2;
ss.ss_flags = SS_ONSTACK;
assert(sigaltstack(&ss,NULL) == 0);
sigaltstack(NULL,&ss);
printf("setup_altstack: ss_sp=%p ss_flags=%8.8X ss_size=%zu\n",
ss.ss_sp,ss.ss_flags,ss.ss_size);
if (ss.ss_sp != sp) {
printf("setup_altstack: failed\n");
exit(99);
}
}
#define SHOWOPT(_opt,_reason) \
if (_opt) \
printf(#_opt " -- " _reason)
int
main(int argc,char **argv)
{
setlinebuf(stdout);
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
char *cp = *argv;
if (*cp != '-')
break;
cp += 2;
switch (cp[-1]) {
case 'd':
opt_d = ! opt_d;
break;
case 'i':
opt_i = ! opt_i;
break;
case 't':
opt_t = ! opt_t;
break;
}
}
// show the options
SHOWOPT(opt_d,"dynamically grow stack within signal handler");
SHOWOPT(opt_i,"grow stack initially");
SHOWOPT(opt_t,"test sigsetjmp/siglongjmp");
// set up alternate signal stack and signal handler
if (opt_d || opt_t) {
setup_altstack();
setup_signal(SIGSEGV);
}
// test our sigsetjmp/siglongjmp
if (opt_t) {
for (int try = 1; try <= 2; ++try) {
if (! sigsetjmp(jbuf,1)) {
printf("main: ptr try=%d\n",try);
unsigned int *ptr = NULL;
*ptr = 23;
}
else
printf("main: resume try=%d\n",try);
}
opt_t = 0;
}
// set up large stack outside of signal handler
if (opt_i) {
struct rlimit rlim;
getrlimit(RLIMIT_STACK,&rlim);
printf("rlim_cur=%lu/%lu\n",rlim.rlim_cur,rlim.rlim_cur / 1024);
rlim.rlim_cur = 32 * 1024 * 1024;
setrlimit(RLIMIT_STACK,&rlim);
getrlimit(RLIMIT_STACK,&rlim);
printf("rlim_cur=%lu/%lu\n",rlim.rlim_cur,rlim.rlim_cur / 1024);
}
printf("main: loop\n");
// dynamically grow the stack
may_grow = opt_d;
// top of stack
top = __builtin_frame_address(0);
if (! sigsetjmp(jbuf,1))
loop(NULL);
else
printf("main: stop\n");
return 0;
}
UPDATE:
This describes how to make "the" stack grow dynamically (very interesting!), but I take the question to be about the stacks of threads other than the initial one, or at least inclusive of those stacks. I don't think this answer addresses them. – 
John Bollinger
Although the question was tagged with pthreads, I'm not sure if OP actually was talking about subthreads doing this.
I'm not sure that this can be done for subthreads (created via pthread_create). At least not by using a default pthread_create call.
Without special pthread_attr_t values, the default for pthread_create is to malloc a stack [of a default size]. With attributes, the caller might set a larger stack size. And/or the caller will [usually] do an explicit malloc and pass the address along (with the size).
AFAICT, from reading glibc source, pthread_create will not set up a guard area if the user provides a stack pointer. Setting up the guard area [if there is to be one] is the responsibility of the caller.
But, in either case, if the stack overflows (with a guard area), a signal will be generated (SIGSEGV ?).
But, what can one do at that point???
The [pthread internal] function that calls the user's start_routine, the start_routine itself, and any functions that the start routine has called, already have pointers to things on that "old" stack [we must assume this].
So, the per-thread stack can not be moved (i.e. no realloc).
The only way to [possibly] do this is for the caller to provide an explicit stack pointer [and size]. As mentioned, the caller must set up the guard pages (via mprotect, I assume).
Although userfaultfd et. al. might be usable/preferable, I'm going to assume that the caller must use an explicit mmap call (vs. using the heap malloc/realloc).
The main thread's stack [generally] grows downward from the top of the virtual memory space. It can grow until all physical memory is used up and the paging disk is full.
However, for a per-thread (subthread) stack, the caller must decide on the maximum per-thread stack size before creating the thread. More on this below.
It can set up a stack (via mmap) of a smaller initial size. Once thread is created, the base/top stack address must remain constant.
Edit: The following is modified by additional thoughts below.
If a fault occurs, the signal handler could try an extension of the stack. There may be a few ways to do this. My best guess for this is:
It may have to copy/save the existing stack data [somewhere ;-)] if the remap below does not copy the existing data the way realloc does.
Temporarily undo any guard pages.
unmap/remap the stack at the same address but with a larger size (via mremap and/or mmap using MREMAP_FIXED/MAP_FIXED).
If necessary, copy back the stack data onto the "new" stack.
Set up new guard pages.
return from the signal handler [and hope ;-)].
As I said, subthread stacks can't grow "infinitely" as the main thread stack can (via setrlimit).
The sizes/addresses in the example below are not "to scale" ...
Consider two threads (e.g. tA and tB) that start with a small size:
tA's stack at xxxx1000 with size 1000
tB's stack at xxxx2000 with size 1000
If tB hits its guard page and faults, what happens? It has no room the extend its stack downward without colliding with the top of tA's stack.
So, we must map the stacks with sufficient space to grow to the "maximum" and we must know that beforehand.
We need to "space out" the stack addresses so they have room to grow [even if not all pages are mapped initially].
Let's assume that the "maximum" size is 10000. A mapping that would work is:
tA's stack at xxxx1000 with size 1000
tB's stack at xxx11000 with size 1000
Now, if tB hits its guard page, it can extend its stack up to the maximum of 10000
Additional thoughts ...
We probably must initially map the entire maximum region. Otherwise, other unrelated mmap calls may grab space in the proposed extension area. Either mmap from the heap manager, shared memory mappings, or even mmap calls done when setting up other threads.
So, we don't need to use mremap to ever increase the size of the region. Rather, we should use madvise with MADV_DONTNEED on the area that we're not currently using.
If we do this, we don't need a signal handler or guard pages to invoke it to extend the area. Just doing MADV_DONTNEED will keep resource usage low(er).
The thread function can (after popping the stack a bit) release the "popped" area with MADV_DONTNEED.
Other MADV_* options might be better.
All of the above strikes me as craziness! The only use would be a [hugely] multithreaded app that is doing deeply recursive functions.
But, if that's the case, it would be better to convert the recursive functions into ones that don't use actual program stack based recursion. Rather, the function manages an array of structs as a software controlled [pseudo] "stack". Each struct has all the variables that were function/automatic scope in the program stack frame.
The size of this array can be controlled with realloc. Or, we can implement the "stack" as a linked list of these structs with a "slab" allocation scheme. With the slab scheme, all threads could share the same slab allocator. This has the advantage that the amount of memory required can be smaller than if each thread had its own/private slab allocator.
So, IMO, don't do this on the real, per-thread program stack!
Okay, so that's what I've come up with. I've not written code for it or tested it. That's an exercise I think I'll leave to the reader ;-)

How to pass a char *array (belonging to the user address space) to a tasklet or workqueue in a kernel module?

I’m writing a device driver. If someone calls the write operation I want it to be deferred (using tasklet or workqueue). The code should be something like that:
static ssize_t dev_write(struct file *filp, const char *buff, size_t len, loff_t *off) {
packed_work *the_task;
the_task = kzalloc(sizeof(packed_work), GFP_ATOMIC);
if (the_task == NULL) {
printk(KERN_ERR "%s: tasklet buffer allocation failure\n", MODNAME);
return -1;
}
the_task->buffer = the_task;
the_task->buff = buff;
the_task->len = len;
INIT_WORK(&(the_task->the_work), (void*)deferred_write);
schedule_work(&the_task->the_work);
return len;
}
void deferred_write(struct work_struct *data) {
printk(“the text: %s\n”, container_of(data, packed_work, the_work)->buff);
//copy_from_user(&(the_object->stream_content), container_of(data, packed_work, the_work)->buff, len);
kfree((void*)container_of(data,packed_work,the_work));
}
And the struct looks like this:
typedef struct _packed_work{
void *buffer;
const char *buff;
size_t len;
struct work_struct the_work;
} packed_work;
The problem is that the kernel crashes. It crashes even before the copy_from_user (that’s why I commented it). In the deferred_write() I can print the length of the string but not the string itself. Is it a problem because the buffer is in the user space memory?
I know that, as a workaround, I can copy the user buffer in the task struct (using the copy_from_user() in the function write()) and then use the strcpy() in the deferred_write() function. But I really would like to use the copy_from_user() in deferred_write(). Is it possible? What can I do?

Even if it is possible (and there is surely a way), the user process has probably changed the contents of the buffer in the time before deferred_write runs. Notice that user programs often allocate these buffers on the stack, so they get overwritten when the function that called write returns and calls other functions.
Even worse: the user process could have unmapped the buffer, or it could have exited.
So you should not delay reading the buffer. You should read the buffer inside the write call and not anywhere else.

Pass struct to xv6 system call

I'm aware that we are not able to pass parameters to xv6 system call directly and we are forced to use it's built in methods.
But all examples and questions in this site is about how to send integer to system call. Which it's answer is using argint() method.
But my question is, is there anyway to pass "struct" to a xv6 system call? Are there any bulit-in methods for this purpose too?
If there is, could you please say a simple example?

Passing a struct through system call is possible.
While one can't pass a struct itself as a system call parameter, passing a pointer to it is possible and will allow using it as both an input or output parameter.
Allowing to use as argument the data itself and not a pointer to it will damage the requirement of the system calls mechanism- as passing data must be implemented in a generic way to allow all data types to (as well as future structs) be used.
Let's have a look on an existing implementation of the system call fstat.
int fstat(int fd, struct stat *st);
fstat requires a file descriptor number as an input and outputs a matching stats information using struct stat.
struct stat {
short type; // Type of file
int dev; // File system's disk device
uint ino; // Inode number
short nlink; // Number of links to file
uint size; // Size of file in bytes
};
Although fstat uses a struct pointer as an output parameter, using it as an input will be similar.
The function sys_fstat in kernel code starts the implementation of fstat system call (XV6's convention is to handle parameter fetching from user space by sys_* functions).
int sys_fstat(void)
{
struct file *f;
struct stat *st;
if(argfd(0, 0, &f) < 0 || argptr(1, (void*)&st, sizeof(*st)) < 0)
return -1;
return filestat(f, st);
}
This function first gets a corresponding struct file to the file descriptor number received by the first fstat function argument (using argfd). Then, fetches the struct stat pointer received by the second fstat function argument using argptr and saves the given pointer in a local (function scope) pointer variable.
At this point, all arguments were fetched and can be used by the kernel implementation.
Note: Although the struct stat pointer is a user-space pointer (located on the lower half of the virtual space), it is safe for the kernel to use it here because when the kernel is serving a process' system call, it uses the process' own paging table.

Although the above answer is correct but i prefered to write my own solutions to make it more usable for other viwers.
i used argptr to pass a pointer-to-struct to my system call.
in sysproc.c:
int sys_counts (void){
struct countTable *ct;
argptr (0 , (void*)&ct ,sizeof(*ct));
return counts(ct);
}
in proc.c:
int counts (struct countTable *ct){
for (int i=0 ; i<22 ; i++){
(ct->system_calls)[i] = count_calls[i] ;
}
return 22;
}
and finally in my user-space-program:
int main (){
struct countTable *ct = malloc (sizeof (struct countTable));
// call system call
counts(ct);
exit();
}

Although one of the answers is acceptable I wrote my answer that is clear and complete.
Note that passing an argument to system-call directly is impossible. we will use argptr to do that.
In userspace, we define a struct that we want to work with. in a user-level file like test.c
#include "types.h"
#include "stat.h"
#include "user.h"
struct Data
{
...
int id; // some fields
...
};
int main(int argc, char *argv[])
{
struct Data *data = malloc(sizeof(struct Data));
// call the systemcall
doSomeWork((void *)data);
exit();
}
In sysproc.c we define system-call and use argptr to get arguments:
int sys_doSomeWork(void){
struct Data *data;
argptr(0, (void *)&data, sizeof(*data));
return doSomeWork((void *)data);
}
and in proc.c we can write the functionality of system-call:
int doSomeWork(void *data){
// cast to (struct Data *)
struct Data *my_data = (struct Data *)data;
...
// work with my_data
...
return 0;
}
and to make Data struct accessible inside sysproc.c and proc.c we define Data struct inside defs.h:
struct Data
{
...
int id; // some fields
...
};

copy_from_user gives null pointer

I'm attempting to write an Open RG kernel module that, at given intervals, sends a message up to the user space. To this end, I need the kernel to hold a pointer to a static buffer in the user space where this message will be stored. I'm having trouble sending the pointer to the kernel.
The user space function call is something like this (simplified, obviously):
typedef struct {
char msg[MAX_BOOT_MSG];
} msg_t;
static msg_t common_mem;
void user_space_func() {
openrg_module_ctrl(KOS_CDT_TEST, TEST_IOCTL_SET_COMMON_MEM, &common_mem.msg);
}
The kernel space usage is like this:
static void* msg_write;
static int do_ioctl(kos_chardev_t *context, unsigned int cmd,
unsigned long data) {
switch (cmd)
{
case TEST_IOCTL_SET_COMMON_MEM:
received_ioctl = 1;
int ret = copy_from_user(&msg_write, (void *)data, sizeof(char*));
printk("setting common mem to %p, received %d\n", msg_write, ret);
return 0;
}
default:
return -1;
}
The output is setting common mem to 0000000000000000, received 0. I see that common_mem.msg isn't NULL. Any idea what I'm doing wrong?

data is the address of the buffer, so by reading from that address, you are copying the contents of the buffer.
Please note that memory in user space can be moved or swapped out, so this address is valid only for the duration of the system call; you must not store the address for later usage.
Better allocate some memory in your driver, and allow the application to access it with mmap.

copy_to_user vs memcpy

I have always been told(In books and tutorials) that while copying data from kernel space to user space, we should use copy_to_user() and using memcpy() would cause problems to the system. Recently by mistake i have used memcpy() and it worked perfectly fine with out any problems. Why is that we should use copy_to_user instead of memcpy()
My test code(Kernel module) is something like this:
static ssize_t test_read(struct file *file, char __user * buf,
size_t len, loff_t * offset)
{
char ani[100];
if (!*offset) {
memset(ani, 'A', 100);
if (memcpy(buf, ani, 100))
return -EFAULT;
*offset = 100;
return *offset;
}
return 0;
}
struct file_operations test_fops = {
.owner = THIS_MODULE,
.read = test_read,
};
static int __init my_module_init(void)
{
struct proc_dir_entry *entry;
printk("We are testing now!!\n");
entry = create_proc_entry("test", S_IFREG | S_IRUGO, NULL);
if (!entry)
printk("Failed to creats proc entry test\n");
entry->proc_fops = &test_fops;
return 0;
}
module_init(my_module_init);
From user-space app, i am reading my /proc entry and everything works fine.
A look at source code of copy_to_user() says that it is also simple memcpy() where we are just trying to check if the pointer is valid or not with access_ok and doing memcpy.
So my understanding currently is that, if we are sure about the pointer we are passing, memcpy() can always be used in place of copy_to_user.
Please correct me if my understanding is incorrect and also, any example where copy_to_user works and memcpy() fails would be very useful. Thanks.

There are a couple of reasons for this.
First, security. Because the kernel can write to any address it wants, if you just use a user-space address you got and use memcpy, an attacker could write to another process's pages, which is a huge security problem. copy_to_user checks that the target page is writable by the current process.
There are also some architecture considerations. On x86, for example, the target pages must be pinned in memory. On some architectures, you might need special instructions. And so on. The Linux kernels goal of being very portable requires this kind of abstraction.

This answer may be late but anyway copy_to_user() and it's sister copy_from_user() both do some size limits checks about user passed size parameter and buffer sizes so a read method of:
char name[] = "This message is from kernel space";
ssize_t read(struct file *f, char __user *to, size_t size, loff_t *loff){
int ret = copy_to_user(to, name, size);
if(ret){
pr_info("[+] Error while copying data to user space");
return ret;
}
pr_info("[+] Finished copying data to user space");
return 0;
}
and a user space app read as read(ret, buffer, 10); is OK but replace 10 with 35 or more and kernel will emit this error:
Buffer overflow detected (34 < 35)!
and cause the copy to fail to prevent memory leaks. Same goes for copy_from_user() which will also make some kernel buffer size checks.
That's why you have to use char name[] and not char *name since using pointer(not array) makes determining size not possible which will make kernel emit this error:
BUG: unable to handle page fault for address: ffffffffc106f280
#PF: supervisor write access in kernel mode
#PF: error_code(0x0003) - permissions violation
Hope this answer is helpful somehow.