I'm trying to trigger system call in kernel space and it works fine if the system call does not take arguments such as getpid().
The method how I do it:
get the address of system table
static void **syscall_table;
use it with system call number you want and as a function pointer:
typedef long (*sys_call_ptr_t)(const struct __user pt_regs *);
// call system call
((sys_call_ptr_t *)syscall_table)[system_call_number](reg);
if system call have argument, store them into regs before calling it:
struct __user pt_regs *reg = kmalloc....;
reg->di = ...
reg->si = ...
Currently, I'm trying to use write but it fails.
write(int fd, const void *buf, size_t count);
For buf, I've tried both user space address and kernel space address. count may not be a problem. So, I guess problem maybe occur in file descriptor (maybe fd is different between in lower level's and user space's). For basic testing, I only want to write text into terminal, so fd should be 1 (at least in user space).
There're two questions here:
In some reason, I need to stick to the method calling syscall described above. Is it reasonable or any step I miss and cause failure of using write?
If something wrong when I called write? Does the problem come from fd? If so, how do I get the corresponding fd with 1 in user space?
Foreword
By definition, a system call is a service offered by the system to the user space applications. When one is running inside the system, he should not call
a service destined to user space. Hence, this is unadvised to make it.
First try with a kernel space buffer
The write() system call is defined in fs/read_write.c. It calls ksys_write() which calls vfs_write():
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (!ret) {
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
ret = __vfs_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
[...]
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
The file descriptor passed as first parameter is not a problem. The value passed from user space is used to retrieve the file structure of the output file (in ksys_write()). But the second parameter must reference a user space memory area.
In vfs_write(), a check is done on the second parameter:
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
access_ok() checks if the buffer is in the user-level space. Hence, if you
pass an address referencing the kernel space, the returned code from read() will be -EFAULT (-14).
The example below is a simple module calling the write() system call with a kernel space buffer. On x86_64, the convention for the parameters of the system calls are:
RDI = arg#0
RSI = arg#1
RDX = arg#2
R10 = arg#3
R8 = arg#4
R9 = arg#5
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
#define DEV_STR DEV_NAME "String from driver"
static char buf[1024];
static int __init device2_init(void) {
syscall_wrapper write_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table#%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
At module insertion time, we can verify that the system call returns -EFAULT:
$ sudo insmod ./device2.ko
$ dmesg
[15716.262977] [DEVICE2]module has been loaded
[15716.270566] [DEVICE2]sys_call_table#ffffffff926013a0
[15716.270568] [DEVICE2]write() with a kernel space buffer = -14
But the same module with a system call like dup() which involves a file descriptor but no user space buffers, this works. Let's change the previous code with:
static int __init device2_init(void) {
syscall_wrapper write_syscall;
syscall_wrapper dup_syscall;
syscall_wrapper close_syscall;
int rc;
struct pt_regs param;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table#%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
dup_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_dup];
close_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_close];
/*
Call to write() system call with a kernel space buffer
*/
snprintf(buf, sizeof(buf), "%s\n", DEV_STR);
param.di = 1;
param.si = (unsigned long)buf;
param.dx = strlen(buf);
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() with a kernel space buffer = %d\n", rc);
/*
Call to dup() system call
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
/*
Call to close() system call
*/
param.di = 0;
rc = (* close_syscall)(¶m);
printk(KERN_INFO DEV_NAME "close() = %d\n", rc);
/*
Call to dup() system call ==> Must return 0 as it is available
*/
param.di = 1;
rc = (* dup_syscall)(¶m);
printk(KERN_INFO DEV_NAME "dup() = %d\n", rc);
return 0;
}
The result of dup() is OK:
$ sudo insmod ./device2.ko
$ dmesg
[17444.098469] [DEVICE2]module has been loaded
[17444.106935] [DEVICE2]sys_call_table#ffffffff926013a0
[17444.106937] [DEVICE2]write() with a kernel space buffer = -14
[17444.106939] [DEVICE2]dup() = 4
[17444.106940] [DEVICE2]close() = 0
[17444.106940] [DEVICE2]dup() = 0
The first call to dup() returns 4 because the current process is insmod. The latter opened the module file and got file descriptor 3. Hence, the first available file descriptor is 4. The second call to dup() returns 0 because we closed the file descriptor 0.
Second try with a user space buffer
To use a user space buffer, let's add some file operations to the kernel module (open(), release() and write()). In the write() entry point we echo back what is passed from user space into stderr (file descriptor 2) using the user space buffer passed to the write() entry point:
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/ptrace.h>
#include <linux/socket.h>
#include <linux/kallsyms.h>
#include <linux/cdev.h>
MODULE_LICENSE("GPL");
typedef int (* syscall_wrapper)(struct pt_regs *);
static unsigned long sys_call_table_addr;
#define DEV_NAME "[DEVICE2]"
static syscall_wrapper write_syscall;
static ssize_t device2_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
struct pt_regs param;
int rc;
printk(KERN_INFO DEV_NAME "write %p, %zu\n", buff, len);
/*
Call to write() system call to echo the write to stderr
*/
param.di = 2;
param.si = (unsigned long)buff;
param.dx = len;
rc = (* write_syscall)(¶m);
printk(KERN_INFO DEV_NAME "write() = %d\n", rc);
return len; // <-------------- To stop the write
}
static int device2_open(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "open\n");
return 0;
}
static int device2_release(struct inode *inode, struct file *file)
{
printk(KERN_INFO DEV_NAME "released\n");
return 0;
}
static const struct file_operations fops =
{
.owner= THIS_MODULE,
.write=device2_write,
.open= device2_open,
.release= device2_release
};
struct cdev *device_cdev;
dev_t deviceNumbers;
static int __init device2_init(void) {
int rc;
printk(KERN_INFO DEV_NAME "module has been loaded\n");
// This returns the major number chosen dynamically in deviceNumbers
rc = alloc_chrdev_region(&deviceNumbers, 0, 1, DEV_NAME);
if (rc < 0) {
printk(KERN_ALERT DEV_NAME "Error registering: %d\n", rc);
return -1;
}
device_cdev = cdev_alloc();
cdev_init(device_cdev, &fops);
cdev_add(device_cdev, deviceNumbers, 1);
printk(KERN_INFO DEV_NAME "initialized (major number is %d)\n", MAJOR(deviceNumbers));
sys_call_table_addr = kallsyms_lookup_name("sys_call_table");
printk(KERN_INFO DEV_NAME "sys_call_table#%lx\n", sys_call_table_addr);
write_syscall = ((syscall_wrapper *)sys_call_table_addr)[__NR_write];
printk(KERN_INFO DEV_NAME "write_syscall#%p\n", write_syscall);
return 0;
}
static void __exit device2_exit(void) {
printk(KERN_INFO DEV_NAME "module has been unloaded\n");
}
module_init(device2_init);
module_exit(device2_exit);
The loading of the module:
$ sudo insmod device2.ko
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table#ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall#0000000030394929
Make the device entry in the file system to be able to write into it:
$ sudo mknod /dev/device2 c 508 0
$ sudo chmod 666 /dev/device2
$ sudo ls -l /dev/device2
crw-rw-rw- 1 root root 508, 0 janv. 24 16:55 /dev/device2
The writing into the device triggers the expected echo on stderr:
$ echo "qwerty for test purposes" > /dev/device2
qwerty for test purposes
$ echo "another string" > /dev/device2
another string
$ dmesg
[ 2255.183196] [DEVICE2]module has been loaded
[ 2255.183202] [DEVICE2]initialized (major number is 508)
[ 2255.193255] [DEVICE2]sys_call_table#ffffffffbcc013a0
[ 2255.193256] [DEVICE2]write_syscall#0000000030394929
[ 2441.674250] [DEVICE2]open
[ 2441.674268] [DEVICE2]write 0000000032fb5249, 25
[ 2441.674281] [DEVICE2]write() = 25
[ 2441.674286] [DEVICE2]released
[ 2475.538140] [DEVICE2]open
[ 2475.538159] [DEVICE2]write 0000000032fb5249, 15
[ 2475.538171] [DEVICE2]write() = 15
[ 2475.538175] [DEVICE2]released
Related
I have an assignment, where I have to create a proc_entry which can be written to(by user) and read from(by kernel).
The motive is that the kernel code should be able to read the value in the proc_entry, and use it later as a threshold for number of files opened by a process. If a process has opened more files that this threshold, it will be penalized in the scheduler. As user can also change value inside this proc_entry, thus, kernel will use this threshold dynamically.
Most of the codes I have seen online, tell how to create a module that will create such an entry, and another module that given the path of this entry, will read the string present.
The code for the module to create I/O proc_entry is- (Using the below module, I am able to create a proc_entry "/proc/my_proc_entry_write", to which, I can write using- "echo 200 > /proc/my_proc_entry_write", and read this value via "cat /proc/my_proc_entry_write")-
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <asm/types.h>
#define DATA_SIZE 3000000 // We can keep 1024 bytes of data with us.
#define MY_PROC_ENTRY "my_proc_entry_write"
#define PROC_FULL_PATH "/proc/my_proc_entry_write"
struct proc_dir_entry *proc;
int len;
char *msg = NULL;
/*
* Function to write to the proc. Here we free get the new value from buffer,
* count from the buffer and then overwrite the data in our file.
*
* Note that - you can have any other implementation as well for this, all you have to
* ensure that you comply with the expectations of the write() system calls
* like filling in the buffer, and returning the numbers of character written.
*/
static ssize_t my_proc_write(struct file *filp, const char __user * buffer, size_t count, loff_t *pos) // buffer, of length count, should be copied to kernel
{
int i;
char *data = PDE_DATA(file_inode(filp)); // gives data pointer of file
if (count > DATA_SIZE) {
return -EFAULT;
}
printk(KERN_INFO "Printing the data passed. Count is %lu", (size_t) count);
for (i=0; i < count; i++) {
printk(KERN_INFO "Index: %d . Character: %c Ascii: %d", i, buffer[i], buffer[i]);
}
printk(KERN_INFO "Writing to proc");
if (copy_from_user(data, buffer, count)) {
return -EFAULT;
}
data[count-1] = '\0';
printk(KERN_INFO "msg has been set to %s", msg); // Due to kmalloc, msg points to this. So, when we write, msg is changed.
printk(KERN_INFO "Message is: ");
for (i=0; i < count; i++) {
printk(KERN_INFO "\n Index: %d . Character: %c", i, msg[i]);
}
*pos = (int) count; // length written to be copied to pos at end
len = count-1; // len is length of string. count is len+1, to accomodate the \0.
return count;
}
/*
* Function to read the proc entry, here we copy the data from our proc entry
* to the buffer passed.
*/
ssize_t my_proc_read(struct file *filp,char *buf, size_t count, loff_t *offp ) // copy to buf,which is in userspace
{
char* f_path; int i=0; char f_arr[128]; char* f_path_2;
while(i<128)
{
f_arr[i]=0;
i++;
}
f_path=dentry_path_raw(filp->f_path.dentry,f_arr,128);
printk(KERN_ERR"f_path: %s\n",f_path);
i=0;
while(i<128 && f_arr[i]==0)
{
i++;
}
if(i!=128)
{
f_path_2=&f_arr[i];
printk(KERN_ERR"f_path_2: %s\n",f_path_2);
}
int err;
char *data = PDE_DATA(file_inode(filp));
if ((int) (*offp) > len) {
return 0;
}
printk(KERN_INFO "Reading the proc entry, len of the file is %d", len);
if(!(data)) {
printk(KERN_INFO "NULL DATA");
return 0;
}
if (count == 0) {
printk(KERN_INFO "Read of size zero, doing nothing.");
return count;
} else {
printk(KERN_INFO "Read of size %d", (int) count);
}
count = len + 1; // +1 to read the \0 ; thus we store the previous written length in global variable len
err = copy_to_user(buf, data, count); // +1 for \0
printk(KERN_INFO "Read data : %s", buf);
*offp = count;
if (err) {
printk(KERN_INFO "Error in copying data.");
} else {
printk(KERN_INFO "Successfully copied data.");
}
return count;
}
/*
* The file_operations structure. This is the glue layer which associates the
* proc entry to the read and write operations.
*/
struct file_operations proc_fops = {
.read = my_proc_read,
.write = my_proc_write,
};
/*
* This function will create the proc entry. This function will allocate some
* data where the data will be written incase of a write to the proc entry. The
* same memory will be used to serve the reads. * Initially the function fills
* the data with DATA which has "100".
* The important function to see here is the proc_create_data, this function
* will take the proc entry name and create it with the given permissions
* (0666). We also need to pass the file_operations structure which has the
* function pointers to the functions which needs to be called when read or
* write is called on the file.
The last argument has the pointer to the data
* associated with the file. (So, by "char *data = PDE_DATA(file_inode(filp));", the 'data' is actually 'msg')
*/
int create_new_proc_entry(void) {
int i;
char *DATA = "100";
len = strlen(DATA);
msg = kmalloc((size_t) DATA_SIZE, GFP_KERNEL); // +1 for \0
if (msg != NULL) {
printk(KERN_INFO "Allocated memory for msg");
} else {
return -1;
}
strncpy(msg, DATA, len+1);
for (i=0; i < len +1 ; i++) {
printk(KERN_INFO "%c", msg[i]);
if (msg[i] == '\0') {
printk(KERN_INFO "YES");
}
}
proc = proc_create_data(MY_PROC_ENTRY, 0666, NULL, &proc_fops, msg);
if (proc) {
return 0;
}
return -1;
}
/* The init function of the module. Does nothing other than calling the
* create_new_proc_entry. */
int proc_init (void) {
if (create_new_proc_entry()) {
return -1;
}
return 0;
}
/* Function to remove the proc entry. Call this when the module unloads. */
void proc_cleanup(void) {
remove_proc_entry(MY_PROC_ENTRY, NULL);
}
MODULE_LICENSE("GPL");
module_init(proc_init);
module_exit(proc_cleanup);
Makefile for the above module is(assuming the code of module is written in file proc_write_read.c):-
MYPROC=proc_write_read
obj-m += $(MYPROC).o
export KROOT=/lib/modules/$(shell uname -r)/build
#export KROOT=/lib/modules/$(uname)3.2.0-23-generic/build
allofit: modules
modules: clean
#$(MAKE) -C $(KROOT) M=$(PWD) modules
modules_install:
#$(MAKE) -C $(KROOT) M=$(PWD) modules_install
kernel_clean:
#$(MAKE) -C $(KROOT) M=$(PWD) clean
clean: kernel_clean
rm -rf Module.symvers modules.order
insert: modules
dmesg -c
insmod proc_write_read.ko
remove: clean
rmmod proc_write_read
The module that reads this proc_entry, like a file is-
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <asm/uaccess.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/proc_fs.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/types.h>
int proc_init(void)
{
struct file *f; ssize_t ret = -EBADF;int i;
char* f_path;
i=0;
f=NULL;
char buf[128]; char f_arr[128];
mm_segment_t fs;
i=0;
while(i<128)
{
buf[i] = 0;
f_arr[i]=0;
i++;
}
// To see in /var/log/messages that the module is operating
//printk(KERN_INFO "read_file- my module is loaded\n");
f = filp_open("/proc/my_proc_entry_write", O_RDONLY, 0);
if (IS_ERR(f)) {
printk(KERN_ERR "Error in filp_open: %p ; %d\n", f,PTR_ERR(f));
return 100000;
}
f_path=dentry_path_raw(f->f_path.dentry,f_arr,128);
printk(KERN_ERR"f_path_in_proc_entry_read: %s\n",f_path);
printk(KERN_ERR"kernel Below1!!!\n");
// Get current segment descriptor
fs = get_fs();
printk(KERN_ERR"kernel Below2!!!\n");
// Set segment descriptor associated to kernel space
set_fs(get_ds());
printk(KERN_ERR"kernel Below3!!!\n");
// Read the file
if (f!=NULL) {
if ((f->f_op)!=NULL && (f->f_op->read)!=NULL){
ret=f->f_op->read(f, buf, 128, &f->f_pos);
printk(KERN_ERR"kernel Below4!!!\n");
}
else
return 100000;
}
else
return 100000;
// Restore segment descriptor
set_fs(fs);
// See what we read from file
printk(KERN_ERR "kernel Read my_proc_entry_write buf:%s\n",buf);
int val=0;
sscanf(buf, "%d", &val);
printk(KERN_ERR"kernel val: %d\n",val);
filp_close(f,NULL);
return val;
}
void proc_cleanup(void)
{
printk(KERN_INFO "My module is unloaded\n");
}
module_init(proc_init);
module_exit(proc_cleanup);
Makefile for the above module(name=read_file_in_kernel) is-
obj-m += read_file_in_kernel.o
all:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
clean:
make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
My question is- Both these modules work fine as userspace modules, via insmod, but if I copy-paste this code to the source code of kernel 4.19.200 (not exact copy-paste, just the main parts as functions), and use the 2nd module,i.e, read_file_in_kernel's code to access the proc value, I am getting an error, i.e, kernel is not able to boot.
On running gdb through the kernel, I found that the code in read_file_in_kernel went through the below code, thrice (i.e, when update_curr function in linux kernel called this function 3 times),i.e, f is being returned as an error every time-
if (IS_ERR(f)) {
printk(KERN_ERR "Error in filp_open: %p ; %d\n", f,PTR_ERR(f));
return 100000;
}
In the 4th call to read_file_in_kernel, the kernel froze on -
f = filp_open("/proc/my_proc_entry_write", O_RDONLY, 0);
Same case with gdb.
I don't know what I am doing wrong here. Is it that /proc/my_proc_entry_write is not being created during bootup when it is read, and so, filp_open is not able to open that proc_entry to be read.
I even tried removing the first module from kernel entirely, running it separately from user-space to create a proc_entry(my_proc_entry_write) beforehand, that will be loaded by default every time the kernel boots up. But still, same error is coming.
What correction should I make to this?
If this is not the way to create a dynamic proc_entry that can be written to by user and ready by kernel, what is?
In the case of linux kernel device drivers there is the file_operations struct, or fops struct, which allows the driver to define handlers for various file operations.
My question is about the .release fop handler.
I know the release handler will only be called when the last file descriptor (fd) for the file object is closed (or munmapped). This is done when fput is called on the file and the file->f_count reaches 0.
However - I am unclear on if other file operations can be running simultaneously in a another thread when release is entered.
For example:
could 1 thread of a process be inside the ioctl handler for the file (or fd), while another thread of the same process is inside of the release handler?
Can release be a factor in race conditions for the file object?
could 1 thread of a process be inside the ioctl handler for the file (or fd), while another thread of the same process is inside of the release handler?
No. The release entry point is called when the reference counter on the
file entry is 0. ioctl() increments the reference counter on the file. So, the release entry point will not be called while an ioctl() is on tracks.
Foreword
The source code discussed below is:
GLIBC 2.31
Linux 5.4
GLIBC's pthread management
The GLIBC's pthread_create() actually involves a clone() system call with
the following flags:
CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID
According to the manual of clone(), the CLONE_FILES flag makes the threads of a process
share the same file descriptor table. Any file descriptor created by
one thread is also valid in the other threads. Similarly, if one thread closes a file descriptor, or changes its associated flags (using the fcntl() F_SETFD operation), the other threads are also affected.
clone() on the kernel side
When clone() is passed CLONE_FILES, the files_struct is not duplicated but a reference counter is incremented. As a consequence, the task structures of both threads point on the same files_struct (files field):
. The task structure is defined in include/linux/sched.h:
struct task_struct {
[...]
/* Open file information: */
struct files_struct *files; /// <==== Table of open files shared between thread
[...]
. In kernel/fork.c, the clone() service calls copy_files() to increment the reference counter on the files_struct
static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
{
struct files_struct *oldf, *newf;
int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
goto out;
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count); // <==== Ref counter incremented: files_struct is shared
goto out;
}
newf = dup_fd(oldf, &error);
if (!newf)
goto out;
tsk->files = newf;
error = 0;
out:
return error;
}
. The files_struct is defined in include/linux/fdtable.h:
/*
* Open file table structure
*/
struct files_struct {
/*
* read mostly part
*/
atomic_t count; // <==== Reference counter
bool resize_in_progress;
wait_queue_head_t resize_wait;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
/*
* written part on a separate cache line in SMP
*/
spinlock_t file_lock ____cacheline_aligned_in_smp;
unsigned int next_fd;
unsigned long close_on_exec_init[1];
unsigned long open_fds_init[1];
unsigned long full_fds_bits_init[1];
struct file __rcu * fd_array[NR_OPEN_DEFAULT];
ioctl() operation
ioctl() system call is defined fs/ioctl.c. It calls fdget() first to increment the reference counter on the file entry, do the requested operation and then call fdput()
int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
{
int error;
struct fd f = fdget(fd);
if (!f.file)
return -EBADF;
error = security_file_ioctl(f.file, cmd, arg);
if (!error)
error = do_vfs_ioctl(f.file, fd, cmd, arg);
fdput(f);
return error;
}
SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
return ksys_ioctl(fd, cmd, arg);
}
The file entry is defined in include/linux/fs.h. Its reference counter is the f_count field:
struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
struct inode *f_inode; /* cached value */
const struct file_operations *f_op;
/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count; // <===== Reference counter
unsigned int f_flags;
[...]
} __randomize_layout
__attribute__((aligned(4)));
Example
Here is a simple device driver into which the file operations merely display a message when they are triggered. The ioctl() entry makes the caller sleep 5 seconds:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/cdev.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/delay.h>
MODULE_LICENSE("GPL");
#define DEVICE_NAME "device"
static int device_open(struct inode *, struct file *);
static int device_release(struct inode *, struct file *);
static ssize_t device_read(struct file *, char *, size_t, loff_t *);
static ssize_t device_write(struct file *, const char *, size_t, loff_t *);
static long int device_ioctl(struct file *, unsigned int, unsigned long);
static int device_flush(struct file *, fl_owner_t);
static const struct file_operations fops = {
.owner = THIS_MODULE,
.read = device_read,
.write = device_write,
.unlocked_ioctl = device_ioctl,
.open = device_open,
.flush = device_flush,
.release = device_release
};
struct cdev *device_cdev;
dev_t deviceNumbers;
static int __init init(void)
{
// This returns the major number chosen dynamically in deviceNumbers
int ret = alloc_chrdev_region(&deviceNumbers, 0, 1, DEVICE_NAME);
if (ret < 0) {
printk(KERN_ALERT "Error registering: %d\n", ret);
return -1;
}
device_cdev = cdev_alloc();
cdev_init(device_cdev, &fops);
ret = cdev_add(device_cdev, deviceNumbers, 1);
printk(KERN_INFO "Device initialized (major number is %d)\n", MAJOR(deviceNumbers));
return 0;
}
static void __exit cleanup(void)
{
unregister_chrdev_region(deviceNumbers, 1);
cdev_del(device_cdev);
printk(KERN_INFO "Device unloaded\n");
}
static int device_open(struct inode *inode, struct file *file)
{
printk(KERN_INFO "Device open\n");
return 0;
}
static int device_flush(struct file *file, fl_owner_t id)
{
printk(KERN_INFO "Device flush\n");
return 0;
}
static int device_release(struct inode *inode, struct file *file)
{
printk(KERN_INFO "Device released\n");
return 0;
}
static ssize_t device_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
printk(KERN_INFO "Device write\n");
return len;
}
static ssize_t device_read(struct file *filp, char *buff, size_t len, loff_t * off)
{
printk(KERN_INFO "Device read\n");
return 0;
}
static long int device_ioctl(struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
{
printk(KERN_INFO "Device ioctl enter\n");
msleep_interruptible(5000);
printk(KERN_INFO "Device ioctl out\n");
return 0;
}
module_init(init);
module_exit(cleanup);
Here is a user space program which involves the main thread and a secondary one. The main thread opens the above device and waits for the secondary thread to start (barrier) before closing the device after 1 second. Meanwhile, the secondary thread calls ioctl() on the above device which makes it sleep 5 seconds. Then it calls ioctl() a second time before exiting.
The expected behavior is to make the main thread close the device file while the secondary thread is running the ioctl().
#include <stdio.h>
#include <pthread.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <errno.h>
static int dev_fd;
static pthread_barrier_t barrier;
void *entry(void *arg)
{
int rc;
printf("Thread running...\n");
// Rendez-vous with main thread
pthread_barrier_wait(&barrier);
rc = ioctl(dev_fd, 0);
printf("rc = %d, errno = %d\n", rc, errno);
rc = ioctl(dev_fd, 0);
printf("rc = %d, errno = %d\n", rc, errno);
return NULL;
}
int main(void)
{
pthread_t tid;
dev_fd = open("/dev/device", O_RDWR);
pthread_barrier_init(&barrier, NULL, 2);
pthread_create(&tid,NULL, entry, NULL);
pthread_barrier_wait(&barrier);
sleep(1);
close(dev_fd);
pthread_join(tid,NULL);
return 0;
}
Installation of the kernel module:
$ sudo insmod ./device.ko
$ dmesg
[13270.589766] Device initialized (major number is 237)
$ sudo mknod /dev/device c 237 0
$ sudo chmod 666 /dev/device
$ ls -l /dev/device
crw-rw-rw- 1 root root 237, 0 janv. 27 10:55 /dev/device
The execution of the program shows that the first ioctl() makes the thread wait 5 seconds. But the second returns in error with EBADF (9) because meanwhile the device file has been closed by the main thread:
$ gcc p1.c -lpthread
$ ./a.out
Thread running...
rc = 0, errno = 0
rc = -1, errno = 9
In the kernel log, we can see that the close() in the main thread merely triggered a flush() operation on the device while the first ioctl() was on tracks in the secondary thread. Then, once the first ioctl() returned, the internals of the kernel freed the file entry (reference counter dropped to 0) and so, the second ioctl() did not reach the device as the file descriptor no longer referenced an opened file. Hence, the EBADF error on the second call:
[13270.589766] Device initialized (major number is 237)
[13656.862951] Device open <==== Open() in the main thread
[13656.863315] Device ioctl enter <==== 1st ioctl() in secondary thread
[13657.863523] Device flush <==== 1 s later, flush() = close() in the main thread
[13661.941238] Device ioctl out <==== 5 s later, the 1st ioctl() returns
[13661.941244] Device released <==== The file is released because the reference counter reached 0
Obviuosly, it's a unsuprising newbie's question after a lot of troubles with kernel programming. I try to launch a program that gets driver file in /dev folder available for some reading and writing (indeed, I realize it's rather unsafe idea, but I need strongly going ahead with all that experience). Let's look at a module source code:
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <asm/uaccess.h>
MODULE_LICENSE("GPL");
int init_module(void); // driver file initialization as opening it
void cleanup_module(void); // exec files removal ahead of shutting driver file
static int device_open(struct inode *, struct file *); // driver file opening
static int device_release(struct inode *, struct file *); // return of system resource control
static ssize_t device_read(struct file *, char *, size_t, loff_t *); // reading from driver file
static ssize_t device_write(struct file *, const char *, size_t, loff_t *); // writing into driver file
#define SUCCESS 1
#define DEVICE_NAME "sample device"
#define BUF_LEN 80
static int Major; // device's major number
static int Device_Open = 0; // device access counter
static char message[BUF_LEN]; // buffer for both read and write operations
static char *message_ptr;
// list of basic operations executable by driver
static struct file_operations ops = {
.read = device_read,
.write = device_write,
.open = device_open,
.release = device_release
};
int init_module(void)
{
Major = register_chrdev(0, DEVICE_NAME, &ops); // major number assignment
// evaluate whether driver file is accessible
if(Major < 0) {
printk(KERN_ALERT "Device registration attempt failed\n");
return Major;
}
return SUCCESS;
}
void cleanup_module(void)
{
unregister_chrdev(Major, DEVICE_NAME); // cancelling driver registration in file system before exit
printk(KERN_ALERT "Driver file of /dev/%s c %d 0 has been destroyed\n", DEVICE_NAME, Major);
return;
}
static int device_open(struct inode * node, struct file * file)
{
printk(KERN_INFO "Trying access /dev/%s c %d 0\n", DEVICE_NAME, Major);
static int counter = 0; // access counter initializing
// file control evaluation
if(Device_Open)
return -EBUSY;
Device_Open++; // increment counter to avert driver's immanent running
sprintf(message, "This sentence displayed %d times\n", counter++);
message_ptr = message;
try_module_get(THIS_MODULE);
return SUCCESS;
}
static int device_release(struct inode * node, struct file * file)
{
printk(KERN_INFO "Trying closure of /dev/%s c %d 0\n", DEVICE_NAME, Major);
Device_Open--; // decrement counter to keep driver file removable as well
module_put(THIS_MODULE);
return SUCCESS;
}
static ssize_t device_read(struct file * file, char * ch, size_t num, loff_t * off)
{
int read_bytes = 0; // output size
printk(KERN_INFO "Trying read from /dev/%s c %d 0\n", DEVICE_NAME, Major);
if(*message_ptr == 0)
return 0;
// loop-executed reading from file
while(num && *message_ptr) {
put_user(*(message_ptr++), ch++);
num--;
read_bytes++;
}
printk("%d bytes read, %d bytes to be handled", read_bytes, num);
return read_bytes;
}
// updated stuff
static ssize_t device_write(struct file *filp, const char *buff, size_t len, loff_t * off)
{
char message_from_user[BUF_LEN];
if(copy_from_user(message_from_user, buff, len)) return -EINVAL;
printk(KERN_INFO "length of message:%d message:'%s'", (int)len, message_from_user);
return len;
}
To test reading/writing, I use this code:
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <linux/unistd.h>
extern int errno;
int main()
{
int fd; // file descriptor id
size_t cnt = 0; // input / output number of bytes
size_t cnt_2 = 0;
char inputBuffer[30] = "Device file is open"; // write operation buffer
char outputBuffer[50]; // read operation buffer
printf("Continuing with basics of Linux drivers...\n");
// evaluate accessibility of driver file
fd = open("/dev/dev", O_RDWR);
if(fd == -1) {
close(fd);
printf("File opening isn't completed\n");
return 1;
}
printf("Driver file is open now\n");
// writing from file
cnt = write(fd, inputBuffer, sizeof(inputBuffer));
printf("Driver got written %d bytes\n", cnt);
// read into file
cnt = read(fd, outputBuffer, sizeof(outputBuffer));
printf("Driver received %d bytes\n", cnt);
int i = 0;
// display an input message
while(i < cnt) {
printf("%c", outputBuffer[i]);
printf("%s", "\n");
i++;
}
close(fd); // wrap up driver connection and clear memory
printf("Driver file is close\n");
return 0;
}
Altough the module was built in as well as dev file was made by mknod (I run it on Ubuntu 18.04), I'm stuck at write operation due to some miscomprehension of driver calls in user/kernel spaces. Once I start my program, outputs are here as follows:
Continuing with basics of Linux drivers...
Driver file is open now
Driver got written -1 bytes
Followed by last line output, the system becomes inoperable (no response until I make off PC). That's a case I think of like a matter of memory control or, most probably, some driver file properties. However, user rights have been granted to reading / writing / executing, no access restrictions are inferable indeed. Hopefully, it's possible to point out to what's wrongness in the code posted here.
Seeing your code you don't handle the writing part.
static ssize_t device_write(struct file * file, const char * ch, size_t num, loff_t * off)
{
printk(KERN_ALERT "Operation denied\n");
return -EINVAL;
}
Thus there is no way your module can possibly work.
But your crash comes from memory accesses in your reading function (check this with strace). I let you understand your issue. dmesg should help (or in the case your system panics you can make the log persistant to debug it after rebooting your system).
I have developed a simple linux kernel module which I will send to it a char message from user space program.
This is the module :
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kernel.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Gaston");
MODULE_DESCRIPTION("A simple Linux char driver");
MODULE_VERSION("0.1");
static char message[256] = {0};
ssize_t exer_open(struct inode *pinode, struct file *pfile) {
printk(KERN_INFO "Device has been opened\n");
return 0;
}
ssize_t exer_read(struct file *pfile, char __user *buffer, size_t length, loff_t *offset) {
return 0;
}
ssize_t exer_write(struct file *pfile, const char __user *buffer, size_t length, loff_t *offset) {
printk(KERN_INFO "Received %s characters from the user\n", message);
return 0;
}
ssize_t exer_close(struct inode *pinode, struct file *pfile) {
printk(KERN_INFO "Device successfully closed\n");
return 0;
}
struct file_operations exer_file_operations = {
.owner = THIS_MODULE,
.open = exer_open,
.read = exer_read,
.write = exer_write,
.release = exer_close,
};
int exer_simple_module_init(void) {
printk(KERN_INFO "Initializing the LKM\n");
register_chrdev(240, "Simple Char Drv", &exer_file_operations);
return 0;
}
void exer_simple_module_exit(void) {
unregister_chrdev(240, "Simple Char Drv");
}
module_init(exer_simple_module_init);
module_exit(exer_simple_module_exit);
And this my C program :
#include<stdio.h>
#include<stdlib.h>
#include<errno.h>
#include<fcntl.h>
#include<string.h>
#include<unistd.h>
#define BUFFER_LENGTH 256
int main()
{
int ret, fd;
char stringToSend[BUFFER_LENGTH];
fd = open("/dev/char_device", O_RDWR); // Open the device with read/write access
if (fd < 0)
{
perror("Failed to open the device...");
return errno;
}
printf("Type in a short string to send to the kernel module:\n");
scanf("%s", stringToSend); // Read in a string (with spaces)
printf("Writing message to the device [%s].\n", stringToSend);
ret = write(fd, stringToSend, strlen(stringToSend)); // Send the string to the LKM
if (ret < 0)
{
perror("Failed to write the message to the device.");
return errno;
}
return 0;
}
After inserting the module with insmod , and when I execute the program and examin the kernel logs using tail -f /var/log/messages command I can see :
Oct 1 13:57:37 auth.info login[306]: root login on 'ttyS0'
Oct 1 13:58:22 user warn kernel: exer_simple_char_drv: loading out-of-tree module taints kernel.
Oct 1 13:58:22 user.info kernel: Initializing the LKM
Oct 1 13:58:35 user.info kernel: Device has been opened
Oct 1 13:58:39 user.info kernel: Received characters from the user
Oct 1 13:58:39 user.info kernel: Device successfully closed
Same thing when I run dmesg :
exer_simple_char_drv: loading out-of-tree module taints kernel.
Initializing the LKM
Device has been opened
Received characters from the user
Device successfully closed
The problem is That I am not able to see the message that I entred manually when I executed my C program. What I am missing here please ?
First problem: you never modify message.
Then, you cannot use directly the user memory in the kernel context
You have to translate it before: copy_from_user is for that.
Your write function could looks like
#define MAX 256
/* here, message is defined as 256 bytes +1 one.
The extra char is here to be compatible with the `%s` formatter */
static char message[MAX+1] ="";
ssize_t exer_write(struct file *pfile, const char __user *buffer, size_t length, loff_t *offset) {
if (length > MAX)
return -EINVAL;
if (copy_from_user(message, buffer, length) != 0)
return -EFAULT;
printk(KERN_INFO "Received %s characters from the user\n", message);
return 0;
}
Below is my kernel module,which I tested through C program but now instead of using c application I want to write a shell script which do read and write operation with my kernel module ?
Thanks in advance.
#include <linux/init.h>
#include <linux/module.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <asm/uaccess.h>
#define DEVICE_NAME "mydevice"
#define CLASS_NAME "device"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ABC");
MODULE_DESCRIPTION("A simple Linux character driver");
MODULE_VERSION("0.1");
static int majorNumber;
static char message[256] = {0};
static short size_of_message;
static int numberOpens = 0;
static struct class* character_deviceClass = NULL;
static struct device* character_deviceDevice = NULL;
static int dev_open(struct inode *, struct file *);
static int dev_release(struct inode *, struct file *);
static ssize_t dev_read(struct file *, char *, size_t, loff_t *);
static ssize_t dev_write(struct file *, const char *, size_t, loff_t *);
static DEFINE_MUTEX(devicedev_mutex);
static struct file_operations fops =
{
.open = dev_open,
.read = dev_read,
.write = dev_write,
.release = dev_release,
};
static int __init character_device_init(void){
printk(KERN_INFO "Shell: Initializing the character device LKM\n");
majorNumber = register_chrdev(0, DEVICE_NAME, &fops);
if (majorNumber<0){
printk(KERN_ALERT "character device failed to register a major number\n");
return majorNumber;
}
printk(KERN_INFO "character device: registered correctly with major number %d\n", majorNumber);
character_deviceClass = class_create(THIS_MODULE, CLASS_NAME);
if (IS_ERR(character_deviceClass)){
unregister_chrdev(majorNumber, DEVICE_NAME);
printk(KERN_ALERT "Failed to register device class\n");
return PTR_ERR(character_deviceClass);
}
printk(KERN_INFO "character device: device class registered correctly\n");
character_deviceDevice = device_create(character_deviceClass, NULL, MKDEV(majorNumber, 0), NULL, DEVICE_NAME);
if (IS_ERR(character_deviceDevice)){
class_destroy(character_deviceClass);
unregister_chrdev(majorNumber, DEVICE_NAME);
printk(KERN_ALERT "Failed to create the device\n");
return PTR_ERR(character_deviceDevice);
}
printk(KERN_INFO "character device: device class created correctly\n");
mutex_init(&devicedev_mutex);
return 0;
}
static void __exit character_device_exit(void){
mutex_destroy(&devicedev_mutex);
device_destroy(character_deviceClass, MKDEV(majorNumber, 0));
class_unregister(character_deviceClass);
class_destroy(character_deviceClass);
unregister_chrdev(majorNumber, DEVICE_NAME);
printk(KERN_INFO "character device: Goodbye from the LKM!\n");
}
static int dev_open(struct inode *inodep, struct file *filep){
if(!mutex_trylock(&devicedev_mutex))
{
printk(KERN_ALERT "Character device: Device in use by another process");
return -EBUSY;
}
numberOpens++;
printk(KERN_INFO "character device: Device has been opened %d time(s)\n", numberOpens);
return 0;
}
static ssize_t dev_read(struct file *filep, char *buffer, size_t len, loff_t *offset){
int error_count = 0;
error_count = copy_to_user(buffer, message, size_of_message);
if (error_count==0){
printk(KERN_INFO "character device: Sent %d characters to the user\n", size_of_message);
return (size_of_message=0);
}
else {
printk(KERN_INFO "character device: Failed to send %d characters to the user\n", error_count);
return -EFAULT;
}
}
static ssize_t dev_write(struct file *filep, const char *buffer, size_t len, loff_t *offset){
sprintf(message, "%s(%d letters)", buffer, len);
size_of_message = strlen(message);
printk(KERN_INFO "character device: Received %d characters from the user\n", len);
return len;
}
static int dev_release(struct inode *inodep, struct file *filep){
mutex_unlock(&devicedev_mutex);
printk(KERN_INFO "character device: Device successfully closed\n");
return 0;
}
module_init(character_device_init);
module_exit(character_device_exit);
Let's say that you registered with major number 254 and minor number 1 (the actual code given in your question logs the allocated numbers to dmesg, so check there). If you didn't have udev or similar configured to create a /dev/mydevice for you, you could do so yourself:
mknod /dev/mydevice c 254 1 # substitute the real allocated values
At that point, opening it is the same as with anything else:
# file descriptor number 3 is arbitrary, but the same number needs to be reused later
# don't use 0-2, which are reserved for stdin/stdout/stderr
exec 3<>/dev/mydevice
...and reads and writes are similarly conventional:
echo "This is a write" >&3
read varname <&3 # read until newline from device