Linux Kernel vfs_write function confusion - c

I was looking at the old Linux kernel code (3.10.1), particularly the IO path.
So when the IO enters the VFS layer, the function vfs_write() is called.
Here I can see a call to file->f_op->write(), which is a blocking call as the man page of the system call write() says.
The other option in the code is when file->f_op->write pointer is not defined, in that case vfs_write() calls do_sync_write().
do_sync_write() goes ahead and calls filp->f_op->aio_write(), which is an async call as the man page of aio_write() explains.
Now, my question is, why was the function do_sync_write() named "sync", when it clearly goes on to call an async IO function?
I might be missing something probably, or there was a blunder made here back in those times?
Function definitions for reference,
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret >= 0) {
count = ret;
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
}
return ret;
}
ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_left = len;
kiocb.ki_nbytes = len;
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
return ret;
}

why was the function do_sync_write() named "sync", when it clearly goes on to call an async IO function?
It calls async function and then waits for its completion with
ret = wait_on_sync_kiocb(&kiocb);
So from the view of the caller of do_sync_write function, the whole function behavior is synced.

Related

C : How to use a struct, declared and defined in function A, in another function B

I have a problem in my C code.
In fact, I have declared struct task_struct *t under write_pid function.
t will receive the pid of a specified task. This is the code of the function :
static ssize_t write_pid(struct file *pfile, const char __user *buffer,
size_t length, loff_t *offset)
{
char mybuf[10];
int pid = 0;
struct task_struct *t;
struct siginfo info;
/* read the value from user space */
if(length > 10)
return -EINVAL;
copy_from_user(mybuf, buffer, length);
sscanf(mybuf, "%d", &pid);
printk("pid = %d\n", pid);
/* send the signal */
memset(&info, 0, sizeof(struct siginfo));
info.si_signo = SIG_TEST;
info.si_code = SI_QUEUE; // this is bit of a trickery: SI_QUEUE is normally used by sigqueue from user space,
// and kernel space should use SI_KERNEL. But if SI_KERNEL is used the real_time data
// is not delivered to the user space signal handler function.
info.si_int = 260; //real time signals may have 32 bits of data.
rcu_read_lock();
t = pid_task(find_vpid(pid), PIDTYPE_PID); //find the task_struct associated with this pid
if(t == NULL){
printk("no such pid\n");
rcu_read_unlock();
return -ENODEV;
}
rcu_read_unlock();
return length;
}
Now I want to use t in another function as an argument. This is a part of function B code named read_pid to more clarify the problem :
static ssize_t read_pid(struct file *pfile, char __user *buffer,
size_t length, loff_t *offset)
{
size_t buf_size = 0;
char *buf = NULL;
ssize_t total = 0;
int yalv;
int ret;
struct siginfo info;
ret = send_sig_info(SIG_TEST, &info, t); //send the signal
if (ret < 0) {
printk("error sending signal\n");
kfree(buf);
return ret;
}
}
As you can see, t is used as an argument for send_sig_info function.
How can I do that ? Thank you.
The short answer is: You can't !
A local variable inside write_pid can be passed to functions that is called inside write_pid but once write_pid returns the local variable no longer exists and therefore can't be used.
If you want to use t after write_pid has returned, you'll have to make it a variable of the caller of write_pid and then pass the address of that variable.
Something like:
static ssize_t write_pid(struct file *pfile, const char __user *buffer,
size_t length, loff_t *offset,
struct task_struct **pt)
{
....
*pt = pid_task(....)
....
}
And call it like:
struct task_struct *t;
write_pid(............, &t);
Now you can pass t to read_pid

Linux device driver buffering strategy

Lets assume that I have an external device that is constantly pushing data into a small buffer in my driver. I'm using a wait queue where an interrupt handler wakes up a waiting user process (similar to LDD (3rd edition) - Implementing a Handler).
irq_handler_t irq_handler(int irq, void *dev_id, struct pt_regs *regs)
{
flag = 1;
wake_up_interruptible(&wq);
return IRQ_HANDLED;
}
ssize_t my_read(struct file *dev, char __user *buf, size_t count, loff_t *f_pos)
{
wait_event_interruptible(wq, flag != 0);
flag = 0;
copy_to_user(usr_buf, drv_buf, count);
}
/***********************User program***********************/
while(1)
{
read(fid, buffer, size);
//do stuff with data
}
The user program calls read and it waits till the interrupt gets new data from the external device. Since the external device may push data at a faster than this code can execute, what mechanisms can I use to ensure data is not overwritten before the user program copies it? Would a ring buffer like structure work here? Its not clear how to implement it.
Thanks
Yes, a ring buffer would work.
You simply have to fill the buffer from the interrupt handler and you will read it from the my_read callback.
A really naive and really really inefficient implementation could be (untested):
static irqreturn_t irq_handler(int irq, void *dev_id)
{
struct my_dev *dev = dev_id;
buf[buf_wr] = read_device(dev);
buf_wr++;
if (buf_wr >= BUFSIZE)
buf_wr = 0;
wake_up(&wq);
return IRQ_HANDLED;
}
static ssize_t my_read(struct file *file, char __user *ubuf,
size_t sz, loff_t *ppos)
{
int n, ret;
ret = wait_event_interruptible(wq,
buf_wr != buf_rd);
if (ret)
return ret;
n = buf_wr - buf_rd;
if (n < 0)
n += BUFSIZE;
n = min(count, n);
ret = copy_to_user(ubuf, buf, n);
buf_rd += n;
if (buf_rd >= BUFSIZE)
buf_rd -= BUFSIZE;
if (ret)
return ret;
*ppos += n;
return 1;
}
You may also want to use DMA or mmap or both to get something more efficient.

need help understanding the read part of read_write.c

Hi I was hoping someone might help me understand the read section of the read_write.c kernel file when I look at it I don't really understand a thing.
I can't really tell which part is actually reading the file considering there are several instances where read functions are called. I ask because I have to know where to modify it and how for an assignment I have where I have to modify the output of the read without actually modifying the file.
By the way I am using the latest version of the Linux kernel from kernel.org version 4.9 any and all help is appreciated thank you. Below is where I believe the read is happening.
typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
};
static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, iter_fn_t fn, int flags)
{
struct kiocb kiocb;
ssize_t ret;
if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
return -EOPNOTSUPP;
init_sync_kiocb(&kiocb, filp);
if (flags & RWF_HIPRI)
kiocb.ki_flags |= IOCB_HIPRI;
if (flags & RWF_DSYNC)
kiocb.ki_flags |= IOCB_DSYNC;
if (flags & RWF_SYNC)
kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
kiocb.ki_pos = *ppos;
ret = fn(&kiocb, iter);
BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
unsigned long nr_segs, unsigned long fast_segs,
struct iovec *fast_pointer,
struct iovec **ret_pointer)
{
unsigned long seg;
ssize_t ret;
struct iovec *iov = fast_pointer;
/*
* SuS says "The readv() function *may* fail if the iovcnt argument
* was less than or equal to 0, or greater than {IOV_MAX}. Linux has
* traditionally returned zero for zero segments, so...
*/
if (nr_segs == 0) {
ret = 0;
goto out;
}
/*
* First get the "struct iovec" from user memory and
* verify all the pointers
*/
if (nr_segs > UIO_MAXIOV) {
ret = -EINVAL;
goto out;
}
if (nr_segs > fast_segs) {
iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
printk(KERN_DEBUG "Hello from read_write.c\n");
printk(KERN_DEBUG "Inside the copy check uvector method\n");
if (iov == NULL) {
ret = -ENOMEM;
goto out;
}
}
if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
ret = -EFAULT;
goto out;
}
/*
* According to the Single Unix Specification we should return EINVAL
* if an element length is < 0 when cast to ssize_t or if the
* total length would overflow the ssize_t return value of the
* system call.
*
* Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
* overflow case.
*/
ret = 0;
for (seg = 0; seg < nr_segs; seg++) {
void __user *buf = iov[seg].iov_base;
ssize_t len = (ssize_t)iov[seg].iov_len;
/* see if we we're about to use an invalid len or if
* it's about to overflow ssize_t */
if (len < 0) {
ret = -EINVAL;
goto out;
}
if (type >= 0
&& unlikely(!access_ok(vrfy_dir(type), buf, len))) {
ret = -EFAULT;
goto out;
}
if (len > MAX_RW_COUNT - ret) {
len = MAX_RW_COUNT - ret;
iov[seg].iov_len = len;
}
ret += len;
}
out:
*ret_pointer = iov;
return ret;
}
/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
loff_t *ppos, io_fn_t fn, int flags)
{
ssize_t ret = 0;
if (flags & ~RWF_HIPRI)
return -EOPNOTSUPP;
while (iov_iter_count(iter)) {
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
if (nr < 0) {
if (!ret)
ret = nr;
break;
}
ret += nr;
if (nr != iovec.iov_len)
break;
iov_iter_advance(iter, nr);
}
return ret;
}
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
unsigned long nr_segs, loff_t *pos,
int flags)
{
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
iter_fn_t iter_fn;
ret = import_iovec(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
if (ret < 0)
return ret;
tot_len = iov_iter_count(&iter);
if (!tot_len)
goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
if (type == READ) {
fn = file->f_op->read;
iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
iter_fn = file->f_op->write_iter;
file_start_write(file);
}
if (iter_fn)
ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
out:
kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
else
fsnotify_modify(file);
}
return ret;
}
ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
unsigned long vlen, loff_t *pos, int flags)
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
return do_readv_writev(READ, file, vec, vlen, pos, flags);
}
EXPORT_SYMBOL(vfs_readv);
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
return ret;
}
static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
f = fdget(fd);
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
ret = vfs_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
}
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
return ret;
}
static ssize_t compat_do_readv_writev(int type, struct file *file,
const struct compat_iovec __user *uvector,
unsigned long nr_segs, loff_t *pos,
int flags)
{
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
iter_fn_t iter_fn;
ret = compat_import_iovec(type, uvector, nr_segs,
UIO_FASTIOV, &iov, &iter);
if (ret < 0)
return ret;
tot_len = iov_iter_count(&iter);
if (!tot_len)
goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
if (type == READ) {
fn = file->f_op->read;
iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
iter_fn = file->f_op->write_iter;
file_start_write(file);
}
if (iter_fn)
ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
if (type != READ)
file_end_write(file);
out:
kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
else
fsnotify_modify(file);
}
return ret;
}
static size_t compat_readv(struct file *file,
const struct compat_iovec __user *vec,
unsigned long vlen, loff_t *pos, int flags)
{
ssize_t ret = -EBADF;
if (!(file->f_mode & FMODE_READ))
goto out;
ret = -EINVAL;
if (!(file->f_mode & FMODE_CAN_READ))
goto out;
ret = compat_do_readv_writev(READ, file, vec, vlen, pos, flags);
out:
if (ret > 0)
add_rchar(current, ret);
inc_syscr(current);
return ret;
}
static size_t do_compat_readv(compat_ulong_t fd,
const struct compat_iovec __user *vec,
compat_ulong_t vlen, int flags)
{
struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
if (!f.file)
return -EBADF;
pos = f.file->f_pos;
ret = compat_readv(f.file, vec, vlen, &pos, flags);
if (ret >= 0)
f.file->f_pos = pos;
fdput_pos(f);
return ret;
}
COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
compat_ulong_t, vlen)
{
return do_compat_readv(fd, vec, vlen, 0);
}
static long do_compat_preadv64(unsigned long fd,
const struct compat_iovec __user *vec,
unsigned long vlen, loff_t pos, int flags)
{
struct fd f;
ssize_t ret;
if (pos < 0)
return -EINVAL;
f = fdget(fd);
if (!f.file)
return -EBADF;
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
ret = compat_readv(f.file, vec, vlen, &pos, flags);
fdput(f);
return ret;
}
Apart from bad formatting you can easily see that do_readv and do_preadv both call vfs_readv. In those functions no hint is seen that they do some reading on their own.
You can also see that vfs_readv doesn't do any reading but only calls do_readv_writev.
The actual reading is done here:
if (type == READ) {
fn = file->f_op->read;
iter_fn = file->f_op->read_iter;
}
...
if (iter_fn)
ret = do_iter_readv_writev(file, &iter, pos, iter_fn, flags);
else
ret = do_loop_readv_writev(file, &iter, pos, fn, flags);
Well, it is not the actual reading, but it is the closest to reading that you can get from your code snippet.
What happens in these functions and more important, what was stored in iter_fn and fn is not visible from your code.
And I am no Linux expert to tell you more details.

In my read function myread it is continuously printing the read data when i do cat /driver/mydriver. I need to print only once how to do that

I wrote h into driver by doing echo:
echo -n h /dev/mydriver
When I do cat /dev/mydriver, myread function is printing h continuously. I wanted to print once. How to do that.
static char m;
static ssize_t myread(struct file *f, char __user *buf, size_t len, loff_t *off)
{
printk(KERN_INFO "Read()\n");
if (copy_to_user(buf, &m, 1) != 0)
return -EFAULT;
else
return 1;
}
static ssize_t my_write(struct file *f, const char __user *buf, size_t len, loff_t *off)
{
printk(KERN_INFO "Write()\n");
if (copy_from_user(&c, buf + len – 1, 1) != 0)
return -EFAULT;
else
return len;
}
If you want to use standard tools (such as cat) with your custom drivers, do not forget to set offset (*loff_t off) correctly. Your read function should look something like this:
static ssize_t myread(struct file *f, char __user *buf, size_t len, loff_t *off)
{
printk(KERN_INFO "Read()\n");
/* You have just a single char in your buffer, so only 0 offset is valid */
if(*off > 0)
return 0; /* End of file */
if (copy_to_user(buf, &m, 1))
return -EFAULT;
*off++;
return 1;
}
You have to think about how you want your device to work... Will what you write to it be available to multiple processes? Or should what you write be removed once it's been read?
The latter is of course easier, and can simple be implemented by clearing the variable m in the myread function. If it's zero, then return zero from the myread function.

How can I allocate memory in the Linux kernel for a char* type string?

I'm trying to allocate some memory for a char* as follows.
static ssize_t memo_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos){
ssize_t retval = -ENOMEM;
printk("write function\n");
if((data = kmalloc(strlen(buf), GFP_KERNEL)) == NULL)
printk("kmalloc fail\n");
if(copy_from_user(data, buf, strlen(buf))){
retval = -EFAULT;
goto out;
}
*f_pos += strlen(buf);
retval = strlen(buf);
out:
return retval;
}
'data' is declared in a header file as
char *data;
When I call the write function, the 'kmalloc fail' line isn't reached, which leads me to believe the kmalloc succeeded, however the data isn't displayed when I try to read from the 'data' variable again.
More confusingly, if I get rid of the kmalloc bit altogether, the data can be read from the driver. Although the problem then is it is followed by a load of other data because i don't have the opportunity to memset() it.
Am I using kmalloc correctly? Presumably not. How should I be doing this?
Additionally, my read function is as follows.
static ssize_t memo_read(struct file *f, char __user *buf,
size_t count, loff_t *f_pos){
ssize_t retval = 0;
printk("read function\n");
printk("data = %s\n", data);
if(*f_pos >= strlen(data)){
printk("EOF\n");
goto out;
}
if(copy_to_user(buf, data, strlen(data))){
retval = -EFAULT;
goto out;
}
printk("copy_to_user success\n");
*f_pos += strlen(data);
retval = strlen(data);
out:
return retval;
}
Thanks.
You should be using strlen_user() on the userspace pointer, instead of strlen() - and you should only call it once, and keep the result around (otherwise, you have a potential kernel exploit, because a second userspace thread could change the buffer while you're working on it).
Alternatively, you could use strncpy_from_user().
Apart from that, the kmalloc looks OK.
(But really, as ephemient says, you should rethink your whole approach and use the count argument instead of treating the input as a string).
Since you can't rely on the data written to a file being nul-terminated strings, you'll need to keep a data_len length parameter around alongside the data. Then your read/write implementations would be along these lines:
static char *data = NULL;
static size_t data_len;
static DEFINE_MUTEX(data_mutex);
static ssize_t memo_read(struct file *f, char __user *buf, size_t count, loff_t *f_pos
{
ssize_t retval = 0;
char *start;
mutex_lock(&data_mutex);
if (!data)
{
retval = -EINVAL; /* Or whatever you want to do here... */
goto out;
}
if (*f_pos >= data_len)
goto out; /* EOF */
start = data + *f_pos;
retval = data_len - *f_pos;
if (retval > count)
retval = count;
if (copy_to_user(buf, start, retval))
{
retval = -EFAULT;
goto out;
}
*f_pos += retval;
out:
mutex_unlock(&data_mutex);
return retval;
}
static ssize_t memo_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos)
{
ssize_t retval = -ENOMEM;
mutex_lock(&data_mutex);
if (data)
kfree(data);
data = kmalloc(count, GFP_KERNEL);
if (!data)
goto out;
if (copy_from_user(data, buf, count))
{
kfree(data);
retval = -EFAULT;
goto out;
}
*f_pos = count;
retval = count;
data_len = count;
out:
mutex_unlock(&data_mutex);
return retval;
}
Don't forget to kfree(data) in your error cases...
In any case, buf is a pointer to user memory, so DON'T call strlen(buf). You must copy_from_user first. Why not
data = kmalloc(count);
copy_from_user(data, buf, count);
?
Your read handler assumes that data is a NUL-terminated string. When you were using an array, this may have been true by accident, but you never actually ensure this in your write handler. My guess is that copy_to_user fails.
Here's a working example of a "memo" module that I wrote up just now, using kmalloc:
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/uaccess.h>
static char *data;
static size_t len;
static ssize_t
memo_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
ssize_t copy_len = min(len - min(len, *ppos), count);
ssize_t retval;
if (copy_to_user(buf, data + *ppos, copy_len)) {
retval = -EFAULT;
goto out;
}
*ppos += copy_len;
retval = copy_len;
out:
return retval;
}
static ssize_t
memo_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
ssize_t retval;
char *newdata;
newdata = kmalloc(count, GFP_KERNEL);
if (!newdata) {
retval = -ENOMEM;
goto out;
}
if (copy_from_user(newdata, buf, count)) {
retval = -EFAULT;
goto out;
}
kfree(data);
data = newdata;
newdata = NULL;
retval = len = count;
out:
kfree(newdata);
return retval;
}
static const struct file_operations memo_fops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.read = memo_read,
.write = memo_write,
};
static struct miscdevice memo_misc = { MISC_DYNAMIC_MINOR, "memo", &memo_fops };
static int __init memo_init(void)
{
int result;
result = misc_register(&memo_misc);
if (result < 0)
return -ENODEV;
return 0;
}
static void __exit memo_exit(void)
{
misc_deregister(&memo_misc);
kfree(data);
return;
}
module_init(memo_init);
module_exit(memo_exit);
MODULE_AUTHOR("ephemient");
MODULE_LICENSE("GPL");
Of course this is missing locking and other safety precautions, but I hope this helps.

Resources