I implement a memory mapping via mmap. My Kernel module writes something into this memory and a userspace application read this. In short I allocate 0x10000 memory (with kcalloc on kernel side and with mmap on userspace side). Then I write something to the address offsets 0x0, 0xf00 and 0xf000 using memcpy. On kernelside I can read back the memory correctly. But on userspace side the content of the first 0x1000 Bytes are repetitive through the whole memory (16 times). But why?
Her comes the code of the kernel module:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/slab.h>
#include <linux/mm.h>
#define DEV_MODULENAME "expdev"
#define DEV_CLASSNAME "expdevclass"
static int majorNumber;
static struct class *devClass = NULL;
static struct device *devDevice = NULL;
#ifndef VM_RESERVED
# define VM_RESERVED (VM_DONTEXPAND | VM_DONTDUMP)
#endif
struct mmap_info
{
char *data;
int reference;
};
static void
dev_vm_ops_open( struct vm_area_struct *vma )
{
struct mmap_info *info;
// counting how many applications mapping on this dataset
info = (struct mmap_info *)vma->vm_private_data;
info->reference++;
}
static void
dev_vm_ops_close( struct vm_area_struct *vma )
{
struct mmap_info *info;
info = (struct mmap_info *)vma->vm_private_data;
info->reference--;
}
static int
dev_vm_ops_fault( struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
info = (struct mmap_info *)vma->vm_private_data;
if (!info->data)
{
printk("No data\n");
return 0;
}
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
return 0;
}
static const struct vm_operations_struct dev_vm_ops =
{
.open = dev_vm_ops_open,
.close = dev_vm_ops_close,
.fault = dev_vm_ops_fault,
};
int
fops_mmap( struct file *filp,
struct vm_area_struct *vma)
{
vma->vm_ops = &dev_vm_ops;
vma->vm_flags |= VM_RESERVED;
vma->vm_private_data = filp->private_data;
dev_vm_ops_open(vma);
return 0;
}
int
fops_close( struct inode *inode,
struct file *filp)
{
struct mmap_info *info;
info = filp->private_data;
free_page((unsigned long)info->data);
kfree(info);
filp->private_data = NULL;
return 0;
}
int
fops_open( struct inode *inode,
struct file *p_file)
{
struct mmap_info *info;
char *data;
info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
// allocating memory on the heap for the data
data = kcalloc(0x10000,sizeof(char),GFP_KERNEL);
if( data==NULL )
{
printk(KERN_ERR "insufficient memory\n");
/* insufficient memory: you must handle this error! */
return ENOMEM;
}
info->data = data;
printk(KERN_INFO " > ->data: 0x%16p\n",info->data);
memcpy(info->data, "Initial entry on mapped memory by the kernel module", 52);
memcpy((info->data)+0xf00, "Somewhere", 9);
memcpy((info->data)+0xf000, "Somehow", 7);
printk(KERN_INFO " > ->data: %c%c%c\n", // the output here is correct
*(info->data+0xf000+0),
*(info->data+0xf000+1),
*(info->data+0xf000+2));
/* assign this info struct to the file */
p_file->private_data = info;
return 0;
}
static const struct file_operations dev_fops =
{
.open = fops_open,
.release = fops_close,
.mmap = fops_mmap,
};
static int __init
_module_init(void)
{
int ret = 0;
// Try to dynamically allocate a major number for the device
majorNumber = register_chrdev(0, DEV_MODULENAME, &dev_fops);
if (majorNumber<0)
{
printk(KERN_ALERT "Failed to register a major number.\n");
return -EIO; // I/O error
}
// Register the device class
devClass = class_create(THIS_MODULE, DEV_CLASSNAME);
// Check for error and clean up if there is
if (IS_ERR(devClass))
{
printk(KERN_ALERT "Failed to register device class.\n");
ret = PTR_ERR(devClass);
goto goto_unregister_chrdev;
}
// Create and register the device
devDevice = device_create(devClass,
NULL,
MKDEV(majorNumber, 0),
NULL,
DEV_MODULENAME
);
// Clean up if there is an error
if( IS_ERR(devDevice) )
{
printk(KERN_ALERT "Failed to create the device.\n");
ret = PTR_ERR(devDevice);
goto goto_class_destroy;
}
printk(KERN_INFO "Module registered.\n");
return ret;
// Error handling - using goto
goto_class_destroy:
class_destroy(devClass);
goto_unregister_chrdev:
unregister_chrdev(majorNumber, DEV_MODULENAME);
return ret;
}
static void __exit
_module_exit(void)
{
device_destroy(devClass, MKDEV(majorNumber, 0));
class_unregister(devClass);
class_destroy(devClass);
unregister_chrdev(majorNumber, DEV_MODULENAME);
printk(KERN_INFO "Module unregistered.\n");
}
module_init(_module_init);
module_exit(_module_exit);
MODULE_LICENSE("GPL");
here comes the code of the application
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#define PAGE_SIZE (0x10000)
int main ( int argc, char **argv )
{
int fd;
char *address = NULL;
time_t t = time(NULL);
char *sbuff;
int i;
sbuff = (char*) calloc(PAGE_SIZE,sizeof(char));
fd = open("/dev/expdev", O_RDWR);
if(fd < 0)
{
perror("Open call failed");
return -1;
}
address = mmap( NULL,
PAGE_SIZE,
PROT_READ|PROT_WRITE,
MAP_SHARED,
fd,
0);
if (address == MAP_FAILED)
{
perror("mmap operation failed");
return -1;
}
printf("%s: first userspace read\n",tbuff);
memcpy(sbuff, address,80);
printf("Initial message: %s\n", sbuff);
memcpy(sbuff, address+0xf00,80);
printf("Initial message: %s\n", sbuff);
memcpy(sbuff, address+0xf000,80);
printf("Initial message: %s\n", sbuff);
for(i=0; i<PAGE_SIZE; i++)
{
printf("%16p: %c\n",address+i, (char)*(address+i));
}
if (munmap(address, PAGE_SIZE) == -1)
{
perror("Error un-mmapping the file");
}
close(fd);
return 0;
}
and this is the output of the application:
0x7fe61b522000: I
0x7fe61b522001: n
0x7fe61b522002: i
0x7fe61b522003: t
0x7fe61b522004: i
0x7fe61b522005: a
0x7fe61b522006: l
...
0x7fe61b522f00: S
0x7fe61b522f01: o
0x7fe61b522f02: m
0x7fe61b522f03: e
0x7fe61b522f04: w
0x7fe61b522f05: h
0x7fe61b522f06: e
0x7fe61b522f07: r
0x7fe61b522f08: e
...
0x7fe61b523000: I
0x7fe61b523001: n
0x7fe61b523002: i
0x7fe61b523003: t
0x7fe61b523004: i
0x7fe61b523005: a
0x7fe61b523006: l
...
0x7fe61b523f00: S
0x7fe61b523f01: o
0x7fe61b523f02: m
0x7fe61b523f03: e
0x7fe61b523f04: w
0x7fe61b523f05: h
0x7fe61b523f06: e
0x7fe61b523f07: r
0x7fe61b523f08: e
...
0x7fe61b524000: I
0x7fe61b524001: n
0x7fe61b524002: i
0x7fe61b524003: t
0x7fe61b524004: i
0x7fe61b524005: a
0x7fe61b524006: l
...
It seems to me, that the repetition comes with the size of one page. But this makes no sense to me.
EDIT 1:
Add Somewhere to the output. Note: Only Somehow never occurs!
EDIT 2:
Corrected fault handler. This now considered the offset of the calling vmf. Now it runs like a charm. Thanks to Tsyvarev!
static int
dev_vm_ops_fault( struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
info = (struct mmap_info *)vma->vm_private_data;
if (!info->data)
{
printk("No data\n");
return 0;
}
page = virt_to_page((info->data)+(vmf->pgoff*PAGE_SIZE));
get_page(page);
vmf->page = page;
return 0;
}
But on userspace side the content of the first 0x1000`
0x1000 is a size of the page mapped with
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
Callback .fault of structure vm_operations_struct is called for every page (4096 bytes), which is accessed by the user but not mapped yet.
So your code just map first 4096 bytes (0x1000) of data to every page which user space accesses.
Related
Driver's mmap() entry point not getting called.
This is the source code of my device driver:
struct miscdevice my_dev = {
.minor = MISC_DYNAMIC_MINOR,
.name = "mymma",
.fops = &my_fops,
};
static const struct file_operations my_fops = {
.owner = THIS_MODULE,
.mmap = my_mmap,
};
static int __init my_module_init(void)
{
return my_init();
}
static void __exit my_module_exit(void)
{
my_exit();
}
int my_init(void)
{
int ret =0;
if ((ret = misc_register(&my_dev)))
{
printk(KERN_ERR "Unable to register \"my mma\" misc device\n");
return ret;
}
printk("kernel module installed\n");
return ret;
}
But my driver's mmap() entry point is not getting called.
This is the user space program calling it:
#include <stdio.h>
#include <sys/mman.h>
#include <fcntl.h>
int main(){
int fd=open("/dev/mymma",O_RDONLY);
if(fd<0)
exit(0);
printf("helllo\n");
int N=5;
int *ptr = mmap ( NULL, N*sizeof(int),
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, fd, 0 );
if(ptr == MAP_FAILED){
printf("Mapping Failed\n");
return 1;
}
for(int i=0; i<N; i++)
ptr[i] = i*10;
for(int i=0; i<N; i++)
printf("[%d] ",ptr[i]);
printf("\n");
int err = munmap(ptr, 10*sizeof(int));
if(err != 0){
printf("UnMapping Failed\n");
return 1;
}
return 0;
}
Provide the mmap() entry point of your driver.
I can notice that the device node is opened RDONLY but you are calling mmap() with PROT_READ/WRITE. Moreover MAP_ANONYMOUS makes mmap() ignore the file descriptor: you are merely allocating some space in memory. That is why you don't reach your driver.
so I have this code
this is what I am doing in ioctl implementation
if( copy_from_user(&value ,(struct aa*) arg, sizeof(value)) )
{
pr_err("Data Write : Err!\n");
}
__u64 a=value.a;
__u32 *b=(__u32 *)a;
pr_info("wow Value = [%x]\n", (int)b[0]);
but I am passing from userspace float so but my passed values are not correctly printing in printk
this is my program
struct aa
{
uint64_t a;
};
#define WR_VALUE _IOW('a','a',struct aa*)
#define RD_VALUE _IOR('a','b',struct aa*)
int main()
{
struct aa a;
float *f=(float[]){2,2,3};
a.a=(uint64_t)f;
printf("sizeof = %zu\n",sizeof(*f));
int fd;
int32_t value, number;
printf("*********************************\n");
printf("*******WWW.EmbeTronicX.com*******\n");
printf("\nOpening Driver\n");
fd = open("/dev/etx_device", O_RDWR);
if(fd < 0) {
printf("Cannot open device file...\n");
return 0;
}
printf("Enter the Value to send\n");
scanf("%d",&number);
printf("Writing Value to Driver\n");
ioctl(fd, WR_VALUE, (struct aa *) &a);
printf("Reading Value from Driver\n");
ioctl(fd, RD_VALUE, (struct aa*) &a);
printf("Value is %d\n", value);
printf("Closing Driver\n");
close(fd);
}
full code
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kdev_t.h>
#include <linux/fs.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include<linux/slab.h> //kmalloc()
#include<linux/uaccess.h> //copy_to/from_user()
#include <linux/ioctl.h>
#include <asm/fpu/api.h>
struct aa
{
__u64 a;
};
#define WR_VALUE _IOW('a','a',struct aa *)
#define RD_VALUE _IOR('a','b',struct aa *)
struct aa value;
dev_t dev = 0;
static struct class *dev_class;
static struct cdev etx_cdev;
/*
** Function Prototypes
*/
static int __init etx_driver_init(void);
static void __exit etx_driver_exit(void);
static int etx_open(struct inode *inode, struct file *file);
static int etx_release(struct inode *inode, struct file *file);
static ssize_t etx_read(struct file *filp, char __user *buf, size_t len,loff_t * off);
static ssize_t etx_write(struct file *filp, const char *buf, size_t len, loff_t * off);
static long etx_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/*
** File operation sturcture
*/
static struct file_operations fops =
{
.owner = THIS_MODULE,
.open = etx_open,
.unlocked_ioctl = etx_ioctl,
.release = etx_release,
};
/*
** This function will be called when we open the Device file
*/
static int etx_open(struct inode *inode, struct file *file)
{
return 0;
}
/*
** This function will be called when we close the Device file
*/
static int etx_release(struct inode *inode, struct file *file)
{
return 0;
}
static long etx_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
switch(cmd) {
case WR_VALUE:
if( copy_from_user(&value ,(struct aa*) arg, sizeof(value)) )
{
pr_err("Data Write : Err!\n");
}
__u64 a=value.a;
__u32 *b=(__u32 *)a;
pr_info("wow Value = [%x]\n", (int)b[0]);
kernel_fpu_end();
break;
case RD_VALUE:
if( copy_to_user((struct aa*) arg, &value, sizeof(value)) )
{
pr_err("Data Read : Err!\n");
}
break;
default:
pr_info("Default\n");
break;
}
return 0;
}
/*
** Module Init function
*/
static int __init etx_driver_init(void)
{
/*Allocating Major number*/
if((alloc_chrdev_region(&dev, 0, 1, "etx_Dev")) <0){
pr_err("Cannot allocate major number\n");
return -1;
}
pr_info("Major = %d Minor = %d \n",MAJOR(dev), MINOR(dev));
/*Creating cdev structure*/
cdev_init(&etx_cdev,&fops);
/*Adding character device to the system*/
if((cdev_add(&etx_cdev,dev,1)) < 0){
pr_err("Cannot add the device to the system\n");
goto r_class;
}
/*Creating struct class*/
if((dev_class = class_create(THIS_MODULE,"etx_class")) == NULL){
pr_err("Cannot create the struct class\n");
goto r_class;
}
/*Creating device*/
if((device_create(dev_class,NULL,dev,NULL,"etx_device")) == NULL){
pr_err("Cannot create the Device 1\n");
goto r_device;
}
pr_info("Device Driver Insert...Done!!!\n");
return 0;
r_device:
class_destroy(dev_class);
r_class:
unregister_chrdev_region(dev,1);
return -1;
}
/*
** Module exit function
*/
static void __exit etx_driver_exit(void)
{
device_destroy(dev_class,dev);
class_destroy(dev_class);
cdev_del(&etx_cdev);
unregister_chrdev_region(dev, 1);
pr_info("Device Driver Remove...Done!!!\n");
}
module_init(etx_driver_init);
module_exit(etx_driver_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("EmbeTronicX <embetronicx#gmail.com>");
MODULE_DESCRIPTION("Simple Linux device driver (IOCTL)");
MODULE_VERSION("1.5");
what am I doing wrong in kernel ioctl because I casted float to uint_64 like a.a=(uint64_t)f; and passed struct a to kernel now I want to read float *f elements passed from userspace through ioctl in ioctl implementation
So I have this function in my driver for network NIC and this function appears in proc/kallsyms[https://stackoverflow.com/a/67766463/4808760] file with base address this is the function
static int rtl8169_poll(struct napi_struct *napi, int budget)
{
struct rtl8169_private *tp = container_of(napi, struct rtl8169_private, napi);
struct net_device *dev = tp->dev;
int work_done;
rtl_tx(dev, tp, budget);
work_done = rtl_rx(dev, tp, budget);
if (work_done < budget && napi_complete_done(napi, work_done))
rtl_irq_enable(tp);
return work_done;
}
appears as
ffffffffc02d2210 t rtl8169_poll [r8169]
and this is my ebpf program
SEC("kprobe/rtl8169_poll")
int bpf_prog2(struct pt_regs *ctx)
{
int sc_nr = (int)PT_REGS_PARM1(ctx);
char *fmt="HELLO from FWDALI %d %d";
bpf_trace_printk(fmt,1,sc_nr);
bpf_trace_printk(fmt ,2,sc_nr);
/* dispatch into next BPF program depending on syscall number */
//bpf_tail_call(ctx, &progs, sc_nr);
/* fall through -> unknown syscall */
//if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) {
// char fmt[] = "-----FWD-------------------------syscall=%d (one of get/set uid/pid/gid)\n";
// bpf_trace_printk(fmt, sizeof(fmt), sc_nr);
//}
return 0;
}
And this is my simple userspace code
// SPDX-License-Identifier: GPL-2.0
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <bpf/bpf.h>
#include <bpf/libbpf.h>
#include <sys/resource.h>
#include <fcntl.h>
#ifdef __mips__
#define MAX_ENTRIES 6000 /* MIPS n64 syscalls start at 5000 */
#else
#define MAX_ENTRIES 1024
#endif
/* install fake seccomp program to enable seccomp code path inside the kernel,
* so that our kprobe attached to seccomp_phase1() can be triggered
*/
void read_trace_pipe(void)
{
int trace_fd;
//printf("-%s-\n",DEBUGFS);
trace_fd = open( "/sys/kernel/debug/tracing/trace_pipe", O_RDONLY, 0);
if (trace_fd < 0)
return;
while (1) {
static char buf[4096];
ssize_t sz;
sz = read(trace_fd, buf, sizeof(buf) - 1);
if (sz > 0) {
buf[sz] = 0;
puts(buf);
}
}
}
static void install_accept_all_seccomp(void)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
.filter = filter,
};
if (prctl(PR_SET_SECCOMP, 2, &prog))
perror("prctl");
}
int main(int ac, char **argv)
{
struct bpf_link *link = NULL;
struct bpf_program *prog;
struct bpf_object *obj;
int key, fd, progs_fd;
const char *section;
char filename[256];
FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[1]);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
fprintf(stderr, "ERROR: opening BPF object file failed\n");
return 0;
}
prog = bpf_object__find_program_by_name(obj, "bpf_prog2");
if (!prog) {
printf("finding a prog in obj file failed\n");
goto cleanup;
}
/* load BPF program */
if (bpf_object__load(obj)) {
fprintf(stderr, "ERROR: loading BPF object file failed\n");
goto cleanup;
}
link = bpf_program__attach(prog);
if (libbpf_get_error(link)) {
fprintf(stderr, "ERROR: bpf_program__attach failed\n");
link = NULL;
goto cleanup;
}
progs_fd = bpf_object__find_map_fd_by_name(obj, "progs");
if (progs_fd < 0) {
fprintf(stderr, "ERROR: finding a map in obj file failed\n");
goto cleanup;
}
bpf_object__for_each_program(prog, obj) {
section = bpf_program__section_name(prog);
/* register only syscalls to PROG_ARRAY */
if (sscanf(section, "kprobe/%d", &key) != 1)
continue;
fd = bpf_program__fd(prog);
bpf_map_update_elem(progs_fd, &key, &fd, BPF_ANY);
}
install_accept_all_seccomp();
f = popen("dd if=/dev/zero of=/dev/null count=5", "r");
(void) f;
read_trace_pipe();
cleanup:
bpf_link__destroy(link);
bpf_object__close(obj);
return 0;
}
SO i like if some take a look at above and explain what exactly I need to add to my ebpf program for kprobe and also what I need to do in my userspace loader program..
I am still having tough time with getting to loads of stuff that tells its simple to implement to use this magical line SEC("kprobe/rtl8169_poll") or something with just loading the program from userspace and its done, But I havent started thinking much of ebpf since ebpf is kind of failed in this simple function hook
this link gave me the idea that I can hook to this function https://stackoverflow.com/a/67766463/4808760
I tried to use perf_event_open() to track all the store instructions to get their access address. I found only when I set attr.precise_ip > 0, I can get the non-zero address. But when I ran the same process on vm instead of host, the error massage was "Operation not supported", I can fix this problem by setting precise_ip = 0 on vm, but now I only get bunch of addresses equal to zero. I don't understand why precise_ip is related to the sample addrress which is not pointed out on document, and I also don't understand why I can't set precise_ip = 1 on vm while I can do it on host. Is there anybody can help me??
FYI: I use - cpu host option when I start vm using qemu-system-x86_64
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/fcntl.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>
#define PERF_PAGES (1 + (1 << 16))
struct perf_sample {
struct perf_event_header header;
__u64 ip;
__u32 pid, tid; /* if PERF_SAMPLE_TID */
__u64 addr; /* if PERF_SAMPLE_ADDR */
__u64 weight; /* if PERF_SAMPLE_WEIGHT */
/* __u64 data_src; /\* if PERF_SAMPLE_DATA_SRC *\/ */
__u64 phy_addr;
};
int perf_event_open(struct perf_event_attr *attr,pid_t pid,int cpu,int group_fd,unsigned long flags)
{
return syscall(__NR_perf_event_open,attr,pid,cpu,group_fd,flags);
}
void workload()
{
int i,c=0;
for(i=0;i<100000000;i++)
{
c+=i*i;
c-=i*100;
c+=i*i*i/100;
}
}
int startup()
{
struct perf_event_attr attr;
memset(&attr,0,sizeof(struct perf_event_attr));
attr.type = PERF_TYPE_RAW;
attr.size = sizeof(struct perf_event_attr);
attr.config = 0x82d0;
attr.config1 = 0;
attr.sample_period = 1000;
attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_WEIGHT | PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR ;
attr.disabled = 0;
//attr.inherit = 1;
attr.exclude_kernel = 1;
attr.exclude_hv = 1;
attr.exclude_callchain_kernel = 1;
attr.exclude_callchain_user = 1;
attr.precise_ip = 1; // when i set attr.precise_ip = 0 , all the addr = 0;
int fd=perf_event_open(&attr,0,-1,-1,0);
if(fd<0)
{
perror("Cannot open perf fd!");
return -1;
}
return fd;
}
void scan_thread(struct perf_event_mmap_page *p)
{
char *pbuf = (char *)p + p->data_offset;
__sync_synchronize();
printf("%d,\n", p->data_size);
if(p->data_head == p->data_tail) {
return;
}
struct perf_event_header *ph = (void *)(pbuf + (p->data_tail % p->data_size));
struct perf_sample* ps;
switch(ph->type) {
case PERF_RECORD_SAMPLE:
ps = (struct perf_sample*)ph;
// assert(ps != NULL);
if(ps == NULL)
{
printf("null\n");
}
if(ps!= NULL && ps->addr != 0) {
printf("ip %lx\n", ps->ip);
printf("tid %d\n", ps->tid);
printf("addr: %lx \n", ps->addr);
}
//printf("addr, %lx\n", ps->addr);
//printf("phy addr, %lx\n", ps->phy_addr);
break;
default:
printf("type %d\n", ph->type);
break;
}
}
int main()
{
int fd = startup();
size_t mmap_size = sysconf(_SC_PAGESIZE) * PERF_PAGES;
struct perf_event_mmap_page *p = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
// start to perf
ioctl(fd,PERF_EVENT_IOC_ENABLE,0);
int a= 0;
while(1)
{
// uint64_t instructions;
// read(fd,&instructions,sizeof(instructions));
// printf("instructions=%ld\n",instructions);
// sleep(1);
workload();
scan_thread(p);
sleep(1);
}
}
I have shared memory segment created in kernel using mmap. I need to access this mapped memory from both kernel and user space. What mechanism should I use to protect the memory from concurrent access ?
I want to have something like:
Kernel module:
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/mm.h>
#ifndef VM_RESERVED
# define VM_RESERVED (VM_DONTEXPAND | VM_DONTDUMP)
#endif
struct dentry *file;
struct mmap_info
{
char *data;
int reference;
};
void mmap_open(struct vm_area_struct *vma)
{
struct mmap_info *info = (struct mmap_info *)vma->vm_private_data;
info->reference++;
}
void mmap_close(struct vm_area_struct *vma)
{
struct mmap_info *info = (struct mmap_info *)vma->vm_private_data;
info->reference--;
}
static int mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page;
struct mmap_info *info;
info = (struct mmap_info *)vma->vm_private_data;
if (!info->data)
{
printk("No data\n");
return 0;
}
page = virt_to_page(info->data);
get_page(page);
vmf->page = page;
return 0;
}
struct vm_operations_struct mmap_vm_ops =
{
.open = mmap_open,
.close = mmap_close,
.fault = mmap_fault,
};
int op_mmap(struct file *filp, struct vm_area_struct *vma)
{
vma->vm_ops = &mmap_vm_ops;
vma->vm_flags |= VM_RESERVED;
vma->vm_private_data = filp->private_data;
mmap_open(vma);
return 0;
}
int mmapfop_close(struct inode *inode, struct file *filp)
{
struct mmap_info *info = filp->private_data;
free_page((unsigned long)info->data);
kfree(info);
filp->private_data = NULL;
return 0;
}
int mmapfop_open(struct inode *inode, struct file *filp)
{
struct mmap_info *info = kmalloc(sizeof(struct mmap_info), GFP_KERNEL);
info->data = (char *)get_zeroed_page(GFP_KERNEL);
memcpy(info->data, "hello from kernel this is file: ", 32);
memcpy(info->data + 32, filp->f_dentry->d_name.name, strlen(filp->f_dentry->d_name.name));
/* assign this info struct to the file */
filp->private_data = info;
return 0;
}
static const struct file_operations mmap_fops = {
.open = mmapfop_open,
.release = mmapfop_close,
.mmap = op_mmap,
};
static int __init mmapexample_module_init(void)
{
file = debugfs_create_file("mmap_example", 0644, NULL, NULL, &mmap_fops);
return 0;
}
static void __exit mmapexample_module_exit(void)
{
debugfs_remove(file);
}
module_init(mmapexample_module_init);
module_exit(mmapexample_module_exit);
MODULE_LICENSE("GPL");
User space:
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <sys/mman.h>
#define PAGE_SIZE 4096
int main ( int argc, char **argv )
{
int configfd;
char * address = NULL;
configfd = open("/sys/kernel/debug/mmap_example", O_RDWR);
if(configfd < 0)
{
perror("Open call failed");
return -1;
}
address = mmap(NULL, PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, configfd, 0);
if (address == MAP_FAILED)
{
perror("mmap operation failed");
return -1;
}
printf("Initial message: %s\n", address);
memcpy(address + 11 , "*user*", 6);
printf("Changed message: %s\n", address);
close(configfd);
return 0;
}
but with locks.
Kernel space and user space have no shared mechanisms for concurrent access protection. If you want them, you need to implement them by yourself.
It can be some sort of mutex, implemented within you kernel module, and accessed from user space via special ioctl requests:
Kernel:
DECLARE_WAIT_QUEUE_HEAD(wq);
int my_mutex_val = 0;
/*
* Lock mutex.
*
* May be used directly by the kernel or via 'ioctl(MY_CMD_LOCK)' by user.
*/
void my_mutex_lock(void)
{
spin_lock(&wq.lock);
wait_event_interruptible_locked(&wq, my_mutex_val == 0);
my_mutex_val = 1;
spin_unlock(&wq.lock);
}
/*
* Unlock mutex.
*
* May be used directly by the kernel or via 'ioctl(MY_CMD_UNLOCK)' by user.
*/
void my_mutex_unlock(void)
{
spin_lock(&wq.lock);
my_mutex_val = 0;
wake_up(&wq);
spin_unlock(&wq.lock);
}
long unlocked_ioctl (struct file * filp, unsigned int cmd, unsigned long val)
{
switch(cmd) {
case MY_CMD_LOCK:
my_mutex_lock();
break;
case MY_CMD_UNLOCK:
my_mutex_unlock();
break;
}
}
User:
int main()
{
...
ioctl(MY_CMD_LOCK);
<read data>
ioctl(MY_CMD_UNLOCK);
...
}
It can be some sort of spinlock, which value is stored in mmap-ed area (so visible both for kernel space and user space).
In any case, kernel module should be prepared for the case, when user space application doesn't follow locking conventions. This, probably, would cancel any expectation about mmap-ed area content, generated by the kernel, but kernel module shouldn't crash in that case. [This is why standard kernel's struct mutex is not used in the code above: user space may use it incorrectly].
The problem with the ioctl is you need a kernel switch every time you want to access the share info->data. If that is okay then the ioctl is good - but then why not just do a standard character read/write file operation instead?
You can also try a lock-less mechanism. In the shared info->data area add a barrier variable. When the user needs access, it will do an atomic_compare_and_xchg on the barrier variable until it is set to 0 (unused) and then set it to 1. When the kernel needs access it will do the same but set it to 2. See the gcc atomic builtin documentation.