Writing to page mapped dmas in kernel - c

I've been working on modifying the intel ixgbe kernel driver to function with my PCIe device (FPGA but that's not super important). The kernel and the PCIe device all negotiate quite well, configuration headers are passed along and communication seems to function. However attempting to write DMA_FROM_DEVICE I have a slight problem that I don't understand and I'm hoping for help.
rx_ring->desc = dma_alloc_coherent(dev, ///This function allocates dma space of size size for handle dma on device dev with flag GFP KERNEL
rx_ring->size,
&rx_ring->dma, ///This dma handle may be cast to unsigned integer of the same bus width and given to dev as the DMA base address
GFP_KERNEL);
page = dev_alloc_pages(0);
dma = dma_map_page(rx_ring->dev, page, 0, acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
//Writing to the PCI device the base address to place data into.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
//This will perfectly read data I place onto the PCIe bus.
rx_ring->desc->wb.upper.length
//This seems to read some garbage memory.
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
unsigned char *va = page_address(page) + rx_buffer->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
//Some code later
dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
new_buff->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
I've tried to purge code down to just the points of interest but here's the brief run down. I allocate space for the dma creating the virtual and bus address via the dma_alloc_coherent function. I create a page of memory for the dma and map this page to the dma via the dev_alloc_pages and dma_map_page commands. I pass the dma bus address to my PCIe device so it can write to the proper offset via the writel commands (I know iowrite32 but this is on redhat).
From here there are 2 ways that the origonal ixgbe driver reads data from the PCIe bus. First it directly reads from the dma's allocated virtual address (desc), but this is only used for configuration information (in the driver I am working off of). The second method is via use page_address(page) to I believe get a virtual address for the page of memory. The problem is there is only garbage memory there.
So here is my confusion. Where is page pointing to and how do I place data into page via the PCI bus? I assumed that dma_map_page would sort of merge the 2 virtual addresses into 1 so my write into the dma's bus address would collide into the page but this doesn't seem to be the case. What base address should my PCI device be writing from to align into this page of memory?
I'm working on redhat, specifically Centos kernel version 3.10.0 which makes for some problems since redhat kernel is very different from base kernel but hopefully someone can help. Thank you for any pointers.
EDIT: Added dma_sync calls which I forgot to include in original post.
EDIT2: Added a more complete code base. As a note I'm still not including some of the struct definitions or top function calls (like probe for instance), but hopefully this will be a lot more complete. Sorry for how long it is.
//These functions are called during configuration
int acc_setup_rx_resources(struct acc_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
int numa_node = -1;
int size;
size = sizeof(struct acc_rx_buffer) * rx_ring->count;
if (rx_ring->q_vector)
numa_node = rx_ring->q_vector->numa_node;
rx_ring->rx_buffer_info = vzalloc_node(size, numa_node);
if (!rx_ring->rx_buffer_info)
rx_ring->rx_buffer_info = vzalloc(size);
if (!rx_ring->rx_buffer_info)
goto err;
/* Round up to nearest 4K */
rx_ring->size = rx_ring->count * sizeof(union acc_adv_rx_desc);
rx_ring->size = ALIGN(rx_ring->size, 4096);
set_dev_node(dev, numa_node);
rx_ring->desc = dma_alloc_coherent(dev,
rx_ring->size,
&rx_ring->dma,
GFP_KERNEL);
set_dev_node(dev, orig_node);
if (!rx_ring->desc)
rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
&rx_ring->dma, GFP_KERNEL);
if (!rx_ring->desc)
goto err;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
return 0;
err:
vfree(rx_ring->rx_buffer_info);
rx_ring->rx_buffer_info = NULL;
dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
return -ENOMEM;
}
static bool acc_alloc_mapped_page(struct acc_ring *rx_ring,
struct acc_rx_buffer *bi)
{
struct page *page = bi->page;
dma_addr_t dma = bi->dma;
if (likely(page))
return true;
page = dev_alloc_pages(0);
if(unlikely(!page)){
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
/* map page for use */
dma = dma_map_page(rx_ring->dev, page, 0,
acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
if (dma_mapping_error(rx_ring->dev, dma)) {
__free_pages(page, acc_rx_pg_order(rx_ring));
bi->page = NULL;
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
bi->dma = dma;
bi->page = page;
bi->page_offset = 0;
page_ref_add(page, USHRT_MAX - 1); //This seems to exist in redhat kernel but not 3.10 base kernel... keep?
return true;
}
void acc_alloc_rx_buffers(struct acc_ring *rx_ring, u16 cleaned_count)
{
union acc_adv_rx_desc *rx_desc;
struct acc_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
printk(KERN_INFO "acc Attempting to allocate rx buffers\n");
/* nothing to do */
if (!cleaned_count)
return;
rx_desc = ACC_RX_DESC(rx_ring, i);
bi = &rx_ring->rx_buffer_info[i];
i -= rx_ring->count;
do {
if (!acc_alloc_mapped_page(rx_ring, bi)){
printk(KERN_INFO "acc Failed to allocate and map the page to dma\n");
break;
}
printk(KERN_INFO "acc happily allocated and mapped page to dma\n");
/*
* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
rx_desc++;
bi++; ///Move to the next buffer
i++;
if (unlikely(!i)) {
rx_desc = ACC_RX_DESC(rx_ring, 0);
bi = rx_ring->rx_buffer_info;
i -= rx_ring->count;
}
/* clear the hdr_addr for the next_to_use descriptor */
rx_desc->read.hdr_addr = 0;
cleaned_count--;
} while (cleaned_count);
i += rx_ring->count;
if (rx_ring->next_to_use != i)
acc_release_rx_desc(rx_ring, i);
}
//This function is called via a napi_schedule command which fires when an MSI interrupt is thrown from my PCIe device (all works fine).
int acc_poll(struct napi_struct *napi, int budget)
{
struct acc_q_vector *q_vector =
container_of(napi, struct acc_q_vector, napi);
struct acc_adapter *adapter = q_vector->adapter;
struct acc_ring *ring;
int per_ring_budget;
bool clean_complete = true;
e_dev_info("Landed in acc_poll\n");
e_dev_info("Attempting to read register space 0x00=%x\t0x04=%x\n", \
readl(q_vector->adapter->hw.hw_addr), readl(q_vector->adapter->hw.hw_addr+0x04));
e_dev_info("Attempting to write to pci ctl\n");
e_dev_info("Target address %.8x%.8x\n",q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF);
e_dev_info("Attempted page address %.8x%.8x\n",virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF);
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr+ACC_PCI_IPCONT_DATA_OFFSET); //These are supposed to be iowrite64 but it seems iowrite64 is different in redhat and only supports the copy function (to,from,size). yay redhat think different.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, q_vector->adapter->hw_region2.hw_addr+0x10+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x14+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0xFF00000000000000, q_vector->adapter->hw_region2.hw_addr+0x18+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x20+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x28+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0003344000005500, q_vector->adapter->hw_region2.hw_addr+0x30+ACC_PCI_IPCONT_DATA_OFFSET);
//Send the start command to the block
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr);
acc_for_each_ring(ring, q_vector->tx)
clean_complete &= !!acc_clean_tx_irq(q_vector, ring);
if (q_vector->rx.count > 1)
per_ring_budget = max(budget/q_vector->rx.count, 1);
else
per_ring_budget = budget;
acc_for_each_ring(ring, q_vector->rx){
e_dev_info("Calling clean_rx_irq\n");
clean_complete &= acc_clean_rx_irq(q_vector, ring,
per_ring_budget);
}
/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;
e_dev_info("Clean complete\n");
/* all work done, exit the polling mode */
napi_complete(napi);
if (adapter->rx_itr_setting & 1)
acc_set_itr(q_vector);
if (!test_bit(__ACC_DOWN, &adapter->state))
acc_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));
e_dev_info("Exiting acc_poll\n");
return 0;
}
static bool acc_clean_rx_irq(struct acc_q_vector *q_vector,
struct acc_ring *rx_ring,
const int budget)
{
printk(KERN_INFO "acc Entered clean_rx_irq\n");
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = acc_desc_unused(rx_ring); /// First pass this is count-1 because ntc and ntu are 0 so this is 512-1=511
printk(KERN_INFO "acc RX irq Clean count = %d\n", cleaned_count);
do {
union acc_adv_rx_desc *rx_desc;
struct sk_buff *skb;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= ACC_RX_BUFFER_WRITE) { //When the clean count is >16 allocate some more buffers to get the clean count down. First pass this happens.
acc_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
rx_desc = ACC_RX_DESC(rx_ring, rx_ring->next_to_clean);
printk(KERN_INFO "acc inside RX do while, acquired description\n");
printk(KERN_INFO "acc Everything I can about the rx_ring desc (acc_rx_buffer). status_error=%d\t \
length=%d\n", rx_desc->wb.upper.status_error, rx_desc->wb.upper.length);
if (!acc_test_staterr(rx_desc, ACC_RXD_STAT_DD))
break;
printk(KERN_INFO "acc inside RX past status_error check\n");
/*
* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* RXD_STAT_DD bit is set
*/
rmb();
/* retrieve a buffer from the ring */
skb = acc_fetch_rx_buffer(rx_ring, rx_desc);
/* exit if we failed to retrieve a buffer */
if (!skb)
break;
printk(KERN_INFO "acc successfully retrieved a buffer\n");
cleaned_count++;
/* place incomplete frames back on ring for completion */
if (acc_is_non_eop(rx_ring, rx_desc, skb))
continue;
/* verify the packet layout is correct */
if (acc_cleanup_headers(rx_ring, rx_desc, skb))
continue;
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
/* populate checksum, timestamp, VLAN, and protocol */
acc_process_skb_fields(rx_ring, rx_desc, skb);
acc_rx_skb(q_vector, skb); ///I believe this sends data to the kernel network stuff and then the generic OS
/* update budget accounting */
total_rx_packets++;
} while (likely(total_rx_packets < budget));
printk(KERN_INFO "acc rx irq exited the while loop\n");
u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.packets += total_rx_packets;
rx_ring->stats.bytes += total_rx_bytes;
u64_stats_update_end(&rx_ring->syncp);
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;
if (cleaned_count)
acc_alloc_rx_buffers(rx_ring, cleaned_count);
printk(KERN_INFO "acc rx irq returning happily\n");
return (total_rx_packets < budget);
}
static struct sk_buff *acc_fetch_rx_buffer(struct acc_ring *rx_ring,
union acc_adv_rx_desc *rx_desc)
{
struct acc_rx_buffer *rx_buffer;
struct sk_buff *skb;
struct page *page;
printk(KERN_INFO "acc Attempting to fetch rx buffer\n");
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
page = rx_buffer->page; //This page is set by I think acc_add_rx_frag... hard to tell. yes the page is created there and kind of linked to the dma via dma_map_page
prefetchw(page); ///Prefetch the page cacheline for writing
skb = rx_buffer->skb; ///This does the mapping between skb and dma page table I believe.
if (likely(!skb)) {
printk(KERN_INFO "acc attempting to allocate netdrv space for page.\n");
void *page_addr = page_address(page) + //get the virtual page address of this page.
rx_buffer->page_offset;
/* prefetch first cache line of first page */
prefetch(page_addr);
#if L1_CACHE_BYTES < 128
prefetch(page_addr + L1_CACHE_BYTES);
#endif
/* allocate a skb to store the frags */
skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
ACC_RX_HDR_SIZE);
if (unlikely(!skb)) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
return NULL;
}
/*
* we will be copying header into skb->data in
* pskb_may_pull so it is in our interest to prefetch
* it now to avoid a possible cache miss
*/
prefetchw(skb->data);
/*
* Delay unmapping of the first packet. It carries the
* header information, HW may still access the header
* after the writeback. Only unmap it when EOP is
* reached
*/
if (likely((rx_desc, ACC_RXD_STAT_EOP)))
goto dma_sync;
ACC_CB(skb)->dma = rx_buffer->dma;
} else {
if (acc_test_staterr(rx_desc, ACC_RXD_STAT_EOP))
acc_dma_sync_frag(rx_ring, skb);
dma_sync:
/* we are reusing so sync this buffer for CPU use */
printk(KERN_INFO "acc attempting to sync the dma and the device.\n");
dma_sync_single_range_for_cpu(rx_ring->dev, //Sync to the pci device, this dma buffer, at this page offset, this ring, for device to DMA transfer
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
}
/* pull page into skb */
if (acc_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
//This is again temporary to try and create blockers around the problem.
return skb;
/* hand second half of page back to the ring */
acc_reuse_rx_page(rx_ring, rx_buffer);
} else if (ACC_CB(skb)->dma == rx_buffer->dma) {
/* the page has been released from the ring */
ACC_CB(skb)->page_released = true;
} else {
/* we are not reusing the buffer so unmap it */
dma_unmap_page(rx_ring->dev, rx_buffer->dma,
acc_rx_pg_size(rx_ring),
DMA_FROM_DEVICE);
}
/* clear contents of buffer_info */
rx_buffer->skb = NULL;
rx_buffer->dma = 0;
rx_buffer->page = NULL;
printk(KERN_INFO "acc returning from fetch_rx_buffer.\n");
return skb;
}
static bool acc_add_rx_frag(struct acc_ring *rx_ring,
struct acc_rx_buffer *rx_buffer,
union acc_adv_rx_desc *rx_desc,
struct sk_buff *skb)
{
printk(KERN_INFO "acc Attempting to add rx_frag from page.\n");
struct page *page = rx_buffer->page;
unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
#if (PAGE_SIZE < 8192)
unsigned int truesize = acc_rx_bufsz(rx_ring);
#else
unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
unsigned int last_offset = acc_rx_pg_size(rx_ring) -
acc_rx_bufsz(rx_ring);
#endif
if ((size <= ACC_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
printk(KERN_INFO "acc Inside the size check.\n");
unsigned char *va = page_address(page) + rx_buffer->page_offset;
printk(KERN_INFO "page:%p\tpage_address:%p\tpage_offset:%d\n",page,page_address(page),rx_buffer->page_offset);
printk(KERN_INFO "acc First 4 bytes of string:%x %x %x %x\n",va[0],va[1],va[2],va[3]); //FIXME: I can now read this page table but there is still no meaningful data in it. (appear to be reading garbage)
printk(KERN_INFO "acc 32 bytes in:%x %x %x %x\n",va[32],va[33],va[34],va[35]);
return true;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
/* we can reuse buffer as-is, just make sure it is local */
if (likely(page_to_nid(page) == numa_node_id()))
return true;
/* this page cannot be reused so discard it */
put_page(page);
return false;
}
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
rx_buffer->page_offset, size, truesize);
/* avoid re-using remote pages */
if (unlikely(page_to_nid(page) != numa_node_id()))
return false;
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
if (unlikely(page_count(page) != 1))
return false;
/* flip page offset to other buffer */
rx_buffer->page_offset ^= truesize;
/*
* since we are the only owner of the page and we need to
* increment it, just set the value to 2 in order to avoid
* an unecessary locked operation
*/
atomic_set(&page->_count, 2);
#else
/* move offset up to the next cache line */
rx_buffer->page_offset += truesize;
if (rx_buffer->page_offset > last_offset)
return false;
/* bump ref count on page before it is given to the stack */
get_page(page);
#endif
return true;
}

Related

ARM32, phys_to_virt, Unable to handle kernel paging request at virtual address

I'm working on implementing a variant of https://apenwarr.ca/log/20190216. Long story short, the main idea is to have a space in memory where to keep informations and to retrieve this information after a soft reboot/panic.
In my case, I just want to keep some variables from a reboot to another. So I've worked on a simple variant of this mechanism to do the job. The code is simply a copy paste from the original patch with some raw adaptations. I've added a syscall to enter kernel mode to execute this code (not shown here).
struct logbits {
int magic; /* needed to verify the memory across reboots */
int state;
int nb_reboot;
};
#define PERSIST_SEARCH_START 0
#ifdef CONFIG_NO_BOOTMEM
#define PERSIST_SEARCH_END 0x5e000000
#else
#define PERSIST_SEARCH_END 0xfe000000
#endif
#define PERSIST_SEARCH_JUMP (4*1024)
#define PERSIST_MAGIC 0xba5eba11
/*
* arm uses one memory model, mips uses another
*/
phys_addr_t physmem_reserve(phys_addr_t size) {
#ifdef CONFIG_NO_BOOTMEM
phys_addr_t alloc;
alloc = memblock_find_in_range_node(size, SMP_CACHE_BYTES,
PERSIST_SEARCH_START, PERSIST_SEARCH_END,
NUMA_NO_NODE);
if (!alloc) return alloc;
if (memblock_reserve(alloc, size)) {
pr_err("info_keeper: memblock_reserve failed\n");
return 0;
}
return alloc;
#else
unsigned long where;
for (where = PERSIST_SEARCH_END - size;
where >= PERSIST_SEARCH_START && where <= PERSIST_SEARCH_END - size;
where -= PERSIST_SEARCH_JUMP) {
if (reserve_bootmem(where, size, BOOTMEM_EXCLUSIVE))
continue;
else
return where;
}
return 0;
#endif
}
struct logbits *log_buf_alloc(char **new_logbuf)
{
char *buf;
phys_addr_t alloc;
unsigned long size = sizeof(struct logbits);
unsigned long full_size = size;
struct logbits *new_logbits;
alloc = physmem_reserve(full_size);
if (alloc) {
printk(KERN_INFO "info_keeper: memory reserved # 0x%08x\n", alloc);
buf = phys_to_virt(alloc);
if(buf){
*new_logbuf = buf;
new_logbits = (void*)buf;
printk(KERN_INFO "info_keeper: memory virtual # 0x%08x\n", buf);
if (new_logbits->magic != PERSIST_MAGIC) {
printk(KERN_INFO "info_keeper: header invalid, " "cleared.\n");
memset(buf, 0, full_size);
memset(new_logbits, 0, sizeof(*new_logbits));
new_logbits->magic = PERSIST_MAGIC;
} else {
printk(KERN_INFO "info_keeper: header valid; " "state=%d\n" "nb_reboot=%d\n", new_logbits->state, new_logbits->nb_reboot);
}
return new_logbits;
}else{
printk(KERN_ERR "info_keeper: failed to get phys to virt");
buf = alloc_bootmem(full_size);
*new_logbuf = buf;
new_logbits = (struct logbits*)(buf);
memset(buf, 0, full_size);
}
} else {
/* replace the buffer */
printk(KERN_ERR "info_keeper: failed to reserve bootmem " "area. disabled.\n");
buf = alloc_bootmem(full_size);
*new_logbuf = buf;
new_logbits = (struct logbits*)(buf);
memset(buf, 0, full_size);
}
return new_logbits;
}
Upon execution, the physmem_reserve function is successful and returns a memory region. Then I get a physical to virtual memory mapping from phys_to_virt. Then, when I try to access the memory, I get this Unable to handle kernel paging request at virtual address error.
Here is a sample output :
[ 42.489639] info_keeper: memory reserved # 0x5dffffc0
[ 42.494781] info_keeper: memory virtual # 0x0dffffc0
[ 42.499778] Unable to handle kernel paging request at virtual address 0dffffc0
Any idea on what is happening ?

Problem with writing from kernel to user space - linux device driver

I'm trying to write a simple Raspberry Pi GPIO driver, with four switches connected to four of the GPIO pins, that reads each switch state. The problem is, I'm not sure how to write from kernel to user space, I'm not getting anything when I insert my device kernel module and try to read the device file with cat command.
The device_read function is as follows:
static ssize_t gpio_driver_read(struct file *filp, char *buf, size_t len, loff_t *f_pos)
{
/* Size of valid data in gpio_driver - data to send in user space. */
int data_size = 0;
/* Counter for 'for' loop. */
int i;
/* Print to kernel space. */
printk(KERN_INFO "Reading active Switch state...\n");
for (i = 0; i < 4; i = i+1)
{
printk(KERN_INFO "Loop number %d...\n", i);
/* TODO: fill gpio_driver_buffer here. */
if (i == 0 && mySwitches[0])
sprintf(gpio_driver_buffer, "gpio_driver: gpio12 value: %d\n", GetGpioPinValue(GPIO_12));
else if (i == 1 && mySwitches[1])
sprintf(gpio_driver_buffer, "gpio_driver: gpio16 value: %d\n", GetGpioPinValue(GPIO_16));
else if (i == 2 && mySwitches[2])
sprintf(gpio_driver_buffer, "gpio_driver: gpio20 value: %d\n", GetGpioPinValue(GPIO_20));
else if (i == 3 && mySwitches[3])
sprintf(gpio_driver_buffer, "gpio_driver: gpio21 value: %d\n", GetGpioPinValue(GPIO_21));
printk(KERN_INFO "%s\n", gpio_driver_buffer);
/* Get size of valid data. */
data_size = strlen(gpio_driver_buffer);
printk(KERN_INFO "%d\n", data_size);
/* Send data to user space. */
if (copy_to_user(buf, gpio_driver_buffer, data_size) != 0)
{
return -EFAULT;
}
}
return 0;
}
gpio_driver_buffer is an array of some default size (I put it to 80).
mySwitches is an array of 4 elements, each one with value 0 or 1 (I'm passing that as an argument when inserting the kernel module, 1 meaning I want to watch the state of the switch and 0 meaning I'm not watching the switch).
GetGpioPinValue is a function that returns switch state.
The problem is, when I try to read the device file with cat command, I'm not getting anything. However, as you can see, I kind of debugged the program with printk commands and everything is written correctly in kernel space. Where could the problem be?
It doesn't look like you are ever writing to the actual file. Since you don't mention how you are generating the file, I'm assuming you are writing to an arbitrary file, not one created by the driver for /proc or something.
Review the post here: Read/write files within a Linux kernel module
You can try this:
int file_write(struct file *file, unsigned long long offset, unsigned char *data, unsigned int size)
{
mm_segment_t oldfs;
int ret;
oldfs = get_fs();
set_fs(get_ds());
ret = vfs_write(file, data, size, &offset);
set_fs(oldfs);
return ret;
}
Then call it instead of 'copy_to_user':
/* Send data to user space. */
if (file_write(filep, 0, gpio_driver_buffer, data_size) != 0)
{
return -EFAULT;
}
Have a look at the sample code here.

Pinning user space buffer for DMA from Linux kernel

I'm writing driver for devices that produce around 1GB of data per second. Because of that I decided to map user buffer allocated by application directly for DMA instead of copying through intermediate kernel buffer.
The code works, more or less. But during long-run stress testing I see kernel oops with "bad page state" initiated by unrelated applications (for instance updatedb), probably when kernel wants to swap some pages:
[21743.515404] BUG: Bad page state in process PmStabilityTest pfn:357518
[21743.521992] page:ffffdf844d5d4600 count:19792158 mapcount:0 mapping: (null) index:0x12b011e012d0132
[21743.531829] flags: 0x119012c01220124(referenced|lru|slab|reclaim|uncached|idle)
[21743.539138] raw: 0119012c01220124 0000000000000000 012b011e012d0132 012e011e011e0111
[21743.546899] raw: 0000000000000000 012101300131011c 0000000000000000 012101240123012b
[21743.554638] page dumped because: page still charged to cgroup
[21743.560383] page->mem_cgroup:012101240123012b
[21743.564745] bad because of flags: 0x120(lru|slab)
[21743.569555] BUG: Bad page state in process PmStabilityTest pfn:357519
[21743.576098] page:ffffdf844d5d4640 count:18219302 mapcount:18940179 mapping: (null) index:0x0
[21743.585318] flags: 0x0()
[21743.587859] raw: 0000000000000000 0000000000000000 0000000000000000 0116012601210112
[21743.595599] raw: 0000000000000000 011301310127012f 0000000000000000 012f011d010d011a
[21743.603336] page dumped because: page still charged to cgroup
[21743.609108] page->mem_cgroup:012f011d010d011a
...
Entering kdb (current=0xffff8948189b2d00, pid 6387) on processor 6 Oops: (null)
due to oops # 0xffffffff9c87f469
CPU: 6 PID: 6387 Comm: updatedb.mlocat Tainted: G B OE 4.10.0-42-generic #46~16.04.1-Ubuntu
...
Details:
The user buffer consists of frames and neither the buffer not the frames are page-aligned. The frames in buffer are used in circular manner for "infinite" live data transfers. For each frame I get memory pages via get_user_pages_fast, then convert it to scatter-gatter table with sg_alloc_table_from_pages and finally map for DMA using dma_map_sg.
I rely on sg_alloc_table_from_pages to bind consecutive pages into one DMA descriptor to reduce size of S/G table sent to device. Devices are custom built and utilize FPGA. I took inspiration from many drivers doing similar mapping, especially video drivers i915 and radeon, but no one has all the stuff on one place so I might overlook something.
Related functions (pin_user_buffer and unpin_user_buffer are called upon separate IOCTLs):
static int pin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
const unsigned long bytes = cam->acq_frame_bytes;
const unsigned long first =
( frame->uaddr & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long last =
((frame->uaddr + bytes - 1) & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long offset =
frame->uaddr & ~PAGE_MASK;
int nr_pages = last - first + 1;
int err;
int n;
struct page **pages;
struct sg_table *sgt;
if (frame->uaddr + bytes < frame->uaddr) {
pr_err("%s: attempted user buffer overflow!\n", __func__);
return -EINVAL;
}
if (bytes == 0) {
pr_err("%s: user buffer has zero bytes\n", __func__);
return -EINVAL;
}
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
if (!pages) {
pr_err("%s: can't allocate udma_frame.pages\n", __func__);
return -ENOMEM;
}
sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
if (!sgt) {
pr_err("%s: can't allocate udma_frame.sgt\n", __func__);
err = -ENOMEM;
goto err_alloc_sgt;
}
/* (rw == READ) means read from device, write into memory area */
err = get_user_pages_fast(frame->uaddr, nr_pages, READ == READ, pages);
if (err < nr_pages) {
nr_pages = err;
if (err > 0) {
pr_err("%s: can't pin all %d user pages, got %d\n",
__func__, nr_pages, err);
err = -EFAULT;
} else {
pr_err("%s: can't pin user pages\n", __func__);
}
goto err_get_pages;
}
for (n = 0; n < nr_pages; ++n)
flush_dcache_page(pages[n]); //<--- Is this needed?
err = sg_alloc_table_from_pages(sgt, pages, nr_pages, offset, bytes,
GFP_KERNEL);
if (err) {
pr_err("%s: can't build sg_table for %d pages\n",
__func__, nr_pages);
goto err_alloc_sgt2;
}
if (!dma_map_sg(&cam->pci_dev->dev, sgt->sgl, sgt->nents, DMA_FROM_DEVICE)) {
pr_err("%s: can't map %u sg_table entries for DMA\n",
__func__, sgt->nents);
err = -ENOMEM;
goto err_dma_map;
}
frame->pages = pages;
frame->nr_pages = nr_pages;
frame->sgt = sgt;
return 0;
err_dma_map:
sg_free_table(sgt);
err_alloc_sgt2:
err_get_pages:
for (n = 0; n < nr_pages; ++n)
put_page(pages[n]);
kfree(sgt);
err_alloc_sgt:
kfree(pages);
return err;
}
static void unpin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
int n;
dma_unmap_sg(&cam->pci_dev->dev, frame->sgt->sgl, frame->sgt->nents,
DMA_FROM_DEVICE);
sg_free_table(frame->sgt);
kfree(frame->sgt);
frame->sgt = NULL;
for (n = 0; n < frame->nr_pages; ++n) {
struct page *page = frame->pages[n];
set_page_dirty_lock(page);
mark_page_accessed(page); //<--- Without this the Oops are more frequent
put_page(page);
}
kfree(frame->pages);
frame->pages = NULL;
frame->nr_pages = 0;
}
static void unpin_user_buffer(struct my_dev *cam)
{
if (cam->udma_frames) {
int n;
for (n = 0; n < cam->udma_frame_count; ++n)
unpin_user_frame(cam, &cam->udma_frames[n]);
kfree(cam->udma_frames);
cam->udma_frames = NULL;
}
cam->udma_frame_count = 0;
cam->udma_buffer_bytes = 0;
cam->udma_buffer = NULL;
cam->udma_desc_count = 0;
}
static int pin_user_buffer(struct my_dev *cam)
{
int err;
int n;
const u32 acq_frame_count = cam->acq_buffer_bytes / cam->acq_frame_bytes;
struct udma_frame *udma_frames;
u32 udma_desc_count = 0;
if (!cam->acq_buffer) {
pr_err("%s: user buffer is NULL!\n", __func__);
return -EFAULT;
}
if (cam->udma_buffer == cam->acq_buffer
&& cam->udma_buffer_bytes == cam->acq_buffer_bytes
&& cam->udma_frame_count == acq_frame_count)
return 0;
if (cam->udma_buffer)
unpin_user_buffer(cam);
udma_frames = kcalloc(acq_frame_count, sizeof(*udma_frames),
GFP_KERNEL | __GFP_ZERO);
if (!udma_frames) {
pr_err("%s: can't allocate udma_frame array for %u frames\n",
__func__, acq_frame_count);
return -ENOMEM;
}
for (n = 0; n < acq_frame_count; ++n) {
struct udma_frame *frame = &udma_frames[n];
frame->uaddr =
(unsigned long)(cam->acq_buffer + n * cam->acq_frame_bytes);
err = pin_user_frame(cam, frame);
if (err) {
pr_err("%s: can't pin frame %d (out of %u)\n",
__func__, n + 1, acq_frame_count);
for (--n; n >= 0; --n)
unpin_user_frame(cam, frame);
kfree(udma_frames);
return err;
}
udma_desc_count += frame->sgt->nents; /* Cannot overflow */
}
pr_debug("%s: total udma_desc_count=%u\n", __func__, udma_desc_count);
cam->udma_buffer = cam->acq_buffer;
cam->udma_buffer_bytes = cam->acq_buffer_bytes;
cam->udma_frame_count = acq_frame_count;
cam->udma_frames = udma_frames;
cam->udma_desc_count = udma_desc_count;
return 0;
}
Related structures:
struct udma_frame {
unsigned long uaddr; /* User address of the frame */
int nr_pages; /* Nr. of pages covering the frame */
struct page **pages; /* Actual pages covering the frame */
struct sg_table *sgt; /* S/G table describing the frame */
};
struct my_dev {
...
u8 __user *acq_buffer; /* User-space buffer received via IOCTL */
...
u8 __user *udma_buffer; /* User-space buffer for image */
u32 udma_buffer_bytes; /* Total image size in bytes */
u32 udma_frame_count; /* Nr. of items in udma_frames */
struct udma_frame
*udma_frames; /* DMA descriptors per frame */
u32 udma_desc_count; /* Total nr. of DMA descriptors */
...
};
Questions:
How to properly pin user buffer pages and mark them as not movable?
If one frame ends and next frame starts in the same page, is it correct to handle it as two independent pages, i.e. pin the page twice?
The data comes from device to user buffer and app is supposed to not write to its buffer, but I have no control over it. Can I use DMA_FROM_DEVICE or rather
use DMA_BIDIRECTIONAL just in case?
Do I need to use something like SetPageReserved/ClearPageReserved or mark_page_reserved/free_reserved_page?
Is IOMMU/swiotlb somehow involved? E.g. i915 driver doesn't use sg_alloc_table_from_pages if swiotlb is active?
What the difference between set_page_dirty, set_page_dirty_lock and SetPageDirty functions?
Thanks for any hint.
PS: I cannot change the way the application gets the data without breaking our library API maintained for many years. So please do not advise e.g. to mmap kernel buffer...
Why do you put "READ == READ" as the third paramter? You need put flag there.
err = get_user_pages_fast(frame->uaddr, nr_pages, READ == READ, pages);
You need put "FOLL_LONGTERM" here, and FOLL_PIN is set by get_user_pages_fast internally. See https://www.kernel.org/doc/html/latest/core-api/pin_user_pages.html#case-2-rdma
In addition, you need take care of cpu and device memory coherence. Just call "dma_sync_sg_for_device(...)" before dma transfer, and "dma_sync_sg_for_cpu(...)" after dma transfer.

ENC28J60 Stops receiving

I'm currently using an stm32f405 and an ENC28J60 and lwip as tcp/ip stack. Everything runs fine at startup but after about a minute or so the ENC stops receiving packets. Transmitting keeps working fine. I've tried both polling it and using interrupts.
I'm using https://github.com/wolfgangr/enc28j60 to communicate to the ENC. And this is the code that handles incoming packets:
while (true) {
eventmask_t mask = chEvtWaitAnyTimeout(ALL_EVENTS, LWIP_PACKET_POLL_INTERVAL);
if(mask & ENC_INTERRUPT_ID)
{
/* Handle ENC28J60 interrupt */
ENC_IRQHandler(&encHandle);
/* Reenable interrupts */
ENC_EnableInterrupts(EIE_INTIE);
}
if (mask & PERIODIC_LINK_TIMER_ID)
{
bool current_link_status = ((encHandle.LinkStatus) & PHSTAT2_LSTAT) != 0;
if (current_link_status != prev_link_status) {
if (current_link_status) {
dhcp_start(&thisif);
}
else {
dhcp_stop(&thisif);
}
}
prev_link_status = current_link_status;
}
/* Check if new frames where received */
struct pbuf *p;
while ((p = low_level_input(&thisif)) != NULL) {
struct eth_hdr *ethhdr = p->payload;
switch (htons(ethhdr->type)) {
/* IP or ARP packet? */
case ETHTYPE_IP:
case ETHTYPE_ARP:
/* full packet send to tcpip_thread to process */
if (tcpip_input(p, &thisif) == ERR_OK)
break;
LWIP_DEBUGF(NETIF_DEBUG, ("ethernetif_input: IP input error\n"));
default:
pbuf_free(p);
}
}
}
Function low_level_input:
static struct pbuf *low_level_input(struct netif *netif) {
struct pbuf *p = NULL;
struct pbuf *q;
uint16_t len;
uint8_t *buffer;
uint32_t bufferoffset = 0;
if (!ENC_GetReceivedFrame(&encHandle)) {
return NULL;
}
/* Obtain the size of the packet and put it into the "len" variable. */
len = encHandle.RxFrameInfos.length;
buffer = (uint8_t *)encHandle.RxFrameInfos.buffer;
if (len > 0)
{
/* We allocate a pbuf chain of pbufs from the Lwip buffer pool */
p = pbuf_alloc(PBUF_RAW, len, PBUF_POOL);
}
if (p != NULL)
{
bufferoffset = 0;
for(q = p; q != NULL; q = q->next)
{
/* Copy data in pbuf */
memcpy( (uint8_t*)((uint8_t*)q->payload), (uint8_t*)((uint8_t*)buffer + bufferoffset), q->len);
bufferoffset = bufferoffset + q->len;
}
}
return p;
}
After a while the function ENC_GetReceivedFrame keeps returning false, even if I know for sure some packets should have been received.
I've debugged the function (found in enc28j60.c) and this line:
pktcnt = enc_rdbreg(handle, ENC_EPKTCNT);
pktcnt is always 0. I've looked at the SPI bus with a logic analyzer and the ENC truly anwsers 0. The SPI bus works fine.
Just before this happens some packets are received that are not flagged as RXSTAT_OK (look at line 1259 in enc28j60.c)
I've been at this for day's now, and truly have no ideas left.
I encountered a similar problem..
The EPKTCNT register was times to times decreased with no reason( without setting the ECON2_PKTDEC bit).
I noticed that when it happened it was after setting the ECON2_AUTOINC bit.
Not every time ECON2_AUTOINC was set but often.
I just set ECON2_AUTOINC at the initialization of the ENC28J60, no more during the reading process.
Since EPKTCNT stopped to decrease with no reason.
Hope it can help

Bus error when reading register using mmap

I developed a kernel module which allocates some kernel memory and remaps it to physical registers of an FPGA when user software opens the device, it also writes in a hardware register which triggers an interruption which is finally used by the probing functions of the kernel to detect the IRQ number at module init, which is 61 in my case. (I got to this point reading the excellent LDD3 book from O'Reilly, but since I'm a newbie in the kernel world I have some trouble getting my driver working well)
Thereby, I am accessing hardware registers from the kernel itself and from the user space using a small soft that I named "regedit". To access the registers from the kernel I used ioremap and I wrote the mmap function to allow regedit to access the registers from the user space using remap_pfn_range.
My first problem is that I suppose there is a better way than separately calling ioremap and remap_pfn_range to do the same thing, but I don't know how to allocate memory, remap it, access it from the kernel and by the same time provide it to the user space.
My second problem is that when I install the module, I see that my driver is able to read and write the registers using ioremap because I successfully detect the IRQ number by triggering an interrupt (by writing the register at offset 0), and when opening the device, my irq handler routine is called and successfully acknowledges the interrupt by writing 0 in the register. But, because there is a but, when I try to read the same register using my soft regedit, I get a bus error.
My guess is that only three registers are physically wired (offsets 0, 4 and 8) and maybe when I think I'm reading a single 32 bits register, the kernel is in fact reading a larger buffer (PAGE_SIZE aligned I presume) and accesses a forbidden area. (To prove that the problem is from my driver I used /dev/mem in my regedit soft, and it's working fine)
I am using a linux kernel 3.12 on a Xilinx Zynq ZC702 board using a processor ARM Cortex A9.
Here is the code of my driver :
Driver Header File
#ifndef DRIVER_H_
#define DRIVER_H_
/* --------------------------------------------------------------
* External References
* ------------------------------------------------------------*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/cdev.h>
#include <linux/moduleparam.h>
/* --------------------------------------------------------------
* Application Includes
* ------------------------------------------------------------*/
/* --------------------------------------------------------------
* Constants Definition
* ------------------------------------------------------------*/
#define MODULE_NAME "mydriver"
#define DEFAULT_MAJOR_NUMBER 0 // If zero, major number will be automatically allocated
#define DEFAULT_MINOR_NUMBER 0
#define NB_DEVICES 1 // Number of devices to register
/*
* Hardware defines
*/
#define NB_PAGES 256 // Number of pages of the memory mapping
#define REG_IRQ 0x43C00000 // IRQ register address
/*
* Modules params
*/
static unsigned int irq_param = 0;
/*
* Kernel module information
*/
MODULE_LICENSE("GPL");
MODULE_AUTHOR("AwaX");
MODULE_VERSION("0.1");
MODULE_ALIAS(MODULE_NAME);
MODULE_DESCRIPTION("Kernel module which handles the hardware interrupts and process them");
module_param(irq_param, int, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP);
MODULE_PARM_DESC(irq_param, "The IRQ line number to be used");
/* --------------------------------------------------------------
* Macros Definition
* ------------------------------------------------------------*/
#define LOG(kernLvl, str, tag) printk(kernLvl "%-6.6s %s() : "str"\n", tag, (char*) __func__);
#define LOGA(kernLvl, str, tag,...) printk(kernLvl "%-6.6s %s() : "str"\n", tag, (char*) __func__, __VA_ARGS__);
#define LOG_TRACE(str) LOG(KERN_DEBUG, str, "KTRACE")
#define LOG_DEBUG(str) LOG(KERN_INFO, str, "KDEBUG")
#define LOG_INFO(str) LOG(KERN_NOTICE, str, "KINFO")
#define LOG_IT(str) LOG(KERN_NOTICE, str, "IT")
#define LOG_WARN(str) LOG(KERN_WARNING, str, "KWARN")
#define LOG_ERROR(str) LOG(KERN_ERR, str, "KERROR")
#define LOG_FATAL(str) LOG(KERN_ALERT, str, "KFATAL")
#define LOG_TRACE_(str,...) LOGA(KERN_DEBUG, str, "KTRACE", __VA_ARGS__)
#define LOG_DEBUG_(str,...) LOGA(KERN_INFO, str, "KDEBUG", __VA_ARGS__)
#define LOG_INFO_(str,...) LOGA(KERN_NOTICE, str, "KINFO", __VA_ARGS__)
#define LOG_IT_(str,...) LOGA(KERN_NOTICE, str, "IT", __VA_ARGS__)
#define LOG_WARN_(str,...) LOGA(KERN_WARNING, str, "KWARN", __VA_ARGS__)
#define LOG_ERROR_(str,...) LOGA(KERN_ERR, str, "KERROR", __VA_ARGS__)
#define LOG_FATAL_(str,...) LOGA(KERN_ALERT, str, "KFATAL", __VA_ARGS__)
/* --------------------------------------------------------------
* Types Definition
* ------------------------------------------------------------*/
/*
* Internal data structure
*/
typedef struct _module_data {
int major; // Major device number
int minor; // Minor device number
dev_t mmap_dev; // Holds device numbers (major and minor)
struct cdev mmap_cdev; // Kernel internal struct representing the device
int *vmalloc_area; // Pointer to the vmalloc'd area - always page aligned
int *kmalloc_area; // Pointer to the kmalloc'd area, rounded up to a page boundary
void *kmalloc_ptr; // Original pointer for kmalloc'd area as returned by kmalloc
// Mapping
volatile int *map_area; // Base address of the registers kernel memory
volatile void *io_area; // Base address of the registers i/o physical memory
// Interrupts
unsigned int irq; // Interrupt number
} module_data;
/* --------------------------------------------------------------
* Functions Definition
* ------------------------------------------------------------*/
#endif /* DRIVER_H_ */
Driver Source File
/* --------------------------------------------------------------
* External References
* ------------------------------------------------------------*/
#include "driver.h"
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/types.h>
#include <linux/io.h>
#include <linux/errno.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <asm-generic/errno-base.h>
/* --------------------------------------------------------------
* Application Includes
* ------------------------------------------------------------*/
/* --------------------------------------------------------------
* Static data
* ------------------------------------------------------------*/
// Internal data
static module_data *_module;
/* --------------------------------------------------------------
* Local Functions Definition
* ------------------------------------------------------------*/
/*
* Module functions
*/
static int __init my_module_init (void);
static void __exit my_module_cleanup (void);
static int my_module_open (struct inode *inode, struct file *filp);
static int my_module_release (struct inode *inode, struct file *filp);
static int my_module_mmap (struct file *filp, struct vm_area_struct *vma);
/*
* Static functions
*/
static void my_vma_open (struct vm_area_struct *vma);
static void my_vma_close (struct vm_area_struct *vma);
static irqreturn_t my_irq_handler (int irq, void *dev_id, struct pt_regs *regs);
static int my_allocate_device (void);
static int my_register_device (void);
static unsigned int my_probe_irq (void);
static int my_mmap_kmem (struct file *filp, struct vm_area_struct *vma);
static int my_mmap_vmem (struct file *filp, struct vm_area_struct *vma);
/*
* Specifies the functions associated with the device operations.
*/
static struct file_operations _module_fops = {
.owner = THIS_MODULE,
.open = my_module_open,
.release = my_module_release,
.mmap = my_module_mmap,
};
/*
* Specifies the functions associated with the remap operations.
*/
static struct vm_operations_struct _module_vmops = {
.open = my_vma_open,
.close = my_vma_close,
};
/* --------------------------------------------------------------
* Functions Implementation
* ------------------------------------------------------------*/
/*****************************************************************************
* Initialization function of the module which allocates the major and minor
* numbers and registers the device to /proc/devices. The creation of the
* device in /dev must be done by an external script.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int __init my_module_init (void) {
unsigned int irqprobe = 0;
int err = 0;
int i = 0;
LOG_INFO_("Initializing module %s", MODULE_NAME);
LOG_INFO_("Module param : irq_param = %u", irq_param);
/*
* Init internal data
*/
_module = kmalloc(sizeof(module_data), GFP_KERNEL);
memset(_module, 0, sizeof(module_data));
if (_module == NULL) {
goto out;
}
_module->major = DEFAULT_MAJOR_NUMBER;
_module->minor = DEFAULT_MINOR_NUMBER;
_module->map_area = NULL;
_module->io_area = NULL;
_module->irq = irq_param;
/*
* Allocate kmalloc memory
*/
_module->kmalloc_ptr = kmalloc((NB_PAGES + 2) * PAGE_SIZE, GFP_KERNEL);
if (_module->kmalloc_ptr == NULL) {
err = -ENOMEM;
goto out_kfree;
}
// Round it up to the page bondary
_module->kmalloc_area = (int *) ((((unsigned long) _module->kmalloc_ptr) + PAGE_SIZE - 1) & PAGE_MASK);
// Use the kernel memory to access registers from the module
_module->map_area = _module->kmalloc_area;
/*
* Allocate vmalloc memory
*/
_module->vmalloc_area = (int *) vmalloc(NB_PAGES * PAGE_SIZE);
if (_module->vmalloc_area == NULL) {
err = -ENOMEM;
goto out_vfree;
}
/*
* Remap physical addresses
*/
_module->io_area = ioremap(REG_IRQ, NB_PAGES * PAGE_SIZE);
if (_module->io_area == NULL) {
LOG_ERROR_("Physical memory remapping failed (base_addr=%#x, size=%#lx)", REG_IRQ, NB_PAGES * PAGE_SIZE);
goto out_iofree;
}
/*
* Allocates the device numbers
*/
err = my_allocate_device();
if (err) {
LOG_ERROR_("Device allocation failed with code : %d", err);
goto out_unalloc_region;
}
// If no IRQ number has been specified
if (_module->irq <= 0) {
// Probes for an IRQ line number
LOG_INFO("Probing IRQ number...");
irqprobe = my_probe_irq();
if (irqprobe == 0) { // Probe failed
LOG_ERROR("IRQ probing failed : cannot find IRQ number");
} else if (irqprobe < 0) { // Probe error
LOG_ERROR_("IRQ probing failed with error code : %d", err);
} else {
// If an irq number is found
LOG_INFO_("IRQ number detected : %u", irqprobe);
_module->irq = irqprobe;
}
} else { // If an irq number has been specified via a module parameter
LOG_INFO_("IRQ number param specified : irq=%u", _module->irq);
}
// Registers the device making it live immediately
err = my_register_device();
if (err) {
LOG_ERROR_("Device register failed with code : %d", err);
goto out_unregister;
}
LOG_INFO_("Module %s initialized successfully !", MODULE_NAME);
return 0;
/*
* Error fallbacks
*/
out_unregister:
LOG_DEBUG_("Deallocating chrdev for %s", MODULE_NAME);
cdev_del(&_module->mmap_cdev);
// Unreserve the pages
LOG_DEBUG("Unreserving memory pages");
for (i = 0; i < NB_PAGES * PAGE_SIZE; i += PAGE_SIZE) {
SetPageReserved(vmalloc_to_page((void *) (((unsigned long) _module->vmalloc_area) + i)));
SetPageReserved(virt_to_page(((unsigned long )_module->kmalloc_area) + i));
}
out_unalloc_region:
LOG_DEBUG_("Unregistering device %s", MODULE_NAME);
unregister_chrdev_region(_module->mmap_dev, NB_DEVICES);
out_iofree:
iounmap(_module->io_area);
out_vfree:
vfree(_module->vmalloc_area);
out_kfree:
kfree(_module->kmalloc_ptr);
out:
return err;
}
/*****************************************************************************
* Cleanup function of the module which unregisters the major number and
* removes the created device from the system.
*
* #return
* void
*****************************************************************************/
static void __exit my_module_cleanup (void) {
LOG_INFO_("Cleaning up module %s", MODULE_NAME);
// Unregisters a range of device numbers.
unregister_chrdev_region(_module->mmap_dev, NB_DEVICES);
LOG_INFO_("Unregistered device %s", MODULE_NAME);
// Free kernel memory
vfree(_module->vmalloc_area);
kfree(_module->kmalloc_ptr);
kfree(_module);
// Remove the cdev from the system, possibly freeing the structure itself
cdev_del(&_module->mmap_cdev);
LOG_INFO_("Deallocated chrdev for %s", MODULE_NAME);
}
/*****************************************************************************
* Open function of the module which makes this module accessible from the
* user space.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_module_open (struct inode *inode, struct file *filp) {
int err = 0;
// If an interrupt line has been requested
if (_module->irq > 0) {
// Registers the interrupt handler to the kernel
err = request_irq(_module->irq, (irq_handler_t) my_irq_handler, 0, MODULE_NAME, _module);
if (err) {
LOG_ERROR_("%s : Cannot get assigned irq %d, request_irq() failed, code=%d", MODULE_NAME, _module->irq, err);
_module->irq = -1;
return err;
} else {
LOG_INFO_("IRQ number %u assigned successfully to module %s", _module->irq, MODULE_NAME);
}
} else {
LOG_ERROR("Invalid IRQ number : the device will not see the hardware interrupts");
}
LOG_INFO_("%s opened successfully", MODULE_NAME);
return 0;
}
/*****************************************************************************
* Close function of the module which releases the use of this module from
* the user space.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_module_release (struct inode *inode, struct file *filp) {
LOG_INFO_("%s closing...", MODULE_NAME);
// Removes the interrupt handler from kernel
LOG_INFO_("Releasing irq number %u", _module->irq);
free_irq(_module->irq, _module);
LOG_INFO_("%s closed", MODULE_NAME);
return 0;
}
/*****************************************************************************
* Creates a new mapping in the virtual address space of the calling process.
* It makes possible for a user process to access physical memory in the
* kernel space.
*
* #param filp
* The file or device.
* #param vma
* The virtual memory area into which the page range is being
* mapped.
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_module_mmap (struct file *filp, struct vm_area_struct *vma) {
unsigned long start = vma->vm_start; // Virtual address where remapping begins
unsigned long end = vma->vm_end;
unsigned long length = end - start;
unsigned long maxLength = NB_PAGES * PAGE_SIZE;
unsigned long pgoff = vma->vm_pgoff; // PFN of the physAddr to which vAddr is mapped
// Checks length - do not allow larger mappings than the number of pages allocated
if (length > maxLength) {
LOG_ERROR_("Specified virtual memory area is too big : 0x%lx , 0x%lx", length, maxLength);
return -EIO;
}
// At offset 0
if (pgoff == 0) {
// we map the vmalloc'd area
LOG_DEBUG_("Allocating virtual memory, start=%#lx, length=%#lx, pgoff=%#lx", start, length, pgoff);
return my_mmap_vmem(filp, vma);
} else {
// we map the kmalloc'd area
LOG_DEBUG_("Allocating kernel memory, start=%#lx, length=%#lx, pgoff=%#lx", start, length, pgoff);
return my_mmap_kmem(filp, vma);
}
return -EIO;
}
static void my_vma_open (struct vm_area_struct* vma) {
LOG_INFO_("%s VMA open, virtAddr=%#lx, physAddr=%#lx", MODULE_NAME, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT);
}
static void my_vma_close (struct vm_area_struct* vma) {
LOG_INFO_("%s VMA close", MODULE_NAME);
}
/*****************************************************************************
* Interrupt handler which :
* - Reads the registers to know the source of the IT,
* - Clears the IT and wakes up the handler task.
*
* #param irq
* The interrupt number requested.
* #param dev_id
* Pointer to the device structure passed to the function
* request_irq() containing internal data (used when the driver
* manages several instances of the same device).
* #param regs
* Used for debug. Holds a snapshot of the processor's context
* before the processor entered interrupted code.
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static irqreturn_t my_irq_handler (int irq, void *dev_id, struct pt_regs *regs) {
module_data *moduleData = dev_id;
LOG_IT_("%s interrupted, interrupt number = %d", MODULE_NAME, irq);
if (moduleData != NULL) {
// Resets irq
iowrite32(0x0, _module->io_area);
wmb();
} else {
LOG_ERROR("Device structure is NULL");
}
return IRQ_HANDLED;
}
/*****************************************************************************
* Allocates major and minor numbers for this device.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_allocate_device (void) {
int err = 0;
// If a non zero major number is specified
if (_module->major) {
// Updates the dev structure used as input for register_chrdev_region
_module->mmap_dev = MKDEV(_module->major, _module->minor);
// Registers a range of device numbers.
err = register_chrdev_region(_module->mmap_dev, NB_DEVICES, MODULE_NAME);
} else { // If major number is zero, then allocates it dynamically
// Allocates a range of char device numbers chosen dynamically
err = alloc_chrdev_region(&_module->mmap_dev, _module->minor, NB_DEVICES, MODULE_NAME);
_module->major = MAJOR(_module->mmap_dev);
}
// Checks result
if (err) {
LOG_ERROR_("cannot get major number %d", _module->major);
return err;
} else {
LOG_INFO_("Registered device %s : major=%d, minor=%d", MODULE_NAME, _module->major, _module->minor);
}
// Initializes cdev and file operations
cdev_init(&_module->mmap_cdev, &_module_fops);
_module->mmap_cdev.owner = THIS_MODULE;
_module->mmap_cdev.ops = &_module_fops;
return 0;
}
/*****************************************************************************
* Registers the device, makes it live immediately, therefore all
* initialization routines must be done before calling this function.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_register_device (void) {
int err = 0;
// Adds the device to the system making it live immediately
err = cdev_add(&_module->mmap_cdev, _module->mmap_dev, NB_DEVICES);
if (err) {
LOG_ERROR("Could not allocate chrdev");
return err;
} else {
LOG_INFO_("Allocated chrdev for %s", MODULE_NAME);
}
return 0;
}
/*****************************************************************************
* This function uses the probe functions of the kernel to find the irq
* number of the hardware device.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static unsigned int my_probe_irq (void) {
unsigned int irq = -1;
int count = 0;
do {
volatile unsigned long mask;
// Reset the interrupts
iowrite32(0x0, _module->io_area);
wmb(); // Memory barrier
// Start kernel probing
mask = probe_irq_on();
// Trigger all interrupts
iowrite32(0xffffffff, _module->io_area);
wmb(); // Memory barrier
// Wait for it
ndelay(1000);
// Try to find which interrupt occurred
irq = probe_irq_off(mask);
} while (irq < 0 && count++ < 5);
return irq;
}
/*****************************************************************************
* Helper function, mmap's the vmalloc'd area which is not physically
* contiguous.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_mmap_vmem (struct file *filp, struct vm_area_struct *vma) {
unsigned long start = vma->vm_start;
unsigned long end = vma->vm_end;
unsigned long pfn = 0;
long length = end - start;
int ret = 0;
// Check length - do not allow larger mappings than the number of pages allocated
if (length > NB_PAGES * PAGE_SIZE) {
LOG_ERROR_("Specified length (%lu) is larger than the number of pages allocated", length);
return -EIO;
}
// Loop over all pages, map it page individually
while (length > 0) {
pfn = vmalloc_to_pfn(_module->vmalloc_area);
ret = remap_pfn_range(vma, start, pfn, PAGE_SIZE, PAGE_SHARED);
if (ret < 0) {
LOG_ERROR_("remap_pfn_range() failed with error %d, addr=%#lx, offset=%#lx", ret, start, pfn);
return ret;
}
start += PAGE_SIZE;
_module->vmalloc_area += PAGE_SIZE;
length -= PAGE_SIZE;
}
vma->vm_ops = &_module_vmops; // Specifies open_vma() and close_vma() functions
my_vma_open(vma); // Calls explicitely open_vma() as its not done by calling mmap()
return 0;
}
/*****************************************************************************
* Helper function, mmap's the kmalloc'd area which is physically contiguous.
*
* #return
* SUCCESS : 0
* FAILURE : Negative error code.
*****************************************************************************/
static int my_mmap_kmem (struct file *filp, struct vm_area_struct *vma) {
unsigned long start = vma->vm_start;
unsigned long end = vma->vm_end;
long length = end - start;
int ret = 0;
// Check length - do not allow larger mappings than the number of pages allocated
if (length > NB_PAGES * PAGE_SIZE) {
LOG_ERROR_("Specified length (%lu) is larger than the number of pages allocated", length);
return -EIO;
}
// Map the whole physically contiguous area in one piece
//pfn = virt_to_phys((void *) _module->kmalloc_area) >> PAGE_SHIFT;
ret = remap_pfn_range(vma, start, vma->vm_pgoff, length, vma->vm_page_prot);
if (ret < 0) {
return ret;
}
vma->vm_ops = &_module_vmops; // Specifies open_vma() and close_vma() functions
my_vma_open(vma); // Calls explicitely open_vma() as its not done by calling mmap()
return 0;
}
/*
* MANDATORY
*
* Used by kernel to load this module and specifies its entry points.
*/
module_init(my_module_init);
module_exit(my_module_cleanup);

Resources