Pinning user space buffer for DMA from Linux kernel

Pinning user space buffer for DMA from Linux kernel - c

I'm writing driver for devices that produce around 1GB of data per second. Because of that I decided to map user buffer allocated by application directly for DMA instead of copying through intermediate kernel buffer.
The code works, more or less. But during long-run stress testing I see kernel oops with "bad page state" initiated by unrelated applications (for instance updatedb), probably when kernel wants to swap some pages:
[21743.515404] BUG: Bad page state in process PmStabilityTest pfn:357518
[21743.521992] page:ffffdf844d5d4600 count:19792158 mapcount:0 mapping: (null) index:0x12b011e012d0132
[21743.531829] flags: 0x119012c01220124(referenced|lru|slab|reclaim|uncached|idle)
[21743.539138] raw: 0119012c01220124 0000000000000000 012b011e012d0132 012e011e011e0111
[21743.546899] raw: 0000000000000000 012101300131011c 0000000000000000 012101240123012b
[21743.554638] page dumped because: page still charged to cgroup
[21743.560383] page->mem_cgroup:012101240123012b
[21743.564745] bad because of flags: 0x120(lru|slab)
[21743.569555] BUG: Bad page state in process PmStabilityTest pfn:357519
[21743.576098] page:ffffdf844d5d4640 count:18219302 mapcount:18940179 mapping: (null) index:0x0
[21743.585318] flags: 0x0()
[21743.587859] raw: 0000000000000000 0000000000000000 0000000000000000 0116012601210112
[21743.595599] raw: 0000000000000000 011301310127012f 0000000000000000 012f011d010d011a
[21743.603336] page dumped because: page still charged to cgroup
[21743.609108] page->mem_cgroup:012f011d010d011a
...
Entering kdb (current=0xffff8948189b2d00, pid 6387) on processor 6 Oops: (null)
due to oops # 0xffffffff9c87f469
CPU: 6 PID: 6387 Comm: updatedb.mlocat Tainted: G B OE 4.10.0-42-generic #46~16.04.1-Ubuntu
...
Details:
The user buffer consists of frames and neither the buffer not the frames are page-aligned. The frames in buffer are used in circular manner for "infinite" live data transfers. For each frame I get memory pages via get_user_pages_fast, then convert it to scatter-gatter table with sg_alloc_table_from_pages and finally map for DMA using dma_map_sg.
I rely on sg_alloc_table_from_pages to bind consecutive pages into one DMA descriptor to reduce size of S/G table sent to device. Devices are custom built and utilize FPGA. I took inspiration from many drivers doing similar mapping, especially video drivers i915 and radeon, but no one has all the stuff on one place so I might overlook something.
Related functions (pin_user_buffer and unpin_user_buffer are called upon separate IOCTLs):
static int pin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
const unsigned long bytes = cam->acq_frame_bytes;
const unsigned long first =
( frame->uaddr & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long last =
((frame->uaddr + bytes - 1) & PAGE_MASK) >> PAGE_SHIFT;
const unsigned long offset =
frame->uaddr & ~PAGE_MASK;
int nr_pages = last - first + 1;
int err;
int n;
struct page **pages;
struct sg_table *sgt;
if (frame->uaddr + bytes < frame->uaddr) {
pr_err("%s: attempted user buffer overflow!\n", __func__);
return -EINVAL;
}
if (bytes == 0) {
pr_err("%s: user buffer has zero bytes\n", __func__);
return -EINVAL;
}
pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL | __GFP_ZERO);
if (!pages) {
pr_err("%s: can't allocate udma_frame.pages\n", __func__);
return -ENOMEM;
}
sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
if (!sgt) {
pr_err("%s: can't allocate udma_frame.sgt\n", __func__);
err = -ENOMEM;
goto err_alloc_sgt;
}
/* (rw == READ) means read from device, write into memory area */
err = get_user_pages_fast(frame->uaddr, nr_pages, READ == READ, pages);
if (err < nr_pages) {
nr_pages = err;
if (err > 0) {
pr_err("%s: can't pin all %d user pages, got %d\n",
__func__, nr_pages, err);
err = -EFAULT;
} else {
pr_err("%s: can't pin user pages\n", __func__);
}
goto err_get_pages;
}
for (n = 0; n < nr_pages; ++n)
flush_dcache_page(pages[n]); //<--- Is this needed?
err = sg_alloc_table_from_pages(sgt, pages, nr_pages, offset, bytes,
GFP_KERNEL);
if (err) {
pr_err("%s: can't build sg_table for %d pages\n",
__func__, nr_pages);
goto err_alloc_sgt2;
}
if (!dma_map_sg(&cam->pci_dev->dev, sgt->sgl, sgt->nents, DMA_FROM_DEVICE)) {
pr_err("%s: can't map %u sg_table entries for DMA\n",
__func__, sgt->nents);
err = -ENOMEM;
goto err_dma_map;
}
frame->pages = pages;
frame->nr_pages = nr_pages;
frame->sgt = sgt;
return 0;
err_dma_map:
sg_free_table(sgt);
err_alloc_sgt2:
err_get_pages:
for (n = 0; n < nr_pages; ++n)
put_page(pages[n]);
kfree(sgt);
err_alloc_sgt:
kfree(pages);
return err;
}
static void unpin_user_frame(struct my_dev *cam, struct udma_frame *frame)
{
int n;
dma_unmap_sg(&cam->pci_dev->dev, frame->sgt->sgl, frame->sgt->nents,
DMA_FROM_DEVICE);
sg_free_table(frame->sgt);
kfree(frame->sgt);
frame->sgt = NULL;
for (n = 0; n < frame->nr_pages; ++n) {
struct page *page = frame->pages[n];
set_page_dirty_lock(page);
mark_page_accessed(page); //<--- Without this the Oops are more frequent
put_page(page);
}
kfree(frame->pages);
frame->pages = NULL;
frame->nr_pages = 0;
}
static void unpin_user_buffer(struct my_dev *cam)
{
if (cam->udma_frames) {
int n;
for (n = 0; n < cam->udma_frame_count; ++n)
unpin_user_frame(cam, &cam->udma_frames[n]);
kfree(cam->udma_frames);
cam->udma_frames = NULL;
}
cam->udma_frame_count = 0;
cam->udma_buffer_bytes = 0;
cam->udma_buffer = NULL;
cam->udma_desc_count = 0;
}
static int pin_user_buffer(struct my_dev *cam)
{
int err;
int n;
const u32 acq_frame_count = cam->acq_buffer_bytes / cam->acq_frame_bytes;
struct udma_frame *udma_frames;
u32 udma_desc_count = 0;
if (!cam->acq_buffer) {
pr_err("%s: user buffer is NULL!\n", __func__);
return -EFAULT;
}
if (cam->udma_buffer == cam->acq_buffer
&& cam->udma_buffer_bytes == cam->acq_buffer_bytes
&& cam->udma_frame_count == acq_frame_count)
return 0;
if (cam->udma_buffer)
unpin_user_buffer(cam);
udma_frames = kcalloc(acq_frame_count, sizeof(*udma_frames),
GFP_KERNEL | __GFP_ZERO);
if (!udma_frames) {
pr_err("%s: can't allocate udma_frame array for %u frames\n",
__func__, acq_frame_count);
return -ENOMEM;
}
for (n = 0; n < acq_frame_count; ++n) {
struct udma_frame *frame = &udma_frames[n];
frame->uaddr =
(unsigned long)(cam->acq_buffer + n * cam->acq_frame_bytes);
err = pin_user_frame(cam, frame);
if (err) {
pr_err("%s: can't pin frame %d (out of %u)\n",
__func__, n + 1, acq_frame_count);
for (--n; n >= 0; --n)
unpin_user_frame(cam, frame);
kfree(udma_frames);
return err;
}
udma_desc_count += frame->sgt->nents; /* Cannot overflow */
}
pr_debug("%s: total udma_desc_count=%u\n", __func__, udma_desc_count);
cam->udma_buffer = cam->acq_buffer;
cam->udma_buffer_bytes = cam->acq_buffer_bytes;
cam->udma_frame_count = acq_frame_count;
cam->udma_frames = udma_frames;
cam->udma_desc_count = udma_desc_count;
return 0;
}
Related structures:
struct udma_frame {
unsigned long uaddr; /* User address of the frame */
int nr_pages; /* Nr. of pages covering the frame */
struct page **pages; /* Actual pages covering the frame */
struct sg_table *sgt; /* S/G table describing the frame */
};
struct my_dev {
...
u8 __user *acq_buffer; /* User-space buffer received via IOCTL */
...
u8 __user *udma_buffer; /* User-space buffer for image */
u32 udma_buffer_bytes; /* Total image size in bytes */
u32 udma_frame_count; /* Nr. of items in udma_frames */
struct udma_frame
*udma_frames; /* DMA descriptors per frame */
u32 udma_desc_count; /* Total nr. of DMA descriptors */
...
};
Questions:
How to properly pin user buffer pages and mark them as not movable?
If one frame ends and next frame starts in the same page, is it correct to handle it as two independent pages, i.e. pin the page twice?
The data comes from device to user buffer and app is supposed to not write to its buffer, but I have no control over it. Can I use DMA_FROM_DEVICE or rather
use DMA_BIDIRECTIONAL just in case?
Do I need to use something like SetPageReserved/ClearPageReserved or mark_page_reserved/free_reserved_page?
Is IOMMU/swiotlb somehow involved? E.g. i915 driver doesn't use sg_alloc_table_from_pages if swiotlb is active?
What the difference between set_page_dirty, set_page_dirty_lock and SetPageDirty functions?
Thanks for any hint.
PS: I cannot change the way the application gets the data without breaking our library API maintained for many years. So please do not advise e.g. to mmap kernel buffer...

Why do you put "READ == READ" as the third paramter? You need put flag there.
err = get_user_pages_fast(frame->uaddr, nr_pages, READ == READ, pages);
You need put "FOLL_LONGTERM" here, and FOLL_PIN is set by get_user_pages_fast internally. See https://www.kernel.org/doc/html/latest/core-api/pin_user_pages.html#case-2-rdma
In addition, you need take care of cpu and device memory coherence. Just call "dma_sync_sg_for_device(...)" before dma transfer, and "dma_sync_sg_for_cpu(...)" after dma transfer.

Related

ARM32, phys_to_virt, Unable to handle kernel paging request at virtual address

I'm working on implementing a variant of https://apenwarr.ca/log/20190216. Long story short, the main idea is to have a space in memory where to keep informations and to retrieve this information after a soft reboot/panic.
In my case, I just want to keep some variables from a reboot to another. So I've worked on a simple variant of this mechanism to do the job. The code is simply a copy paste from the original patch with some raw adaptations. I've added a syscall to enter kernel mode to execute this code (not shown here).
struct logbits {
int magic; /* needed to verify the memory across reboots */
int state;
int nb_reboot;
};
#define PERSIST_SEARCH_START 0
#ifdef CONFIG_NO_BOOTMEM
#define PERSIST_SEARCH_END 0x5e000000
#else
#define PERSIST_SEARCH_END 0xfe000000
#endif
#define PERSIST_SEARCH_JUMP (4*1024)
#define PERSIST_MAGIC 0xba5eba11
/*
* arm uses one memory model, mips uses another
*/
phys_addr_t physmem_reserve(phys_addr_t size) {
#ifdef CONFIG_NO_BOOTMEM
phys_addr_t alloc;
alloc = memblock_find_in_range_node(size, SMP_CACHE_BYTES,
PERSIST_SEARCH_START, PERSIST_SEARCH_END,
NUMA_NO_NODE);
if (!alloc) return alloc;
if (memblock_reserve(alloc, size)) {
pr_err("info_keeper: memblock_reserve failed\n");
return 0;
}
return alloc;
#else
unsigned long where;
for (where = PERSIST_SEARCH_END - size;
where >= PERSIST_SEARCH_START && where <= PERSIST_SEARCH_END - size;
where -= PERSIST_SEARCH_JUMP) {
if (reserve_bootmem(where, size, BOOTMEM_EXCLUSIVE))
continue;
else
return where;
}
return 0;
#endif
}
struct logbits *log_buf_alloc(char **new_logbuf)
{
char *buf;
phys_addr_t alloc;
unsigned long size = sizeof(struct logbits);
unsigned long full_size = size;
struct logbits *new_logbits;
alloc = physmem_reserve(full_size);
if (alloc) {
printk(KERN_INFO "info_keeper: memory reserved # 0x%08x\n", alloc);
buf = phys_to_virt(alloc);
if(buf){
*new_logbuf = buf;
new_logbits = (void*)buf;
printk(KERN_INFO "info_keeper: memory virtual # 0x%08x\n", buf);
if (new_logbits->magic != PERSIST_MAGIC) {
printk(KERN_INFO "info_keeper: header invalid, " "cleared.\n");
memset(buf, 0, full_size);
memset(new_logbits, 0, sizeof(*new_logbits));
new_logbits->magic = PERSIST_MAGIC;
} else {
printk(KERN_INFO "info_keeper: header valid; " "state=%d\n" "nb_reboot=%d\n", new_logbits->state, new_logbits->nb_reboot);
}
return new_logbits;
}else{
printk(KERN_ERR "info_keeper: failed to get phys to virt");
buf = alloc_bootmem(full_size);
*new_logbuf = buf;
new_logbits = (struct logbits*)(buf);
memset(buf, 0, full_size);
}
} else {
/* replace the buffer */
printk(KERN_ERR "info_keeper: failed to reserve bootmem " "area. disabled.\n");
buf = alloc_bootmem(full_size);
*new_logbuf = buf;
new_logbits = (struct logbits*)(buf);
memset(buf, 0, full_size);
}
return new_logbits;
}
Upon execution, the physmem_reserve function is successful and returns a memory region. Then I get a physical to virtual memory mapping from phys_to_virt. Then, when I try to access the memory, I get this Unable to handle kernel paging request at virtual address error.
Here is a sample output :
[ 42.489639] info_keeper: memory reserved # 0x5dffffc0
[ 42.494781] info_keeper: memory virtual # 0x0dffffc0
[ 42.499778] Unable to handle kernel paging request at virtual address 0dffffc0
Any idea on what is happening ?

Writing to page mapped dmas in kernel

I've been working on modifying the intel ixgbe kernel driver to function with my PCIe device (FPGA but that's not super important). The kernel and the PCIe device all negotiate quite well, configuration headers are passed along and communication seems to function. However attempting to write DMA_FROM_DEVICE I have a slight problem that I don't understand and I'm hoping for help.
rx_ring->desc = dma_alloc_coherent(dev, ///This function allocates dma space of size size for handle dma on device dev with flag GFP KERNEL
rx_ring->size,
&rx_ring->dma, ///This dma handle may be cast to unsigned integer of the same bus width and given to dev as the DMA base address
GFP_KERNEL);
page = dev_alloc_pages(0);
dma = dma_map_page(rx_ring->dev, page, 0, acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
//Writing to the PCI device the base address to place data into.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
//This will perfectly read data I place onto the PCIe bus.
rx_ring->desc->wb.upper.length
//This seems to read some garbage memory.
dma_sync_single_range_for_cpu(rx_ring->dev,
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
unsigned char *va = page_address(page) + rx_buffer->page_offset;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
//Some code later
dma_sync_single_range_for_device(rx_ring->dev, new_buff->dma,
new_buff->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
I've tried to purge code down to just the points of interest but here's the brief run down. I allocate space for the dma creating the virtual and bus address via the dma_alloc_coherent function. I create a page of memory for the dma and map this page to the dma via the dev_alloc_pages and dma_map_page commands. I pass the dma bus address to my PCIe device so it can write to the proper offset via the writel commands (I know iowrite32 but this is on redhat).
From here there are 2 ways that the origonal ixgbe driver reads data from the PCIe bus. First it directly reads from the dma's allocated virtual address (desc), but this is only used for configuration information (in the driver I am working off of). The second method is via use page_address(page) to I believe get a virtual address for the page of memory. The problem is there is only garbage memory there.
So here is my confusion. Where is page pointing to and how do I place data into page via the PCI bus? I assumed that dma_map_page would sort of merge the 2 virtual addresses into 1 so my write into the dma's bus address would collide into the page but this doesn't seem to be the case. What base address should my PCI device be writing from to align into this page of memory?
I'm working on redhat, specifically Centos kernel version 3.10.0 which makes for some problems since redhat kernel is very different from base kernel but hopefully someone can help. Thank you for any pointers.
EDIT: Added dma_sync calls which I forgot to include in original post.
EDIT2: Added a more complete code base. As a note I'm still not including some of the struct definitions or top function calls (like probe for instance), but hopefully this will be a lot more complete. Sorry for how long it is.
//These functions are called during configuration
int acc_setup_rx_resources(struct acc_ring *rx_ring)
{
struct device *dev = rx_ring->dev;
int orig_node = dev_to_node(dev);
int numa_node = -1;
int size;
size = sizeof(struct acc_rx_buffer) * rx_ring->count;
if (rx_ring->q_vector)
numa_node = rx_ring->q_vector->numa_node;
rx_ring->rx_buffer_info = vzalloc_node(size, numa_node);
if (!rx_ring->rx_buffer_info)
rx_ring->rx_buffer_info = vzalloc(size);
if (!rx_ring->rx_buffer_info)
goto err;
/* Round up to nearest 4K */
rx_ring->size = rx_ring->count * sizeof(union acc_adv_rx_desc);
rx_ring->size = ALIGN(rx_ring->size, 4096);
set_dev_node(dev, numa_node);
rx_ring->desc = dma_alloc_coherent(dev,
rx_ring->size,
&rx_ring->dma,
GFP_KERNEL);
set_dev_node(dev, orig_node);
if (!rx_ring->desc)
rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
&rx_ring->dma, GFP_KERNEL);
if (!rx_ring->desc)
goto err;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
return 0;
err:
vfree(rx_ring->rx_buffer_info);
rx_ring->rx_buffer_info = NULL;
dev_err(dev, "Unable to allocate memory for the Rx descriptor ring\n");
return -ENOMEM;
}
static bool acc_alloc_mapped_page(struct acc_ring *rx_ring,
struct acc_rx_buffer *bi)
{
struct page *page = bi->page;
dma_addr_t dma = bi->dma;
if (likely(page))
return true;
page = dev_alloc_pages(0);
if(unlikely(!page)){
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
/* map page for use */
dma = dma_map_page(rx_ring->dev, page, 0,
acc_rx_pg_size(rx_ring), DMA_FROM_DEVICE);
if (dma_mapping_error(rx_ring->dev, dma)) {
__free_pages(page, acc_rx_pg_order(rx_ring));
bi->page = NULL;
rx_ring->rx_stats.alloc_rx_page_failed++;
return false;
}
bi->dma = dma;
bi->page = page;
bi->page_offset = 0;
page_ref_add(page, USHRT_MAX - 1); //This seems to exist in redhat kernel but not 3.10 base kernel... keep?
return true;
}
void acc_alloc_rx_buffers(struct acc_ring *rx_ring, u16 cleaned_count)
{
union acc_adv_rx_desc *rx_desc;
struct acc_rx_buffer *bi;
u16 i = rx_ring->next_to_use;
printk(KERN_INFO "acc Attempting to allocate rx buffers\n");
/* nothing to do */
if (!cleaned_count)
return;
rx_desc = ACC_RX_DESC(rx_ring, i);
bi = &rx_ring->rx_buffer_info[i];
i -= rx_ring->count;
do {
if (!acc_alloc_mapped_page(rx_ring, bi)){
printk(KERN_INFO "acc Failed to allocate and map the page to dma\n");
break;
}
printk(KERN_INFO "acc happily allocated and mapped page to dma\n");
/*
* Refresh the desc even if buffer_addrs didn't change
* because each write-back erases this info.
*/
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
rx_desc++;
bi++; ///Move to the next buffer
i++;
if (unlikely(!i)) {
rx_desc = ACC_RX_DESC(rx_ring, 0);
bi = rx_ring->rx_buffer_info;
i -= rx_ring->count;
}
/* clear the hdr_addr for the next_to_use descriptor */
rx_desc->read.hdr_addr = 0;
cleaned_count--;
} while (cleaned_count);
i += rx_ring->count;
if (rx_ring->next_to_use != i)
acc_release_rx_desc(rx_ring, i);
}
//This function is called via a napi_schedule command which fires when an MSI interrupt is thrown from my PCIe device (all works fine).
int acc_poll(struct napi_struct *napi, int budget)
{
struct acc_q_vector *q_vector =
container_of(napi, struct acc_q_vector, napi);
struct acc_adapter *adapter = q_vector->adapter;
struct acc_ring *ring;
int per_ring_budget;
bool clean_complete = true;
e_dev_info("Landed in acc_poll\n");
e_dev_info("Attempting to read register space 0x00=%x\t0x04=%x\n", \
readl(q_vector->adapter->hw.hw_addr), readl(q_vector->adapter->hw.hw_addr+0x04));
e_dev_info("Attempting to write to pci ctl\n");
e_dev_info("Target address %.8x%.8x\n",q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF);
e_dev_info("Attempted page address %.8x%.8x\n",virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF);
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr+ACC_PCI_IPCONT_DATA_OFFSET); //These are supposed to be iowrite64 but it seems iowrite64 is different in redhat and only supports the copy function (to,from,size). yay redhat think different.
writel(q_vector->adapter->rx_ring[0]->dma >> 32, q_vector->adapter->hw_region2.hw_addr+0x08+ACC_PCI_IPCONT_DATA_OFFSET);
writel(q_vector->adapter->rx_ring[0]->dma & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x0C+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) >> 32, q_vector->adapter->hw_region2.hw_addr+0x10+ACC_PCI_IPCONT_DATA_OFFSET);
writel(virt_to_bus(page_address(q_vector->adapter->rx_ring[0]->rx_buffer_info[0].page)) & 0xFFFFFFFF, q_vector->adapter->hw_region2.hw_addr+0x14+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0xFF00000000000000, q_vector->adapter->hw_region2.hw_addr+0x18+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x20+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0000000CC0000000, q_vector->adapter->hw_region2.hw_addr+0x28+ACC_PCI_IPCONT_DATA_OFFSET);
writeq(0x0003344000005500, q_vector->adapter->hw_region2.hw_addr+0x30+ACC_PCI_IPCONT_DATA_OFFSET);
//Send the start command to the block
writeq(0x0000000000000001, q_vector->adapter->hw_region2.hw_addr);
acc_for_each_ring(ring, q_vector->tx)
clean_complete &= !!acc_clean_tx_irq(q_vector, ring);
if (q_vector->rx.count > 1)
per_ring_budget = max(budget/q_vector->rx.count, 1);
else
per_ring_budget = budget;
acc_for_each_ring(ring, q_vector->rx){
e_dev_info("Calling clean_rx_irq\n");
clean_complete &= acc_clean_rx_irq(q_vector, ring,
per_ring_budget);
}
/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;
e_dev_info("Clean complete\n");
/* all work done, exit the polling mode */
napi_complete(napi);
if (adapter->rx_itr_setting & 1)
acc_set_itr(q_vector);
if (!test_bit(__ACC_DOWN, &adapter->state))
acc_irq_enable_queues(adapter, ((u64)1 << q_vector->v_idx));
e_dev_info("Exiting acc_poll\n");
return 0;
}
static bool acc_clean_rx_irq(struct acc_q_vector *q_vector,
struct acc_ring *rx_ring,
const int budget)
{
printk(KERN_INFO "acc Entered clean_rx_irq\n");
unsigned int total_rx_bytes = 0, total_rx_packets = 0;
u16 cleaned_count = acc_desc_unused(rx_ring); /// First pass this is count-1 because ntc and ntu are 0 so this is 512-1=511
printk(KERN_INFO "acc RX irq Clean count = %d\n", cleaned_count);
do {
union acc_adv_rx_desc *rx_desc;
struct sk_buff *skb;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= ACC_RX_BUFFER_WRITE) { //When the clean count is >16 allocate some more buffers to get the clean count down. First pass this happens.
acc_alloc_rx_buffers(rx_ring, cleaned_count);
cleaned_count = 0;
}
rx_desc = ACC_RX_DESC(rx_ring, rx_ring->next_to_clean);
printk(KERN_INFO "acc inside RX do while, acquired description\n");
printk(KERN_INFO "acc Everything I can about the rx_ring desc (acc_rx_buffer). status_error=%d\t \
length=%d\n", rx_desc->wb.upper.status_error, rx_desc->wb.upper.length);
if (!acc_test_staterr(rx_desc, ACC_RXD_STAT_DD))
break;
printk(KERN_INFO "acc inside RX past status_error check\n");
/*
* This memory barrier is needed to keep us from reading
* any other fields out of the rx_desc until we know the
* RXD_STAT_DD bit is set
*/
rmb();
/* retrieve a buffer from the ring */
skb = acc_fetch_rx_buffer(rx_ring, rx_desc);
/* exit if we failed to retrieve a buffer */
if (!skb)
break;
printk(KERN_INFO "acc successfully retrieved a buffer\n");
cleaned_count++;
/* place incomplete frames back on ring for completion */
if (acc_is_non_eop(rx_ring, rx_desc, skb))
continue;
/* verify the packet layout is correct */
if (acc_cleanup_headers(rx_ring, rx_desc, skb))
continue;
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
/* populate checksum, timestamp, VLAN, and protocol */
acc_process_skb_fields(rx_ring, rx_desc, skb);
acc_rx_skb(q_vector, skb); ///I believe this sends data to the kernel network stuff and then the generic OS
/* update budget accounting */
total_rx_packets++;
} while (likely(total_rx_packets < budget));
printk(KERN_INFO "acc rx irq exited the while loop\n");
u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.packets += total_rx_packets;
rx_ring->stats.bytes += total_rx_bytes;
u64_stats_update_end(&rx_ring->syncp);
q_vector->rx.total_packets += total_rx_packets;
q_vector->rx.total_bytes += total_rx_bytes;
if (cleaned_count)
acc_alloc_rx_buffers(rx_ring, cleaned_count);
printk(KERN_INFO "acc rx irq returning happily\n");
return (total_rx_packets < budget);
}
static struct sk_buff *acc_fetch_rx_buffer(struct acc_ring *rx_ring,
union acc_adv_rx_desc *rx_desc)
{
struct acc_rx_buffer *rx_buffer;
struct sk_buff *skb;
struct page *page;
printk(KERN_INFO "acc Attempting to fetch rx buffer\n");
rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
page = rx_buffer->page; //This page is set by I think acc_add_rx_frag... hard to tell. yes the page is created there and kind of linked to the dma via dma_map_page
prefetchw(page); ///Prefetch the page cacheline for writing
skb = rx_buffer->skb; ///This does the mapping between skb and dma page table I believe.
if (likely(!skb)) {
printk(KERN_INFO "acc attempting to allocate netdrv space for page.\n");
void *page_addr = page_address(page) + //get the virtual page address of this page.
rx_buffer->page_offset;
/* prefetch first cache line of first page */
prefetch(page_addr);
#if L1_CACHE_BYTES < 128
prefetch(page_addr + L1_CACHE_BYTES);
#endif
/* allocate a skb to store the frags */
skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
ACC_RX_HDR_SIZE);
if (unlikely(!skb)) {
rx_ring->rx_stats.alloc_rx_buff_failed++;
return NULL;
}
/*
* we will be copying header into skb->data in
* pskb_may_pull so it is in our interest to prefetch
* it now to avoid a possible cache miss
*/
prefetchw(skb->data);
/*
* Delay unmapping of the first packet. It carries the
* header information, HW may still access the header
* after the writeback. Only unmap it when EOP is
* reached
*/
if (likely((rx_desc, ACC_RXD_STAT_EOP)))
goto dma_sync;
ACC_CB(skb)->dma = rx_buffer->dma;
} else {
if (acc_test_staterr(rx_desc, ACC_RXD_STAT_EOP))
acc_dma_sync_frag(rx_ring, skb);
dma_sync:
/* we are reusing so sync this buffer for CPU use */
printk(KERN_INFO "acc attempting to sync the dma and the device.\n");
dma_sync_single_range_for_cpu(rx_ring->dev, //Sync to the pci device, this dma buffer, at this page offset, this ring, for device to DMA transfer
rx_buffer->dma,
rx_buffer->page_offset,
acc_rx_bufsz(rx_ring),
DMA_FROM_DEVICE);
}
/* pull page into skb */
if (acc_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
//This is again temporary to try and create blockers around the problem.
return skb;
/* hand second half of page back to the ring */
acc_reuse_rx_page(rx_ring, rx_buffer);
} else if (ACC_CB(skb)->dma == rx_buffer->dma) {
/* the page has been released from the ring */
ACC_CB(skb)->page_released = true;
} else {
/* we are not reusing the buffer so unmap it */
dma_unmap_page(rx_ring->dev, rx_buffer->dma,
acc_rx_pg_size(rx_ring),
DMA_FROM_DEVICE);
}
/* clear contents of buffer_info */
rx_buffer->skb = NULL;
rx_buffer->dma = 0;
rx_buffer->page = NULL;
printk(KERN_INFO "acc returning from fetch_rx_buffer.\n");
return skb;
}
static bool acc_add_rx_frag(struct acc_ring *rx_ring,
struct acc_rx_buffer *rx_buffer,
union acc_adv_rx_desc *rx_desc,
struct sk_buff *skb)
{
printk(KERN_INFO "acc Attempting to add rx_frag from page.\n");
struct page *page = rx_buffer->page;
unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
#if (PAGE_SIZE < 8192)
unsigned int truesize = acc_rx_bufsz(rx_ring);
#else
unsigned int truesize = ALIGN(size, L1_CACHE_BYTES);
unsigned int last_offset = acc_rx_pg_size(rx_ring) -
acc_rx_bufsz(rx_ring);
#endif
if ((size <= ACC_RX_HDR_SIZE) && !skb_is_nonlinear(skb)) {
printk(KERN_INFO "acc Inside the size check.\n");
unsigned char *va = page_address(page) + rx_buffer->page_offset;
printk(KERN_INFO "page:%p\tpage_address:%p\tpage_offset:%d\n",page,page_address(page),rx_buffer->page_offset);
printk(KERN_INFO "acc First 4 bytes of string:%x %x %x %x\n",va[0],va[1],va[2],va[3]); //FIXME: I can now read this page table but there is still no meaningful data in it. (appear to be reading garbage)
printk(KERN_INFO "acc 32 bytes in:%x %x %x %x\n",va[32],va[33],va[34],va[35]);
return true;
memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
/* we can reuse buffer as-is, just make sure it is local */
if (likely(page_to_nid(page) == numa_node_id()))
return true;
/* this page cannot be reused so discard it */
put_page(page);
return false;
}
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
rx_buffer->page_offset, size, truesize);
/* avoid re-using remote pages */
if (unlikely(page_to_nid(page) != numa_node_id()))
return false;
#if (PAGE_SIZE < 8192)
/* if we are only owner of page we can reuse it */
if (unlikely(page_count(page) != 1))
return false;
/* flip page offset to other buffer */
rx_buffer->page_offset ^= truesize;
/*
* since we are the only owner of the page and we need to
* increment it, just set the value to 2 in order to avoid
* an unecessary locked operation
*/
atomic_set(&page->_count, 2);
#else
/* move offset up to the next cache line */
rx_buffer->page_offset += truesize;
if (rx_buffer->page_offset > last_offset)
return false;
/* bump ref count on page before it is given to the stack */
get_page(page);
#endif
return true;
}

Libusb race condition using asynchronous I/O

I'm using libusb to communicate with a Philips ISP1362 configured as a USB device. I am able to successfully loopback data using Synchronous I/O without any problems. For some reason when using Asynchronous I/O there appears to be a race condition.
I am transferring 64-byte packets using back-to-back OUT-IN transfers. Occasionally when I run my program libusb throws a timeout error and some of the loopback data is lost. When analyzing the USB bus using my Beagle 12 I can see the OUT-IN transactions are out of order (i.e. OUT-OUT-IN-TIMEOUT) when it should be (OUT-IN-OUT-IN).
Update The transfers are appearing out of order in the callback function which is strange because they are not coinciding with what is actually on the bus analyzer.
Example 1: (IN-OUT-IN-OUT)
main(): submitting transfer 0, endpoint 1
main(): submitting transfer 1, endpoint 82
main(): submitting transfer 2, endpoint 1
main(): submitting transfer 3, endpoint 82
xfr_cb(): count 0, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 1, status = 0, endpoint = 1, actual_length = 64, completed = 0
xfr_cb(): count 2, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 3, status = 0, endpoint = 1, actual_length = 64, completed = 0
completed
Example 2: (OUT-IN-IN-OUT)
main(): submitting transfer 0, endpoint 1
main(): submitting transfer 1, endpoint 82
main(): submitting transfer 2, endpoint 1
main(): submitting transfer 3, endpoint 82
xfr_cb(): count 0, status = 0, endpoint = 1, actual_length = 64, completed = 0
xfr_cb(): count 1, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 2, status = 0, endpoint = 82, actual_length = 64, completed = 0
xfr_cb(): count 3, status = 0, endpoint = 1, actual_length = 64, completed = 0
completed
Below is a screenshot from the analyzer:
Below is the code:
#include <stdlib.h>
#include <stdio.h>
#include <libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 4 // number of transfers
#define BYTES EP_SIZE*TRANSFERS
#define TIMEOUT 3*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *devh = NULL;
/* use a global variable to keep the context */
static struct libusb_context *usb_context = NULL;
/* count variable */
int count = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
void xfr_cb(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
/* callback - This is called after the transfer has been received by libusb */
fprintf(stderr, "xfr_cb(): count %d, status = %d, endpoint = %x, actual_length = %d, completed = %d\n",
count,
transfer->status,
transfer->endpoint,
transfer->actual_length,
*completed);
if (transfer->status != LIBUSB_TRANSFER_COMPLETED)
{
/* Error! */
fprintf(stderr, "Error: %s\n", libusb_error_name((int)transfer->status));
}
if (count == TRANSFERS-1)
*completed = 1;
count++;
}
int main(int argc, char **argv)
{
int ep_addr;
int completed = 0;
unsigned char *buf;
size_t length = 64;
int n;
int i;
int rc;
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
devh = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!devh)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* allocate memory */
buf = malloc(length);
/* start with OUT transfer */
ep_addr = ep_out;
/* queue up alternating OUT-IN transfers */
for (i = 0; i < TRANSFERS; i++)
{
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE; n++)
{
buf[n] = i+n;
}
/* Set up the transfer object */
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, devh, ep_addr, buf, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
/* Submit the transfer object */
libusb_submit_transfer(transfer);
fprintf(stderr, "main(): submitting transfer %d, endpoint %x\n", i, ep_addr);
/* alternate writing and reading for loopback */
ep_addr = (ep_addr == ep_out) ? ep_in : ep_out;
}
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc < 0)
{
if (rc == LIBUSB_ERROR_INTERRUPTED)
continue;
fprintf(stderr, "Transfer Error: %s", libusb_error_name(rc));
continue;
}
}
fprintf(stderr, "completed\n");
/* Release the interface */
libusb_release_interface(devh, 0);
/* Close the device handle */
if (devh)
libusb_close(devh);
out:
if (devh)
{
libusb_close(devh);
}
libusb_exit(NULL);
return rc;
}
Update 2 I successfully eliminated the timeout. The cause of the libusb timeout is because the Host was sending two consecutive OUT transactions intermittently on the bus.
Analyzer screenshot:
The following is the working code (no timeouts). Ran these thousands of times with no issues
static void LIBUSB_CALL xfr_cb(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
unsigned char *wbuf, *rbuf;
size_t length = 64;
fprintf(stderr, "xfr_cb(): status = %d, endpoint = %x, actual_length = %d\n",
transfer->status,
transfer->endpoint,
transfer->actual_length);
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int ep_addr;
int completed = 0;
unsigned char *buf, *wbuf1, *wbuf2, *rbuf1, *rbuf2;
size_t length = 64;
int n;
int m;
int i;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
wbuf1 = malloc(length);
wbuf2 = malloc(length);
rbuf1 = malloc(length);
rbuf2 = malloc(length);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE; n++)
wbuf1[n] = n;
for (m = 0; m < EP_SIZE; m++)
wbuf2[m] = m+1;
struct libusb_transfer *transfer1;
struct libusb_transfer *transfer2;
struct libusb_transfer *transfer3;
struct libusb_transfer *transfer4;
/* Set up the transfer object */
transfer1 = libusb_alloc_transfer(0);
transfer2 = libusb_alloc_transfer(0);
transfer3 = libusb_alloc_transfer(0);
transfer4 = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer1, handle, ep_out, wbuf1, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer2, handle, ep_in, rbuf1, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer3, handle, ep_out, wbuf2, EP_SIZE, xfr_cb, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer4, handle, ep_in, rbuf2, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
/* Submit the transfers */
libusb_submit_transfer(transfer1);
libusb_submit_transfer(transfer2);
libusb_submit_transfer(transfer3);
libusb_submit_transfer(transfer4);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
Changing the code as follows (i.e. callback = NULL for transfer 1-3) re-creates intermittent duplicate transactions, as shown in the screenshots.
libusb_fill_bulk_transfer(transfer1, handle, ep_out, wbuf1, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer2, handle, ep_in, rbuf1, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer3, handle, ep_out, wbuf2, EP_SIZE, NULL, NULL, TIMEOUT);
libusb_fill_bulk_transfer(transfer4, handle, ep_in, rbuf2, EP_SIZE, xfr_cb, &completed, TIMEOUT); /* callback data = &completed */
I honestly don't understand why the loop would cause race conditions based on their documentation and examples. Queueing up multiple transfers is actually suggested in one of the libusb examples (sam3u_benchmark.c) and also demonstrated (using loops) in the following .pdfs.
See asynchronous I/O sections:
https://www.socallinuxexpo.org/sites/default/files/presentations/scale_2017_usb.pdf
http://www.signal11.us/oss/elc2014/elc_2014_usb_0.pdf
From my understanding, the use of libusb_handle_events_completed(NULL, &completed) is supposed to resolve synchronization issues. Am I misunderstanding something?
See libusb_handle_events() from multiple threads
http://libusb.sourceforge.net/api-1.0/libusb_mtasync.html
-"This is why libusb-1.0.9 introduces the new libusb_handle_events_timeout_completed() and libusb_handle_events_completed() functions, which handles doing the completion check for you after they have acquired the lock:"
What they need are crystal clear examples of how to use their API if this is the case.
I can add more event checking but something does not seem right here.
Update 3: See accepted answer.

I started reading the documentation in the libusb source code and understood what was happening.
Particularly the section about how libusb deals with packet sizes:
http://libusb.sourceforge.net/api-1.0/libusb_packetoverflow.html
After reading that it clicked for me and I found two ways to accomplish a loopback test with large data size using asynchronous I/O.
The first way is submitting two transfers consecutively with transfer->buffer containing the entire data structure (i.e. total bytes to send and receive). The second way is submitting the two transfers with transfer->buffer containing wMaxPacketSize (e.g. 64-bytes) and having the out and in callback functions submit additional transfers to transceive the rest of the data.
For the second case, extra code needed to be added to keep track of the number of transfers and to set the completed signal when finished. The OUT-IN packet interleaving is handled by libusb and the OS - which was the part I didn't realize. In other words, not every OUT-IN transfer needed to be specified and queued individually.
Here is the asynchronous code along with the transfer rates to my USB device (ISP1362). My USB device controller is an FPGA coded in pure SystemVerilog.
Note: Regarding the transfer rates, I only have double-buffering enabled on BULK_EP_IN. I am assuming the IN-NAK's (# POLL) and transfer rate would improve in the second approach if double-buffering was enabled on BULK_EP_OUT. So this may not be a fair comparison due to device configuration.
First approach: ~1.161 MB/s (~9.288 Mb/s)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define TIMEOUT 10*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variables */
unsigned int count = 0;
unsigned int count_in = 0;
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
/* Write and Read buffers */
unsigned char wbuf[EP_SIZE*TRANSFERS];
unsigned char wbuf_tmp[EP_SIZE*TRANSFERS];
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char rbuf_tmp[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
}
static void LIBUSB_CALL xfr_cb_in(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
memcpy(rbuf+count_in*EP_SIZE, transfer->buffer, EP_SIZE);
count_in++; // one transfer complete
if (count_in < TRANSFERS)
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int completed = 0;
size_t length = 64;
int n;
int m;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* fill the buffer with incrementing data */
for (n = 0; n < TRANSFERS; n++)
{
for (m = 0; m < EP_SIZE; m++)
{
wbuf_tmp[m+n*EP_SIZE] = m+n;
}
}
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, wbuf_tmp, EP_SIZE*TRANSFERS, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_in, rbuf_tmp, EP_SIZE*TRANSFERS, xfr_cb_in, &completed, TIMEOUT);
libusb_submit_transfer(transfer);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
else
fprintf(stderr, "success\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
Second approach: ~755.9 MB/s (~6.047 Mb/s)
include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define TIMEOUT 10*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variables */
unsigned int count = 0;
unsigned int count_in = 0;
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
/* Write and Read buffers */
unsigned char wbuf[EP_SIZE*TRANSFERS];
unsigned char *wbuf_tmp;
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char rbuf_tmp[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
count_out++; // one transfer complete
if (count_out < TRANSFERS)
{
transfer->buffer = ++wbuf_tmp;
libusb_submit_transfer(transfer);
}
}
static void LIBUSB_CALL xfr_cb_in(struct libusb_transfer *transfer )
{
int *completed = transfer->user_data;
memcpy(rbuf+count_in*EP_SIZE, transfer->buffer, EP_SIZE);
count_in++; // one transfer complete
if (count_in < TRANSFERS)
libusb_submit_transfer(transfer);
else
*completed = 1;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
int completed = 0;
size_t length = 64;
int n;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
wbuf_tmp = malloc(length*TRANSFERS);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE*TRANSFERS; n++)
{
wbuf_tmp[n] = n;
}
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, wbuf_tmp, EP_SIZE, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_in, rbuf_tmp, EP_SIZE, xfr_cb_in, &completed, TIMEOUT);
libusb_submit_transfer(transfer);
/* Handle Events */
while (!completed)
{
rc = libusb_handle_events_completed(NULL, &completed);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
else
fprintf(stderr, "success\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}

Update: See accepted answer.
The following is an example using Synchronous I/O. I had a lot of trouble getting the transactions to come out in the expected order using Asynchronous I/O. I assume this was due to transfers racing with each other as #Gene had mentioned.
The main gripe I have about the libusb API is the lack of examples to illustrate proper use. The API would lead someone to believe that asynchronous transactions are placed on the bus in the order they are "submitted" and from what I gather this is not true. This functionality would be fine for submitting transactions with all the same packet TOKEN (i.e. OUT or IN).
The following code works for large bulk transfers.
Using Synchronous I/O
#include <stdlib.h>
#include <stdio.h>
#include <libusb-1.0/libusb.h>
/* Change VENDOR_ID and PRODUCT_ID depending on device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define BYTES 1024*768*3 // bytes
#define EP_SIZE 64 // bytes
#define TIMEOUT 5*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *devh = NULL;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in_addr = 0x82;
static int ep_out_addr = 0x01;
int write_chars(unsigned char * data, int length)
{
/* To send a char to the device simply initiate a bulk_transfer to the Endpoint
* with the address ep_out_addr.
*/
int actual_length;
int rc = libusb_bulk_transfer(devh, ep_out_addr, data, length, &actual_length, TIMEOUT);
if (rc < 0)
{
fprintf(stderr, "Error while sending char: %d\n", rc);
return -1;
}
return actual_length;
}
int read_chars(unsigned char * data, int length)
{
/* To receive characters from the device initiate a bulk_transfer to the Entpoint
* with address ep_in_addr
*/
int actual_length;
int rc = libusb_bulk_transfer(devh, ep_in_addr, data, length, &actual_length, TIMEOUT);
if (rc == LIBUSB_ERROR_TIMEOUT)
{
printf("timeout (%d)\n", actual_length);
return -1;
}
else if (rc < 0)
{
fprintf(stderr, "Error while waiting for char: %d\n", rc);
return -1;
}
return actual_length;
}
int main(int argc, char **argv)
{
int rc;
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
devh = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!devh)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* We can now start sending or receiving data to the device */
unsigned char buf[BYTES];
unsigned char rbuf[EP_SIZE];
int len;
int n;
int l;
int res;
// fill buffer
for (n = 0; n < BYTES; n++)
{
buf[n] = 0x00+n;
}
// loopback data, write-read
for (l = 0; l < BYTES/EP_SIZE; l++)
{
len = write_chars(buf+l*EP_SIZE, EP_SIZE);
len = read_chars(rbuf, EP_SIZE);
res = memcmp(rbuf, buf+l*EP_SIZE, sizeof(rbuf));
if (res != 0)
fprintf(stderr, "Miscompare: block %d\n", l);
}
libusb_release_interface(devh, 0);
out:
if (devh)
{
libusb_close(devh);
}
libusb_exit(NULL);
return rc;
}
Using Asynchronous and Synchronous together (i.e. OUT is submitted Asynchronously and IN is Synchronous)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include </usr/include/libusb-1.0/libusb.h>
/* Specify VENDOR_ID and PRODUCT_ID for device */
#define VENDOR_ID 0x0471
#define PRODUCT_ID 0x3630
/* Define number of bytes to transfer */
#define EP_SIZE 64 // bytes
#define TRANSFERS 1024*768*3/EP_SIZE // number of transfers
#define BYTES EP_SIZE*TRANSFERS
#define TIMEOUT 15*1000 // milliseconds
/* Use a global variable to keep the device handle */
static struct libusb_device_handle *handle = NULL;
/* count variable */
unsigned int count_out = 0;
/* The Endpoint addresses are hard-coded. You should use libusb -v to find
* the values corresponding to device
*/
static int ep_in = 0x82;
static int ep_out = 0x01;
unsigned char rbuf[EP_SIZE*TRANSFERS];
unsigned char wbuf[EP_SIZE*TRANSFERS];
static void LIBUSB_CALL xfr_cb_out(struct libusb_transfer *transfer )
{
memcpy(wbuf+count_out*EP_SIZE, transfer->buffer, EP_SIZE);
count_out++;
}
int main(int argc, char **argv)
{
const struct libusb_version *version;
unsigned char *buf, *rbuf_tmp;
size_t length = 64;
int n;
int i;
int rc;
/* Get libusb version */
version = libusb_get_version();
fprintf(stderr, "libusb version: %d.%d.%d.%d\n", version->major, version->minor, version->micro, version->nano);
/* Initialize libusb */
rc = libusb_init(NULL);
if (rc < 0)
{
fprintf(stderr, "Error Initializing libusb: %s\n", libusb_error_name(rc));
exit(1);
}
/* Set debugging output to max level */
libusb_set_debug(NULL, 3);
/* Look for a specific device and open it */
handle = libusb_open_device_with_vid_pid(NULL, VENDOR_ID, PRODUCT_ID);
if (!handle)
{
fprintf(stderr, "Error finding USB device\n");
goto out;
}
/* claim interface */
rc = libusb_claim_interface(handle, 0);
if (rc < 0)
{
fprintf(stderr, "Error claiming interface.\n");
goto out;
}
/* allocate memory */
buf = malloc(length*TRANSFERS);
/* fill the buffer with incrementing data */
for (n = 0; n < EP_SIZE*TRANSFERS; n++)
{
buf[n] = n;
}
/* allocate memory */
rbuf_tmp = malloc(length);
/* set up alternating OUT-IN transfers */
for (i = 0; i < TRANSFERS; i++)
{
struct libusb_transfer *transfer;
transfer = libusb_alloc_transfer(0);
libusb_fill_bulk_transfer(transfer, handle, ep_out, buf+i, EP_SIZE, xfr_cb_out, NULL, TIMEOUT);
libusb_submit_transfer(transfer);
int actual_length;
int rc = libusb_bulk_transfer(handle, ep_in, rbuf_tmp, EP_SIZE, &actual_length, TIMEOUT);
if (rc != LIBUSB_SUCCESS)
{
fprintf(stderr, "Transfer Error: %s\n", libusb_error_name(rc));
break;
}
memcpy(rbuf+i*EP_SIZE, rbuf_tmp, EP_SIZE);
}
fprintf(stderr, "completed\n");
int res;
res = memcmp(rbuf, wbuf, sizeof(wbuf));
if (res != 0)
fprintf(stderr, "miscompare\n");
//* Release the interface */
libusb_release_interface(handle, 0);
/* Close the device handle */
if (handle)
libusb_close(handle);
out:
if (handle)
{
libusb_close(handle);
}
libusb_exit(NULL);
return rc;
}
The above code was an experiment to see if performance increased. Interestingly, the speed difference between the two was negligible.
The version of libusb was 1.0.17.10830

Data bus error when using ioread32 in PCIE driver

I am developing a PCIE device driver for Openwrt, which is also a linux system. Here is a weird situation. After Initializing the driver in probe function, I can read(by ioread32) correct data (preset value:123456) from the buffer address obtained from ioremap_nocache. Then when I try to read it every 1 second in periodic timer interrupt handler, the function ioread32 will crash and the serial console presents a Data bus error. Below are code details.
// content of function my_driver_request_mem, this function is called in probe function
int my_driver_request_mem(struct gps_time *gt) {
u32 start, len;
int ret;
int bar = 0;
u32 flags;
ret = pcim_iomap_regions(gt->pdev, BIT(0), "My Driver");
if (ret) {
gt_log("Fail to request IO mem: err: %d\n", ret);
return ret;
}
// gt is a custom struct, and gt->pdev is the pci_dev struct
// obtained from probe function
start = pci_resource_start(gt->pdev, bar);
len = pci_resource_len(gt->pdev, bar);
flags = pci_resource_flags(gt->pdev, bar);
printk(KERN_ALERT "region start: 0x%x, len: %u\n", start, len);
printk(KERN_ALERT "region flags: 0x%x\n", flags);
gt->buffer = ioremap_nocache(start, len);
gt->buffer_len = len;
gt->buffer_start = start;
return 0;
}
Afte the function above is invoked, I read data through gt->buffer:
u32 d = 0;
d = ioread32(gt->buffer); // this operation does not cause fatal error
printk(KERN_ALERT "initial value is: %u", d);
By reading the console output, the ioread32 here is successful, and the right value 123456 is printed. Then I start a timer to read data multiple times
setup_timer(&gt->g_timer, _gps_timer_handler, gt);
mod_timer(&gt->g_timer, jiffies + msecs_to_jiffies(20000));
printk(KERN_ALERT "GPS_TIME: timer created.\n");
The handler function is quit simple:
void _gps_timer_handler(unsigned long data) {
struct gps_time *gt = (struct gps_time*)data;
u32 d;
d = ioread32(gt->buffer); // fatal error in this line
printk(KERN_ALERT "Value: %u\n", d);
mod_timer(&gt->g_timer, jiffies + msecs_to_jiffies(1000));
}
The ioread32 here will cause a fatal error here, and the error info is:
Data bus error, epc == 82db8030, ra == 8009a000
Oops[#1]
CPU: 0 PID: 853 Comm: dropbearkey Tainted: G W 4.4.14 #2
task: 82dd1900 ti:8209a000 task.ti: 8209a000
...
(bunch of numbers)
...
Status: 1100d403 KERNEL EXL IE
Cause : 8080001c (ExcCode 07)
PrId: 0001974c (MIPS 74Kc)
...
First I though this is because IO process should not be done in interrupt context, so I put ioread32 in a tasklet, and invoke that tasklet by tasklet_schedule, but it still fails.
======== Update
A. Below are my ->probe function:
static int gps_time_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int ret = 0;
struct gps_time *gt;
u8 tmp = 0;
u32 bar_val = 0;
u8 csz = 0;
unsigned long start, end, len, flag;
int bar = 0;
if (gps_time_global_time) {
printk(KERN_ALERT "GPS_TIME: more than one device detected\n");
return -1;
}
gt = gps_time_alloc();
if (gt == NULL) {
printk(KERN_WARNING "GPS_TIME: out of memory\n");
return -ENOMEM;
}
gt->pdev = pdev;
gt->irq = pdev->irq;
ret = pcim_enable_device(pdev);
if (ret) {
printk(KERN_ALERT "GPS_TIME: Fail to enable device %d\n", ret);
goto err;
}
ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
if (ret) {
printk(KERN_WARNING "GPS_TIME: 32-bit DMA not available\n");
return ret;
}
ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
if (ret) {
printk(KERN_WARNING "GPS_TIME: 32-bit DMA consistent DMA enable failed\n");
return ret;
}
my_driver_request_mem(gt);
ret = pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &tmp);
if (ret) {
printk(KERN_ALERT "GPS_TIME: Fail to read cache line size\n");
goto err;
}
if (tmp == 0) {
printk(KERN_ALERT "GPS_TIME: Write pci cache line size\n");
pci_write_config_byte(
pdev, PCI_CACHE_LINE_SIZE, L1_CACHE_BYTES / sizeof(u32));
}
pci_write_config_byte(pdev, PCI_LATENCY_TIMER, 0xa8);
pci_set_master(pdev);
pci_set_drvdata(pdev, gt);
// This function is very simple. I just create timer here. The first ioread32 is also included in this function.
gps_time_init_device(gt);
ret = request_irq(pdev->irq, gps_time_isq, IRQF_SHARED, "gps_time", gt);
if (ret) {
printk(KERN_ALERT "GPS_TIME: Fail to request IRQ: %d", ret);
goto err_irq;
}
return 0;
err_region:
pci_release_regions(pdev);
err_irq:
err:
gps_time_free(gt);
return ret;
}
B. More info about the device:
This device is a self-designed chip with PCIE interface. It is built around a Altera Cyclone IV FPGA. The firmware in the chip does nothing except writing constant 123456 into its memory.

Unable to find MBR type

I have this code which is part of a project source.
This code finds the MBR type: GRUB or LILO, and accordingly sets a flag.
Surprisingly in SLES 10-SP1 (SUSE Linux Enterprise Server), it is unable to determine.
/dev/sda1 is my swap.
/dev/sda2 is where the whole / is there, including the MBR.
Same code works for SLES11 and others.
Here MBR_SIZE is #defined to 0x1be.
int lnxfsGetBootType(int pNumber)
{
int i, retval = -1, ccode;
PartInfo *p = &cpuParts[pNumber];
char buffer[SECTOR_SIZE];
var64 offset = 0;
isdLogFileOut(ZISD_LOG_DEVELOPER,"[lnxGBT]\n");
if (getenv("ZENDEVICE") || gUtilPart == 1) {
offset = p->pOffset; // look at the partition BPB
}
//Now try to find the installed boot loader...
lseek64(p->handle, (var64)offset, SEEK_SET); // either MBR or BPB
ccode = read(p->handle, buffer, SECTOR_SIZE);
for (i=0; i<MBR_SIZE-4;i++) {
if (strncmp(&buffer[i], "LILO", 4) == 0) {
if (offset == 0){
retval = FLAG_LNXFS_LILO;
isdLogFileOut(ZISD_LOG_WARNING,"\tLILO MBR found on %s\n",p->header.deviceName);
} else {
retval = FLAG_LNXFS_LILO; // 10.31.06 _BPB;
isdLogFileOut(ZISD_LOG_WARNING,"\tLILO BPB found on %s\n",p->header.deviceName);
}
}
if (strncmp(&buffer[i], "GRUB", 4) == 0) {
if (offset == 0){
retval = FLAG_LNXFS_GRUB;
isdLogFileOut(ZISD_LOG_WARNING,"\tGRUB MBR found on %s\n",p->header.deviceName);
} else {
retval = FLAG_LNXFS_GRUB; // 10.31.06 _BPB;
isdLogFileOut(ZISD_LOG_WARNING,"\tGRUB BPB found on %s\n",p->header.deviceName);
}
}
}
if (retval == -1) {
isdLogFileOut(ZISD_LOG_WARNING,"\tLILO or GRUB mbr/bpb not found on %s\n",p->header.deviceName);
}
return retval;
} // lnxfsGetBootType
Here partinfo, is a struct of partition type:
//Data structure used internally by the image engine to store information about the
//partitions. It encapsulates the PartHeader struct, whcih is used to store partition
//information in image archives
typedef struct _PartInfo
{
PartHeader header;
int handle; //file handle for reading/writing physical device
var32 flags; //Various flags as needed. Defined above.
var64 pOffset; //offset to partition from start of physical device
int deviceNumber; //index into 'devices' where this partition's
// physical device is located
int archIndex; //for restoring only. Index into imgParts of the
// archive partition this physical partition is
// mapped to
int bytesWritten; //track number of sectors written so the device-level
// cache can be flushed
void *info; //partition-type-specific info struct
/* snip */
The testing is being done with different virtual disk images under VMWare. I've confirmed the disks are formatted with MBR and not GPT.

I'm not sure what you mean when you say it doesn't work. If your point is that your code returns -1, could you show us a copy of the MBR? You can use this command to capture it:
sudo dd if=/dev/sda bs=512 count=1 | xxd
You mention that your MBR is on /dev/sda2. That is very unusual indeed. If you mean that that is where the boot code is installed, that's a totally different thing. The MBR is always held on the first sector of the disk (assuming it is a DOS-format MBR).
I suppose it's possible that the problem in some of the failure cases is a seek failure or a short read. I've made some tweaks to add error handling and simplify a bit.
#define MBR_SIZE 0x1be
int lnxfsGetBootType(int pNumber)
{
int retval = -1, ccode;
PartInfo *p = &cpuParts[pNumber];
char buffer[SECTOR_SIZE];
off64_t offset = 0;
void *plilo, *pgrub;
const char *what = "MBR";
isdLogFileOut(ZISD_LOG_DEVELOPER,"[lnxGBT]\n");
if (getenv("ZENDEVICE") || gUtilPart == 1) {
offset = p->pOffset; // look at the partition BPB
what = "BPB";
}
// Now try to find the installed boot loader...
if (lseek64(p->handle, offset, SEEK_SET) == -1) {
isdLogFileOut(ZISD_LOG_ERROR,"\tFailed to seek to %s: %s\n", what, strerror(errno));
return -1;
}
ccode = read(p->handle, buffer, SECTOR_SIZE);
if (ccode != SECTOR_SIZE) {
isdLogFileOut(ZISD_LOG_ERROR,"\tFailed to read BPB/MBR: %s\n",
strerror(errno));
return -1;
}
plilo = memmem(buffer, ccode, "LILO", 4);
pgrub = memmem(buffer, ccode, "GRUB", 4);
if (plilo) {
retval = FLAG_LNXFS_LILO;
if (pgrub && pgrub < plilo)
retval = FLAG_LNXFS_GRUB;
}
} else if (pgrub) {
retval = FLAG_LNXFS_GRUB;
}
if (-1 == retval) {
isdLogFileOut(ZISD_LOG_WARNING,"\tLILO or GRUB %s not found on %s\n", what, p->header.deviceName);
} else {
isdLogFileOut(ZISD_LOG_WARNING,"\t%s %s not found on %s\n",
(retval == FLAG_LNXFS_GRUB ? "GRUB" : "LILO"),
what, p->header.deviceName);
}
return retval;
} // lnxfsGetBootType