Linux DMA: Using the DMAengine for scatter-gather transactions

Linux DMA: Using the DMAengine for scatter-gather transactions - c

I try to use the DMAengine API from a custom kernel driver to perform a scatter-gather operation. I have a contiguous memory region as source and I want to copy its data in several distributed buffers through a scatterlist structure. The DMA controller is the PL330 one that supports the DMAengine API (see PL330 DMA controller).
My test code is the following:
In my driver header file (test_driver.h):
#ifndef __TEST_DRIVER_H__
#define __TEST_DRIVER_H__
#include <linux/platform_device.h>
#include <linux/device.h>
#include <linux/scatterlist.h>
#include <linux/dma-mapping.h>
#include <linux/dmaengine.h>
#include <linux/of_dma.h>
#define SG_ENTRIES 3
#define BUF_SIZE 16
#define DEV_BUF 0x10000000
struct dma_block {
void * data;
int size;
};
struct dma_private_info {
struct sg_table sgt;
struct dma_block * blocks;
int nblocks;
int dma_started;
struct dma_chan * dma_chan;
struct dma_slave_config dma_config;
struct dma_async_tx_descriptor * dma_desc;
dma_cookie_t cookie;
};
struct test_platform_device {
struct platform_device * pdev;
struct dma_private_info dma_priv;
};
#define _get_devp(tdev) (&((tdev)->pdev->dev))
#define _get_dmapip(tdev) (&((tdev)->dma_priv))
int dma_stop(struct test_platform_device * tdev);
int dma_start(struct test_platform_device * tdev);
int dma_start_block(struct test_platform_device * tdev);
int dma_init(struct test_platform_device * tdev);
int dma_exit(struct test_platform_device * tdev);
#endif
In my source that contains the dma functions (dma_functions.c):
#include <linux/slab.h>
#include "test_driver.h"
#define BARE_RAM_BASE 0x10000000
#define BARE_RAM_SIZE 0x10000000
struct ram_bare {
uint32_t * __iomem map;
uint32_t base;
uint32_t size;
};
static void dma_sg_check(struct test_platform_device * tdev)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
uint32_t * buf;
unsigned int bufsize;
int nwords;
int nbytes_word = sizeof(uint32_t);
int nblocks;
struct ram_bare ramb;
uint32_t * p;
int i;
int j;
ramb.map = ioremap(BARE_RAM_BASE,BARE_RAM_SIZE);
ramb.base = BARE_RAM_BASE;
ramb.size = BARE_RAM_SIZE;
dev_info(dev,"nblocks: %d \n",dma_priv->nblocks);
p = ramb.map;
nblocks = dma_priv->nblocks;
for( i = 0 ; i < nblocks ; i++ ) {
buf = (uint32_t *) dma_priv->blocks[i].data;
bufsize = dma_priv->blocks[i].size;
nwords = dma_priv->blocks[i].size/nbytes_word;
dev_info(dev,"block[%d],size %d: ",i,bufsize);
for ( j = 0 ; j < nwords; j++, p++) {
dev_info(dev,"DMA: 0x%x, RAM: 0x%x",buf[j],ioread32(p));
}
}
iounmap(ramb.map);
}
static int dma_sg_exit(struct test_platform_device * tdev)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
int ret = 0;
int i;
for( i = 0 ; i < dma_priv->nblocks ; i++ ) {
kfree(dma_priv->blocks[i].data);
}
kfree(dma_priv->blocks);
sg_free_table(&(dma_priv->sgt));
return ret;
}
int dma_stop(struct test_platform_device * tdev)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
int ret = 0;
dma_unmap_sg(dev,dma_priv->sgt.sgl,\
dma_priv->sgt.nents, DMA_FROM_DEVICE);
dma_sg_exit(tdev);
dma_priv->dma_started = 0;
return ret;
}
static void dma_callback(void * param)
{
enum dma_status dma_stat;
struct test_platform_device * tdev = (struct test_platform_device *) param;
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
dev_info(dev,"Checking the DMA state....\n");
dma_stat = dma_async_is_tx_complete(dma_priv->dma_chan,\
dma_priv->cookie, NULL, NULL);
if(dma_stat == DMA_COMPLETE) {
dev_info(dev,"DMA complete! \n");
dma_sg_check(tdev);
dma_stop(tdev);
} else if (unlikely(dma_stat == DMA_ERROR)) {
dev_info(dev,"DMA error! \n");
dma_stop(tdev);
}
}
static void dma_busy_loop(struct test_platform_device * tdev)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
enum dma_status status;
int status_change = -1;
do {
status = dma_async_is_tx_complete(dma_priv->dma_chan, dma_priv->cookie, NULL, NULL);
switch(status) {
case DMA_COMPLETE:
if(status_change != 0)
dev_info(dev,"DMA status: COMPLETE\n");
status_change = 0;
break;
case DMA_PAUSED:
if (status_change != 1)
dev_info(dev,"DMA status: PAUSED\n");
status_change = 1;
break;
case DMA_IN_PROGRESS:
if(status_change != 2)
dev_info(dev,"DMA status: IN PROGRESS\n");
status_change = 2;
break;
case DMA_ERROR:
if (status_change != 3)
dev_info(dev,"DMA status: ERROR\n");
status_change = 3;
break;
default:
dev_info(dev,"DMA status: UNKNOWN\n");
status_change = -1;
break;
}
} while(status != DMA_COMPLETE);
dev_info(dev,"DMA transaction completed! \n");
}
static int dma_sg_init(struct test_platform_device * tdev)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct scatterlist *sg;
int ret = 0;
int i;
ret = sg_alloc_table(&(dma_priv->sgt), SG_ENTRIES, GFP_ATOMIC);
if(ret)
goto out_mem2;
dma_priv->nblocks = SG_ENTRIES;
dma_priv->blocks = (struct dma_block *) kmalloc(dma_priv->nblocks\
*sizeof(struct dma_block), GFP_ATOMIC);
if(dma_priv->blocks == NULL)
goto out_mem1;
for( i = 0 ; i < dma_priv->nblocks ; i++ ) {
dma_priv->blocks[i].size = BUF_SIZE;
dma_priv->blocks[i].data = kmalloc(dma_priv->blocks[i].size, GFP_ATOMIC);
if(dma_priv->blocks[i].data == NULL)
goto out_mem3;
}
for_each_sg(dma_priv->sgt.sgl, sg, dma_priv->sgt.nents, i)
sg_set_buf(sg,dma_priv->blocks[i].data,dma_priv->blocks[i].size);
return ret;
out_mem3:
i--;
while(i >= 0)
kfree(dma_priv->blocks[i].data);
kfree(dma_priv->blocks);
out_mem2:
sg_free_table(&(dma_priv->sgt));
out_mem1:
ret = -ENOMEM;
return ret;
}
static int _dma_start(struct test_platform_device * tdev,int block)
{
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
int ret = 0;
int sglen;
/* Step 1: Allocate and initialize the SG list */
dma_sg_init(tdev);
/* Step 2: Map the SG list */
sglen = dma_map_sg(dev,dma_priv->sgt.sgl,\
dma_priv->sgt.nents, DMA_FROM_DEVICE);
if(! sglen)
goto out2;
/* Step 3: Configure the DMA */
(dma_priv->dma_config).direction = DMA_DEV_TO_MEM;
(dma_priv->dma_config).src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
(dma_priv->dma_config).src_maxburst = 1;
(dma_priv->dma_config).src_addr = (dma_addr_t) DEV_BUF;
dmaengine_slave_config(dma_priv->dma_chan, \
&(dma_priv->dma_config));
/* Step 4: Prepare the SG descriptor */
dma_priv->dma_desc = dmaengine_prep_slave_sg(dma_priv->dma_chan, \
dma_priv->sgt.sgl, dma_priv->sgt.nents, DMA_DEV_TO_MEM, \
DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
if (dma_priv->dma_desc == NULL) {
dev_err(dev,"DMA could not assign a descriptor! \n");
goto out1;
}
/* Step 5: Set the callback method */
(dma_priv->dma_desc)->callback = dma_callback;
(dma_priv->dma_desc)->callback_param = (void *) tdev;
/* Step 6: Put the DMA descriptor in the queue */
dma_priv->cookie = dmaengine_submit(dma_priv->dma_desc);
/* Step 7: Fires the DMA transaction */
dma_async_issue_pending(dma_priv->dma_chan);
dma_priv->dma_started = 1;
if(block)
dma_busy_loop(tdev);
return ret;
out1:
dma_stop(tdev);
out2:
ret = -1;
return ret;
}
int dma_start(struct test_platform_device * tdev) {
return _dma_start(tdev,0);
}
int dma_start_block(struct test_platform_device * tdev) {
return _dma_start(tdev,1);
}
int dma_init(struct test_platform_device * tdev)
{
int ret = 0;
struct dma_private_info * dma_priv = _get_dmapip(tdev);
struct device * dev = _get_devp(tdev);
dma_priv->dma_chan = dma_request_slave_channel(dev, \
"dma_chan0");
if (dma_priv->dma_chan == NULL) {
dev_err(dev,"DMA channel busy! \n");
ret = -1;
}
dma_priv->dma_started = 0;
return ret;
}
int dma_exit(struct test_platform_device * tdev)
{
int ret = 0;
struct dma_private_info * dma_priv = _get_dmapip(tdev);
if(dma_priv->dma_started) {
dmaengine_terminate_all(dma_priv->dma_chan);
dma_stop(tdev);
dma_priv->dma_started = 0;
}
if(dma_priv->dma_chan != NULL)
dma_release_channel(dma_priv->dma_chan);
return ret;
}
In my driver source file (test_driver.c):
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/version.h>
#include <linux/device.h>
#include <linux/platform_device.h>
#include <linux/of_device.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/interrupt.h>
#include "test_driver.h"
static int dma_block=0;
module_param_named(dma_block, dma_block, int, 0444);
static struct test_platform_device tdev;
static struct of_device_id test_of_match[] = {
{ .compatible = "custom,test-driver-1.0", },
{}
};
static int test_probe(struct platform_device *op)
{
int ret = 0;
struct device * dev = &(op->dev);
const struct of_device_id *match = of_match_device(test_of_match, &op->dev);
if (!match)
return -EINVAL;
tdev.pdev = op;
dma_init(&tdev);
if(dma_block)
ret = dma_start_block(&tdev);
else
ret = dma_start(&tdev);
if(ret) {
dev_err(dev,"Error to start DMA transaction! \n");
} else {
dev_info(dev,"DMA OK! \n");
}
return ret;
}
static int test_remove(struct platform_device *op)
{
dma_exit(&tdev);
return 0;
}
static struct platform_driver test_platform_driver = {
.probe = test_probe,
.remove = test_remove,
.driver = {
.name = "test-driver",
.owner = THIS_MODULE,
.of_match_table = test_of_match,
},
};
static int test_init(void)
{
platform_driver_register(&test_platform_driver);
return 0;
}
static void test_exit(void)
{
platform_driver_unregister(&test_platform_driver);
}
module_init(test_init);
module_exit(test_exit);
MODULE_AUTHOR("klyone");
MODULE_DESCRIPTION("DMA SG test module");
MODULE_LICENSE("GPL");
However, the DMA never calls my callback function and I do not have any idea why it happens. Maybe, I am misunderstanding something...
Could anyone help me?
Thanks in advance.

Caveat: I don't have a definitive solution for you, but merely some observations and suggestions on how to debug this [based on many years of experience writing/debugging linux device drivers].
I presume you believe the callback is not being done because you don't get any printk messages. But, the callback is the only place that has them. But, is the printk level set high enough to see the messages? I'd add a dev_info to your module init, to prove it prints as expected.
Also, you [probably] won't get a callback if dma_start doesn't work as expected, so I'd add some dev_info calls there, too (e.g. before and after the call in step 7). I also notice that not all calls in dma_start check error returns [may be fine or void return, just mentioning in case you missed one]
At this point, it should be noted that there are really two questions here: (1) Did your DMA request start successfully [and complete]? (2) Did you get a callback?
So, I'd split off some code from dma_complete into (e.g.) dma_test_done. The latter does the same checking but only prints the "complete" message. You can call this in a poll mode to verify DMA completion.
So, if you [eventually] get a completion, then the problem reduces to why you didn't get the callback. If, however, you don't [even] get a completion, that's an even more fundamental problem.
This reminds me. You didn't show any code that calls dma_start or how you wait for the completion. I presume that if your callback were working, it would issue a wakeup of some sort that the base level would wait on. Or, the callback would do the request deallocate/cleanup (i.e. more code you'd write)
At step 7, you're calling dma_async_issue_pending, which should call pl330_issue_pending. pl330_issue_pending will call pl330_tasklet.
pl330_tasklet is a tasklet function, but it can also be called directly [to kick off DMA when there are no active requests].
pl330_tasklet will loop on its "work" queue and move any completed items to its "completed" queue. It then tries to start new requests. It then loops on its completed queue and issues the callbacks.
pl330_tasklet grabs the callback pointer, but if it's null it is silently ignored. You've set a callback, but it might be good to verify that where you set the callback is the same place [or propagates to] the place where pl330_tasklet will fetch it from.
When you make the call, everything may be busy, so there are no completed requests, no room to start a new request, so nothing to complete. In that case, pl330_tasklet will be called again later.
So, when dma_async_issue_pending returns, nothing may have happened yet. This is quite probable for your case.
pl330_tasklet tries to start new DMA by calling fill_queue. It will check that a descriptor is not [already] busy by looking at status != BUSY. So, you may wish to verify that yours has the correct value. Otherwise, you'd never get a callback [or even any DMA start].
Then, fill_queue will try to start the request via pl330_submit_req. But, that can return an error (e.g. queue already full), so, again, things are deferred.
For reference, notice the following comment at the top of pl330_submit_req:
Submit a list of xfers after which the client wants notification.
Client is not notified after each xfer unit, just once after all
xfer units are done or some error occurs.
What I'd do is start hacking up pl330.c and add debug messages and cross-checking. If your system is such that pl330 is servicing many other requests, you might limit the debug messages by checking that the device's private data pointer matches yours.
In particular, you'd like to get a message when your request actually gets started, so you could add a debug message to the end of pl330_submit_req
Then, adding messages within pl330_tasklet for requests will help, too.
Those are two good starting points. But, don't be afraid to add more printk calls as needed. You may be surprised by what gets called [or doesn't get called] or in what order.
UPDATE:
If I install the kernel module with the blocking behaviour, everything is initialized well. However, the dma_busy_loop function shows that the DMA descriptor is always IN PROGESS and the DMA transaction never completes. For this reason, the callback function is not executed. What could be happening?
Did a little more research. Cookies are just sequence numbers that increment. For example, if you issue a request that gets broken up into [say] 10 separate scatter/gather operations [descriptors], each one gets a unique cookie value. The cookie return value is the latest/last of the bunch (e.g. 10).
When you're calling (1) dma_async_is_tx_complete, (2) it calls chan->device->device_tx_status, (3) which is pl330_tx_status, (4) which calls dma_cookie_status
Side note/tip: When I was tracking this down, I just kept flipping back and forth between dmaengine.h and pl330.c. It was like: Look at (1), it calls (2). Where is that set? In pl330.c, I presume. So, I grepped for the string and got the name of pl330's function (i.e. (3)). So, I go there, and see that it does (4). So ... Back to dmaengine.h ...
However, when you make the outer call, you're ignoring [setting to NULL] the last two arguments. These can be useful because they return the "last" and "used" cookies. So, even if you don't get full completion, these values could change and show partial progress.
One of them should eventually be >= to the "return" cookie value. (i.e.) The entire operation should be complete. So, this will help differentiate what may be happening.
Also, note that in dmaengine.h, right below dma_async_is_tx_complete, there is dma_async_is_complete. This function is what decides whether to return DMA_COMPLETE or DMA_IN_PROGRESS, based on the cookie value you pass and the "last" and "used" cookie values. It's passive, and not used in the code path [AFAICT], but it does show how to calculate completion yourself.

Related

Alternative to global arrays in C multithreading environment?

does anyone know about an elegant (efficient) alternative to using large global arrays in C for an embedded system, whereby the array is written to in an interrupt service routine and it is read elsewhere asynchronously:
I have no issues with the current implementation, however I was just wondering if it is the best option.
for example:
uint8_t array_data[20] = {0};
volatile bool data_ready = false;
someIsr(void){
for(uint8_t i = 0; i < 20; i++){
array_data[i] = some_other_data[i];
}
data_ready = true;
}
main(void){
for(;;){
if(data_ready){
write_data_somewhere(&array_data[0]);
data_ready = false;
}
}
}
Thanks

Using global arrays is often the best approach unless one would need to use the storage for other purposes when the interrupt routine isn't running. An alternative is to use a global pointer to data that may be stored elsewhere, but one must be very cautious changing that pointer while interrupts are enabled.
An important caveat with your code, by the way: although the Standard regards the implications of a volatile qualifier as implementation-defined, allowing for the possibility that implementations may treat a volatile write as a potential "memory clobber", the authors of gcc require the use of compiler-specific intrinsics to prevent operations on "ordinary" objects from being reordered across operations on volatile-qualified ones.
For example, given:
volatile unsigned short out_count;
int *volatile out_ptr;
int buffer[10];
__attribute__((noinline))
void do_write(int *p, unsigned short count)
{
__asm("");
out_ptr = p;
out_count = count;
do {} while(out_count);
__asm("");
}
void test(void)
{
buffer[0] = 10;
buffer[1] = 20;
do_write(buffer, 2);
buffer[0] = 30;
buffer[1] = 40;
buffer[2] = 50;
do_write(buffer, 3);
}
because the __asm intrinsics don't use gcc-specific syntax to indicate that they might "clobber" the contents of memory in ways the compiler can't understand (even though many compilers support the use of empty __asm intrinsics for that express purpose, and such intrinsics wouldn't really serve any other purpose), and because gcc can see that there's no way that do_write could alter the contents of buffer, it "optimizes out" the code that would store the values 10 and 20 into buffer before the first call to do_write.
Clang doesn't seem quite as bad as gcc. It doesn't seem to reorder writes across volatile writes, it seems to refrain from reordering reads across functions that are not in-line expanded, and it seems to treat empty asm directives as potential memory clobbers, but I I'm not familiar enough with its documentation to know whether such restraint is by design, or merely a consequence of "missed optimizations" which might be "fixed" in future versions.

Consider using translation unit scope rather than global scope. That is declare the array static in the translation unit in which it is used. That translation unit should contain in this case the ISR that writes the data and an access function to read the data. Anything else, including main() should be in other translation units in order that that do not have direct access to the array:
#include <stdbool.h>
#include <stdint.h>
static volatile uint8_t array_data[DATA_LEN] = {0};
static volatile bool data_ready = false;
void someIsr(void)
{
for(uint8_t i = 0; i < 20; i++)
{
array_data[i] = some_other_data[i];
}
data_ready = true;
}
bool getdata( char* dest )
{
bool new_data = data_ready ;
if( data_ready )
{
memcpy( desr, array_data, sizeof(array_data) ) ;
data_ready = false ;
}
}
Then main() in some other translation unit might have:
#include "mydevice.h"
int main( void )
{
uint8_t somewhare[DATA_LEN] = {0};
for(;;)
{
if( getdata( somewhere ) )
{
// process new data
}
}
}
The above is based on your example, and the aim here is to isolate the array so that outside of the ISR the access is enforced to be read-only. In practice it is likely that you will need a "safer" data structure or access method such as a critical-section, double-buffering or a ring buffer so that the data can be accessed without risk of it being modified while it is being read.
This is no less efficient that your original global access, it is simply a restriction of the visibility and accessibility of the array.

As I mentioned in my top comment, one of best ways is to implement a ring queue.
Although I done a few ring queue implementations, here's one I just cooked up for illustration purposes. It is a [cheap] simulation of an Rx ISR for a uart [which is fairly common in embedded systems].
It is fairly complete, but I've not debugged it, so it may have some issues with the queue index calculations.
Anyway, here's the code:
// queue.c -- a ring queue
#include <stdlib.h>
#include <unistd.h>
enum {
QMAX = 1024
};
typedef unsigned char qdata_t; // queue data item
typedef struct {
int qenq; // index for enqueue
int qdeq; // index for dequeue
int qmax; // maximum number of elements in queue
int qover; // number of queue overflows
qdata_t *qbuf; // pointer to queue's buffer
} queue_t;
queue_t *rxisr_q; // pointer to Rx qeueue
// cli -- disable interrupts
void
cli(void)
{
}
// sti -- enable interrupts
void
sti(void)
{
}
// uart_ready -- uart is ready (has Rx data available)
int
uart_ready(void)
{
int rval = rand();
rval = ((rval % 100) > 95);
return rval;
}
// uart_getc -- get character from uart receiver
int
uart_getc(void)
{
int rval = rand();
rval &= 0xFF;
return rval;
}
// qwrap -- increment and wrap queue index
int
qwrap(queue_t *que,int qidx,int inc)
{
int qmax = que->qmax;
qidx += inc;
if (inc > 0) {
if (qidx >= qmax)
qidx -= qmax;
}
else {
if (qidx < 0)
qidx += qmax;
}
return qidx;
}
// qavail_total -- total amount of space available (for enqueue)
int
qavail_total(queue_t *que)
{
int qlen;
qlen = que->qdeq - que->qenq;
if (qlen < 0)
qlen += que->qmax;
qlen -= 1;
return qlen;
}
// qavail_contig -- total amount of space available (for enqueue) [contiguous]
int
qavail_contig(queue_t *que)
{
int qlen;
qlen = que->qdeq - que->qenq;
if (qlen < 0)
qlen = que->qmax - que->qenq;
qlen -= 1;
return qlen;
}
// qready_total -- total amount of space filled (for dequeue)
int
qready_total(queue_t *que)
{
int qlen;
qlen = que->qenq - que->qdeq;
if (qlen < 0)
qlen += que->qmax;
return qlen;
}
// qready_contig -- total amount of space filled (for dequeue) [contiguous]
int
qready_contig(queue_t *que)
{
int qlen;
qlen = que->qenq - que->qdeq;
if (qlen < 0)
qlen = que->qmax - que->qdeq;
return qlen;
}
// qfull -- is queue full?
int
qfull(queue_t *que)
{
int next;
next = qwrap(que,que->qenq,1);
return (next == que->qdeq);
}
// qpush -- push single value
int
qpush(queue_t *que,qdata_t chr)
{
int qenq = que->qenq;
int qnxt;
int push;
qnxt = qwrap(que,qenq,1);
push = (qnxt != que->qdeq);
if (push) {
que->qbuf[qenq] = chr;
que->qenq = qnxt;
}
return push;
}
// qalloc -- allocate a queue
queue_t *
qalloc(int qmax)
{
queue_t *que;
que = calloc(1,sizeof(*que));
que->qbuf = calloc(qmax,sizeof(qdata_t));
return que;
}
// uart_rx_isr -- ISR for uart receiver
void
uart_rx_isr(void)
{
int chr;
queue_t *que;
que = rxisr_q;
while (uart_ready()) {
chr = uart_getc();
#if 0
if (qfull(que)) {
++que->qover;
break;
}
#endif
if (! qpush(que,chr)) {
++que->qover;
break;
}
}
}
int
main(int argc,char **argv)
{
int qlen;
int qdeq;
queue_t *que;
rxisr_q = qalloc(QMAX);
que = rxisr_q;
while (1) {
cli();
qlen = qready_contig(que);
if (qlen > 0) {
qdeq = que->qdeq;
write(1,&que->qbuf[qdeq],qlen);
que->qdeq = qwrap(que,qdeq,qlen);
}
sti();
}
return 0;
}

How to use the ARM PMU in GEM5?

I had a problem initializing the PMU in gem5 for an arm full system with the starter_fs.py in --cpu hpi.
i followed the instructions of this post Using perf_event with the ARM PMU inside gem5 and i managed to solve my problem. I added the patch and configure the system. I am not using perf. I try to access directly the registers and read them. As i see GEM5 has only some register events implemented. Can we add the others as well as :
for example EXC_TAKEN is not implemented. Is the following the way to add them?
self.addEvent(ProbeEvent(self,0x09, cpu, "EXC_TAKEN"))
#0x09: EXC_TAKEN ???
Also, reading the pmu event registers i manage to read them and extract the events but the pmccntr cycle register always returns zero? How gem5 increments this register? What are the steps to read the cycle reggister?
a code that i use to read using perf is the following:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <linux/perf_event.h>
#define NUM_NODES 100
#define NONE 9999
struct _NODE
{
int iDist;
int iPrev;
};
typedef struct _NODE NODE;
struct _QITEM
{
int iNode;
int iDist;
int iPrev;
struct _QITEM *qNext;
};
typedef struct _QITEM QITEM;
QITEM *qHead = NULL;
int AdjMatrix[NUM_NODES][NUM_NODES];
int g_qCount = 0;
NODE rgnNodes[NUM_NODES];
int ch;
int iPrev, iNode;
int i, iCost, iDist;
void print_path (NODE *rgnNodes, int chNode)
{
if (rgnNodes[chNode].iPrev != NONE)
{
//print_path(rgnNodes, rgnNodes[chNode].iPrev);
}
//printf (" %d", chNode);
fflush(stdout);
}
void enqueue (int iNode, int iDist, int iPrev)
{
QITEM *qNew = (QITEM *) malloc(sizeof(QITEM));
QITEM *qLast = qHead;
if (!qNew)
{
//fprintf(stderr, "Out of memory.\n");
exit(1);
}
qNew->iNode = iNode;
qNew->iDist = iDist;
qNew->iPrev = iPrev;
qNew->qNext = NULL;
if (!qLast)
{
qHead = qNew;
}
else
{
while (qLast->qNext) qLast = qLast->qNext;
qLast->qNext = qNew;
}
g_qCount++;
// ASSERT(g_qCount);
}
void dequeue (int *piNode, int *piDist, int *piPrev)
{
QITEM *qKill = qHead;
if (qHead)
{
// ASSERT(g_qCount);
*piNode = qHead->iNode;
*piDist = qHead->iDist;
*piPrev = qHead->iPrev;
qHead = qHead->qNext;
free(qKill);
g_qCount--;
}
}
int qcount (void)
{
return(g_qCount);
}
int dijkstra(int chStart, int chEnd)
{
for (ch = 0; ch < NUM_NODES; ch++)
{
rgnNodes[ch].iDist = NONE;
rgnNodes[ch].iPrev = NONE;
}
if (chStart == chEnd)
{
//printf("Shortest path is 0 in cost. Just stay where you are.\n");
}
else
{
rgnNodes[chStart].iDist = 0;
rgnNodes[chStart].iPrev = NONE;
enqueue (chStart, 0, NONE);
while (qcount() > 0)
{
dequeue (&iNode, &iDist, &iPrev);
for (i = 0; i < NUM_NODES; i++)
{
if ((iCost = AdjMatrix[iNode][i]) != NONE)
{
if ((NONE == rgnNodes[i].iDist) ||
(rgnNodes[i].iDist > (iCost + iDist)))
{
rgnNodes[i].iDist = iDist + iCost;
rgnNodes[i].iPrev = iNode;
enqueue (i, iDist + iCost, iNode);
}
}
}
}
//printf("Shortest path is %d in cost. ", rgnNodes[chEnd].iDist);
//printf("Path is: ");
//print_path(rgnNodes, chEnd);
//printf("\n");
}
}
int main(int argc, char *argv[]) {
int diff = 0;
uint64_t num_cycles_nominal=0;
uint64_t num_cycles_attack=0;
uint64_t counter_cpu_cycles = 0;
//system("./load-module");
int i,j,k;
FILE *fp;
static int perf_fd_cpu_cycles;
static struct perf_event_attr attr_cpu_cycles;
attr_cpu_cycles.size = sizeof(attr_cpu_cycles);
attr_cpu_cycles.exclude_kernel = 1;
attr_cpu_cycles.exclude_hv = 1;
attr_cpu_cycles.exclude_callchain_kernel = 1;
attr_cpu_cycles.type = PERF_TYPE_RAW;
attr_cpu_cycles.config = 0x11;
/* Open the file descriptor corresponding to this counter. The counter
should start at this moment. */
if ((perf_fd_cpu_cycles = syscall(__NR_perf_event_open, &attr_cpu_cycles, 0, -1, -1, 0)) == -1)
fprintf(stderr, "perf_event_open fail %d %d: %s\n", perf_fd_cpu_cycles, errno, strerror(errno));
if (argc<2) {
//fprintf(stderr, "Usage: dijkstra <filename>\n");
//fprintf(stderr, "Only supports matrix size is #define'd.\n");
}
/* open the adjacency matrix file */
fp = fopen (argv[1],"r");
/* make a fully connected matrix */
for (i=0;i<NUM_NODES;i++) {
for (j=0;j<NUM_NODES;j++) {
/* make it more sparce */
fscanf(fp,"%d",&k);
AdjMatrix[i][j]= k;
}
}
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles before: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles;
/* Get and close the performance counters. */
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
//close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles after attack: %d\n", counter_cpu_cycles);
num_cycles_attack = counter_cpu_cycles - num_cycles_nominal;
/* finds 10 shortest paths between nodes */
for (i=0,j=NUM_NODES/2;i<100;i++,j++) {
j=j%NUM_NODES;
dijkstra(i,j);
}
read(perf_fd_cpu_cycles, &counter_cpu_cycles, sizeof(counter_cpu_cycles));
close(perf_fd_cpu_cycles);
printf("Number of cpu_cycles end: %d\n", counter_cpu_cycles);
num_cycles_nominal = counter_cpu_cycles - num_cycles_attack;
printf("Number of cpu_cycles nominal: %d\n", num_cycles_nominal);
printf("Number of cpu_cycles attack: %d\n", num_cycles_attack);
exit(0);
}
the problem is that i can read the branch misses with perf having 0x10 instead 0f 0x11 (cycle counters RAW EVENT in GEM5) but using 0x11 for reading the cycles i get zero. When i try to reverse engineer the increment of cycle counter i do the following comments:
when simple/atomic or simple/timing i see that updateCycleCounter is called from the base.hh, also for the 03 cpu model. When HPI and considering that hpi is a MinorCPU model i see that updateCycleCounter is called only in POWER_STATE_ON, but i didnt find in the code a POWER_STATE_ON reference updateCycleCounter(CPU_STATE_ON) which will update the cycle counter. Please help me verify this assumption.
*****The problem was that in the MinorCPU the updateCycleCounter wasnt called for the CPU_STATE_ON which updates the ActiveCycles. It was fixed by the following patch https://gem5-review.googlesource.com/c/public/gem5/+/38095 .

Confusion regarding #interrupt-cells configuration on PCA9555 expander

I'm trying to setup a device tree source file for the first time on my custom platform. On the board is a NXP PCA9555 gpio expander. I'm attempting to setup node for the device and am a bit confused.
Here is where I'm at with the node in the dts file:
ioexp0: gpio-exp#21 {
compatible = "nxp,pca9555";
reg = <21>;
interrupt-parent = <&gpio>;
interrupts = <8 0>;
gpio-controller;
#gpio-cells = <2>;
/*I don't understand the following two lines*/
interrupt-controller;
#interrupt-cells = <2>;
};
I got to this point by using the armada-388-gp.dts source as a guide.
My confusion is on what code processes the #interrupt-cells property. The bindings documentation is not very helpful at all for this chip as it doesn't say anything regarding interrupt cell interpretation.
Looking at the pca953x_irq_setup function in the source code for the pca9555 driver - I don't see anywhere that the #interrupt-cells property is handled. Is this handled in the linux interrupt handling code? I'm just confused as to how I'm suppose to know the meaning of the two interrupt cells.
pca953x_irq_setup for your convenience:
static int pca953x_irq_setup(struct pca953x_chip *chip,
int irq_base)
{
struct i2c_client *client = chip->client;
int ret, i;
if (client->irq && irq_base != -1
&& (chip->driver_data & PCA_INT)) {
ret = pca953x_read_regs(chip,
chip->regs->input, chip->irq_stat);
if (ret)
return ret;
/*
* There is no way to know which GPIO line generated the
* interrupt. We have to rely on the previous read for
* this purpose.
*/
for (i = 0; i < NBANK(chip); i++)
chip->irq_stat[i] &= chip->reg_direction[i];
mutex_init(&chip->irq_lock);
ret = devm_request_threaded_irq(&client->dev,
client->irq,
NULL,
pca953x_irq_handler,
IRQF_TRIGGER_LOW | IRQF_ONESHOT |
IRQF_SHARED,
dev_name(&client->dev), chip);
if (ret) {
dev_err(&client->dev, "failed to request irq %d\n",
client->irq);
return ret;
}
ret = gpiochip_irqchip_add_nested(&chip->gpio_chip,
&pca953x_irq_chip,
irq_base,
handle_simple_irq,
IRQ_TYPE_NONE);
if (ret) {
dev_err(&client->dev,
"could not connect irqchip to gpiochip\n");
return ret;
}
gpiochip_set_nested_irqchip(&chip->gpio_chip,
&pca953x_irq_chip,
client->irq);
}
return 0;
}
This is my first time working with device tree so I'm hoping it's something obvious that I'm just missing.

After looking at all of the comments I did some additional reading and figured out my answer.
I now understand that I was misinterpreting some properties of the device tree. I was previously under the impression that the driver had to specify how all properties were handled. I now see that linux will actually handle many of the generic properties such as gpios or interrupts (which makes a lot of sense).
The documentation on the actual interrupts binding was very helpful, not the documentation for the device driver.
Here is a bit more of a detailed explanation of how the translation from intspec to IRQ_TYPE* happens:
The function of_irq_parse_one copies the interrupt specifier integers to a struct of_phandle_args here. This arg is then passed to irq_create_of_mapping via a consumer function (e.g. of_irq_get). This function then maps these args to a struct irq_fwspec via of_phandle_args_to_fwspec and passes it's fwspec data to irq_create_fwspec_mapping. These functions are all found in irqdomain.c. At this point the irq will belong to an irq_domain or use the irq_default_domain. As far I can tell - the pca853x driver uses the default domain. This domain is often setup by platform specific code. I found mine by searching for irq_domain_ops on cross reference. A lot of these seem to do simple copying of intspec[1] & IRQ_TYPE_SENSE_MASK to the type variable in irq_create_fwspec_mapping via irq_domain_translate. From here the type is set to the irq's irq_data via irqd_set_trigger_type.
of_irq_parse_one:
/**
* of_irq_parse_one - Resolve an interrupt for a device
* #device: the device whose interrupt is to be resolved
* #index: index of the interrupt to resolve
* #out_irq: structure of_irq filled by this function
*
* This function resolves an interrupt for a node by walking the interrupt tree,
* finding which interrupt controller node it is attached to, and returning the
* interrupt specifier that can be used to retrieve a Linux IRQ number.
*/
int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq)
{
struct device_node *p;
const __be32 *intspec, *tmp, *addr;
u32 intsize, intlen;
int i, res;
pr_debug("of_irq_parse_one: dev=%s, index=%d\n", of_node_full_name(device), index);
/* OldWorld mac stuff is "special", handle out of line */
if (of_irq_workarounds & OF_IMAP_OLDWORLD_MAC)
return of_irq_parse_oldworld(device, index, out_irq);
/* Get the reg property (if any) */
addr = of_get_property(device, "reg", NULL);
/* Try the new-style interrupts-extended first */
res = of_parse_phandle_with_args(device, "interrupts-extended",
"#interrupt-cells", index, out_irq);
if (!res)
return of_irq_parse_raw(addr, out_irq);
/* Get the interrupts property */
intspec = of_get_property(device, "interrupts", &intlen);
if (intspec == NULL)
return -EINVAL;
intlen /= sizeof(*intspec);
pr_debug(" intspec=%d intlen=%d\n", be32_to_cpup(intspec), intlen);
/* Look for the interrupt parent. */
p = of_irq_find_parent(device);
if (p == NULL)
return -EINVAL;
/* Get size of interrupt specifier */
tmp = of_get_property(p, "#interrupt-cells", NULL);
if (tmp == NULL) {
res = -EINVAL;
goto out;
}
intsize = be32_to_cpu(*tmp);
pr_debug(" intsize=%d intlen=%d\n", intsize, intlen);
/* Check index */
if ((index + 1) * intsize > intlen) {
res = -EINVAL;
goto out;
}
/* Copy intspec into irq structure */
intspec += index * intsize;
out_irq->np = p;
out_irq->args_count = intsize;
for (i = 0; i < intsize; i++)
out_irq->args[i] = be32_to_cpup(intspec++);
/* Check if there are any interrupt-map translations to process */
res = of_irq_parse_raw(addr, out_irq);
out:
of_node_put(p);
return res;
}
EXPORT_SYMBOL_GPL(of_irq_parse_one)
irq_create_fwspec_mapping:
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
struct irq_domain *domain;
struct irq_data *irq_data;
irq_hw_number_t hwirq;
unsigned int type = IRQ_TYPE_NONE;
int virq;
if (fwspec->fwnode) {
domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
if (!domain)
domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY);
} else {
domain = irq_default_domain;
}
if (!domain) {
pr_warn("no irq domain found for %s !\n",
of_node_full_name(to_of_node(fwspec->fwnode)));
return 0;
}
if (irq_domain_translate(domain, fwspec, &hwirq, &type))
return 0;
/*
* WARN if the irqchip returns a type with bits
* outside the sense mask set and clear these bits.
*/
if (WARN_ON(type & ~IRQ_TYPE_SENSE_MASK))
type &= IRQ_TYPE_SENSE_MASK;
/*
* If we've already configured this interrupt,
* don't do it again, or hell will break loose.
*/
virq = irq_find_mapping(domain, hwirq);
if (virq) {
/*
* If the trigger type is not specified or matches the
* current trigger type then we are done so return the
* interrupt number.
*/
if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
return virq;
/*
* If the trigger type has not been set yet, then set
* it now and return the interrupt number.
*/
if (irq_get_trigger_type(virq) == IRQ_TYPE_NONE) {
irq_data = irq_get_irq_data(virq);
if (!irq_data)
return 0;
irqd_set_trigger_type(irq_data, type);
return virq;
}
pr_warn("type mismatch, failed to map hwirq-%lu for %s!\n",
hwirq, of_node_full_name(to_of_node(fwspec->fwnode)));
return 0;
}
if (irq_domain_is_hierarchy(domain)) {
virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, fwspec);
if (virq <= 0)
return 0;
} else {
/* Create mapping */
virq = irq_create_mapping(domain, hwirq);
if (!virq)
return virq;
}
irq_data = irq_get_irq_data(virq);
if (!irq_data) {
if (irq_domain_is_hierarchy(domain))
irq_domain_free_irqs(virq, 1);
else
irq_dispose_mapping(virq);
return 0;
}
/* Store trigger type */
irqd_set_trigger_type(irq_data, type);
return virq;
}
EXPORT_SYMBOL_GPL(irq_create_fwspec_mapping);

Getting volume value from pulseaudio

I've written this code by looking at various examples: Python pulseaudio monitor, Pavumeter source, async playback example, and Pacat source.
I have successfully connected to a sink and am able to record it, but my problem is, I'm stuck at getting the volume value out. If I try printing value from the read function, I just get a bunch of random numbers at a second's interval.
Now I'm not asking for someone to finish writing the code for me, I'd just like some tips, help so that I could head towards the right direction. How do I retrieve the volume value?
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <pulse/pulseaudio.h>
static int latency = 20000; // start latency in micro seconds
static int sampleoffs = 0;
static short sampledata[300000];
static pa_buffer_attr bufattr;
static int underflows = 0;
static pa_sample_spec ss;
// This callback gets called when our context changes state. We really only
// care about when it's ready or if it has failed
void pa_state_cb(pa_context *c, void *userdata) {
pa_context_state_t state;
int *pa_ready = userdata;
state = pa_context_get_state(c);
switch (state) {
// These are just here for reference
case PA_CONTEXT_UNCONNECTED:
case PA_CONTEXT_CONNECTING:
case PA_CONTEXT_AUTHORIZING:
case PA_CONTEXT_SETTING_NAME:
default:
break;
case PA_CONTEXT_FAILED:
case PA_CONTEXT_TERMINATED:
*pa_ready = 2;
break;
case PA_CONTEXT_READY:
*pa_ready = 1;
break;
}
}
static void stream_read_cb(pa_stream *s, size_t length, void *userdata) {
const void *data;
pa_stream_peek(s, &data, &length);
data = (const unsigned char*) data;
printf("%u", data);
pa_stream_drop(s);
}
int main(int argc, char *argv[]) {
pa_mainloop *pa_ml;
pa_mainloop_api *pa_mlapi;
pa_context *pa_ctx;
pa_stream *recordstream;
int r;
int pa_ready = 0;
int retval = 0;
unsigned int a;
double amp;
int test = 0;
// Create a mainloop API and connection to the default server
pa_ml = pa_mainloop_new();
pa_mlapi = pa_mainloop_get_api(pa_ml);
pa_ctx = pa_context_new(pa_mlapi, "Simple PA test application");
pa_context_connect(pa_ctx, NULL, 0, NULL);
// This function defines a callback so the server will tell us it's state.
// Our callback will wait for the state to be ready. The callback will
// modify the variable to 1 so we know when we have a connection and it's
// ready.
// If there's an error, the callback will set pa_ready to 2
pa_context_set_state_callback(pa_ctx, pa_state_cb, &pa_ready);
// We can't do anything until PA is ready, so just iterate the mainloop
// and continue
while (pa_ready == 0) {
pa_mainloop_iterate(pa_ml, 1, NULL);
}
if (pa_ready == 2) {
retval = -1;
goto exit;
}
ss.rate = 44100;
ss.channels = 2;
ss.format = PA_SAMPLE_U8;
recordstream = pa_stream_new(pa_ctx, "Record", &ss, NULL);
if (!recordstream) {
printf("pa_stream_new failed\n");
}
pa_stream_set_read_callback(recordstream, stream_read_cb, NULL);
r = pa_stream_connect_record(recordstream, NULL, NULL, PA_STREAM_PEAK_DETECT);
if (r < 0) {
printf("pa_stream_connect_playback failed\n");
retval = -1;
goto exit;
}
// Run the mainloop until pa_mainloop_quit() is called
// (this example never calls it, so the mainloop runs forever).
// printf("%s", "Running Loop");
pa_mainloop_run(pa_ml, NULL);
exit:
// clean up and disconnect
pa_context_disconnect(pa_ctx);
pa_context_unref(pa_ctx);
pa_mainloop_free(pa_ml);
return retval;
}

Looking at the original question from UNIX.StackExchange, it looks like you're trying to create a VU meter. It can be done using an envelope detector. You have to read the input values and then average their rectified value. A simple envelope detector can be done as an exponential moving average filter.
float level = 0; // Init time
const float alpha = COEFFICIENT; // See below
...
// Inside sample loop
float input_signal = fabsf(get_current_sample());
level = level + alpha * (input_signal - level);
Here, alpha is the filter coefficient, which can be calculated as:
const float alpha = 1.0 - expf( (-2.0 * M_PI) / (TC * SAMPLE_RATE) );
Where TC is known as the "time constant" parameter, measured in seconds, which defines how fast you want to "follow" the signal. Setting it too short makes the VU meter very "bumpy" and setting it too long will miss transients in the signal. 10 mS is a good value to start from.

pthread_mutex_lock gets stuck

The revelant code may be found here: http://pastebin.com/VbhtQckm
The problem is at line
85. pthread_mutex_lock(ID_retrieval_pool->info->lock);
I'm running the server and it's getting stuck at lock. The memory is allocated, I'm initializing the mutex and it's the only thread who's owning that shared memory.
I did debug with GDB and Valgrind using helgrind tool but did not find any clue.
Possible problems which think may cause this:
mutex is not being initialized (I'm using a block is shared memory which I'm initializing as a mutex);
deadlock? in the man page https://www.sourceware.org/pthreads-
win32/manual/pthread_mutex_init.html says this can cause this;
Please note that this code is for learning purpose.
Edit, the code is:
// common_header.h + common_header.c
#ifndef DATA_TYPES_H
#define DATA_TYPES_H
#include <pthread.h>
#include <errno.h>
#define RETREIVE_ID_KEY 1
typedef enum {
SHM_State_None,
SHM_State_ID_Available,
SHM_State_ID_Not_Available,
} SHM_State;
typedef struct {
pthread_mutex_t *lock; // locked if any thread is modifying data
SHM_State state;
} data_state;
typedef int shmid_t;
typedef struct data_pool {
data_state *info;
shmid_t shm_id;
} data_pool;
// other data structures
extern data_state * data_state_initialize_by_setting_address(void *address)
{
data_state *data = (data_state *)address;
data->lock = (pthread_mutex_t *)address;
pthread_mutex_init(data->lock, NULL);
data->state = SHM_State_None;
return data;
}
extern data_pool * data_pool_initialize_by_setting_address(void *address)
{
data_pool *data = (data_pool *)address;
data->info = data_state_initialize_by_setting_address(address);
data->shm_id = 0; // invalid though, the structure's client has to set a valid one
return data;
}
// other initialization functions
#endif // DATA_TYPES_H
///----------------------------------------------------------------------------------------\\\
// main.c -- Server
#include "common_header.h"
#define SHM_INVALID_ADDRESS (void *)-1
#define SHMGET_RW_FLAGS 0666
#define SHMAT_RW_FLAGS 0
bool initialize_data();
static data_pool *ID_retrieval_pool = NULL;
int main(int argc, char *argv[])
{
if (!initialize_data()) {
return EXIT_FAILURE;
}
// Do other stuff
return EXIT_SUCCESS;
}
bool initialize_data()
{
// some irrelevant initialization code
shmid_t shm_ID = shmget(RETREIVE_ID_KEY,
sizeof(data_pool),
IPC_CREAT | SHMGET_RW_FLAGS);
void *shm_address = shmat(shm_ID, NULL, SHMAT_RW_FLAGS);
if (shm_address == SHM_INVALID_ADDRESS) {
return false;
}
ID_retrieval_pool = data_pool_initialize_by_setting_address(shm_address);
pthread_mutex_lock(ID_retrieval_pool->info->lock);
ID_retrieval_pool->shm_id = get_shared_ID();
ID_retrieval_pool->info->state = SHM_State_ID_Available;
pthread_mutex_unlock(ID_retrieval_pool->info->lock);
// other initialization code
return true;
}

You have an interesting and incorrect way of initializing the mutex:
data->lock = (pthread_mutex_t *)address; /* address == &data */
pthread_mutex_init(data->lock, NULL);
In your code address is the address of the outer struct: this does not actually allocate a usable block of memory.
I suggest you just make the mutex non-pointer and then initialize it:
/* In the struct. */
pthread_mutex_t lock;
/* In your function. */
pthread_mutex_init(&data->lock, NULL);