Within the Linux Kernel (specifically for device drivers), how would I know what variables to lock and when they need locking? In particular, why does the locking in the following code only happen after dev has been set, even though dev points to a global variable scull_devices?
struct scull_qset {
void **data; /* pointer to an array of pointers which each point to a quantum buffer */
struct scull_qset *next;
};
struct scull_dev {
struct scull_qset *data; /* Pointer to first quantum set */
int quantum; /* the current quantum size */
int qset; /* the current array size */
unsigned long size; /* amount of data stored here */
unsigned int access_key; /* used by sculluid and scullpriv */
struct semaphore sem; /* mutual exclusion semaphore */
struct cdev cdev; /* Char device structure initialized in scull_init_module */
};
struct scull_dev *scull_devices; /* allocated dynamically in scull_init_module */
int scull_open(struct inode *inode, struct file *filp)
{
struct scull_dev *dev; /* device information */
dev = container_of(inode->i_cdev, struct scull_dev, cdev);
filp->private_data = dev; /* for other methods */
/* now trim to 0 the length of the device if open was write-only */
if ( (filp->f_flags & O_ACCMODE) == O_WRONLY) {
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
scull_trim(dev); /* empty out the scull device */
up(&dev->sem);
}
return 0; /* success */
}
If the code for scull_init_module is needed for a more complete picture, here it is:
int scull_major = SCULL_MAJOR;
int scull_minor = 0;
int scull_quantum = SCULL_QUANTUM;
int scull_qset = SCULL_QSET;
int scull_nr_devs = SCULL_NR_DEVS;
int scull_init_module(void)
{
int result, i;
dev_t dev = 0;
/* assigns major and minor numbers (left out for brevity sake) */
/*
* allocate the devices -- we can't have them static, as the number
* can be specified at load time
*/
scull_devices = kmalloc(scull_nr_devs * sizeof(struct scull_dev), GFP_KERNEL);
if (!scull_devices) {
result = -ENOMEM;
goto fail;
}
memset(scull_devices, 0, scull_nr_devs * sizeof(struct scull_dev));
/* Initialize each device. */
for (i = 0; i < scull_nr_devs; i++) {
scull_devices[i].quantum = scull_quantum;
scull_devices[i].qset = scull_qset;
init_MUTEX(&scull_devices[i].sem);
scull_setup_cdev(&scull_devices[i], i);
}
/* some other stuff left out for brevity sake */
return 0; /* succeed */
fail: /* isn't this a little redundant? */
scull_cleanup_module();
return result;
}
/*
* Set up the char_dev structure for this device.
*/
static void scull_setup_cdev(struct scull_dev *dev, int index)
{
int err, devno = MKDEV(scull_major, scull_minor + index);
cdev_init(&dev->cdev, &scull_fops);
dev->cdev.owner = THIS_MODULE;
dev->cdev.ops = &scull_fops;
err = cdev_add (&dev->cdev, devno, 1);
/* Fail gracefully if need be */
if (err)
printk(KERN_NOTICE "Error %d adding scull%d", err, index);
}
Locking in the example has nothing to do with the global scull_devices variable, but the locking is used to protect attributes of one scull_dev.
E.g. assume there exists a read() operation which copies size bytes from data while the mentioned scroll_trim() operation frees data.
So, when process #1 calls open() and process #2 tries to read() from an already opened device at the same time, the read() operation can access freed data and oopses.
That is why you need to protect data against races. Semaphores are one way; mutexes another one which is often more appropriate. Spinlocks and atomic variables might work too.
lock - it is way to protect critical section
critical section - in your driver code, if multiple instances are accessing same area, that is critical section.
multiple instances - it could be thread, regular ioctl cmd(from user space), and softirq and irq. It depends on your driver implementation.
Based on "context", you should use different lock too.
thread context which can sleep -> semaphore/mutex
non-sleeping context -> spinlock
softirq, tasklet -> spin_lock_bh
irq -> spin_lock_irq, spin_lock_irqsave
It is completely based on your requirements.
Let's take an example. If you are working on network driver, your netdev has stats and packet buffer and those needs to be protected by lock since it can be updated by multiple instances like net_rx_softirq, net_tx_softirq, ioctl/netlink from userspace request an so on.
In this case, based on your resource's context, you need to use different lock/mutex and sometimes you need more than 1 lock.
Related
I was wondering how the memcached source code is multithread safe in a multicore enviornement, when the manipulation of many of the _stritem or item structs variables don't seem to utilize a lock when manipulated (instead using refcount to indicate how many threads are using the item).
Line: 519 in memcached.h
typedef struct _stritem {
/* Protected by LRU locks */
struct _stritem *next;
struct _stritem *prev;
/* Rest are protected by an item lock */
struct _stritem *h_next; /* hash chain next */
rel_time_t time; /* least recent access */
rel_time_t exptime; /* expire time */
int nbytes; /* size of data */
unsigned short refcount;
uint16_t it_flags; /* ITEM_* above */
uint8_t slabs_clsid;/* which slab class we're in */
uint8_t nkey; /* key length, w/terminating null and padding */
/* this odd type prevents type-punning issues when we do
* the little shuffle to save space when not using CAS. */
union {
uint64_t cas;
char end;
} data[];
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
/* then null-terminated key */
/* then " flags length\r\n" (no terminating null) */
/* then data with terminating \r\n (no terminating null; it's binary!) */
} item;
Line 851 in memcached.h
#define refcount_incr(it) ++(it->refcount)
#define refcount_decr(it) --(it->refcount)
Wouldn't a lack of locking on the item locks when incrementing/decrementing the variable structure in a multithreaded multicore environment potentially create a race condition?
For example: the variable could be fetched by both threads and placed in separate registers, then incremented in both registered, then replaced back in their memory location. This would cause a increment of only 1. Then if 1 thread decremented, couldn't the object be freed and then cause a problem for the other thread?
Edit: To clarify, given memcached has existed for 17 + years the implementation is most likely bug-free, but I am curious as to how memcached handles this.
I am using tm4c1294+lwip1.4.1+FreeRTOS.
As netconn_alloc() is called for socket communication, it allocates an unused semaphore. the number of semaphore is defined as SYS_SEM_MAX, so it can not be over SYS_SEM_MAX.
However, as semaphores are allocated continuously it reaches SYS_SEM_MAX and stop working since I guess sys_sem_free() does not deallocate it properly
Here is function that creates a semaphore implemented in sys_arch.c
err_t
sys_sem_new(sys_sem_t *sem, u8_t count)
{
void *temp;
u32_t i;
/* Find a semaphore that is not in use. */
for(i = 0; i < SYS_SEM_MAX; i++) {
if(sems[i].queue == 0) {
break;
}
}
if(i == SYS_SEM_MAX) {
#if SYS_STATS
STATS_INC(sys.sem.err);
#endif /* SYS_STATS */
return ERR_MEM;
}
/* Create a single-entry queue to act as a semaphore. */
#if RTOS_FREERTOS
sem->queue = xQueueCreate(1, sizeof(void *));
if(sem->queue == NULL) {
#endif /* RTOS_FREERTOS */
#if SYS_STATS
STATS_INC(sys.sem.err);
#endif /* SYS_STATS */
return ERR_MEM;
}
/* Acquired the semaphore if necessary. */
if(count == 0) {
temp = 0;
xQueueSend(sem->queue, &temp, 0);
}
/* Update the semaphore statistics. */
#if SYS_STATS
STATS_INC(sys.sem.used);
#if LWIP_STATS
if(lwip_stats.sys.sem.max < lwip_stats.sys.sem.used) {
lwip_stats.sys.sem.max = lwip_stats.sys.sem.used;
}
#endif
#endif /* SYS_STATS */
/* Save the queue handle. */
sems[i].queue = sem->queue;
/* Return this semaphore. */
return (ERR_OK);
}
Here is another function that frees semaphore implemented in sys_arch.c
void
sys_sem_free(sys_sem_t *sem)
{
/* Delete Sem , By Jin */
vQueueDelete(sem->queue);
/* Clear the queue handle. */
sem->queue = 0;
/* Update the semaphore statistics. */
#if SYS_STATS
STATS_DEC(sys.sem.used);
#endif /* SYS_STATS */
}
Whenever netconn_free() is called sys_sem_free() deallocates the semaphore, but it does not free the semaphore assigned in sem[] array.
I added vQueueDelete(sem->queue); that was suggested by someone, but still all same.
Not only functions creates/frees semaphore but also functions handling mbox are same as functions above, so functions handling mbox could be wrong as well.
Someone already reported this issue to TI, but it seems they have not solved the problems yet.
Therefore, I may need to implement my own functions handling semaphore/mbox in sys_arch.c, but I don't have any clues so far.
Can anyone give me any ideas? or anything?
Thanks,
Jin
The sys_arch.txt file in /doc is somewhat helpful. Apparently, looking at that document, and at what lwip 1.3.2 used to do, it looks like the ports/tiva-tm4c129/sys_arch.c is incorrect and incomplete.
sys_sem_free() should indeed be doing the vQueueDelete() as you discovered. It should not be doing "sem->queue = 0". If you look at netconn_free() over in src/api/api_msg.c, you can see it calls sys_sem_free() and then sys_sem_set_invalid(). The queue handle will be needed in the second function, and should not be clobbered in the first.
sys_sem_set_invalid() should make a sweep over the sems[] array and if it finds a match to the sem->queue, it should zero out that copy in sems[]. Once that is done, it should set sem->queue to 0.
This, I think, best matches what's in Dunkel's sys_arch.txt document, and fixed the resource leak on my system.
I concur that the mailboxes are in the same shape. Fwiw, I went ahead and modified those in similar fashion to that just described for the sems.
btw, the lwip files I was working with were from the TivaWare_C_Series 2.1.0.12573 third_party folder.
I would like to add an attribute to the buffer description in the PostgreSQL source code, but when I try to initialize it I get an error of: PANIC: stuck spinlock (0x7fc1cddd0cd0) detected at freelist.c:206
The struct is described in buf_internals.h as:
typedef struct sbufdesc
{
BufferTag tag; /* ID of page contained in buffer */
BufFlags flags; /* see bit definitions above */
uint16 usage_count; /* usage counter for clock sweep code */
unsigned refcount; /* # of backends holding pins on buffer */
int wait_backend_pid; /* backend PID of pin-count waiter */
int buf_age; //<<<<<<<<<<< The age of the buffer
slock_t buf_hdr_lock; /* protects the above fields */
int buf_id; /* buffer's index number (from 0) */
int freeNext; /* link in freelist chain */
LWLockId io_in_progress_lock; /* to wait for I/O to complete */
LWLockId content_lock; /* to lock access to buffer contents */
} BufferDesc;
but it gets stuck at line 206 of freelist.c, which is just:
LockBufHdr(buf);
All I've added was an int to the struct and set it to zero in the same place all the other buffers are initialized. How could this cause a spinlock error?
It looks like running make clean first has corrected the issue.
I'm reading APUE and I am confused with thread synchronization of chapter 11. Below is a code snippet.
#define NHASH 29
#define HASH(fp) (((unsigned long)fp)%NHASH)
struct foo *fh[NHASH];
pthread_mutex_t hashlock = PTHREAD_MUTEX_INITIALIZER;
struct foo {
int f_count;
pthread_mutex_t f_lock;
struct foo *f_next; /* protected by hashlock */
int f_id;
/* ... more stuff here ... */
};
struct foo *
foo_alloc(void) /* allocate the object */
{
struct foo *fp;
int idx;
if ((fp = malloc(sizeof(struct foo))) != NULL) {
fp->f_count = 1;
if (pthread_mutex_init(&fp->f_lock, NULL) != 0) {
free(fp);
return(NULL);
}
idx = HASH(fp);
pthread_mutex_lock(&hashlock);
fp->f_next = fh[idx];
fh[idx] = fp;
pthread_mutex_lock(&fp->f_lock);
pthread_mutex_unlock(&hashlock);
/* ... continue initialization ... */
pthread_mutex_unlock(&fp->f_lock);
}
return(fp);
}
My doubts are:
Why place pthread_mutex_lock(&fp->f_lock) before the pthread_mutex_unlock(&hashlock)? Could I place it afterward instead?
Since fp is local variable, could pthread_mutex_lock(&fp->f_lock) and pthread_mutex_unlock(&fp->f_lock) be removed all together?
No, because the actions after the pthread_mutex_lock(&hashlock) expose the newly created structure to other threads by adding it to the the fh list. While the hashlock is held, no-one can access the variable; as soon as the hashlock is released, it becomes accessible to other threads via the hash, but locking the fp_>f_lock mutex prevents anyone messing with fp.
Not with the code as written. If the whole structure was initialized except for the hashing, then you could do without locking the fp->f_lock mutex; just before completing, you'd lock the hashlock, hook the newly allocated item into the hash tables, and then release the hashlock, and you would be safe. If you need any exclusive access after the structure is added into the hash table, you have to acquire its mutex. The way it is written, that's a non-waiting acquisition of the mutex; there is no other process that could have access to the variable.
if ((fp = malloc(sizeof(struct foo))) != NULL) {
fp->f_count = 1;
if (pthread_mutex_init(&fp->f_lock, NULL) != 0) {
free(fp);
return(NULL);
}
idx = HASH(fp);
/* ... complete initialization except for adding to hash table ... */
pthread_mutex_lock(&hashlock);
fp->f_next = fh[idx];
fh[idx] = fp;
pthread_mutex_unlock(&hashlock);
}
So, there is logic behind what is done; it is correct.
I guess that there's a second thread that's looping through the created objects, doing something with them. In this case:
No, because the loop thread might access the newly created object before being initialized at all.
No, because the loop thread might access the newly created object being half initialized.
I have several processes communicating with each through POSIX shared memory on OS X.
My issue is these processes could spawn in any order, and try to initialize the shared memory segment at the same time.
I tried using advisory locks with fcntl and flock but both fail telling me I'm passing an invalid file descriptor (I'm positive the file descriptor is not invalid). So clearly that's out of the picture.
Are there any alternatives to this? Or is there any details about using locks with shared memory that I'm not aware of?
Edit:
My attempt at using locks looks like this:
// Some declarations...
struct Queue {
int index[ENTRIES_PER_QUEUE];
sem_t lock;
sem_t readWait;
sem_t writeSem;
struct Entry slots[ENTRIES_PER_QUEUE];
};
struct ipc_t {
int fd;
char name[512];
struct Queue* queue;
};
ipc_t ipc_create(const char* name, int owner) {
int isInited = 1;
struct Queue* queue;
struct flock lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
.l_start = 0,
.l_len = 0
};
ipc_t conn = malloc(sizeof(struct ipc_t));
sprintf(conn->name, "/arqvenger_%s", name);
conn->fd = shm_open(conn->name, O_CREAT | O_RDWR, 0666);
if (conn->fd == -1) {
free(conn);
perror("shm_open failed");
return NULL;
}
if (fcntl(conn->fd, F_SETLKW, &lock) == -1) {
perror("Tanked...");
}
// Do stuff with the lock & release it
The output I get is:
Tanked...: Bad file descriptor
A common technique is to first call shm_open with O_CREAT|O_EXCL. This will succeed for only one process that then has to do the setup. The others then would have to do the open as before and wait a bit, probably polling, that the setup is finished.
Edit: To show how this could work as discussed in the comments.
struct head {
unsigned volatile flag;
pthread_mutex_t mut;
};
void * addr = 0;
/* try shm_open with exclusive, and then */
if (/* we create the segment */) {
addr = mmap(something);
struct head* h = addr;
pthread_mutex_init(&h->mut, aSharedAttr);
pthread_mutex_lock(&h->mut);
h->flag = 1;
/* do the rest of the initialization, and then */
pthread_mutex_unlock(&h->mut);
} else {
/* retry shm_open without exclusive, and then */
addr = mmap(something);
struct head* h = addr;
/* initialy flag is guaranteed to be 0 */
/* this will break out of the loop whence the new value is written to flag */
while (!h->flag) sched_yield();
pthread_mutex_lock(&h->mut);
pthread_mutex_unlock(&h->mut);
}
I have been able to reproduce the problem. Then I found a sad notice in the standard:
When the file descriptor fildes refers to a shared memory object, the
behavior of fcntl() shall be the same as for a regular file except the
effect of the following values for the argument cmd shall be
unspecified: F_SETFL, F_GETLK, F_SETLK, and F_SETLKW.
So it's likely it's not yet supported. For such a simple setup I would use a semaphore for signaling "the memory is ready".
EDIT
It seems I need to mention semaphores can be created using "O_EXCL" (so there are no races).