Having a race condition in my MPSC ring buffer - c

I was trying to build a MPSC lock-free ring buffer for learning purpose, and am running into race conditions.
A description of the MPSC ring buffer:
It is guaranteed that poll() is never called when the buffer is empty.
Instead of mod'ing head and tail like a traditional ring buffer, it lets them proceed linearly, and AND's them before using them (since the buffer size is a power of 2, this works ok with overflow).
We keep MAX_PRODUCERS-1 slots open in the queue so that if multiple producers come and see one slot is available and proceed, they can all place their entries.
It uses 32-bit quantities for head and tail, so that it can snapshot them with a 64-bit atomic read without a lock.
My test involves a couple of threads writing some known set of values to the queue, and a consumer thread polling (when the buffer is not empty) and summing all, and verifying the correct result is obtained. With 2 or more producers, I get inconsistent sums (and with 1 producer, it works).
Any help would be much appreciated. Thank you!
Here is the code:
struct ring_buf_entry {
uint32_t seqn;
};
struct __attribute__((packed, aligned(8))) ring_buf {
union {
struct {
volatile uint32_t tail;
volatile uint32_t head;
};
volatile uint64_t snapshot;
};
volatile struct ring_buf_entry buf[RING_BUF_SIZE];
};
#define RING_SUB(x,y) ((x)>=(y)?((x)-(y)):((x)+(1ULL<<32)-(y)))
static void ring_buf_push(struct ring_buf* rb, uint32_t seqn)
{
size_t pos;
while (1) {
// rely on aligned, packed, and no member-reordering properties
uint64_t snapshot = __atomic_load_n(&(rb->snapshot), __ATOMIC_SEQ_CST);
// little endian.
uint64_t snap_head = snapshot >> 32;
uint64_t snap_tail = snapshot & 0xffffffffULL;
if (RING_SUB(snap_tail, snap_head) < RING_BUF_SIZE - MAX_PRODUCERS + 1) {
uint32_t exp = snap_tail;
if (__atomic_compare_exchange_n(&(rb->tail), &exp, snap_tail+1, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
pos = snap_tail;
break;
}
}
asm volatile("pause\n": : :"memory");
}
pos &= RING_BUF_SIZE-1;
rb->buf[pos].seqn = seqn;
asm volatile("sfence\n": : :"memory");
}
static struct ring_buf_entry ring_buf_poll(struct ring_buf* rb)
{
struct ring_buf_entry ret = rb->buf[__atomic_load_n(&(rb->head), __ATOMIC_SEQ_CST) & (RING_BUF_SIZE-1)];
__atomic_add_fetch(&(rb->head), 1, __ATOMIC_SEQ_CST);
return ret;
}

Related

how to copy multiple data elements between CPUs using cacheline atomicity?

I'm trying to implement an atomic copy for multiple data elements between CPUs. I packed multiple elements of data into a single cacheline to manipulate them atomically. So I wrote the following code.
In this code, (compiled with -O3) I aligned a global struct data into a single cacheline, and I set the elements in a CPU followed by a store barrier. It is to make globally visible from the other CPU.
At the same time, in the other CPU, I used an load barrier to access the cacheline atomically. My expectation was that the reader (or consumer) CPU should bring a cache line of data into the its own cache hierarchy L1, L2 etc.. So, since I do not use load barrier again until the next read, the elements of the data would be the same, but it does not work as expected. I can't keep the cacheline atomicity in this code. The writer CPU seems putting elements into the cacheline piece by piece. How could it be possible?
#include <emmintrin.h>
#include <pthread.h>
#include "common.h"
#define CACHE_LINE_SIZE 64
struct levels {
uint32_t x1;
uint32_t x2;
uint32_t x3;
uint32_t x4;
uint32_t x5;
uint32_t x6;
uint32_t x7;
} __attribute__((aligned(CACHE_LINE_SIZE)));
struct levels g_shared;
void *worker_loop(void *param)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(15, &cpuset);
pthread_t thread = pthread_self();
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
struct levels shared;
while (1) {
_mm_lfence();
shared = g_shared;
if (shared.x1 != shared.x7) {
printf("%u %u %u %u %u %u %u\n",
shared.x1, shared.x2, shared.x3, shared.x4, shared.x5, shared.x6, shared.x7);
exit(EXIT_FAILURE);
}
}
return NULL;
}
int main(int argc, char *argv[])
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(16, &cpuset);
pthread_t thread = pthread_self();
memset(&g_shared, 0, sizeof(g_shared));
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
pthread_t worker;
int istatus = pthread_create(&worker, NULL, worker_loop, NULL);
fatal_elog_if(istatus != 0);
uint32_t val = 0;
while (1) {
g_shared.x1 = val;
g_shared.x2 = val;
g_shared.x3 = val;
g_shared.x4 = val;
g_shared.x5 = val;
g_shared.x6 = val;
g_shared.x7 = val;
_mm_sfence();
// _mm_clflush(&g_shared);
val++;
}
return EXIT_SUCCESS;
}
The output is like below
3782063 3782063 3782062 3782062 3782062 3782062 3782062
UPDATE 1
I updated the code as below using AVX512, but the problem is still here.
#include <emmintrin.h>
#include <pthread.h>
#include "common.h"
#include <immintrin.h>
#define CACHE_LINE_SIZE 64
/**
* Copy 64 bytes from one location to another,
* locations should not overlap.
*/
static inline __attribute__((always_inline)) void
mov64(uint8_t *dst, const uint8_t *src)
{
__m512i zmm0;
zmm0 = _mm512_load_si512((const void *)src);
_mm512_store_si512((void *)dst, zmm0);
}
struct levels {
uint32_t x1;
uint32_t x2;
uint32_t x3;
uint32_t x4;
uint32_t x5;
uint32_t x6;
uint32_t x7;
} __attribute__((aligned(CACHE_LINE_SIZE)));
struct levels g_shared;
void *worker_loop(void *param)
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(15, &cpuset);
pthread_t thread = pthread_self();
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
struct levels shared;
while (1) {
mov64((uint8_t *)&shared, (uint8_t *)&g_shared);
// shared = g_shared;
if (shared.x1 != shared.x7) {
printf("%u %u %u %u %u %u %u\n",
shared.x1, shared.x2, shared.x3, shared.x4, shared.x5, shared.x6, shared.x7);
exit(EXIT_FAILURE);
} else {
printf("%u %u\n", shared.x1, shared.x7);
}
}
return NULL;
}
int main(int argc, char *argv[])
{
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(16, &cpuset);
pthread_t thread = pthread_self();
memset(&g_shared, 0, sizeof(g_shared));
int status = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
fatal_relog_if(status != 0, status);
pthread_t worker;
int istatus = pthread_create(&worker, NULL, worker_loop, NULL);
fatal_elog_if(istatus != 0);
uint32_t val = 0;
while (1) {
g_shared.x1 = val;
g_shared.x2 = val;
g_shared.x3 = val;
g_shared.x4 = val;
g_shared.x5 = val;
g_shared.x6 = val;
g_shared.x7 = val;
_mm_sfence();
// _mm_clflush(&g_shared);
val++;
}
return EXIT_SUCCESS;
}
I used an load barrier to access the cacheline atomically
No, barriers do not create atomicity. They only order your own operations, not stop operations from other threads from appearing between two of our own.
Non-atomicity happens when another thread's store becomes visible between two of our loads. lfence does nothing to stop that.
lfence here is pointless; it just makes the CPU running this thread stall until it drains its ROB/RS before executing the loads. (lfence serializes execution, but has no effect on memory ordering unless you're using NT loads from WC memory e.g. video RAM).
Your options are:
Recognize that this is an X-Y problem and do something that doesn't require 64-byte atomic loads/stores. e.g. atomically update a pointer to non-atomic data. The general case of that is RCU, or perhaps a lock-free queue using a circular buffer.
Or
Use a software lock to get logical atomicity (like _Atomic struct levels g_shared; with C11) for threads that agree to cooperate by respecting the lock.
A SeqLock might be a good choice for this data if it's read more often than it changes, or especially with a single writer and multiple readers. Readers retry when tearing may have been possible; check a sequence number before/after the read, using sufficient memory-ordering. See Implementing 64 bit atomic counter with 32 bit atomics for a C++11 implementation; C11 is easier because C allows assignment from a volatile struct to a non-volatile temporary.
Or hardware-supported 64-byte atomicity:
Intel transactional memory (TSX) available on some CPUs. This would even let you
do an atomic RMW on it, or atomically read from one location and write to another. But more complex transactions are more likely to abort. Putting 4x 16-byte or 2x 32-byte loads into a transaction should hopefully not abort very often even under contention. Safe for grouping stores into a separate transaction. (Hopefully the compiler is smart enough to end the transaction with the loaded data still in registers, so it doesn't have to be atomically stored to a local on the stack, too.)
There are GNU C/C++ extensions for transactional memory. https://gcc.gnu.org/wiki/TransactionalMemory
AVX512 (allowing a full-cache-line load or store) on a CPU which happens to implement it in a way that makes aligned 64-byte loads/stores atomic. There's no on-paper guarantee that anything wider than an 8-byte load/store is ever atomic on x86, except for lock cmpxchg16b and movdir64b.
In practice we're fairly sure that modern Intel CPUs like Skylake transfer whole cache-lines atomically between cores, unlike AMD. And we know that on Intel (not AMD) a vector load or store that doesn't cross a cache-line boundary does make a single access to L1d cache, transferring all the bits in the same clock cycle. So an aligned vmovaps zmm, [mem] on Skylake-avx512 should in practice be atomic, unless you have an exotic chipset that glues many sockets together in a way that creates tearing. (Multi-socket K10 vs. single-socket K10 is a good cautionary tale: Why is integer assignment on a naturally aligned variable atomic on x86?)
MOVDIR64B - only atomic for the store part, and only supported on Intel Tremont (next-gen Goldmont successor). This still doesn't give you a way to do a 64-byte atomic load. Also it's a cache-bypassing store so not good for inter-core communication latency. I think the use-case is generating a full-size PCIe transaction.
See also SSE instructions: which CPUs can do atomic 16B memory operations? re: lack of atomicity guarantees for SIMD load/store. CPU vendors have for some reason not chosen to provide any written guarantees or ways to detect when SIMD loads/stores will be atomic, even though testing has shown that they are on many systems (when you don't cross a cache-line boundary.)

Ring Buffer with time stamp

I have a need for a ring buffer (In C language) which can hold objects of any type at the run time (almost the data will be different signal's values like current (100ms and 10ms) and temperature.etc) ( I am not sure if it have to be a fixed size or not) and it needs to be very high performance. although it's in a multi-tasking embedded environment.
Actually i need this buffer as a back up, which mean the embedded software will work as normal and save the data into the ring buffer, so far for any reason and when an error occurred, then i could have like a reference for the measured values then i will be able to have a look on them and determine the problem. Also i need to make a time stamp on the ring buffer, which mean every data (Signal value) is stored on the ring buffer will stored with the measurement's time.
Any code or ideas would be greatly appreciated. some of the operations required are:
create a ring buffer with specific size.
Link it with the whole software.
put at the tail.
get from the head.
at error, read the data and when its happen (time stamp).
return the count.
overwrite when the buffer is being full.
#include<stdint.h>
#include<stdio.h>
#include<stdlib.h>
typedef struct ring_buffer
{
void * buffer; // data buffer
void * buffer_end; // end of data buffer
void * data_start; // pointer to head
void * data_end; // pointer to tail
uint64_t capacity; // maximum number of items in buffer
uint64_t count; // number of items in the buffer
uint64_t size; // size of each item in the buffer
} ring_buffer;
void rb_init (ring_buffer *rb, uint64_t size, uint64_t capacity )
{
rb->buffer = malloc(capacity * size);
if(rb->buffer == NULL)
// handle error
rb->buffer_end = (char *)rb->buffer + capacity * size;
rb->capacity = capacity;
rb->count = 0;
rb->size = size;
rb->data_start = rb->buffer;
rb->data_end = rb->buffer;
}
void cb_free(ring_buffer *rb)
{
free(rb->buffer);
// clear out other fields too, just to be safe
}
void rb_push_back(ring_buffer *rb, const void *item)
{
if(rb->count == rb->capacity){
// handle error
}
memcpy(rb->data_start, item, rb->size);
rb->data_start = (char*)rb->data_start + rb->size;
if(rb->data_start == rb->buffer_end)
rb->data_start = rb->buffer;
rb->count++;
}
void rb_pop_front(ring_buffer *rb, void *item)
{
if(rb->count == 0){
// handle error
}
memcpy(item, rb->data_end, rb->size);
rb->data_end = (char*)rb->data_end + rb->size;
if(rb->data_end == rb->buffer_end)
rb->data_end = rb->buffer;
rb->count--;
}
Creating a ring buffer/FIFO with hardcopies of generic type is highly questionable design for embedded systems. You shouldn't need that high level of abstraction for code so close to the hardware.
Either you make a ring buffer with a data type tag (like an enum) plus a void* to data allocated elsewhere, or you make a ring buffer where all data is of the same type. Everything else is most likely confused program design ("XY problem").
You need some means to lock access to the ring buffer internally, to make it thread-safe/interrupt-safe. This, as well as the time stamp, has to be handled internally by the ring buffer ADT.

Algorithm for writing to EEPROM?

I have a memory which is a column of 4 byte rows. I can only write to it in 16 bytes and read is done in 4 bytes (line by line, that is) using I2C.
I am interested in how to write data into the EEPROM: the data that is being written consists of a few different parts of which two can be of variable length. For example, I can have XYYZ or XYYYYZZZZZZZ where each letter is 4 bytes.
My question is, how I should go about this problem to have a general way of writing the message to the memory using 16 byte write that would accommodate the variable nature of the two parts?
Rather than try to work in 4 or 16-byte units, you could consider using a small (21-byte) static cache for the eeprom. Let's assume you have
void eeprom_read16(uint32_t page, uint8_t *data);
void eeprom_write16(uint32_t page, const uint8_t *data);
where page is the address divided by 16, and always operate on 16 byte chunks. The cache itself and its initialization function (you'd call once at power-on) would be
static uint32_t eeprom_page; /* uint16_t suffices for 2 MiB EEPROM */
static uint8_t eeprom_cache[16];
static uint8_t eeprom_dirty;
static void eeprom_init(void)
{
eeprom_page = 0x80000000U; /* "None", at 32 GiB */
eeprom_dirty = 0;
}
static void eeprom_flush(void)
{
if (eeprom_dirty) {
eeprom_write16(eeprom_page, eeprom_cache);
eeprom_dirty = 0;
}
}
The eeprom_flush() function is only needed if you wish to ensure some data is stored in the EEPROM -- basically, after each complete transaction. You can safely call it at any time.
To access any memory in the EEPROM, you use the accessor functions
static inline uint8_t eeprom_get(const uint32_t address)
{
const uint32_t page = address >> 4;
if (page != eeprom_page) {
if (eeprom_dirty) {
eeprom_write(eeprom_page, eeprom_cache);
eeprom_dirty = 0;
}
eeprom_read(page, eeprom_cache);
eeprom_page = page;
}
return eeprom_cache[address % 0xFU];
}
static inline void eeprom_set(const uint32_t address, const uint8_t value)
{
const uint32_t page = address >> 4;
if (page != eeprom_page) {
if (eeprom_dirty) {
eeprom_write(eeprom_page, eeprom_cache);
eeprom_dirty = 0;
}
eeprom_read(page, eeprom_cache);
eeprom_page = page;
}
eeprom_dirty = 1;
eeprom_cache[address % 0xFU] = value;
}
Feel free to omit the inline if you like; it is just an optimization. The static inline above tell a C99 compiler to inline the functions if possible. It might increase a bit your code size, but it should produce faster code (because the compiler can make better optimizations when such small functions are inlined into the code).
Note that you should not use the above in interrupt handlers, because normal code is not prepared for the eeprom page to change mid-operation.
You can mix read and write operations, but that may lead to unnecessary wear on the EEPROM. You can, of course, split the read and write sides to separate caches, if you do mix reads and writes. That would also allow you to safely do EEPROM reads from an interrupt context (although the delay/latency of the I2C access might wreak havoc elsewhere).
Not tailored specifically to your examples, completely untested and relying on having "read 4 bytes from EEPROM" and "write 16 bytes to EEPROM" encapsulated in suitable functions.
void write_to_eeprom(uint32_t start, size_t len, uint8_t *data) {
uint32_t eeprom_dst = start & 0xfffffff0;
uint8_t buffer[16];
ssize_t data_offset;
for (data_offset = (start - eeprom_dst); data_offset < len; data_offset += 16, eeprom_dst+= 16) {
if (data_offset < 0) || ((len - data_offset) < 16) {
// we need to fill our buffer with EEPROM data
read_from_eeprom(eeprom_dst, buffer); // read 4 bytes, place at ptr
read_from_eeprom(eeprom_dst+4, buffer+4);
read_from_eeprom(eeprom_dst+8, buffer+8);
read_from_eeprom(eeprom_dst+12, buffer+12);
for (int buf_ix=0, ssize_t tmp_offset = data_offset; buf_ix < 16; buf_ix++, offset++) {
if ((offset >= 0) && (buf_ix < 16)) {
// We want to copy actual data
buffer[buf_ix] = data[offset];
}
}
} else {
// We don't need to cater for edge cases and can simply shift
// 16 bytes into our tmp buffer.
for (int ix = 0; ix < 16; ix++) {
buffer[ix] = data[data_offset + ix];
}
}
write_to_eeprom(eeprom_dst, buffer);
}
}

Initializing, constructing and converting struct to byte array causes misalignment

I am trying to design a data structure (I have made it much shorter to save space here but I think you get the idea) to be used for byte level communication:
/* PACKET.H */
#define CM_HEADER_SIZE 3
#define CM_DATA_SIZE 16
#define CM_FOOTER_SIZE 3
#define CM_PACKET_SIZE (CM_HEADER_SIZE + CM_DATA_SIZE + CM_FOOTER_SIZE)
// + some other definitions
typedef struct cm_header{
uint8_t PacketStart; //Start Indicator 0x5B [
uint8_t DeviceId; //ID Of the device which is sending
uint8_t PacketType;
} CM_Header;
typedef struct cm_footer {
uint16_t DataCrc; //CRC of the 'Data' part of CM_Packet
uint8_t PacketEnd; //should be 0X5D or ]
} CM_Footer;
//Here I am trying to conver a few u8[4] tp u32 (4*u32 = 16 byte, hence data size)
typedef struct cm_data {
union {
struct{
uint8_t Value_0_0:2;
uint8_t Value_0_1:2;
uint8_t Value_0_2:2;
uint8_t Value_0_3:2;
};
uint32_t Value_0;
};
//same thing for Value_1, 2 and 3
} CM_Data;
typedef struct cm_packet {
CM_Header Header;
CM_Data Data;
CM_Footer Footer;
} CM_Packet;
typedef struct cm_inittypedef{
uint8_t DeviceId;
CM_Packet Packet;
} CM_InitTypeDef;
typedef struct cm_appendresult{
uint8_t Result;
uint8_t Reason;
} CM_AppendResult;
extern CM_InitTypeDef cmHandler;
The goal here is to make reliable structure for transmitting data over USB interface. At the end the CM_Packet should be converted to an uint8_t array and be given to data transmit register of an mcu (stm32).
In the main.c file I try to init the structure as well as some other stuff related to this packet:
/* MAIN.C */
uint8_t packet[CM_PACKET_SIZE];
int main(void) {
//use the extern defined in packet.h to init the struct
cmHandler.DeviceId = 0x01; //assign device id
CM_Init(&cmHandler); //construct the handler
//rest of stuff
while(1) {
CM_GetPacket(&cmHandler, (uint8_t*)packet);
CDC_Transmit_FS(&packet, CM_PACKET_SIZE);
}
}
And here is the implementation of packet.h which screws up everything so bad. I added the packet[CM_PACKET_SIZE] to watch but it is like it is just being generated randomly. Sometimes by pure luck I can see in this array some of the values that I am interested in! but it is like 1% of the time!
/* PACKET.C */
CM_InitTypeDef cmHandler;
void CM_Init(CM_InitTypeDef *cm_initer) {
cmHandler.DeviceId = cm_initer->DeviceId;
static CM_Packet cmPacket;
cmPacket.Header.DeviceId = cm_initer->DeviceId;
cmPacket.Header.PacketStart = CM_START;
cmPacket.Footer.PacketEnd = CM_END;
cm_initer->Packet = cmPacket;
}
CM_AppendResult CM_AppendData(CM_InitTypeDef *handler, uint8_t identifier,
uint8_t *data){
CM_AppendResult result;
switch(identifier){
case CM_VALUE_0:
handler->Packet.Data.Value_0_0 = data[0];
handler->Packet.Data.Value_0_1 = data[1];
handler->Packet.Data.Value_0_2 = data[2];
handler->Packet.Data.Value_0_3 = data[3];
break;
//Also cases for CM_VALUE_0, 1 , 2
//to build up the CM_Data sturct of CM_Packet
default:
result.Result = CM_APPEND_FAILURE;
result.Reason = CM_APPEND_CASE_ERROR;
return result;
break;
}
result.Result = CM_APPEND_SUCCESS;
result.Reason = 0x00;
return result;
}
void CM_GetPacket(CM_InitTypeDef *handler, uint8_t *packet){
//copy the whole struct in the given buffer and later send it to USB host
memcpy(packet, &handler->Packet, sizeof(CM_PACKET_SIZE));
}
So, the problem is this code gives me 99% of the time random stuff. It never has the CM_START which is the start indicator of packet to the value I want to. But most of the time it has the CM_END byte correctly! I got really confused and cant find out the reason. Being working on an embedded platform which is hard to debugg I am kind of lost here...
If you transfer data to another (different) architecture, do not just pass a structure as a blob. That is the way to hell: endianess, alignment, padding bytes, etc. all can (and likely will) cause trouble.
Better serialize the struct in a conforming way, possily using some interpreted control stream so you do not have to write every field out manually. (But still use standard functions to generate that stream).
Some areas of potential or likely trouble:
CM_Footer: The second field might very well start at a 32 or 64 bit boundary, so the preceeding field will be followed by padding. Also, the end of that struct is very likely to be padded by at least 1 bytes on a 32 bit architecture to allow for proper alignment if used in an array (the compiler does not care you if you actually need this). It might even be 8 byte aligned.
CM_Header: Here you likely (not guaranteed) get one uint8_t with 4*2 bits with the ordering not standardized. The field my be followed by 3 unused bytes which are required for the uint32_t interprettion of the union.
How do you guarantee the same endianess (for >uint8_t: high byte first or low byte first?) for host and target?
In general, the structs/unions need not have the same layout for host and target. Even if the same compiler is used, their ABIs may differ, etc. Even if it is the same CPU, there might be other system constraints. Also, for some CPUs, different ABIs (application binary interface) exist.

Error while trying to update array element

I am working on an embedded platform which does not have debugging features. So it is hard to say what is the error source.
I have defined in header file:
typedef struct cm_packet {
CM_Header Header; //header of packet 3 bytes
uint8_t *Data; //packet data 64 bytes
CM_Footer Footer; //footer of packet 3 bytes
} CM_Packet;
typedef struct cm_inittypedef{
uint8_t DeviceId;
CM_Packet Packet;
} CM_InitTypeDef;
extern CM_InitTypeDef cmHandler;
void CM_Init(CM_InitTypeDef *handler);
CM_AppendResult CM_AppendData(CM_InitTypeDef *handler, uint8_t identifier
, uint8_t *data, uint8_t length);
And somewhere in implementation I have:
uint8_t bufferIndex = 0;
void CM_Init(CM_InitTypeDef *cm_initer) { //init a handler
cmHandler.DeviceId = cm_initer->DeviceId;
CM_Packet cmPacket;
cmPacket.Header.DeviceId = cm_initer->DeviceId;
cmPacket.Header.PacketStart = CM_START;
cmPacket.Footer.PacketEnd = CM_END;
//initialize data array
uint8_t emptyBuffer[CM_MAX_DATA_SIZE] = {0x00};
cmPacket.Data = emptyBuffer;
cm_initer->Packet = cmPacket;
}
CM_AppendResult CM_AppendData(CM_InitTypeDef *handler, uint8_t identifier
, uint8_t *data, uint8_t length){
//some check to see if new data does not make Data overflow
uint8_t i;
/*** ERROR HAPPENS HERE!!!! ***/
handler->Packet.Data[bufferIndex++] = identifier;
//now add the data itself
for(i = 0; i < length; i++) {
handler->Packet.Data[bufferIndex++] = data[i];
}
//reset indexer
if(bufferIndex > 64) {
PacketReady(); //mark packet as ready
bufferIndex = 0
};
//return result
}
The idea is to update the Packet.Data from some other source codes which have access to the handler. For example some other sources can call that Append function to change Packet.Data. But as you see in the code, I have commented the place which causes the micro-controller to go in hard fault mode. I am not sure what is happening here. All I know is exactly at that line micro goes into hard fault mode and never recovers!
This might be a race condition but before anything else I want to know I have written correct c !!! code then I try to rule out other problems.
In function CM_Init, you are setting cmPacket.Data to point to a local array:
uint8_t emptyBuffer[CM_MAX_DATA_SIZE] = {0x00};
cmPacket.Data = emptyBuffer;
Accessing this memory address outside the scope of the function yields undefined behavior.
As #barak manos mentioned, the buffer supplied to Data is allocated on the stack.
When you get to CM_AppendData, you are writing over memory that is no longer dedicated to the buffer.
You may want to use malloc so that the buffer is allocated on the heap instead of on the stack. Just remember to call free so that you are not leaking memory.
If you can't use dynamic allocation, it's possible to dedicate some scratch memory for all the Data uses. It just needs to be static.
Hope that helps :)

Resources