printf causes code crash

printf causes code crash - c

I am working on MODBUS creating library. Library is working fine for testing purpose I am using a Modbus based Power meter. I am facing a very strange problem.
Trying to print value on console, by using printf. By fetching first value side by side I am printing it, on second fetch value is coming in array correctly but on using pritnf code get crashed. :(
I have tried to debug the code also and came to know on using printf for a value second time code get crashed.
Here is part of code:
int wmain(void)
{
FLOAT dataFloat[5] = {0};
CHAR dataBuffer[200] = {0};
// Initializing Structure declared in library header
psModbusRTU pModbus =(psModbusRTU)malloc(sizeof(psModbusRTU));
// Array pointer to store data
BYTE *readData = (BYTE *) malloc(NUMBER_OF_READ_REGISTERS * sizeof(BYTE));
memset(readData, 0, NUMBER_OF_READ_REGISTERS * sizeof(BYTE));
// Function to read the voltage
if (!ModbusRTUReadRegister(pModbus, FC_READ_HOLDING_REGISTERS, VOLTAGE_REGISTER_ADDRESS, NUMBER_OF_READ_REGISTERS, readData))
{
printf("\nERROR Read register error %d", GetLastError());
getchar();
return 0;
}
dataFloat[0] = hexToFloatPointConversion(readData);
printf("\nvoltage : %f", dataFloat[0]); //(THIS IS FIRST PRINTF TILL HERE CODE IS FINE)
/// Function reads the frequnecy
if (!ModbusRTUReadRegister(pModbus, FC_READ_HOLDING_REGISTERS, FREQUENCY_REGISTER_ADDRESS, NUMBER_OF_READ_REGISTERS, readData))
{
printf("\nERROR Read register error %d", GetLastError());
getchar();
return 0;
}
dataFloat[1] = hexToFloatPointConversion(readData); // Saving frequency value in array
printf("\nFrequency : %f", dataFloat[1]); // (THIS IS WHERE PROBLEM ARISES CODE CRASH WITH NO REASON)
hexToFloatPointConversion Function:
FLOAT hexToFloatPointConversion(BYTE *readData)
{
DWORD dataHex = 0;
DWORD exponent = 0;
DWORD fraction = 0;
DWORD signBit = 0;
FLOAT dataFloat = 0;
/// Saving BYTE array data into one DWORD
dataHex = readData[0];
dataHex = dataHex << 8;
dataHex = dataHex + readData[1];
dataHex = dataHex << 8;
dataHex = dataHex + readData[2];
dataHex = dataHex << 8;
dataHex = dataHex + readData[3];
/// Getting sign bit, exponent and fraction
signBit = dataHex >> 31;
exponent = (dataHex >> 23 & 0xFF);
fraction = dataHex & 0x7FFFFF;
/// Calculation for converting data into float.
dataFloat = (pow(2, (exponent - SINGLE_PRECISION_CONSTANT)) * ( 1 + (fraction * pow(2, -23))));
return dataFloat;
}
psModbusRTU structure:
typedef struct sModbusRTU
{
HANDLE hModbus;
BYTE slaveNumber;
BYTE functionCode;
BOOL fModbusInitialize;
BOOL fSlaveAddress;
BOOL fModbusOpen;
BOOL fException;
WCHAR portName[16];
DWORD exceptionData;
DCB dcb;
}sModbusRTU, *psModbusRTU;
Tried to debug also in run time and looked into dis-assembly break point but did not get exactly why it is happening.
Here is the dis-assembly:
40050658 str r3, [sp, #0x14]
4005065C str r5, [sp, #8]
40050660 str r2, [sp]
40050664 str r4, [sp, #4]
40050668 bl 40031078
4005066C __debugbreak_ce //Break point
40050670 mvn r3, #0x37, 24
40050674 eor r3, r3, #0xAB
40050678 ldr r2, [r3]
4005067C movw r3, #0xF7F8
40050680 movt r3, #0xF101
Please help me out, my project is stuck in middle just because of this small problem.
Please respond back soon

Wrong size passed to malloc().
// was psModbusRTU pModbus =(psModbusRTU)malloc(sizeof(psModbusRTU));
// -----------------------------------------------v--------------
psModbusRTU pModbus = (psModbusRTU) malloc(sizeof(*psModbusRTU));
Suggest alternate malloc() idiom. Use sizeof(*target).
Also, unless you compiler requires it (which it shouldn't), recommend dropping the cast
// example
psModbusRTU *pModbus = malloc(sizeof(*pModbus));

The %f specifier is expecting to format a double (8 bytes) and not a float (4 bytes) (See this MSDN page for details. Try casting the values to double in your printf statements, e.g. printf("\nvoltage : %f", (double) dataFloat[0]);

Related

How can I make this loop run faster?

I'm using this code to find the highest temperature pixel in a thermal image and the coordinates of the pixel.
void _findMax(uint16_t *image, int sz, sPixelData *returnPixel)
{
int temp = 0;
for (int i = sz; i > 0; i--)
{
if (returnPixel->temperature < *image)
{
returnPixel->temperature = *image;
temp = i;
}
image++;
}
returnPixel->x_location = temp % IMAGE_HORIZONTAL_SIZE;
returnPixel->y_location = temp / IMAGE_HORIZONTAL_SIZE;
}
With an image size of 640x480 it takes around 35ms to run through this function, which is too slow for what I need it for (under 10ms ideally).
This is executing on an ARM A9 processor running Linux.
The compiler I'm using is ARM v8 32-Bit Linux gcc compiler.
I'm using optimize -O3 and the following compile options: -march=armv7-a+neon -mcpu=cortex-a9 -mfpu=neon-fp16 -ftree-vectorize.
This is the output from the compiler:
000127f4 <_findMax>:
for(int i = sz; i > 0; i--)
127f4: e3510000 cmp r1, #0
{
127f8: e52de004 push {lr} ; (str lr, [sp, #-4]!)
for(int i = sz; i > 0; i--)
127fc: da000014 ble 12854 <_findMax+0x60>
12800: e1d2c0b0 ldrh ip, [r2]
12804: e2400002 sub r0, r0, #2
int temp = 0;
12808: e3a0e000 mov lr, #0
if(returnPixel->temperature < *image)
1280c: e1f030b2 ldrh r3, [r0, #2]!
12810: e153000c cmp r3, ip
returnPixel->temperature = *image;
12814: 81a0c003 movhi ip, r3
12818: 81a0e001 movhi lr, r1
1281c: 81c230b0 strhhi r3, [r2]
for(int i = sz; i > 0; i--)
12820: e2511001 subs r1, r1, #1
12824: 1afffff8 bne 1280c <_findMax+0x18>
12828: e30c3ccd movw r3, #52429 ; 0xcccd
1282c: e34c3ccc movt r3, #52428 ; 0xcccc
12830: e0831e93 umull r1, r3, r3, lr
12834: e1a034a3 lsr r3, r3, #9
12838: e0831103 add r1, r3, r3, lsl #2
1283c: e6ff3073 uxth r3, r3
12840: e04ee381 sub lr, lr, r1, lsl #7
12844: e6ffe07e uxth lr, lr
returnPixel->x_location = temp % IMAGE_HORIZONTAL_SIZE;
12848: e1c2e0b4 strh lr, [r2, #4]
returnPixel->y_location = temp / IMAGE_HORIZONTAL_SIZE;
1284c: e1c230b6 strh r3, [r2, #6]
}
12850: e49df004 pop {pc} ; (ldr pc, [sp], #4)
for(int i = sz; i > 0; i--)
12854: e3a03000 mov r3, #0
12858: e1a0e003 mov lr, r3
1285c: eafffff9 b 12848 <_findMax+0x54>
For clarity after comments:
Each pixel is a unsigned 16 bit integer, image[0] would be the pixel with coordinates 0,0, and the last in the array would have the coordinates 639,479.

This is executing on an ARM A9 processor running Linux.
ARM Cortex-A9 supports Neon.
With this in mind the goal should be to load 8 values (128 bits of pixel data) into a register, then do "compare with the current maximums for each of the 8 places" to get a mask, then use the mask and its inverse to mask out the "too small" old maximums and the "too small" new values; then OR the results to merge the new higher values into the "current maximums for each of the 8 places".
Once that has been done for all pixels (using a loop); you'd want to find the highest value in the "current maximums for each of the 8 places".
However; to find the location of the hottest pixel (rather than just how hot it is) you'd want to split the image into tiles (e.g. maybe 8 pixels wide and 8 pixels tall). This allows you to find the max. temperature within each tile (using Neon); then find the pixel within the hottest tile. Note that for huge images this lends itself to a "multi-layer" approach - e.g. create a smaller image containing the maximum from each tile in the original image; then do the same thing again to create an even smaller image containing the maximum from each "group of tiles", then ...
Making this work in plain C means trying to convince the compiler to auto-vectorize. The alternatives are to use compiler intrinsics or inline assembly. In any of these cases, using Neon to do 8 pixels in parallel (without any branches) could/should improve performance significantly (how much depends on RAM bandwidth).

You should minimize memory access, especially in loops.
Every * or -> could cause unnecessary memory access resulting in serious performance hits.
Local variables are your best friend:
void _findMax(uint16_t *image, int sz, sPixelData *returnPixel)
{
int temp = 0;
uint16_t temperature = returnPixel->temperature;
uint16_t pixel;
for (int i = sz; i > 0; i--)
{
pixel = *image++;
if (temperature < pixel)
{
temperature = pixel;
temp = i;
}
}
returnPixel->temperature = temperature;
returnPixel->x_location = temp % IMAGE_HORIZONTAL_SIZE;
returnPixel->y_location = temp / IMAGE_HORIZONTAL_SIZE;
}
Below is how this can be optimized by utilizing neon:
#include <stdint.h>
#include <arm_neon.h>
#include <assert.h>
static inline void findMax128_neon(uint16_t *pDst, uint16x8_t *pImage)
{
uint16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15;
uint16x4_t dmax;
in0 = vld1q_u16(pImage++);
in1 = vld1q_u16(pImage++);
in2 = vld1q_u16(pImage++);
in3 = vld1q_u16(pImage++);
in4 = vld1q_u16(pImage++);
in5 = vld1q_u16(pImage++);
in6 = vld1q_u16(pImage++);
in7 = vld1q_u16(pImage++);
in8 = vld1q_u16(pImage++);
in9 = vld1q_u16(pImage++);
in10 = vld1q_u16(pImage++);
in11 = vld1q_u16(pImage++);
in12 = vld1q_u16(pImage++);
in13 = vld1q_u16(pImage++);
in14 = vld1q_u16(pImage++);
in15 = vld1q_u16(pImage);
in0 = vmaxq_u16(in1, in0);
in2 = vmaxq_u16(in3, in2);
in4 = vmaxq_u16(in5, in4);
in6 = vmaxq_u16(in7, in6);
in8 = vmaxq_u16(in9, in8);
in10 = vmaxq_u16(in11, in10);
in12 = vmaxq_u16(in13, in12);
in14 = vmaxq_u16(in15, in14);
in0 = vmaxq_u16(in2, in0);
in4 = vmaxq_u16(in6, in4);
in8 = vmaxq_u16(in10, in8);
in12 = vmaxq_u16(in14, in12);
in0 = vmaxq_u16(in4, in0);
in8 = vmaxq_u16(in12, in8);
in0 = vmaxq_u16(in8, in0);
dmax = vmax_u16(vget_high_u16(in0), vget_low_u16(in0));
dmax = vpmax_u16(dmax, dmax);
dmax = vpmax_u16(dmax, dmax);
vst1_lane_u16(pDst, dmax, 0);
}
void _findMax_neon(uint16_t *image, int sz, sPixelData *returnPixel)
{
assert((sz % 128) == 0);
const uint32_t nSector = sz/128;
uint16_t max[nSector];
uint32_t i, s, nMax;
uint16_t *pImage;
for (i = 0; i < nSector; ++i)
{
findMax128_neon(&max[i], (uint16x8_t *) &image[i*128]);
}
s = 0;
nMax = max[0];
for (i = 1; i < nSector; ++i)
{
if (max[i] > nMax)
{
s = i;
nMax = max[i];
}
}
if (nMax < returnPixel->temperature)
{
returnPixel->x_location = 0;
returnPixel->y_location = 0;
return;
}
pImage = &image[s];
i = 0;
while(1) {
if (*pImage++ == nMax) break;
i += 1;
}
i += 128 * s;
returnPixel->temperature = nMax;
returnPixel->x_location = i % IMAGE_HORIZONTAL_SIZE;
returnPixel->y_location = i / IMAGE_HORIZONTAL_SIZE;
}
Beware that the function above assumes sz being a multiple of 128.
And yes, it will run in less than 10ms.

The culprit here is the slow linear search for the highest "temperature". I'm not quite sure how to improve that search algorithm with the information given, if at all possible (could you sort the data in advance?), but you could start with this:
uint16_t max = 0;
size_t found_index = 0;
for(size_t i=0; i<sz; i++)
{
if(max < image[i])
{
max = image[i];
found_index = sz - i - 1; // or whatever makes sense here for the algorithm
}
}
returnPixel->temperature = max;
returnPixel->x_location = found_index % IMAGE_HORIZONTAL_SIZE;
returnPixel->y_location = found_index / IMAGE_HORIZONTAL_SIZE;
This might give a very slight performance gain because of the top to bottom iteration order and not touching unrelated memory returnPixel in the middle of the loop. max should get stored in a register and with luck you might get slightly better cache performance overall. Still, this comes with a branch like the original code, so it is a minor improvement.
Another micro-optimization is to change the parameter to const uint16_t* image - this might give slightly better pointer aliasing, in case returnPixel happens to contain a uint16_t too. image should be const regardless of performance, since const correctness is always good practice.
Further more obscure optimization tricks might be possible if you read image 32 or 64 bits at a time, then come up with a fast look-up method to find the largest image inside that 32/64 bit chunk.

If you have to find the hottest pixel in the image and if there is no structure to the image data itself then I think your stuck with iterating through the pixels. If so you have a number of different ways to make this faster:
As suggested above try loop unrolling and other micro-optimisation tricks, this might give you the performance boost you need
Go parallel, split the array into N chunks and find the MAX[N] for each chunk and then find the largest of the MAX[N] values. You have to be careful here as setting up the parallel processes can take longer than doing the work.
If there is some structure to the image, lots of cold pixels and a hot spot (larger than 1 pixel) that your trying to find say, then there are other techniques you could use.
One approach could be to split the image up into N boxes and then sample each box, The hottest pixel in the hottest box (and maybe in the boxes adjacent too it) would then be your result. However this depends on their being some structure to the image which you can rely on.

The assembly language reveals the compiler is storing in returnPixel->temperature each time a new maximum is found, with the instruction strhhi r3, [r2]. Eliminate this unnecessary store by caching the maximum in a local object and only updating returnPixel->temperature after the loop ends:
uint16_t maximum = returnPixel->temperature;
for (int i = sz; i > 0; i--)
{
if (maximum < *image)
{
maximum = *image;
temp = i;
}
image++;
}
returnPixel->temperature = maximum;
That is unlikely to reduce the execution time as much as you need, but it might if there is some bad cache or memory interaction occurring. It is a very simple change, so try it before moving on to the SIMD vectorizations suggested in other answers.
Regarding vectorization, two approaches are:
Iterate through the image using a vmax instruction to update the maximum value seen so far in each SIMD lane. Then consolidate the lanes to find the overall maximum. Then iterate through the image again looking for that maximum in any lane. (I forget what that architecture has for instructions that would assist in testing whether a comparison produced true in any lane.)
Iterate through the image maintaining three registers: One with the maximum seen so far in each lane, one with a counter of position in the image, and one with, in each lane, a record of the counter value at the time each new maximum was seen. The first can be updated with vmax, as above. The second can be updated with vadd. The third can be updated with vcmp and vbit. After the loop, figure out which lane has the maximum, and get the position of the maximum from the recorded counter for that lane.
Depending on the performance of the necessary instructions, a hybrid approach may be faster:
Set some strip size S. Partition the image into strips of that size. For each strip in the image, find the maximum (using the fast vmax loop described above). If the maximum is greater than seen in previous strips, remember it and the current strip number. After processing the whole image, the task has been reduced to finding the location of the maximum in a particular strip. Use the second loop from the first approach above for that. (For large images, further refinements may be possible, possibly refining the location using a shorter strip size before finding the exact location, depending on cache behavior and other factors.)

In my opinion you cannot improve that algorithim, because any array element can hold the maximum, so you need to do at least one pass through the data, I don't believe you can improve this without going multithreading. you can start several threads (as many as cores/processors you may have) and give each a subset of your image. Once they are finished, you will have as many local maximum values as the number of threads you started. just do a second pass on those vaues to get the maximum total value, and you are finished. But consider the extra workload of creating threads, allocating stack memory for them, and scheduling, as that can be higher if the number of values is low than the workload of running all in a single thread. If you have a thread pool somewhere to provide ready to run threads, and that is something you can get on, then probably you'll be able to finished in one Nth part of the time to run all the loop in one single processor (where N is the number of cores you have on the machine)
Note: using a dimension that is a power of two will save you the job of calculating a quotient and a remainder by solving the division problem with a bit shift and a bit mask. You use it only once in your function, but it's an improving, anyway.

Determine length in bytes of content of a variable

I want to automatically determine length in bytes of a field (addr) that is uint32, based on it's contents. Compiler is GCC. I use this:
uint8 len;
if(addr < 256) len = 1;
else if (addr < 65536) len = 2;
else if (addr < 16777216) len = 3;
else len = 4;
Is there a more efficient way?
This is inside a SPI function for a embedded device. I'm interested in the fastest way except macros, since addr can be a variable.

You can do it using an approach similar to binary search: first compare to 65536, then either to 256 or 16777216, depending on the outcome of the first comparison. This way you always finish in two comparisons, while your code sometimes would require three:
uint8 len = (addr < 65536)
? ((addr < 256) ? 1 : 2)
: ((addr < 16777216) ? 3 : 4);

gcc has a __builtin_ctz() function which can be used like
if (addr == 0)
len = 0;
else
len = (sizeof(int) * 8 - __builtin_ctz(addr) + 7) / 8;
Update:
under ARM, this compiles to
cmp r0, #0
rbitne r0, r0
clzne r0, r0
rsbne r0, r0, #39
lsrne r0, r0, #3
bx lr

Get the position of the highest bit – the only one that counts for your question (from https://stackoverflow.com/a/14085901). Divide by 8 to get the number of bytes.
addr = 0x20424;
printf ("%d\n", (fls(addr)+7)>>3);
It returns 0 when addr == 0.
fls is conforming to POSIX.1-2001, POSIX.1-2008, 4.3BSD. If your current system does not contain it, look at the above link or What is the fastest/most efficient way to find the highest set bit (msb) in an integer in C? for more suggestions to find the highest bit set.

Why does storing a %x value to a variable do funky things when leading value is 8-f?

I am writing a program in c which imitates an LC-3 simulator. One of the objectives of this program is to store a 4 digit hexadecimal value from a file (0000 - ffff), and to convert it to binary, and interpret an LC-3 instruction from it. The following code segment shows how I am storing this value into a variable (which is where the problem seems to lie), and below that is the output I am receiving:
int *strstr(int s, char c);
void initialize_memory(int argc, char *argv[], CPU *cpu) {
FILE *datafile = get_datafile(argc, argv);
// Buffer to read next line of text into
#define DATA_BUFFER_LEN 256
char buffer[DATA_BUFFER_LEN];
int counter = 0;
// Will read the next line (words_read = 1 if it started
// with a memory value). Will set memory location loc to
// value_read
//
int value_read, words_read, loc = 0, done = 0;
char comment;
char *read_success; // NULL if reading in a line fails.
int commentLine =0;
read_success = fgets(buffer, DATA_BUFFER_LEN, datafile);
while (read_success != NULL && !done) {
// If the line of input begins with an integer, treat
// it as the memory value to read in. Ignore junk
// after the number and ignore blank lines and lines
// that don't begin with a number.
//
words_read = sscanf(buffer, "%04x%c", &value_read, &comment);
// if an integer was actually read in, then
// set memory value at current location to
// value_read and increment location. Exceptions: If
// loc is out of range, complain and quit the loop. If
// value_read is outside 0000 and ffff, then it's a
// sentinel -- we should say so and quit the loop.
if (value_read == NULL || comment ==';')
{
commentLine = 1;
}
if (value_read < -65536 || value_read > 65536)
{
printf("Sentinel read in place of Memory location %d: quitting loop\n", loc);
break;
}
else if (value_read >= -65536 && value_read <= 65536)
{
if (commentLine == 0)
{
if (counter == 0)
{
loc = value_read;
cpu -> memLocation = loc;
printf("\nPC location set to: x%04x \n\n", cpu -> memLocation);
counter++;
}
else
{
cpu -> mem[loc] = value_read;
printf("x%04x : x%d\t %04x \t ", loc,loc, cpu -> mem[loc]);
print_instr(cpu, cpu -> mem[loc]);
loc++;
value_read = NULL;
}
}
}
if (loc > 65536)
{
printf("Reached Memory limit, quitting loop.\n", loc);
break;
}
commentLine = 0;
read_success = fgets(buffer, DATA_BUFFER_LEN, datafile);
// Gets next line and continues the loop
}
fclose(datafile);
// Initialize rest of memory
while (loc < MEMLEN) {
cpu -> mem[loc++] = 0;
}
}
My aim is to show the Hex address : decimal address, the hex instruction, binary code, and then at the end, its LC-3 instruction translation. The data I am scanning from the file is the hex instruction:
x1000 : x4096 200c 0010000000001100 LD, R0, 12
x1001 : x4097 1221 0001001000100000 ADD, R1, R0, 0
x1002 : x4098 1401 0001010000000000 ADD, R2, R0, R0
x1003 : x4099 ffff94bf 0000000000000000 NOP
x1004 : x4100 166f 0001011001101110 ADD, R3, R1, 14
x1005 : x4101 1830 0001100000110000 ADD, R4, R0, -16
x1006 : x4102 1b04 0001101100000100 ADD, R5, R4, R4
x1007 : x4103 5d05 0101110100000100 AND, R6, R4, R4
x1008 : x4104 5e3f 0101111000111110 AND, R7, R0, -2
x1009 : x4105 5030 0101000000110000 AND, R0, R0, -16
x100a : x4106 52ef 0101001011101110 AND, R1, R3, 14
x100b : x4107 5fe0 0101111111100000 AND, R7, R7, 0
x100c : x4108 fffff025 0000000000000000 NOP
x100d : x4109 7fff 0111111111111110 STR, R7, R7, -2
As you can see, my problem lies in addresses x1003 and x100c;
As stated in the headline, when storing the hex instruction, if the value is between 8 and f, my best guess is that the scan is interpreting it as a negative value because of the leading value of the first hex digit in binary. If that is the case, it makes perfect sense, but is there a way I can bypass this? And if it isn't the case, what else could be causing this?
I found that if I pass value_read into print_instr() instead of cpu -> mem[loc], then the output works correctly. However, this is only a temporary fix as I need to store that value for later use in the program(for actual execution of the instruction). So the problem seems to arise while storing, and I am unsure as to why.
Additionally, (and this is a side question) though it is not a pressing concern, since I am using %x%c (value_read, comment) to store values from the file, I have been having trouble with the first few lines of the .hex file I am using, in which there is no hex value in the line, but instead just a comment symbol (for those unfamiliar with lc_3 simulators, the ';' is the symbol for comments). Whenever this is the case, I get a hex value of zero, although I wish for it to be NULL(In my program, I implemented a temporary solution because I am not sure how to fix it). I am not an expert in c just yet, and have not been able to find a solution to this problem. If you can help, it would be greatly appreciated, otherwise, it isn't a big issue for what I am trying to achieve with this program, it is more so just for my own knowledge and growth.
Thank you all in advance for your help :)

In a scanf family format string, the %x specifier means to read into an unsigned int. The corresponding argument must have exactly the type unsigned int *.
However you supply an argument of type int *.
This causes undefined behaviour. What you are seeing is the chance interaction between library elements that expect you to follow the rules, and your code that didn't follow the rules.
To fix it, follow the rules. For example, read into an unsigned int variable.
NB. 0 does nothing in the scanf format string; %04x is equivalent to %4x.

May I suppose that cpu->mem is of type array of short or alike? Then sign extension occurs when printing cpu->mem[loc]. Remind that arguments are at least converted to int at printf calls. Symptom is the same as in the following code:
int i;
scanf("%4x",&i);
printf("%x\n",i);
short s = i;
printf("--> %x\n",s);
The short equals to -1 then when you set it to an int it is converted to -1, 0xffffffff (if 32-bits).
Use unsigned short in place.

ARMCC 5 optimization of strtol and strtod

I have a board based on STM32L4 MCU (Ultra Low Power Cortex-M4) for GNSS tracking purposes. I don't use RTOS, so I use a custom scheduler. Compiler and environment is KEIL uVision 5 (compiler 5.05 and 5.06, behavior doesn't change)
The MCU speaks with GNSS module via plain UART and the protocol is a mix of NMEA and AT. GNSS position is given as plain text that must be converted to a pair of float/double coordinates.
To get the double/float value from text, I use strtod (or strtof).
Note that string operations are made in a separate buffer, different from the UART RX one.
The typical string for a latitude on the UART is
4256.45783
which means 42° 56.45783'
to get absolute position in degrees, I use the following formula
42 + 56.45783 / 60
When there is no optimization the code works fine and the position is converted right. When I turn on level 1 optimization (or higher), if I use standard C library I can convert the integer part (42 in the example) and when it comes to convert 56.45783, I get only 56 (so the integer part of minutes until the dot).
If I get rid of standard library and I use a custom strtod function downloaded from ANSI C source library I simply get 0 with ERANGE error.
In other parts of the code I use strtol, which has a strange behavior when L1 optimization is turned ON: when the first digit is 9 and conversion base is 10 it simply skips that 9 going on with the other digits.
So if in the buffer I have 92, I will get just 2 parsed. To get rid of this I simply prepended a sign + to the number and the result is always OK (as far as I can tell). This WA doesn't work with strtod.
Note that I tried to use static, volatile and on-stack variables, behavior doesn't change.
EDIT: I simplified the code in order to get where it goes wrong, as per comments hereafter
C code is like this:
void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
{
double dbl = 0.0;
dbl = strtod("56.45783",NULL);
if(struc != NULL)
{
struc->Axis = (float)((dbl / 60.0) + 42.0);
}
}
Level 0 optimization:
559: void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
0x08011FEE BDF8 POP {r3-r7,pc}
560: {
0x08011FF0 B570 PUSH {r4-r6,lr}
0x08011FF2 4605 MOV r5,r0
0x08011FF4 ED2D8B06 VPUSH.64 {d8-d10}
0x08011FF8 460C MOV r4,r1
561: double dbl = 0.0;
0x08011FFA ED9F0BF8 VLDR d0,[pc,#0x3E0]
0x08011FFE EEB08A40 VMOV.F32 s16,s0
0x08012002 EEF08A60 VMOV.F32 s17,s1
562: dbl = strtod("56.45783",NULL);
0x08012006 2100 MOVS r1,#0x00
0x08012008 A0F6 ADR r0,{pc}+4 ; #0x080123E4
0x0801200A F7FDFED1 BL.W __hardfp_strtod (0x0800FDB0)
0x0801200E EEB08A40 VMOV.F32 s16,s0
0x08012012 EEF08A60 VMOV.F32 s17,s1
563: if(struc != NULL)
564: {
0x08012016 B1A4 CBZ r4,0x08012042
565: struc->Axis = (float)((dbl / 60.0) + 42.0);
566: }
0x08012018 ED9F0BF5 VLDR d0,[pc,#0x3D4]
0x0801201C EC510B18 VMOV r0,r1,d8
0x08012020 EC532B10 VMOV r2,r3,d0
0x08012024 F7FEF880 BL.W __aeabi_ddiv (0x08010128)
0x08012028 EC410B1A VMOV d10,r0,r1
0x0801202C ED9F0BF2 VLDR d0,[pc,#0x3C8]
0x08012030 EC532B10 VMOV r2,r3,d0
0x08012034 F7FDFFBC BL.W __aeabi_dadd (0x0800FFB0)
0x08012038 EC410B19 VMOV d9,r0,r1
0x0801203C F7FDFF86 BL.W __aeabi_d2f (0x0800FF4C)
0x08012040 6020 STR r0,[r4,#0x00]
567: }
LEVEL 1 optimization
557: void GnssStringToLatLonDegMin(const char* str, LatLong_t* struc)
0x08011FEE BDF8 POP {r3-r7,pc}
558: {
559: double dbl = 0.0;
0x08011FF0 B510 PUSH {r4,lr}
0x08011FF2 460C MOV r4,r1
560: dbl = strtod("56.45783",NULL);
0x08011FF4 2100 MOVS r1,#0x00
0x08011FF6 A0F7 ADR r0,{pc}+2 ; #0x080123D4
0x08011FF8 F7FDFEDA BL.W __hardfp_strtod (0x0800FDB0)
561: if(struc != NULL)
562: {
0x08011FFC 2C00 CMP r4,#0x00
0x08011FFE D010 BEQ 0x08012022
563: struc->Axis = (float)((dbl / 60.0) + 42.0);
564: }
0x08012000 ED9F1BF7 VLDR d1,[pc,#0x3DC]
0x08012004 EC510B10 VMOV r0,r1,d0
0x08012008 EC532B11 VMOV r2,r3,d1
0x0801200C F7FEF88C BL.W __aeabi_ddiv (0x08010128)
0x08012010 ED9F1BF5 VLDR d1,[pc,#0x3D4]
0x08012014 EC532B11 VMOV r2,r3,d1
0x08012018 F7FDFFCA BL.W __aeabi_dadd (0x0800FFB0)
0x0801201C F7FDFF96 BL.W __aeabi_d2f (0x0800FF4C)
0x08012020 6020 STR r0,[r4,#0x00]
565: }
I looked at the disassembly of __hardfp_strtod and __strtod_int called by these functions and, as they are incorporated as binaries, they don't change with respect of optimization level.

Due to optimization, strtod didn't work.
Thanks to #old_timer, I had to make my own strtod function, which works even with optimization level set at level 2.
double simple_strtod(const char* str)
{
int8 inc;
double result = 0.0;
char * c_tmp;
c_tmp = strchr(str, '.');
if(c_tmp != NULL)
{
c_tmp++;
inc = -1;
while(*c_tmp != 0 && inc > -9)
{
result += (*c_tmp - '0') * pow(10.0, inc);
c_tmp++; inc--;
}
inc = 0;
c_tmp = strchr(str, '.');
c_tmp--;
do
{
result += (*c_tmp - '0') * pow(10.0,inc);
c_tmp--; inc++;
}while(c_tmp >= str);
}
return result;
}
It can be further optimized by not calling 'pow' and use something more clever, but just like this it works perfectly.

How to define a stack for each task on an ARM

Today I have a little proble and I think that the origin is about the stack.
This is my problem :
I declare three user task like this :
void task1Function(void) {
print_uart0("-usertask : First task is started...\r\n");
while(1){
//syscall(1);
}
}
void task2Function(void) {
print_uart0("-usertask : Second task is running...\r\n");
//syscall(); /* To return in the kernel's mode */
while(1){
}
}
void task3Function(void) {
print_uart0("-usertask : Third task is running...\r\n");
//syscall(); /* To return in the kernel's mode */
while(1){
}
}
I have an array of three task whose the strucure is below :
typedef struct
{
unsigned int *sp;
unsigned int registers[12];
unsigned int lr;
unsigned int pc;
unsigned int cpsr;
unsigned int mode;
unsigned int num;
void *stack;
int stacksize;
int priority;
int state; /* Running, Ready, Waiting, Suspended */
/* Next and previous task in the queue */
struct taskstruct *qnext, *qprev;
}taskstruct;
Here the inialisation of my tasks :
void init_task(taskstruct * task, void (*function)(void) ){
task->sp = (unsigned int*)&function;
task->registers[0] = 0; // r0
task->registers[1] = 0; // r1
task->registers[2] = 0; // r2
task->registers[3] = 0; // r3
task->registers[4] = 0; // r4
task->registers[5] = 0; // r5
task->registers[6] = 0; // r6
task->registers[7] = 0; // r7
task->registers[8] = 0; // r8
task->registers[9] = 0; // r9
task->registers[10] = 0; // r10
task->registers[11] = 0; // r11
task->registers[12] = 0; // r12
task->lr = 0;
task->pc = 0;
task->cpsr = 0;
task->mode = 0x10;
}
init_task(&task[0],&task1Function);
init_task(&task[1],&task2Function);
init_task(&task[2],&task3Function);
But when I passed the task[0].sp to my activate function, it's always the last declared task that is launched (i.e the third):
.global activate
activate:
LDR r12, [r0]
/*STMFD sp!,{r1-r11,lr}*/
NOP
msr CPSR_c, #0x10 /* User mode with IRQ enabled and FIQ disabled*/
mov pc, r12
So I guess that I have a problem with my user stack and that I have to setup a different stack for each of them, am I right ? In this case, can somebody tell me how I have to proceed ?
Regards, Vincent

Yes, you are right.
You must have separate stacks for each task, since the state of the stack is part of the task's total state.
You could add something like
uint32_t stack[512];
to your taskstruct, and then set/restore the task pointer accordingly when switching tasks, of course. It's hard to provide enough detail in this context.

'Ok but how I can allocate the stack task ? ' - malloc it.
You can economize on mallocs by adding the stack space on the end of the taskstruct - all your current members are fixed-size, so calculating the total size to malloc is easy.
The hard bit with these taskers is getting the interrupt entry correct, ie. when an interrupt handler needs to change a task state and so has to exit via the scheduler. This is especially good fun with nested interrupts. Good luck with that!