lmbench3 rd/wr/rdwr/cp stride size

lmbench3 rd/wr/rdwr/cp stride size - arm

What is actual stride size (in bytes) for lmbench3 rd/wr/rdwr/cp bw_mem.c? In the comment it is mentioned 32byte stride but looking into the implementation it is 16byte stride (int type is 4byte):
wr(iter_t iterations, void *cookie)
{
state_t *state = (state_t *) cookie;
register TYPE *lastone = state->lastone;
while (iterations-- > 0) {
register TYPE *p = state->buf;
while (p <= lastone) {
#define DOIT(i) p[i] = 1;
DOIT(0) DOIT(4) DOIT(8) DOIT(12) DOIT(16) DOIT(20) DOIT(24)
DOIT(28) DOIT(32) DOIT(36) DOIT(40) DOIT(44) DOIT(48) DOIT(52)
DOIT(56) DOIT(60) DOIT(64) DOIT(68) DOIT(72) DOIT(76)
DOIT(80) DOIT(84) DOIT(88) DOIT(92) DOIT(96) DOIT(100)
DOIT(104) DOIT(108) DOIT(112) DOIT(116) DOIT(120) DOIT(124);
p += 128;
}
}
}
References:
https://sourceforge.net/projects/lmbench/files/development/lmbench-3.0-a9/
http://www.bitmover.com/lmbench/get_lmbench.html

Related

Looking for performance improvement of NEON code to match clipping area on the screen

Here is my test code to find 1st clipping area on the screen.
Two subroutines and dummy loops in the code to compare the performance of them.
point_in_neon (NEON version) and point_in (Regular version) does the same thing:
find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.
I expected NEON version is faster than regular version.
Unfortunately, it is slower than regular version. Is there another way to speed it up?
The compiler command is:
${CC} -O2 -ftree-vectorize -o vcomp vcomp.c
Thanks,
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>
#define WIDTH (4096)
#define HEIGHT (4096)
#define CLIPS (32)
static inline uint64_t now(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec*1000000+tv.tv_usec;
}
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
const int32_t right[4]={
rs[0].x+rs[0].width-1,
rs[1].x+rs[1].width-1,
rs[2].x+rs[2].width-1,
rs[3].x+rs[3].width-1
}, bottom[4]={
rs[0].y+rs[0].height-1,
rs[1].y+rs[1].height-1,
rs[2].y+rs[2].height-1,
rs[3].y+rs[3].height-1
};
int32x4_t p, r;
uint32x4_t t;
uint32_t res[4];
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//r = <Left0, Left1, Left2, Left3>
r=vld1q_lane_s32(&rs[0].x, r, 0);
r=vld1q_lane_s32(&rs[1].x, r, 1);
r=vld1q_lane_s32(&rs[2].x, r, 2);
r=vld1q_lane_s32(&rs[3].x, r, 3);
//t = (p >= r)
t=vcgeq_s32(p, r);
//r = <Right0, Right1, Right2, Right3>
r=vld1q_s32(&right);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//r = <Top0, Top1, Top2, Top3>
r=vld1q_lane_s32(&rs[0].y, r, 0);
r=vld1q_lane_s32(&rs[1].y, r, 1);
r=vld1q_lane_s32(&rs[2].y, r, 2);
r=vld1q_lane_s32(&rs[3].y, r, 3);
//t = t & (p >= r)
t=vandq_u32(t, vcgeq_s32(p, r));
//r = <Bottom0, Bottom1, Bottom2, Bottom3>
r=vld1q_s32(&bottom);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0;i<len;i++) {
int32_t right=rs[i].x+rs[i].width-1,
bottom=rs[i].y+rs[i].height-1;
if(pt->x>=rs[i].x && pt->x<=right &&
pt->y>=rs[i].y && pt->y<=bottom)
return i;
}
return -1;
}
int32_t main(int32_t argc, char *argv[]) {
rect_t rs[CLIPS];
int32_t i, j;
uint64_t ts0, ts1;
int32_t res[2][CLIPS];
srand((unsigned int)time(NULL));
for(i=0;i<CLIPS;i++) {
rs[i].x=rand()%WIDTH;
rs[i].y=rand()%HEIGHT;
rs[i].width=rand()%WIDTH;
rs[i].height=rand()%HEIGHT;
}
memset(res, 0, sizeof(res));
ts0=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
if(idx>=0)
res[0][idx]=1;
}
}
ts0=now()-ts0;
ts1=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
int32_t k, idx;
point_t p={i, j};
for(k=0, idx=-1;k<CLIPS/4;k++) {
idx=point_in_neon(&p, &rs[k*4]);
if(idx>=0)
break;
}
if(idx>=0)
res[1][k*4+idx]=1;
}
}
ts1=now()-ts1;
/*
for(i=0;i<CLIPS;i++) {
if(res[0][i]!=res[1][i]) {
printf("error.\n");
return 1;
}
}
*/
printf("regular = %lu\n", ts0);
printf("neon = %lu\n", ts1);
return 0;
}

According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
int32x4x4_t r;
int32x4_t right, bottom, p;
uint32x4_t t;
uint32_t res[4];
/*
r.val[0] = <X0, X1, X2, X3>
r.val[1] = <Y0, Y1, Y2, Y3>
r.val[2] = <Width0, Width1, Width2, Width3>
r.val[3] = <Height0, Height1, Height2, Height3>
*/
r=vld4q_s32(rs);
//right = <Right0, Right1, Right2, Right3>
right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
//bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//t = (p >= left)
t=vcgeq_s32(p, r.val[0]);
//t = t & (right >= p)
t=vandq_u32(t, vcgeq_s32(right, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//t = t & (p >= top)
t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
//t = t & (r >= bottom)
t=vandq_u32(t, vcgeq_s32(bottom, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}

Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0; i < len; i++)
{
// this is pointless - change your data structures so that
// the rect stores minx/maxx, miny/maxy instead!
int32_t right = rs[i].x + rs[i].width;
int32_t bottom= rs[i].y + rs[i].height;
bool cmp0 = pt->x >= rs[i].x;
bool cmp1 = pt->y >= rs[i].y;
bool cmp2 = pt->x < right;
bool cmp3 = pt->y < bottom;
if(cmp0 & cmp1 & cmp2 & cmp3)
return i;
}
return -1;
}
Next obvious thing to point out:
// your screen size...
#define WIDTH (4096)
#define HEIGHT (4096)
// yet your structures use uint32 as storage???
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.
I'm also going to hoist the pointless p.x + width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).
typedef struct rect_x8_t {
int16x8_t x;
int16x8_t y;
int16x8_t xmax; //< x + width
int16x8_t ymax; //< y + height
} rect_x8_t;
typedef struct point_x8_t {
int16x8_t x;
int16x8_t y;
} point_x8_t;
On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)
// assuming this has already been initialised
rect_t rs[CLIPS];
// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);
// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};
// I'm going to do this rubbishly as an pre-process step.
// I don't care too much about efficiency here...
for(uint32_t i = 0; i < CLIPS; ++i) {
rs8[i / 8].x[i & 7] = rs[i].x;
rs8[i / 8].y[i & 7] = rs[I].y;
rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}
I have a couple of concerns here:
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
// This seems wrong? Shouldn't it be p = {j, i} ?
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
// I'm not quite sure what the result says about your
// image data and clip regions???
//
// This seems like a really silly way of asking
// a simple question about the clip regions. The pixels
// don't have any effect here.
if(idx >= 0)
res[0][idx] = 1;
}
}
Anyhow, now refactoring the point_in method to use int16x8_t, we get:
inline int32_t point_in_x8(const point_x8_t pt,
const rect_x8_t* rs,
uint32_t len) {
for(int32_t i = 0; i < len; i++) {
// perform comparisons on 8 rects at a time
uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);
// combine to single comparison value
uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);
// use a horizontal max to see if any lanes are true
if(vmaxvq_u16(cmp0123)) {
for(int32_t j = 0; j < 8; ++j) {
if(cmp0123[j])
return 8*i + j;
}
}
}
return -1;
}
Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).
Then finally...
for(i = 0; i < HEIGHT; i++) {
point_x8_t p;
// splat the y value
p.y = vld1q_dup_s16(i);
for(j = 0; j < WIDTH; j++) {
// splat the x value
p.x = vld1q_dup_s16(j);
int32_t idx = point_in_x8(p, rs8, CLIPS8);
if(idx >= 0)
res[1][idx] = 1;
}
}
The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.
HOWEVER
This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.
A clip region will be false if it is entirely contained by the preceding clip regions
for(i = 0; i < CLIPS; i++) {
// if region is empty, ignore.
if(rs[i].width == 0 || rs[i].height == 0) {
res[0][i] = 0;
continue;
}
// first region will always be true (unless it's of zero size)
if(i == 0) {
res[0][1] = 1;
continue;
}
uint32_t how_many_intersect = 0;
bool entirely_contained = false;
uint32_t intersection_indices[CLIPS] = {};
// do a lazy test first.
for(j = i - 1; j >= 0; --j) {
// if the last region is entirely contained by preceding
// ones, it will be false. exit loop.
if(region_is_entirely_contained(rs[i], rs[j])) {
res[0][i] = 0;
entirely_contained = true;
j = -1; ///< break out of loop
}
else
// do the regions intersect?
if(region_intersects(rs[i], rs[j])) {
intersection_indices[how_many_intersect] = j;
++how_many_intersect;
}
}
// if one region entirely contains this clip region, skip it.
if(entirely_contained) {
continue;
}
// if you only intersect one or no regions, the result is true.
if(how_many_intersect <= 1) {
res[0][i] = 1;
continue;
}
// If you get here, the result is *probably* true, however
// you will need to split this clip region against the previous
// ones to be fully sure. If all regions are fully contained,
// the answer is false.
// I won't implement it, but something like this:
* split rs[i] against each rs[intersection_indices[]].
* Throw away the rectangles that are entirely contained.
* Each bit that remains should be tested against each rs[intersection_indices[]]
* If you find any split rectangle that isn't contained,
set to true and move on.
}

Does fmodf() cause a hardfault in stm32?

I am trying to create a modulated waveform out of 2 sine waves.
To do this I need the modulo(fmodf) to know what amplitude a sine with a specific frequency(lo_frequency) has at that time(t). But I get a hardfault when the following line is executed:
j = fmodf(2 * PI * lo_frequency * t, 2 * PI);
Do you have an idea why this gives me a hardfault ?
Edit 1:
I exchanged fmodf with my_fmodf:
float my_fmodf(float x, float y){
if(y == 0){
return 0;
}
float n = x / y;
return x - n * y;
}
But still the hardfault occurs, and when I debug it it doesn't even jump into this function(my_fmodf).
Heres the whole function in which this error occurs:
int* create_wave(int* message){
/* Mixes the message signal at 10kHz and the carrier at 40kHz.
* When a bit of the message is 0 the amplitude is lowered to 10%.
* When a bit of the message is 1 the amplitude is 100%.
* The output of the STM32 can't be negative, thats why the wave swings between
* 0 and 256 (8bit precision for faster DAC)
*/
static int rf_frequency = 10000;
static int lo_frequency = 40000;
static int sample_rate = 100000;
int output[sample_rate];
int index, mix;
float j, t;
for(int i = 0; i <= sample_rate; i++){
t = i * 0.00000001f; // i * 10^-8
j = my_fmodf(2 * PI * lo_frequency * t, 2 * PI);
if (j < 0){
j += (float) 2 * PI;
}
index = floor((16.0f / (lo_frequency/rf_frequency * 0.0001f)) * t);
if (index < 16) {
if (!message[index]) {
mix = 115 + sin1(j) * 0.1f;
} else {
mix = sin1(j);
}
} else {
break;
}
output[i] = mix;
}
return output;
}
Edit 2:
I fixed the warning: function returns address of local variable [-Wreturn-local-addr] the way "chux - Reinstate Monica" suggested.
int* create_wave(int* message){
static uint16_t rf_frequency = 10000;
static uint32_t lo_frequency = 40000;
static uint32_t sample_rate = 100000;
int *output = malloc(sizeof *output * sample_rate);
uint8_t index, mix;
float j, n, t;
for(int i = 0; i < sample_rate; i++){
t = i * 0.00000001f; // i * 10^-8
j = fmodf(2 * PI * lo_frequency * t, 2 * PI);
if (j < 0){
j += 2 * PI;
}
index = floor((16.0f / (lo_frequency/rf_frequency * 0.0001f)) * t);
if (index < 16) {
if (!message[index]) {
mix = (uint8_t) floor(115 + sin1(j) * 0.1f);
} else {
mix = sin1(j);
}
} else {
break;
}
output[i] = mix;
}
return output;
}
But now I get the hardfault on this line:
output[i] = mix;
EDIT 3:
Because the previous code contained a very large buffer array that did not fit into the 16KB SRAM of the STM32F303K8 I needed to change it.
Now I use a "ping-pong" buffer where I use the callback of the DMA for "first-half-transmitted" and "completly-transmitted":
void HAL_DAC_ConvHalfCpltCallbackCh1(DAC_HandleTypeDef * hdac){
HAL_GPIO_WritePin(GPIOB, GPIO_PIN_3, GPIO_PIN_SET);
for(uint16_t i = 0; i < 128; i++){
new_value = sin_table[(i * 8) % 256];
if (message[message_index] == 0x0){
dac_buf[i] = new_value * 0.1f + 115;
} else {
dac_buf[i] = new_value;
}
}
}
void HAL_DAC_ConvCpltCallbackCh1 (DAC_HandleTypeDef * hdac){
HAL_GPIO_WritePin(GPIOB, GPIO_PIN_3, GPIO_PIN_RESET);
for(uint16_t i = 128; i < 256; i++){
new_value = sin_table[(i * 8) % 256];
if (message[message_index] == 0x0){
dac_buf[i] = new_value * 0.1f + 115;
} else {
dac_buf[i] = new_value;
}
}
message_index++;
if (message_index >= 16) {
message_index = 0;
// HAL_DAC_Stop_DMA (&hdac1, DAC_CHANNEL_1);
}
}
And it works the way I wanted:
But the frequency of the created sine is too low.
I cap at around 20kHz but I'd need 40kHz.
I allready increased the clock by a factor of 8 so that one is maxed out:
.
I can still decrease the counter period (it is 50 at the moment), but when I do so the interrupt callback seems to take longer than the period to the next one.
At least it seems so as the output becomes very distorted when I do that.
I also tried to decrease the precision by taking only every 8th sine value but
I cant do this any more because then the output does not look like a sine wave anymore.
Any ideas how I could optimize the callback so that it takes less time ?
Any other ideas ?

Does fmodf() cause a hardfault in stm32?
It is other code problems causing the hard fault here.
Failing to compile with ample warnings
Best code tip: enable all warnings. #KamilCuk
Faster feedback than Stackoverflow.
I'd expect something like below on a well enabled compiler.
return output;
warning: function returns address of local variable [-Wreturn-local-addr]
Returning a local Object
Cannot return a local array. Allocate instead.
// int output[sample_rate];
int *output = malloc(sizeof *output * sample_rate);
return output;
Calling code will need to free() the pointer.
Out of range array access
static int sample_rate = 100000;
int output[sample_rate];
// for(int i = 0; i <= sample_rate; i++){
for(int i = 0; i < sample_rate; i++){
...
output[i] = mix;
}
Stack overflow?
static int sample_rate = 100000; int output[sample_rate]; is a large local variable. Maybe allocate or try something smaller?
Advanced: loss of precision
A good fmodf() does not lose precision. For a more precise answer consider double math for the intermediate results. An even better approach is more involved.
float my_fmodf(float x, float y){
if(y == 0){
return 0;
}
double n = 1.0 * x / y;
return (float) (x - n * y);
}
Can I not use any function within another ?
Yes. Code has other issues.

1 value every 10uS makes only 100kSPS whis is not too much for this macro. In my designs I generate > 5MSPS signals without any problems. Usually I have one buffer and DMA in circular mode. First I fill the buffer and start generation. When the half transmition DMA interrupt is trigerred I fill the first half of the buffer with fresh data. The the transmition complete interrupt is trigerred I fill the second half and this process repeats all over again.

OpenMP parallel for loop

void calc_mean(float *left_mean, float *right_mean, const uint8_t* left, const uint8_t* right, int32_t block_width, int32_t block_height, int32_t d, uint32_t w, uint32_t h, int32_t i,int32_t j)
{
*left_mean = 0;
*right_mean = 0;
int32_t i_b;
float local_left = 0, local_right = 0;
for (i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp parallel for reduction(+:local_left,local_right)
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
// Borders checking
if (!(i+i_b >= 0) || !(i+i_b < h) || !(j+j_b >= 0) || !(j+j_b < w) || !(j+j_b-d >= 0) || !(j+j_b-d < w)) {
continue;
}
// Calculating indices of the block within the whole image
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
// Updating the block means
//*left_mean += *(left+ind_l);
//*right_mean += *(right+ind_r);
local_left += left[ind_l];
local_right += right[ind_r];
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
}
This now makes the program execution longer than non-threaded version. I added private(left,right) but it leads to bad memory access for ind_l.

I think this should get you closer to what you want, although I'm not quite sure about one final part.
float local_left, local_right = 0;
for ( int32_t i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp for schedule(static, CORES) reduction(+:left_mean, +: right_mean)
{
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
if (your conditions) continue;
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
local_left += *(left+ind_l);
local_right += *(right+ind_r);
}
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
Part I am unsure of is whether you need the schedule() and how to do two different reductions. I know for one reduction, you can simply do
reduction(+:left_mean)
EDIT: some reference for the schedule() http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-loop.html#Loopschedules
It looks like you do not need this, but using it could produce a better runtime

How to execute faster than "snprintf(mystr, 22, "{%+0.4f,%+0.4f}", (double)3.14159265, (double) 2.718281828459);" on a 32 bit mcu

I've tried a few things, any it seems that at best I'm 1.5x slower than the printf() family of functions, which boggles my mind a bit. I think what I'm up against in this situation is the addressing of my device is 32bit, and I don't have an FPU. I've tried a couple of "ftoa()" implementations and constrained them to only look for 2 digits on the left of the decimal point, and left myself some breadcrumbs as to what the total length is of a larger overall string that I'm trying to build. At the end of the day, it seems like the nature of an array of 8-bit elements on a 32bit system is leading to a bunch of hidden shift operations, bitwise "OR" and bitwise NAND operations that are just slowing things down ridiculously...
Anyone have any general tips for this situation? (other than a re-architect to an 8.24 fixed point design) I've tried the compiler optimizations from wysiwyg to execution speed focused, nothing seems to beat snprintf.
Here's the fastest one that I had tried:
#if (__DEBUG)
#define DATA_FIFO_SIZE (8)
#else
#define DATA_FIFO_SIZE (1024)
#endif
typedef struct
{
int32_t rval[4];
double cval[4];
uint16_t idx;
uint16_t padding; //#attention the compiler was padding with 2 bytes to align to 32bit
} data_fifo_entry;
const char V_ERR_MSG[7] = "ERROR,\0";
static data_fifo_entry data_fifo[DATA_FIFO_SIZE];
static char embed_text[256];
/****
* float to ASCII, adapted from
* https://stackoverflow.com/questions/2302969/how-to-implement-char-ftoafloat-num-without-sprintf-library-function-i#7097567
*
****/
//#attention the following floating point #defs are linked!!
#define MAX_DIGITS_TO_PRINT_FLOAT (6)
#define MAX_SUPPORTED_PRINTABLE_FLOAT (+999999.99999999999999999999999999)
#define MIN_SUPPORTED_PRINTABLE_FLOAT (-999999.99999999999999999999999999)
#define FLOAT_TEST6 (100000.0)
#define FLOAT_TEST5 (10000.0)
#define FLOAT_TEST4 (1000.0)
#define FLOAT_TEST3 (100.0)
#define FLOAT_TEST2 (10.0)
#define FLOAT_TEST1 (1.0)
static inline int ftoa(char *s, const float f_in, const uint8_t precision)
{
float f_p = 0.0001;
float n = f_in;
int neg = (n < 0.0);
int length = 0;
switch (precision)
{
case (1):
{
f_p = 0.1;
break;
}
case (2):
{
f_p = 0.01;
break;
}
case (3):
{
f_p = 0.001;
break;
}
//case (4) is the default assumption
case (5):
{
f_p = 0.00001;
break;
}
case (6):
{
f_p = 0.000001;
break;
}
default: //already assumed, no assignments here
{
break;
}
} /* switch */
// handle special cases
if (isnan(n))
{
strcpy(s, "nan\0");
length = 4;
}
else if ((isinf(n)) || (n >= MAX_SUPPORTED_PRINTABLE_FLOAT) ||
((-1.0 * n) < MIN_SUPPORTED_PRINTABLE_FLOAT))
{
strcpy(s, "inf\0");
length = 4;
}
else if (n == 0.0)
{
int idx;
s[length++] = '+';
s[length++] = '0';
s[length++] = '.';
for (idx = 0; idx < precision; idx++)
{
s[length++] = '0';
}
s[length++] = '\0';
}
else if (((n > 0.0) && (n < f_p)) || ((n < 0.0) && ((-1.0 * n) < f_p)))
{
int idx;
if (n >= 0.0)
{
s[length++] = '+';
}
else
{
s[length++] = '-';
}
s[length++] = '0';
s[length++] = '.';
for (idx = 1; idx < precision; idx++)
{
s[length++] = '0';
}
s[length++] = '\0';
}
else
{
int digit, m;
if (neg)
{
n = -n;
}
// calculate magnitude
if (n >= FLOAT_TEST6)
{
m = 6;
}
else if (n >= FLOAT_TEST5)
{
m = 5;
}
else if (n >= FLOAT_TEST4)
{
m = 4;
}
else if (n >= FLOAT_TEST3)
{
m = 3;
}
else if (n >= FLOAT_TEST2)
{
m = 2;
}
else if (n >= FLOAT_TEST1)
{
m = 1;
}
else
{
m = 0;
}
if (neg)
{
s[length++] = '-';
}
else
{
s[length++] = '+';
}
// set up for scientific notation
if (m < 1.0)
{
m = 0;
}
// convert the number
while (n > f_p || m >= 0)
{
double weight = pow(10.0, m);
if ((weight > 0) && !isinf(weight))
{
digit = floor(n / weight);
n -= (digit * weight);
s[length++] = '0' + digit;
}
if ((m == 0) && (n > 0))
{
s[length++] = '.';
}
m--;
}
s[length++] = '\0';
}
return (length - 1);
} /* ftoa */
static inline void print2_and_idx(int8_t idx1, int8_t idx2, uint16_t fifo_idx)
{
//#attention 10 characters already in the buffer, idx does NOT start at zero
uint8_t idx = V_PREFIX_LENGTH;
char scratch[16] = {'\0'};
char * p_fifo_id;
if ((idx1 >= 0) && (idx1 < MAX_IDX) && (idx2 >= 0) && (idx2 < MAX_IDX) &&
(fifo_idx >= 0) && (fifo_idx < DATA_FIFO_SIZE))
{
ftoa(scratch, data_fifo[fifo_idx].cval[idx1], 4);
memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
embed_text[idx++] = ',';
ftoa(scratch, data_fifo[fifo_idx].cval[idx2], 4);
memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
embed_text[idx++] = ',';
//!\todo maybe print the .idx as fixed width, zero pad to 5 digits
p_fifo_id = utoa((char *)&embed_text[idx], (unsigned int)data_fifo[fifo_idx].idx, 10);
idx += strlen(p_fifo_id);
embed_text[idx++] = ',';
}
else
{
memcpy((void *)&embed_text[idx], (void *)V_ERR_MSG, 7);
}
} /* print2_and_idx */

Instead of using *printf() with FP arguments, convert the FP values first into scaled integers.
With still calling snprintf(), yet with integer and simple character arguments, my code was about 20x faster than the baseline.
Your mileage may vary. YMMV.
//baseline
void format2double_1(char *mystr, double pi, double e) {
snprintf(mystr, 22, "{%+0.4f,%+0.4f}", pi, e);
//puts(mystr);
}
void format2double_2(char *mystr, double pi, double e) {
int pi_i = (int) lrint(pi * 10000.0);
int api_i = abs(pi_i);
int e_i = (int) lrint(e * 10000.0);
int ae_i = abs(e_i);
snprintf(mystr, 22, "{%c%d.%04d,%c%d.%04d}", //
"+-"[pi_i < 0], api_i / 10000, api_i % 10000, //
"+-"[e_i < 0], ae_i / 10000, ae_i % 10000);
//puts(mystr);
}
[edit]
For a proper -0.0 text, use "+-"[!!signbit(pi)]
[edit]
Some idea for OP to consider as a ftoa() replacement. Central code is lrint(f_in * fscale[precision]); which rounds and scales. Untested.
#define PRINTABLE_MAGNITUDE_LIMIT 1000000
int ftoa_1(char *s, const float f_in, const uint8_t precision) {
int n;
sprintf(s, "%+.*f%n", precision, f_in, &n);
return n;
}
int ftoa_2(char *s, const float f_in, const uint8_t precision) {
float fscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
long iscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
assert(precision > 0 && precision < sizeof fscale / sizeof fscale[0]);
// gross range check
if (f_in > -PRINTABLE_MAGNITUDE_LIMIT && f_in < PRINTABLE_MAGNITUDE_LIMIT) {
long value = lrint(f_in * fscale[precision]);
value = labs(value);
long scale = iscale[precision];
long ipart = value / scale;
long fpart = value % scale;
// fine range check
if (ipart < PRINTABLE_MAGNITUDE_LIMIT) {
int n;
sprintf(s, "%c%ld:%0*ld%n", signbit(f_in) ? '-' : '+', ipart, precision,
fpart, &n);
return n;
}
}
// Out of range values need not be of performance concern for now.
return ftoa_1(s, f_in, precision);
}
[edit]
To convert a positive or 0 integer to a string quickly without the need to shift the buffer or reverse it, see below. It also returns the string length for subsequent string building.
// Convert an unsigned to a decimal string and return its length
size_t utoa_length(char *dest, unsigned u) {
size_t len = 0;
if (u >= 10) {
len = utoa_length(dest, u/10);
dest += len;
}
dest[0] = '0' + u%10;
dest[1] = '\0';
return len + 1;
}

In a similar vein of #chux's answer, if the remaining snprintf is still slow you can go down the rabbit hole of hand-composing strings/hand-rendering integers.
char *fmtp04f(char *buf, char *lim, double d) {
// if there's no space at all don't bother
if(buf==lim) return buf;
// 10 characters in maximum 32 bit integer, one for the dot,
// one for the terminating NUL in debug prints
char b[12];
// current position in the buffer
char *bp = b;
// scale and round
int32_t i = lrint(d * 10000.);
// write sign and fix i sign
// (we do have at least one character available in buf)
if(signbit(d)) {
*buf++='-';
i = -i;
} else {
*buf++='+';
}
// *always* write down the last 4 digits, even if they are zeroes
// (they'll become the 4 digits after the decimal dot)
for(; bp!=b+4; ) {
*bp++ = '0' + i%10;
i/=10;
}
*bp++='.';
// write down the remaining digits, writing at least one
do {
*bp++ = '0' + i%10;
i/=10;
} while(i != 0);
// bp is at the character after the last, step back
--bp;
// data is now into b *in reversed order*;
// reverse-copy it into the user-provided buffer
while(buf!=lim) {
*buf++ = *bp;
// check before decrementing, as a pointer to one-before-first
// is not allowed in C
if(bp == b) break;
--bp;
}
if(buf!=lim) *buf=0; // "regular" case: terminate *after*
else lim[-1]=0; // bad case: truncate
return buf;
}
void doformat(char *buf, char *lim, double a, double b) {
if(buf==lim) return; // cannot do anything
*buf++='{';
if(buf==lim) goto end;
buf = fmtp04f(buf, lim, a);
if(buf==lim) return; // already terminated by fmtp04f
*buf++=',';
if(buf==lim) goto end;
buf = fmtp04f(buf, lim, b);
if(buf==lim) return; // idem
*buf++='}';
if(buf==lim) goto end;
*buf++=0;
end:
lim[-1]=0; // always terminate
}
It passes some random tests, so I'm reasonably confident that it is not too wrong.
For some reason, #chux version on my machine (64 bit Linux, gcc 6.3) is generally 2/3 times faster than the baseline, while my version is usually 10/30 times faster than the baseline. I don't know if this is because my snprintf is particularly good or particularly bad. As said above, YMMV.

control display with fbtft modules

I would like to swap upside part and lower part in the display.
like this
| 1 | to | 2 |
| 2 | to | 1 | upper half and lower half will be changed
so I have edited fbtft source (https://github.com/notro/fbtft/wiki/How-it-works, https://github.com/notro/fbtft)
I thought It is possible if I write the changed pixel value( pointer to current pixel value + half size ) in video memory when I send the data to 8 bit bus
my LCD is 240 X 320 resolution using SPI-8bit. device name is adafruit22a , driver name is ili9340.
this is the source I edited
first, in fbtft-core.c
void fbtft_update_display(struct fbtft_par *par, unsigned start_line, unsigned end_line)void fbtft_update_display(struct fbtft_par *par, unsigned start_line, unsigned end_line)
{
size_t offset, len;
struct timespec ts_start, ts_end, ts_fps, ts_duration;
long fps_ms, fps_us, duration_ms, duration_us;
long fps, throughput;
bool timeit = false;
int ret = 0;
unsigned start_line_t; // changed start line( startline + half size (119))
if (unlikely(par->debug & (DEBUG_TIME_FIRST_UPDATE | DEBUG_TIME_EACH_UPDATE))) {
if ((par->debug & DEBUG_TIME_EACH_UPDATE) || \
((par->debug & DEBUG_TIME_FIRST_UPDATE) && !par->first_update_done)) {
getnstimeofday(&ts_start);
timeit = true;
}
}
/* Sanity checks */
if (start_line > end_line) {
dev_warn(par->info->device,
"%s: start_line=%u is larger than end_line=%u. Shouldn't happen, will do full display update\n",
__func__, start_line, end_line);
start_line = 0;
end_line = par->info->var.yres - 1;
}
if (start_line > par->info->var.yres - 1 || end_line > par->info->var.yres - 1) {
dev_warn(par->info->device,
"%s: start_line=%u or end_line=%u is larger than max=%d. Shouldn't happen, will do full display update\n",
__func__, start_line, end_line, par->info->var.yres - 1);
start_line = 0;
end_line = par->info->var.yres - 1;
}
fbtft_par_dbg(DEBUG_UPDATE_DISPLAY, par, "%s(start_line=%u, end_line=%u)\n",
__func__, start_line, end_line);
if (par->fbtftops.set_addr_win)
par->fbtftops.set_addr_win(par, 0, start_line,
par->info->var.xres-1, end_line);
start_line_t= (start_line + 119)%239;
//offset = start_line * par->info->fix.line_length;
offset = start_line_t * par->info->fix.line_length;
len = (end_line - start_line + 1) * par->info->fix.line_length;
ret = par->fbtftops.write_vmem(par, offset, len);
if (ret < 0)
dev_err(par->info->device,
"%s: write_vmem failed to update display buffer\n",
__func__);
if (unlikely(timeit)) {
getnstimeofday(&ts_end);
if (par->update_time.tv_nsec == 0 && par->update_time.tv_sec == 0) {
par->update_time.tv_sec = ts_start.tv_sec;
par->update_time.tv_nsec = ts_start.tv_nsec;
}
ts_fps = timespec_sub(ts_start, par->update_time);
par->update_time.tv_sec = ts_start.tv_sec;
par->update_time.tv_nsec = ts_start.tv_nsec;
fps_ms = (ts_fps.tv_sec * 1000) + ((ts_fps.tv_nsec / 1000000) % 1000);
fps_us = (ts_fps.tv_nsec / 1000) % 1000;
fps = fps_ms * 1000 + fps_us;
fps = fps ? 1000000 / fps : 0;
ts_duration = timespec_sub(ts_end, ts_start);
duration_ms = (ts_duration.tv_sec * 1000) + ((ts_duration.tv_nsec / 1000000) % 1000);
duration_us = (ts_duration.tv_nsec / 1000) % 1000;
throughput = duration_ms * 1000 + duration_us;
throughput = throughput ? (len * 1000) / throughput : 0;
throughput = throughput * 1000 / 1024;
dev_info(par->info->device,
"Display update: %ld kB/s (%ld.%.3ld ms), fps=%ld (%ld.%.3ld ms)\n",
throughput, duration_ms, duration_us,
fps, fps_ms, fps_us);
par->first_update_done = true;
}
}
I changed start_line to start_line_t to move upper half to lower half.
then I write data from start_line_t to length in vedio memory (frame memory)
this is writing from video memory to bus in fbtft-bus.c
#define HALF_OF_LCD 119
#define END_OF_LCD 239
/* 16 bit pixel over 8-bit databus */
int fbtft_write_vmem16_bus8(struct fbtft_par *par, size_t offset, size_t len)
{
u16 *vmem16;
u16 *txbuf16 = (u16 *)par->txbuf.buf;
size_t remain;
size_t to_copy;
size_t tx_array_size;
int i;
int ret = 0;
size_t startbyte_size = 0;
size_t start_offset=0
size_t end_offset=0;
u16 *endMem16;
fbtft_par_dbg(DEBUG_WRITE_VMEM, par, "%s(offset=%zu, len=%zu)\n",
__func__, offset, len);
end_offset= 239 * par->info->fix.line_length;
endMem16=(u16 *)(par->info->screen_base+ end_offset);
remain = len / 2;
vmem16 = (u16 *)(par->info->screen_base + offset);
if (par->gpio.dc != -1)
gpio_set_value(par->gpio.dc, 1);
/* non buffered write */
if (!par->txbuf.buf)
return par->fbtftops.write(par, vmem16, len);
/* buffered write */
tx_array_size = par->txbuf.len / 2;
if (par->startbyte) {
txbuf16 = (u16 *)(par->txbuf.buf + 1);
tx_array_size -= 2;
*(u8 *)(par->txbuf.buf) = par->startbyte | 0x2;
startbyte_size = 1;
}
while(remain)
{
to_copy = remain > tx_array_size ? tx_array_size : remain;
for (i = 0; i < to_copy; i++)
{
txbuf16[i] = cpu_to_be16(*vmem16);
++vmem16;
if(vmem16 == endMem16)
vmem16=(u16 *)(par->info->screen_base);
}
ret = par->fbtftops.write(par, par->txbuf.buf,
startbyte_size + to_copy * 2);
if (ret < 0)
return ret;
remain -= to_copy;
}
return ret;
}
in the code if pointer get reached end of video memory address ,
it moves to initial address in video memory address.( lower half should be written in upper half side).
the result is showing what I want like swapped half display
but the problem is that mouse moving and update display are so slow
even mouse is disappeared in upper half side.
I dont know what is wrong and how should I edit the part of source
I want to control display with frame buffer
anyone help me ? T^T

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

lmbench3 rd/wr/rdwr/cp stride size - arm

Related

Looking for performance improvement of NEON code to match clipping area on the screen

Does fmodf() cause a hardfault in stm32?

OpenMP parallel for loop

How to execute faster than "snprintf(mystr, 22, "{%+0.4f,%+0.4f}", (double)3.14159265, (double) 2.718281828459);" on a 32 bit mcu

control display with fbtft modules

Categories

Resources