OpenMP parallel for loop

OpenMP parallel for loop - c

void calc_mean(float *left_mean, float *right_mean, const uint8_t* left, const uint8_t* right, int32_t block_width, int32_t block_height, int32_t d, uint32_t w, uint32_t h, int32_t i,int32_t j)
{
*left_mean = 0;
*right_mean = 0;
int32_t i_b;
float local_left = 0, local_right = 0;
for (i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp parallel for reduction(+:local_left,local_right)
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
// Borders checking
if (!(i+i_b >= 0) || !(i+i_b < h) || !(j+j_b >= 0) || !(j+j_b < w) || !(j+j_b-d >= 0) || !(j+j_b-d < w)) {
continue;
}
// Calculating indices of the block within the whole image
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
// Updating the block means
//*left_mean += *(left+ind_l);
//*right_mean += *(right+ind_r);
local_left += left[ind_l];
local_right += right[ind_r];
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
}
This now makes the program execution longer than non-threaded version. I added private(left,right) but it leads to bad memory access for ind_l.

I think this should get you closer to what you want, although I'm not quite sure about one final part.
float local_left, local_right = 0;
for ( int32_t i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp for schedule(static, CORES) reduction(+:left_mean, +: right_mean)
{
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
if (your conditions) continue;
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
local_left += *(left+ind_l);
local_right += *(right+ind_r);
}
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
Part I am unsure of is whether you need the schedule() and how to do two different reductions. I know for one reduction, you can simply do
reduction(+:left_mean)
EDIT: some reference for the schedule() http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-loop.html#Loopschedules
It looks like you do not need this, but using it could produce a better runtime

Related

Looking for performance improvement of NEON code to match clipping area on the screen

Here is my test code to find 1st clipping area on the screen.
Two subroutines and dummy loops in the code to compare the performance of them.
point_in_neon (NEON version) and point_in (Regular version) does the same thing:
find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.
I expected NEON version is faster than regular version.
Unfortunately, it is slower than regular version. Is there another way to speed it up?
The compiler command is:
${CC} -O2 -ftree-vectorize -o vcomp vcomp.c
Thanks,
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>
#define WIDTH (4096)
#define HEIGHT (4096)
#define CLIPS (32)
static inline uint64_t now(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec*1000000+tv.tv_usec;
}
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
const int32_t right[4]={
rs[0].x+rs[0].width-1,
rs[1].x+rs[1].width-1,
rs[2].x+rs[2].width-1,
rs[3].x+rs[3].width-1
}, bottom[4]={
rs[0].y+rs[0].height-1,
rs[1].y+rs[1].height-1,
rs[2].y+rs[2].height-1,
rs[3].y+rs[3].height-1
};
int32x4_t p, r;
uint32x4_t t;
uint32_t res[4];
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//r = <Left0, Left1, Left2, Left3>
r=vld1q_lane_s32(&rs[0].x, r, 0);
r=vld1q_lane_s32(&rs[1].x, r, 1);
r=vld1q_lane_s32(&rs[2].x, r, 2);
r=vld1q_lane_s32(&rs[3].x, r, 3);
//t = (p >= r)
t=vcgeq_s32(p, r);
//r = <Right0, Right1, Right2, Right3>
r=vld1q_s32(&right);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//r = <Top0, Top1, Top2, Top3>
r=vld1q_lane_s32(&rs[0].y, r, 0);
r=vld1q_lane_s32(&rs[1].y, r, 1);
r=vld1q_lane_s32(&rs[2].y, r, 2);
r=vld1q_lane_s32(&rs[3].y, r, 3);
//t = t & (p >= r)
t=vandq_u32(t, vcgeq_s32(p, r));
//r = <Bottom0, Bottom1, Bottom2, Bottom3>
r=vld1q_s32(&bottom);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0;i<len;i++) {
int32_t right=rs[i].x+rs[i].width-1,
bottom=rs[i].y+rs[i].height-1;
if(pt->x>=rs[i].x && pt->x<=right &&
pt->y>=rs[i].y && pt->y<=bottom)
return i;
}
return -1;
}
int32_t main(int32_t argc, char *argv[]) {
rect_t rs[CLIPS];
int32_t i, j;
uint64_t ts0, ts1;
int32_t res[2][CLIPS];
srand((unsigned int)time(NULL));
for(i=0;i<CLIPS;i++) {
rs[i].x=rand()%WIDTH;
rs[i].y=rand()%HEIGHT;
rs[i].width=rand()%WIDTH;
rs[i].height=rand()%HEIGHT;
}
memset(res, 0, sizeof(res));
ts0=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
if(idx>=0)
res[0][idx]=1;
}
}
ts0=now()-ts0;
ts1=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
int32_t k, idx;
point_t p={i, j};
for(k=0, idx=-1;k<CLIPS/4;k++) {
idx=point_in_neon(&p, &rs[k*4]);
if(idx>=0)
break;
}
if(idx>=0)
res[1][k*4+idx]=1;
}
}
ts1=now()-ts1;
/*
for(i=0;i<CLIPS;i++) {
if(res[0][i]!=res[1][i]) {
printf("error.\n");
return 1;
}
}
*/
printf("regular = %lu\n", ts0);
printf("neon = %lu\n", ts1);
return 0;
}

According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
int32x4x4_t r;
int32x4_t right, bottom, p;
uint32x4_t t;
uint32_t res[4];
/*
r.val[0] = <X0, X1, X2, X3>
r.val[1] = <Y0, Y1, Y2, Y3>
r.val[2] = <Width0, Width1, Width2, Width3>
r.val[3] = <Height0, Height1, Height2, Height3>
*/
r=vld4q_s32(rs);
//right = <Right0, Right1, Right2, Right3>
right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
//bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//t = (p >= left)
t=vcgeq_s32(p, r.val[0]);
//t = t & (right >= p)
t=vandq_u32(t, vcgeq_s32(right, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//t = t & (p >= top)
t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
//t = t & (r >= bottom)
t=vandq_u32(t, vcgeq_s32(bottom, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}

Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0; i < len; i++)
{
// this is pointless - change your data structures so that
// the rect stores minx/maxx, miny/maxy instead!
int32_t right = rs[i].x + rs[i].width;
int32_t bottom= rs[i].y + rs[i].height;
bool cmp0 = pt->x >= rs[i].x;
bool cmp1 = pt->y >= rs[i].y;
bool cmp2 = pt->x < right;
bool cmp3 = pt->y < bottom;
if(cmp0 & cmp1 & cmp2 & cmp3)
return i;
}
return -1;
}
Next obvious thing to point out:
// your screen size...
#define WIDTH (4096)
#define HEIGHT (4096)
// yet your structures use uint32 as storage???
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.
I'm also going to hoist the pointless p.x + width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).
typedef struct rect_x8_t {
int16x8_t x;
int16x8_t y;
int16x8_t xmax; //< x + width
int16x8_t ymax; //< y + height
} rect_x8_t;
typedef struct point_x8_t {
int16x8_t x;
int16x8_t y;
} point_x8_t;
On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)
// assuming this has already been initialised
rect_t rs[CLIPS];
// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);
// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};
// I'm going to do this rubbishly as an pre-process step.
// I don't care too much about efficiency here...
for(uint32_t i = 0; i < CLIPS; ++i) {
rs8[i / 8].x[i & 7] = rs[i].x;
rs8[i / 8].y[i & 7] = rs[I].y;
rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}
I have a couple of concerns here:
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
// This seems wrong? Shouldn't it be p = {j, i} ?
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
// I'm not quite sure what the result says about your
// image data and clip regions???
//
// This seems like a really silly way of asking
// a simple question about the clip regions. The pixels
// don't have any effect here.
if(idx >= 0)
res[0][idx] = 1;
}
}
Anyhow, now refactoring the point_in method to use int16x8_t, we get:
inline int32_t point_in_x8(const point_x8_t pt,
const rect_x8_t* rs,
uint32_t len) {
for(int32_t i = 0; i < len; i++) {
// perform comparisons on 8 rects at a time
uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);
// combine to single comparison value
uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);
// use a horizontal max to see if any lanes are true
if(vmaxvq_u16(cmp0123)) {
for(int32_t j = 0; j < 8; ++j) {
if(cmp0123[j])
return 8*i + j;
}
}
}
return -1;
}
Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).
Then finally...
for(i = 0; i < HEIGHT; i++) {
point_x8_t p;
// splat the y value
p.y = vld1q_dup_s16(i);
for(j = 0; j < WIDTH; j++) {
// splat the x value
p.x = vld1q_dup_s16(j);
int32_t idx = point_in_x8(p, rs8, CLIPS8);
if(idx >= 0)
res[1][idx] = 1;
}
}
The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.
HOWEVER
This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.
A clip region will be false if it is entirely contained by the preceding clip regions
for(i = 0; i < CLIPS; i++) {
// if region is empty, ignore.
if(rs[i].width == 0 || rs[i].height == 0) {
res[0][i] = 0;
continue;
}
// first region will always be true (unless it's of zero size)
if(i == 0) {
res[0][1] = 1;
continue;
}
uint32_t how_many_intersect = 0;
bool entirely_contained = false;
uint32_t intersection_indices[CLIPS] = {};
// do a lazy test first.
for(j = i - 1; j >= 0; --j) {
// if the last region is entirely contained by preceding
// ones, it will be false. exit loop.
if(region_is_entirely_contained(rs[i], rs[j])) {
res[0][i] = 0;
entirely_contained = true;
j = -1; ///< break out of loop
}
else
// do the regions intersect?
if(region_intersects(rs[i], rs[j])) {
intersection_indices[how_many_intersect] = j;
++how_many_intersect;
}
}
// if one region entirely contains this clip region, skip it.
if(entirely_contained) {
continue;
}
// if you only intersect one or no regions, the result is true.
if(how_many_intersect <= 1) {
res[0][i] = 1;
continue;
}
// If you get here, the result is *probably* true, however
// you will need to split this clip region against the previous
// ones to be fully sure. If all regions are fully contained,
// the answer is false.
// I won't implement it, but something like this:
* split rs[i] against each rs[intersection_indices[]].
* Throw away the rectangles that are entirely contained.
* Each bit that remains should be tested against each rs[intersection_indices[]]
* If you find any split rectangle that isn't contained,
set to true and move on.
}

Increasing n-body program performance using OpenMP

My goal is to increase the performance of a code that simulates the n-body problem.
This is where the time is to be calculated. The two functions that need to be parallelized are the calculate_forces() and the *move_bodies() functions but since the loop control variable t is a double I cannot have a #pragma omp parallel for statement there.
t0 = gettime ();
for (t = 0; t < t_end; t += dt)
{
// draw bodies
show_bodies (window);
// computation
calculate_forces ();
move_bodies ();
}
// print out calculation speed every second
t0 = gettime () - t0;
The two functions calculate_forces() and move_bodies() with the respective directives that I used are the following:
static void
calculate_forces ()
{
double distance, magnitude, factor, r;
vector_t direction;
int i, j;
#pragma omp parallel private(distance,magnitude,factor,direction)
{
#pragma omp for private(i,j)
for (i = 0; i < n_body - 1; i++)
{
for (j = i + 1; j < n_body; j++)
{
r = SQR (bodies[i].position.x - bodies[j].position.x) + SQR (bodies[i].position.y - bodies[j].position.y);
// avoid numerical instabilities
if (r < EPSILON)
{
// this is not how nature works :-)
r += EPSILON;
}
distance = sqrt (r);
magnitude = (G * bodies[i].mass * bodies[j].mass) / (distance * distance);
factor = magnitude / distance;
direction.x = bodies[j].position.x - bodies[i].position.x;
direction.y = bodies[j].position.y - bodies[i].position.y;
// +force for body i
#pragma omp critical
{
bodies[i].force.x += factor * direction.x;
bodies[i].force.y += factor * direction.y;
// -force for body j
bodies[j].force.x -= factor * direction.x;
bodies[j].force.y -= factor * direction.y;
}
}
}
}
}
static void
move_bodies ()
{
vector_t delta_v, delta_p;
int i;
#pragma omp parallel private(delta_v,delta_p,i)
{
#pragma omp for
for (i = 0; i < n_body; i++)
{
// calculate delta_v
delta_v.x = bodies[i].force.x / bodies[i].mass * dt;
delta_v.y = bodies[i].force.y / bodies[i].mass * dt;
// calculate delta_p
delta_p.x = (bodies[i].velocity.x + delta_v.x / 2.0) * dt;
delta_p.y = (bodies[i].velocity.y + delta_v.y / 2.0) * dt;
// update body velocity and position
#pragma omp critical
{
bodies[i].velocity.x += delta_v.x;
bodies[i].velocity.y += delta_v.y;
bodies[i].position.x += delta_p.x;
bodies[i].position.y += delta_p.y;
}
// reset forces
bodies[i].force.x = bodies[i].force.y = 0.0;
if (bounce)
{
// bounce on boundaries (i.e. it's more like billard)
if ((bodies[i].position.x < -body_distance_factor) || (bodies[i].position.x > body_distance_factor))
bodies[i].velocity.x = -bodies[i].velocity.x;
if ((bodies[i].position.y < -body_distance_factor) || (bodies[i].position.y > body_distance_factor))
bodies[i].velocity.y = -bodies[i].velocity.y;
}
}
}
The values of bodies.velocity and bodies.position are changed in the move bodies function, but I couldn't use a reduction.
There is also a checksum function to calculate if the calculated checksum is equal to the reference checksum. That function looks like this:
static unsigned long
checksum()
{
unsigned long checksum = 0;
// initialize bodies
for (int i = 0; i < n_body; i++)
{
// random position vector
checksum += (unsigned long)round(bodies[i].position.x);
checksum += (unsigned long)round(bodies[i].position.y);
}
return checksum;
}
This function uses the previously calculated values of bodies.position.x and bodies.position.y which were calculated in the move_bodies function hence the reason why I used a critical block while calculating those value which didn't seem to yield a correct answer. Can anyone give me some insight on where I am going wrong? Thank you in advance.

OpenMP parallelization not efficient

I'm trying to parallelize this code using OpenMP.
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x) cur_row++;
else cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
for(i=0;i<Npt_x;i++) {
for(j=0;j<Npt_y;j++) {
i_corrected = cur_row - i;
if(i_corrected < 0) i_corrected = Npt_x + i_corrected;
R[i][j] += val[cur_row][0]*val[i_corrected][j]/Ntot;
}
}
}
with
- val and R declared as **double,
- Npt_x and Npt_y are about 500,
- Ntot is about 10^6.
I've done this
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x) cur_row++;
else cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
#pragma omp parallel for collapse(2), private(i,j,i_corrected)
for(i=0;i<Npt_x;i++) {
for(j=0;j<Npt_y;j++) {
i_corrected = cur_row - i;
if(i_corrected < 0) i_corrected = Npt_x + i_corrected;
R[i][j] += val[cur_row][0]*val[i_corrected][j]/Ntot;
}
}
}
The problem is that it doesn't seem to be efficient. Is there a way to use OpenMP more efficiently in this case ?
Many thks

Right now, I would try something like this:
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x)
cur_row++;
else
cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
#pragma omp parallel for private(i,j,i_corrected)
for(i=0;i<Npt_x;i++) {
i_corrected = cur_row - i;
if(i_corrected < 0)
i_corrected += Npt_x;
double tmp = val[cur_row][0]/Ntot;
#if defined(_OPENMP) && _OPENMP > 201306
#pragma omp simd
#endif
for(j=0;j<Npt_y;j++) {
R[i][j] += tmp*val[i_corrected][j];
}
}
}
However, since the code will be memory bound, that's not sure it'll get you much parallel speed-up... Worth a try though.

Using two Arrays in C/Gameboy programming

For a game in Gameboy programming, I am using four arrays called top, oldTop, bottom and oldBottom:
struct Point { int x, y; };
struct Rect { struct Point xx, yy; };
Rect top[size], oldTop[size];
Rect bottom[size], oldBottom[i];
where Rect is a struct made of two Struct Points, the top-left and the bottom right corner points.
The idea of the game is to have random-heighted blocks top-down from the ceiling and bottom-up from the floor.
It is similar to the copter-classic game. In my infinite while loop, I shift all of the rectangles down by one pixel using the following code
while (1)
{
for (int i = 0; i < size; i++)
{
//in Struct Rect, xx is the top-left corner point, and yy is the bottom right
top[i].xx.x--;
top[i].yy.x--;
bottom[i].xx.x--;
bottom[i].yy.x--;
if (top[i].xx.x < 0)
{
top[i].xx.x += 240;
top[i].yy.x += 240;
}
if (bottom[i].xx.x < 0)
{
bottom[i].xx.x += 240;
bottom[i].yy.x += 240;
}
}
for (int i = 0; i < size; i++)
{
drawRect(oldTop[i], colorBlack);
drawRect(oldBottom[i], colorBlack);
}
/*call delay function that wait for Vertical Blank*/
for(int i = 0; i < size; i++)
{
drawRect(top[i], colorGreen);
drawRect(bottom[i], colorGreen);
oldTop[i] = top[i];
oldBottom[i] = bottom[i];
}
}
The drawRect method uses DMA to draw the rectangle.
with this code, the code should display the rectangles like this: (drew this up in paint)
But the result I get is
What is odd is that if I don't draw the bottom row at all, then the top row draws fine. The result only messes up when I draw both. This is really weird because I think that the code should be working fine, and the code is not very complicated. Is there a specific reason this is happening, and is there a way to remedy this?
Thanks.
The code that I use to draw the rectangle looks like this:
void drawRect(int row, int col, int width, int height){
int i;
for (i=0; i<height; i++)
{
DMA[3].src = &color;
DMA[3].dst = videoBuffer + (row+r)*240 + col);
DMA[3].cnt = DMA_ON | DMA_FIXED_SOURCE | width;
}
}

Here's a debugging SSCCE (Short, Self-Contained, Correct Example) based on your code. There are assertions in this code that fire; it runs, but is known not to be correct. I've renamed bottom to btm and oldBottom to oldBtm so that the names are symmetric; it makes the code layout more systematic (but is otherwise immaterial).
#include <assert.h>
#include <stdio.h>
typedef struct Point { int x, y; } Point;
typedef struct Rect { struct Point xx, yy; } Rect;
enum { size = 2 };
typedef enum { colourGreen = 0, colourBlack = 1 } Colour;
/*ARGSUSED*/
static void drawRect(Rect r, Colour c)
{
printf(" (%3d)(%3d)", r.xx.x, r.yy.x);
}
int main(void)
{
Rect top[size], oldTop[size];
Rect btm[size], oldBtm[size];
int counter = 0;
for (int i = 0; i < size; i++)
{
top[i].xx.x = 240 - 4 * i;
top[i].xx.y = 0 + 10 + i;
top[i].yy.x = 240 - 14 * i;
top[i].yy.y = 0 + 20 + i;
btm[i].xx.x = 0 + 72 * i;
btm[i].xx.y = 0 + 10 * i;
btm[i].yy.x = 0 + 12 * i;
btm[i].yy.y = 0 + 20 * i;
oldTop[i] = top[i];
oldBtm[i] = btm[i];
}
while (1)
{
if (counter++ > 480) // Limit amount of output!
break;
for (int i = 0; i < size; i++)
{
//in Struct Rect, xx is the top-left corner point, and yy is the bottom right
top[i].xx.x--;
top[i].yy.x--;
btm[i].xx.x--;
btm[i].yy.x--;
if (top[i].xx.x < 0)
{
top[i].xx.x += 240;
top[i].yy.x += 240;
}
if (btm[i].xx.x < 0)
{
btm[i].xx.x += 240;
btm[i].yy.x += 240;
}
}
for (int i = 0; i < size; i++)
{
assert(top[i].xx.x >= 0 && top[i].yy.x >= 0);
assert(btm[i].xx.x >= 0 && btm[i].yy.x >= 0);
}
for (int i = 0; i < size; i++)
{
drawRect(oldTop[i], colourBlack);
drawRect(oldBtm[i], colourBlack);
}
/*call delay function that wait for Vertical Blank*/
for(int i = 0; i < size; i++)
{
drawRect(top[i], colourGreen);
drawRect(btm[i], colourGreen);
oldTop[i] = top[i];
oldBtm[i] = btm[i];
}
putchar('\n');
}
return(0);
}
As noted in a late comment, one big difference between this and your code is that oldBottom in your code is declared as:
Rect top[size], oldTop[size];
Rect bottom[size], oldBottom[i];
using the size i instead of size. This probably accounts for array overwriting issues you see.
There's a second problem though; the assertions in the loop in the middle fire:
(240)(240) ( 0)( 0) (236)(226) ( 72)( 12) (239)(239) (239)(239) (235)(225) ( 71)( 11)
(239)(239) (239)(239) (235)(225) ( 71)( 11) (238)(238) (238)(238) (234)(224) ( 70)( 10)
(238)(238) (238)(238) (234)(224) ( 70)( 10) (237)(237) (237)(237) (233)(223) ( 69)( 9)
(237)(237) (237)(237) (233)(223) ( 69)( 9) (236)(236) (236)(236) (232)(222) ( 68)( 8)
(236)(236) (236)(236) (232)(222) ( 68)( 8) (235)(235) (235)(235) (231)(221) ( 67)( 7)
(235)(235) (235)(235) (231)(221) ( 67)( 7) (234)(234) (234)(234) (230)(220) ( 66)( 6)
(234)(234) (234)(234) (230)(220) ( 66)( 6) (233)(233) (233)(233) (229)(219) ( 65)( 5)
(233)(233) (233)(233) (229)(219) ( 65)( 5) (232)(232) (232)(232) (228)(218) ( 64)( 4)
(232)(232) (232)(232) (228)(218) ( 64)( 4) (231)(231) (231)(231) (227)(217) ( 63)( 3)
(231)(231) (231)(231) (227)(217) ( 63)( 3) (230)(230) (230)(230) (226)(216) ( 62)( 2)
(230)(230) (230)(230) (226)(216) ( 62)( 2) (229)(229) (229)(229) (225)(215) ( 61)( 1)
(229)(229) (229)(229) (225)(215) ( 61)( 1) (228)(228) (228)(228) (224)(214) ( 60)( 0)
Assertion failed: (btm[i].xx.x >= 0 && btm[i].yy.x >= 0), function main, file video.c, line 63.
I think your 'not negative' checks should be revised to:
if (top[i].xx.x < 0)
top[i].xx.x += 240;
if (top[i].yy.x < 0)
top[i].yy.x += 240;
if (btm[i].xx.x < 0)
btm[i].xx.x += 240;
if (btm[i].yy.x < 0)
btm[i].yy.x += 240;
This stops anything going negative. However, it is perfectly plausible that you should simply be checking on the bottom-right x-coordinate (instead of the top-left coordinate) using the original block. Or the wraparound may need to be more complex altogether. That's for you to decipher. But I think that the odd displays occur because you were providing negative values where you didn't intend to and weren't supposed to.
The key points to note here are:
When you're debugging an algorithm, you don't have to use the normal display mechanisms.
When you're debugging, reduce loop sizes where you can (size == 2).
Printing just the relevant information (here, the x-coordinates) helped reduce the output.
Putting the counter code to limit the amount of output simplifies things.
If things are going wrong, look for patterns in what is going wrong early.
I had various versions of the drawRect() function before I got to the design shown, which works well on a wide screen (eg 120x65) terminal window.

Optimization of Brute-Force algorithm or Alternative?

I have a simple (brute-force) recursive solver algorithm that takes lots of time for bigger values of OpxCnt variable. For small values of OpxCnt, no problem, works like a charm. The algorithm gets very slow as the OpxCnt variable gets bigger. This is to be expected but any optimization or a different algorithm ?
My final goal is that :: I want to read all the True values in the map array by
executing some number of read operations that have the minimum operation
cost. This is not the same as minimum number of read operations.
At function completion, There should be no True value unread.
map array is populated by some external function, any member may be 1 or 0.
For example ::
map[4] = 1;
map[8] = 1;
1 read operation having Adr=4,Cnt=5 has the lowest cost (35)
whereas
2 read operations having Adr=4,Cnt=1 & Adr=8,Cnt=1 costs (27+27=54)
#include <string.h>
typedef unsigned int Ui32;
#define cntof(x) (sizeof(x) / sizeof((x)[0]))
#define ZERO(x) do{memset(&(x), 0, sizeof(x));}while(0)
typedef struct _S_MB_oper{
Ui32 Adr;
Ui32 Cnt;
}S_MB_oper;
typedef struct _S_MB_code{
Ui32 OpxCnt;
S_MB_oper OpxLst[20];
Ui32 OpxPay;
}S_MB_code;
char map[65536] = {0};
static int opx_ListOkey(S_MB_code *px_kod, char *pi_map)
{
int cost = 0;
char map[65536];
memcpy(map, pi_map, sizeof(map));
for(Ui32 o = 0; o < px_kod->OpxCnt; o++)
{
for(Ui32 i = 0; i < px_kod->OpxLst[o].Cnt; i++)
{
Ui32 adr = px_kod->OpxLst[o].Adr + i;
// ...
if(adr < cntof(map)){map[adr] = 0x0;}
}
}
for(Ui32 i = 0; i < cntof(map); i++)
{
if(map[i] > 0x0){return -1;}
}
// calculate COST...
for(Ui32 o = 0; o < px_kod->OpxCnt; o++)
{
cost += 12;
cost += 13;
cost += (2 * px_kod->OpxLst[o].Cnt);
}
px_kod->OpxPay = (Ui32)cost; return cost;
}
static int opx_FindNext(char *map, int pi_idx)
{
int i;
if(pi_idx < 0){pi_idx = 0;}
for(i = pi_idx; i < 65536; i++)
{
if(map[i] > 0x0){return i;}
}
return -1;
}
static int opx_FindZero(char *map, int pi_idx)
{
int i;
if(pi_idx < 0){pi_idx = 0;}
for(i = pi_idx; i < 65536; i++)
{
if(map[i] < 0x1){return i;}
}
return -1;
}
static int opx_Resolver(S_MB_code *po_bst, S_MB_code *px_wrk, char *pi_map, Ui32 *px_idx, int _min, int _max)
{
int pay, kmax, kmin = 1;
if(*px_idx >= px_wrk->OpxCnt)
{
return opx_ListOkey(px_wrk, pi_map);
}
_min = opx_FindNext(pi_map, _min);
// ...
if(_min < 0){return -1;}
kmax = (_max - _min) + 1;
// must be less than 127 !
if(kmax > 127){kmax = 127;}
// is this recursion the last one ?
if(*px_idx >= (px_wrk->OpxCnt - 1))
{
kmin = kmax;
}
else
{
int zero = opx_FindZero(pi_map, _min);
// ...
if(zero > 0)
{
kmin = zero - _min;
// enforce kmax limit !?
if(kmin > kmax){kmin = kmax;}
}
}
for(int _cnt = kmin; _cnt <= kmax; _cnt++)
{
px_wrk->OpxLst[*px_idx].Adr = (Ui32)_min;
px_wrk->OpxLst[*px_idx].Cnt = (Ui32)_cnt;
(*px_idx)++;
pay = opx_Resolver(po_bst, px_wrk, pi_map, px_idx, (_min + _cnt), _max);
(*px_idx)--;
if(pay > 0)
{
if((Ui32)pay < po_bst->OpxPay)
{
memcpy(po_bst, px_wrk, sizeof(*po_bst));
}
}
}
return (int)po_bst->OpxPay;
}
int main()
{
int _max = -1, _cnt = 0;
S_MB_code best = {0};
S_MB_code work = {0};
// SOME TEST DATA...
map[ 4] = 1;
map[ 8] = 1;
/*
map[64] = 1;
map[72] = 1;
map[80] = 1;
map[88] = 1;
map[96] = 1;
*/
// SOME TEST DATA...
for(int i = 0; i < cntof(map); i++)
{
if(map[i] > 0)
{
_max = i; _cnt++;
}
}
// num of Opx can be as much as num of individual bit(s).
if(_cnt > cntof(work.OpxLst)){_cnt = cntof(work.OpxLst);}
best.OpxPay = 1000000000L; // invalid great number...
for(int opx_cnt = 1; opx_cnt <= _cnt; opx_cnt++)
{
int rv;
Ui32 x = 0;
ZERO(work); work.OpxCnt = (Ui32)opx_cnt;
rv = opx_Resolver(&best, &work, map, &x, -42, _max);
}
return 0;
}

You can use dynamic programming to calculate the lowest cost that covers the first i true values in map[]. Call this f(i). As I'll explain, you can calculate f(i) by looking at all f(j) for j < i, so this will take time quadratic in the number of true values -- much better than exponential. The final answer you're looking for will be f(n), where n is the number of true values in map[].
A first step is to preprocess map[] into a list of the positions of true values. (It's possible to do DP on the raw map[] array, but this will be slower if true values are sparse, and cannot be faster.)
int pos[65536]; // Every position *could* be true
int nTrue = 0;
void getPosList() {
for (int i = 0; i < 65536; ++i) {
if (map[i]) pos[nTrue++] = i;
}
}
When we're looking at the subproblem on just the first i true values, what we know is that the ith true value must be covered by a read that ends at i. This block could start at any position j <= i; we don't know, so we have to test all i of them and pick the best. The key property (Optimal Substructure) that enables DP here is that in any optimal solution to the i-sized subproblem, if the read that covers the ith true value starts at the jth true value, then the preceding j-1 true values must be covered by an optimal solution to the (j-1)-sized subproblem.
So: f(i) = min(f(j) + score(pos(j+1), pos(i)), with the minimum taken over all 1 <= j < i. pos(k) refers to the position of the kth true value in map[], and score(x, y) is the score of a read from position x to position y, inclusive.
int scores[65537]; // We effectively start indexing at 1
scores[0] = 0; // Covering the first 0 true values requires 0 cost
// Calculate the minimum score that could allow the first i > 0 true values
// to be read, and store it in scores[i].
// We can assume that all lower values have already been calculated.
void calcF(int i) {
int bestStart, bestScore = INT_MAX;
for (int j = 0; j < i; ++j) { // Always executes at least once
int attemptScore = scores[j] + score(pos[j + 1], pos[i]);
if (attemptScore < bestScore) {
bestStart = j + 1;
bestScore = attemptScore;
}
}
scores[i] = bestScore;
}
int score(int i, int j) {
return 25 + 2 * (j + 1 - i);
}
int main(int argc, char **argv) {
// Set up map[] however you want
getPosList();
for (int i = 1; i <= nTrue; ++i) {
calcF(i);
}
printf("Optimal solution has cost %d.\n", scores[nTrue]);
return 0;
}
Extracting a Solution from Scores
Using this scheme, you can calculate the score of an optimal solution: it's simply f(n), where n is the number of true values in map[]. In order to actually construct the solution, you need to read back through the table of f() scores to infer which choice was made:
void printSolution() {
int i = nTrue;
while (i) {
for (int j = 0; j < i; ++j) {
if (scores[i] == scores[j] + score(pos[j + 1], pos[i])) {
// We know that a read can be made from pos[j + 1] to pos[i] in
// an optimal solution, so let's make it.
printf("Read from %d to %d for cost %d.\n", pos[j + 1], pos[i], score(pos[j + 1], pos[i]));
i = j;
break;
}
}
}
}
There may be several possible choices, but all of them will produce optimal solutions.
Further Speedups
The solution above will work for an arbitrary scoring function. Because your scoring function has a simple structure, it may be that even faster algorithms can be developed.
For example, we can prove that there is a gap width above which it is always beneficial to break a single read into two reads. Suppose we have a read from position x-a to x, and another read from position y to y+b, with y > x. The combined costs of these two separate reads are 25 + 2 * (a + 1) + 25 + 2 * (b + 1) = 54 + 2 * (a + b). A single read stretching from x-a to y+b would cost 25 + 2 * (y + b - x + a + 1) = 27 + 2 * (a + b) + 2 * (y - x). Therefore the single read costs 27 - 2 * (y - x) less. If y - x > 13, this difference goes below zero: in other words, it can never be optimal to include a single read that spans a gap of 12 or more.
To make use of this property, inside calcF(), final reads could be tried in decreasing order of start-position (i.e. in increasing order of width), and the inner loop stopped as soon as any gap width exceeds 12. Because that read and all subsequent wider reads tried would contain this too-large gap and therefore be suboptimal, they need not be tried.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

OpenMP parallel for loop - c

Related

Looking for performance improvement of NEON code to match clipping area on the screen

Increasing n-body program performance using OpenMP

OpenMP parallelization not efficient

Using two Arrays in C/Gameboy programming

Optimization of Brute-Force algorithm or Alternative?

Categories

Resources