Here is my test code to find 1st clipping area on the screen.
Two subroutines and dummy loops in the code to compare the performance of them.
point_in_neon (NEON version) and point_in (Regular version) does the same thing:
find out the first clipping area (contains given point) in given list and return -1 if there is no matching area.
I expected NEON version is faster than regular version.
Unfortunately, it is slower than regular version. Is there another way to speed it up?
The compiler command is:
${CC} -O2 -ftree-vectorize -o vcomp vcomp.c
Thanks,
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include <sys/time.h>
#include <arm_neon.h>
#define WIDTH (4096)
#define HEIGHT (4096)
#define CLIPS (32)
static inline uint64_t now(void) {
struct timeval tv;
gettimeofday(&tv,NULL);
return tv.tv_sec*1000000+tv.tv_usec;
}
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
const int32_t right[4]={
rs[0].x+rs[0].width-1,
rs[1].x+rs[1].width-1,
rs[2].x+rs[2].width-1,
rs[3].x+rs[3].width-1
}, bottom[4]={
rs[0].y+rs[0].height-1,
rs[1].y+rs[1].height-1,
rs[2].y+rs[2].height-1,
rs[3].y+rs[3].height-1
};
int32x4_t p, r;
uint32x4_t t;
uint32_t res[4];
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//r = <Left0, Left1, Left2, Left3>
r=vld1q_lane_s32(&rs[0].x, r, 0);
r=vld1q_lane_s32(&rs[1].x, r, 1);
r=vld1q_lane_s32(&rs[2].x, r, 2);
r=vld1q_lane_s32(&rs[3].x, r, 3);
//t = (p >= r)
t=vcgeq_s32(p, r);
//r = <Right0, Right1, Right2, Right3>
r=vld1q_s32(&right);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//r = <Top0, Top1, Top2, Top3>
r=vld1q_lane_s32(&rs[0].y, r, 0);
r=vld1q_lane_s32(&rs[1].y, r, 1);
r=vld1q_lane_s32(&rs[2].y, r, 2);
r=vld1q_lane_s32(&rs[3].y, r, 3);
//t = t & (p >= r)
t=vandq_u32(t, vcgeq_s32(p, r));
//r = <Bottom0, Bottom1, Bottom2, Bottom3>
r=vld1q_s32(&bottom);
//t = t & (r >= p)
t=vandq_u32(t, vcgeq_s32(r, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0;i<len;i++) {
int32_t right=rs[i].x+rs[i].width-1,
bottom=rs[i].y+rs[i].height-1;
if(pt->x>=rs[i].x && pt->x<=right &&
pt->y>=rs[i].y && pt->y<=bottom)
return i;
}
return -1;
}
int32_t main(int32_t argc, char *argv[]) {
rect_t rs[CLIPS];
int32_t i, j;
uint64_t ts0, ts1;
int32_t res[2][CLIPS];
srand((unsigned int)time(NULL));
for(i=0;i<CLIPS;i++) {
rs[i].x=rand()%WIDTH;
rs[i].y=rand()%HEIGHT;
rs[i].width=rand()%WIDTH;
rs[i].height=rand()%HEIGHT;
}
memset(res, 0, sizeof(res));
ts0=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
if(idx>=0)
res[0][idx]=1;
}
}
ts0=now()-ts0;
ts1=now();
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
int32_t k, idx;
point_t p={i, j};
for(k=0, idx=-1;k<CLIPS/4;k++) {
idx=point_in_neon(&p, &rs[k*4]);
if(idx>=0)
break;
}
if(idx>=0)
res[1][k*4+idx]=1;
}
}
ts1=now()-ts1;
/*
for(i=0;i<CLIPS;i++) {
if(res[0][i]!=res[1][i]) {
printf("error.\n");
return 1;
}
}
*/
printf("regular = %lu\n", ts0);
printf("neon = %lu\n", ts1);
return 0;
}
According to Peter Cordes's suggestion, I replaced data loding parts of point_in_neon subroutine with vld4q_s32 intrinsic and subsequent right and bottom calculation can be vectorized. Now the code is shorter and faster than regular version.
int32_t inline point_in_neon(const point_t *pt, const rect_t rs[4]) {
int32x4x4_t r;
int32x4_t right, bottom, p;
uint32x4_t t;
uint32_t res[4];
/*
r.val[0] = <X0, X1, X2, X3>
r.val[1] = <Y0, Y1, Y2, Y3>
r.val[2] = <Width0, Width1, Width2, Width3>
r.val[3] = <Height0, Height1, Height2, Height3>
*/
r=vld4q_s32(rs);
//right = <Right0, Right1, Right2, Right3>
right=vsubq_s32(vaddq_s32(r.val[0], r.val[2]), vdupq_n_s32(1));
//bottom = <Bottom0, Bottom1, Bottom2, Bottom3>
bottom=vsubq_s32(vaddq_s32(r.val[1], r.val[3]), vdupq_n_s32(1));
//p = <Xp, Xp, Xp, Xp>
p=vld1q_dup_s32(&pt->x);
//t = (p >= left)
t=vcgeq_s32(p, r.val[0]);
//t = t & (right >= p)
t=vandq_u32(t, vcgeq_s32(right, p));
//p = <Yp, Yp, Yp, Yp>
p=vld1q_dup_s32(&pt->y);
//t = t & (p >= top)
t=vandq_u32(t, vcgeq_s32(p, r.val[1]));
//t = t & (r >= bottom)
t=vandq_u32(t, vcgeq_s32(bottom, p));
vst1q_u32(res, t);
if(res[0])
return 0;
else if(res[1])
return 1;
else if(res[2])
return 2;
else if(res[3])
return 3;
return -1;
}
Starting with your original point_in method, we can clean up a little bit here by removing the -1's, and changing <= to <.
int32_t inline point_in(const point_t *pt, const rect_t *rs, uint32_t len) {
int32_t i;
for(i=0; i < len; i++)
{
// this is pointless - change your data structures so that
// the rect stores minx/maxx, miny/maxy instead!
int32_t right = rs[i].x + rs[i].width;
int32_t bottom= rs[i].y + rs[i].height;
bool cmp0 = pt->x >= rs[i].x;
bool cmp1 = pt->y >= rs[i].y;
bool cmp2 = pt->x < right;
bool cmp3 = pt->y < bottom;
if(cmp0 & cmp1 & cmp2 & cmp3)
return i;
}
return -1;
}
Next obvious thing to point out:
// your screen size...
#define WIDTH (4096)
#define HEIGHT (4096)
// yet your structures use uint32 as storage???
typedef struct _rect_t {
int32_t x;
int32_t y;
uint32_t width;
uint32_t height;
} rect_t;
typedef struct _point_t {
int32_t x;
int32_t y;
} point_t;
If you can get away with using 16bit integers, this will go at twice the speed (because you can fit 8x 16bit numbers in a SIMD register, v.s. 4x 32bit). Whilst we're at it, we might as well change the data layout to structure of array at the same time.
I'm also going to hoist the pointless p.x + width out, and store it as xmax/ymax instead (removes duplicated computation in your loops).
typedef struct rect_x8_t {
int16x8_t x;
int16x8_t y;
int16x8_t xmax; //< x + width
int16x8_t ymax; //< y + height
} rect_x8_t;
typedef struct point_x8_t {
int16x8_t x;
int16x8_t y;
} point_x8_t;
On the assumption you don't have a number of clips that's divisible by 8, we'll need to pad the number slightly (not a big deal)
// assuming this has already been initialised
rect_t rs[CLIPS];
// how many batches of 8 do we need?
uint32_t CLIPS8 = (CLIPS / 8) + (CLIPS & 7 ? 1 : 0);
// allocate in batches of 8
rect_x8_t rs8[CLIPS8] = {};
// I'm going to do this rubbishly as an pre-process step.
// I don't care too much about efficiency here...
for(uint32_t i = 0; i < CLIPS; ++i) {
rs8[i / 8].x[i & 7] = rs[i].x;
rs8[i / 8].y[i & 7] = rs[I].y;
rs8[i / 8].xmax[i & 7] = rs[i].x + rs[i].width;
rs8[i / 8].ymax[i & 7] = rs[i].y + rs[i].height;
}
I have a couple of concerns here:
for(i=0;i<HEIGHT;i++) {
for(j=0;j<WIDTH;j++) {
// This seems wrong? Shouldn't it be p = {j, i} ?
point_t p={i, j};
int32_t idx=point_in(&p, rs, CLIPS);
// I'm not quite sure what the result says about your
// image data and clip regions???
//
// This seems like a really silly way of asking
// a simple question about the clip regions. The pixels
// don't have any effect here.
if(idx >= 0)
res[0][idx] = 1;
}
}
Anyhow, now refactoring the point_in method to use int16x8_t, we get:
inline int32_t point_in_x8(const point_x8_t pt,
const rect_x8_t* rs,
uint32_t len) {
for(int32_t i = 0; i < len; i++) {
// perform comparisons on 8 rects at a time
uint16x8_t cmp0 = vcgeq_s16(pt.x, rs[i].x);
uint16x8_t cmp1 = vcgeq_s16(pt.y, rs[i].y);
uint16x8_t cmp2 = vcltq_s16(pt.x, rs[i].xmax);
uint16x8_t cmp3 = vcltq_s16(pt.y, rs[I].ymax);
// combine to single comparison value
uint16x8_t cmp01 = vandq_u16(cmp0, cmp1);
uint16x8_t cmp23 = vandq_u16(cmp2, cmp3);
uint16x8_t cmp0123 = vandq_u16(cmp01, cmp23);
// use a horizontal max to see if any lanes are true
if(vmaxvq_u16(cmp0123)) {
for(int32_t j = 0; j < 8; ++j) {
if(cmp0123[j])
return 8*i + j;
}
}
}
return -1;
}
Any additional padded elements in the rect_x8_t structs should end up being ignored (since they should be 0/0, 0/0, which will always end up being false).
Then finally...
for(i = 0; i < HEIGHT; i++) {
point_x8_t p;
// splat the y value
p.y = vld1q_dup_s16(i);
for(j = 0; j < WIDTH; j++) {
// splat the x value
p.x = vld1q_dup_s16(j);
int32_t idx = point_in_x8(p, rs8, CLIPS8);
if(idx >= 0)
res[1][idx] = 1;
}
}
The vld4 instruction actually has a fairly high latency. Given that WIDTH * HEIGHT is actually a very big number, pre-swizzling here (as a pre-processing step) makes a lot more sense imho.
HOWEVER
This whole algorithm could be massively improved by simply ignoring the pixels, and working on CLIP regions directly.
A clip region will be false if it is entirely contained by the preceding clip regions
for(i = 0; i < CLIPS; i++) {
// if region is empty, ignore.
if(rs[i].width == 0 || rs[i].height == 0) {
res[0][i] = 0;
continue;
}
// first region will always be true (unless it's of zero size)
if(i == 0) {
res[0][1] = 1;
continue;
}
uint32_t how_many_intersect = 0;
bool entirely_contained = false;
uint32_t intersection_indices[CLIPS] = {};
// do a lazy test first.
for(j = i - 1; j >= 0; --j) {
// if the last region is entirely contained by preceding
// ones, it will be false. exit loop.
if(region_is_entirely_contained(rs[i], rs[j])) {
res[0][i] = 0;
entirely_contained = true;
j = -1; ///< break out of loop
}
else
// do the regions intersect?
if(region_intersects(rs[i], rs[j])) {
intersection_indices[how_many_intersect] = j;
++how_many_intersect;
}
}
// if one region entirely contains this clip region, skip it.
if(entirely_contained) {
continue;
}
// if you only intersect one or no regions, the result is true.
if(how_many_intersect <= 1) {
res[0][i] = 1;
continue;
}
// If you get here, the result is *probably* true, however
// you will need to split this clip region against the previous
// ones to be fully sure. If all regions are fully contained,
// the answer is false.
// I won't implement it, but something like this:
* split rs[i] against each rs[intersection_indices[]].
* Throw away the rectangles that are entirely contained.
* Each bit that remains should be tested against each rs[intersection_indices[]]
* If you find any split rectangle that isn't contained,
set to true and move on.
}
#include <stdio.h>
#include <stdlib.h>
int main() {
int i, j, n, maxi = 0;
printf("\n Introduce the number:\n");
scanf("%d", &n);
for (j = 1; j <= n; j++)
{
i = 0;
while (i < j) {
i++;
if (j == i * i) {
if (j > maxi) {
maxi = j;
printf("%d", maxi);
}
}
}
}
return 0;
}
I have to find the greatest perfect square smaller than than a number n, I succeeded in finding all the perfect squares that are smaller than the number n but because each time it finds a perfect square it displays it I couldn't think of any way to compare all the perfect square that were found (or at least that's what I think the problem is) so I would appreciate some help. I already know that you could also solve this problem using a more simpler method ( like the one below ) and if you have any other ideas on how to solve it I'd like to hear them.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int main()
{
int n,j;
printf("\n Your number:\n");
scanf("%d",&n);
j=(int)sqrt(n);
printf("%d",j*j);
return 0;
}
You only need a single loop here. Check if i*i <= n. If so, set maxi to i*i and increment i:
int n, i = 1, sq = 1;
printf("\n Introduce the number:\n");
scanf("%d", &n);
while (i*i <= n) {
sq = i*i;
i++;
}
printf("sq=%d\n", sq);
Find the greatest perfect square that is less than or equal to n
For n>=0, this is akin to finding the integer square root of n.
unsigned greatest_perfect_square(unsigned x) {
unsigned root = usqrt(x);
return root * root;
}
if you have any other ideas on how to solve it I'd like to hear them.
The order of complexity to find the square root is O(bit-width-of-type-n). e.g. 16 iterations.
#include <limits.h>
unsigned usqrt(unsigned x) {
unsigned y = 0;
unsigned xShifted = 0;
const unsigned MSBit = UINT_MAX - UINT_MAX/2;
// This constant relies on no padding and bit width even
const unsigned TwoBitCount_N = sizeof(x) * CHAR_BIT / 2;
for (unsigned TwoBitCount = TwoBitCount_N; TwoBitCount > 0; TwoBitCount--) {
// Shift `xShifted` 2 places left while shifting in the 2 MSbits of x
xShifted <<= 1;
if (x & MSBit) {
xShifted |= 1;
}
x <<= 1;
xShifted <<= 1;
if (x & MSBit) {
xShifted |= 1;
}
x <<= 1;
// Shift the answer 1 bit left
y <<= 1;
// Form test value as y*2 + 1
unsigned Test = (y << 1) | 1;
// If xShifted big enough ...
if (xShifted >= Test) {
xShifted -= Test;
// Increment answer
y |= 1;
}
}
return y;
}
OP's method is far far slower. Even the inner loop takes O(sqrt(n)) time.
Note:
OP's code: j == i * i is subject to overflow and leads to the incorrect answer when j is larger.
j/i == i performs a like test without overflow.
#Jonathan Leffler suggested a Newton-Raphson approximation approach. Some lightly tested code below works quite fast, often taking only a few iterations.
I suspect this is O(log(bit-width-of-type-n)) for the main part, yet of course still O(log(bit-width-of-type-n)) for bit_width().
Both of the functions could be improved.
unsigned bit_width(unsigned x) {
unsigned width = 0;
while (x) {
x /= 2;
width++;
}
return width;
}
unsigned usqrt_NR(unsigned x) {
if (x == 0) {
return 0;
}
unsigned y = 1u << bit_width(x)/2;
unsigned y_previous;
unsigned diff;
unsigned diff1count = 0;;
do {
y_previous = y;
y = (y + x/y)/2;
diff = y_previous < y ? y - y_previous : y_previous - y;
if (diff == 1) diff1count++;
} while (diff > 1 || (diff == 1 && diff1count <= 1));
y = (y_previous + y)/2;
return y;
}
This minimizes the number of multiplications: it looks for the first square which is larger than n, meaning that the perfect square immediately before was the solution.
for (i = 1; i <= n; i++) {
if (i*i > n) {
break;
}
}
i--;
// i*i is your answer
On some platforms it might be useful to exploit the fact that (i+1)*(i+1) = i*i + 2*i + 1, or in other words, if you already have i^2, (i+1)^2 is obtained by adding i to it twice, and incrementing by 1; and at the beginning, 0^2 is 0 to prime the cycle.
for (i = 0, sq = 0; i < n; i++) {
sq += i; // Or on some platforms sq += i<<1 instead of two sums
sq += i; // Some compilers will auto-optimize "sq += 2*i" for the platform
sq++; // Or even sq += ((2*i)|1) as adding 1 to even numbers is OR'ing 1
if (sq > n) {
break;
}
// if sq is declared as signed integer, a possible overflow will
// show it as being negative. This way we can still get a "correct" result
// with i the smallest root that does not overflow.
// In 16-bit arithmetic this is 181, root of 32761; next square would be
// 33124 which cannot be represented in signed 16-bit space.
if (sq < 0) {
break;
}
}
// (i*i) is your answer
I have a 2 dimensional byte array of a bitmap pixel data that I need to send to a thermal printer using ESC/POS via comport. I can do this successfully. However, I need to shift the printed image to the right. Center justify, Right justify, HT, and all other ESC/POS commands have no effect due to the command used to print the bitmap (DC2 * r n [d1.....dn]).
I wish to left pad the array of bytes containing the bitmap in order to shift the printed image to the right. Below are my code lines to print bitmap
private void Print_Bipmap()
{
int x;
int y;
int i;
int RowBytes;
byte n;
Color Pixels;
byte[,] ImageArray = new byte[bitmap.Width, bitmap.Height];
// Calculate output size
RowBytes = (bitmap.Width + 7) / 8;
// Generate body of array
for (y = 0; y < bitmap.Height; y++)
{ // Each row...
for (x = 0; x < (bitmap.Width / 8); x++)
{ // Each 8-pixel block within row...
ImageArray[x, y] = 0;
for (n = 0; n < 8; n++)
{ // Each pixel within block...
Pixels = bitmap.GetPixel(x * 8 + n, y);
if (Pixels.GetBrightness() < 0.5)
{
ImageArray[x, y] += (byte)(1 << (7 - n));
}
}
}
}
comport_writeByte(18); //DC2
comport_writeByte(42); //*
comport_writeByte((byte)bitmap.Height); //r
comport_writeByte((byte)RowBytes); //n
for (y = 0; y < bitmap.Height; y++)
{
for (x = 0; x < RowBytes; x++)
{
comport_writeByte(ImageArray[x, y]); //[d1 ..... dn]
}
}
}
How do I left pad the 2 dimensional array (ImageArray[x, y])?
Many thanks in advance.
void scale_brightness( uint8_t array[],
unsigned int cols,
unsigned int rows,
double scale_factor )
{
for (int x = 0; x < rows; x++)
{
for (int y = 0; y < cols; y++)
{
array[x] = ceil(scale_factor * array[x]);
array[y] = ceil(scale_factor * array[y]);
if (array[x] >= 255 && array[y] >= 255)
{
array[x] = 255;
array[y] = 255;
}
}
}
}
So this function is supposed to multiply each pixel in an image by a scale factor. But for some reason, it's not working. I can't find whats wrong with it. Would anybody be able to help me out with it?
Just unroll the loops on paper and you'll see what's happening...
1st iteration (x = 0, y = 0)
array[0] = ceil(scale_factor * array[0]);
array[0] = ceil(scale_factor * array[0]);
if (array[0] >= 255 && array[0] >= 255)
{
array[0] = 255;
array[0] = 255;
}
Already this is nonsensical, you're doing the same operation twice on the same element and then your if statement checks for the same condition twice on the same element, to then assign 255 twice to that same value.
2nd iteration (x = 0, y = 1)
array[0] = ceil(scale_factor * array[0]);
array[1] = ceil(scale_factor * array[1]);
if (array[0] >= 255 && array[1] >= 255)
{
array[0] = 255;
array[1] = 255;
}
So now you're setting element 0 again even though you just did it in the last iteration, but at least we're considering element 1 now.
By extrapolating we can see that the calculation will be applied for pixels 0 to max(cols, rows). Obviously your array of pixels has many more pixels than that, probably (cols * rows) pixels, so your algorithm ignores most pixels but is applied many times for some pixels, and that's basically why it doesn't work.
Consider array[x] = ceil(scale_factor * array[x]);. If array[x] is 255 and scale_factor is 1.1, then that would be like array[x] = 281;. This is an overflow (281 is too large to fit in an unsigned 8-bit integer). C will discard the highest bits that don't fit, so it would be like array[x] = 25;.
Now consider if(array[x] >= 255) array[x] = 255;. Because it's working with unsigned 8-bit integers (that can only store values in the range from 0 to 255); it's only possible for (array[x] >= 255) to be true when array[x] contains the value 255. It would be equivalent to if(array[x] == 255) array[x] = 255;, which actually does nothing at all. Your if (array[x] >= 255 && array[y] >= 255) { code has the same problem - it does nothing.
Now read Asik's answer - he's right too. You're doing the same broken calculation multiple times on each pixel; and it's going to end up being a bit like a pseudo-random number generator.
For a game in Gameboy programming, I am using four arrays called top, oldTop, bottom and oldBottom:
struct Point { int x, y; };
struct Rect { struct Point xx, yy; };
Rect top[size], oldTop[size];
Rect bottom[size], oldBottom[i];
where Rect is a struct made of two Struct Points, the top-left and the bottom right corner points.
The idea of the game is to have random-heighted blocks top-down from the ceiling and bottom-up from the floor.
It is similar to the copter-classic game. In my infinite while loop, I shift all of the rectangles down by one pixel using the following code
while (1)
{
for (int i = 0; i < size; i++)
{
//in Struct Rect, xx is the top-left corner point, and yy is the bottom right
top[i].xx.x--;
top[i].yy.x--;
bottom[i].xx.x--;
bottom[i].yy.x--;
if (top[i].xx.x < 0)
{
top[i].xx.x += 240;
top[i].yy.x += 240;
}
if (bottom[i].xx.x < 0)
{
bottom[i].xx.x += 240;
bottom[i].yy.x += 240;
}
}
for (int i = 0; i < size; i++)
{
drawRect(oldTop[i], colorBlack);
drawRect(oldBottom[i], colorBlack);
}
/*call delay function that wait for Vertical Blank*/
for(int i = 0; i < size; i++)
{
drawRect(top[i], colorGreen);
drawRect(bottom[i], colorGreen);
oldTop[i] = top[i];
oldBottom[i] = bottom[i];
}
}
The drawRect method uses DMA to draw the rectangle.
with this code, the code should display the rectangles like this: (drew this up in paint)
But the result I get is
What is odd is that if I don't draw the bottom row at all, then the top row draws fine. The result only messes up when I draw both. This is really weird because I think that the code should be working fine, and the code is not very complicated. Is there a specific reason this is happening, and is there a way to remedy this?
Thanks.
The code that I use to draw the rectangle looks like this:
void drawRect(int row, int col, int width, int height){
int i;
for (i=0; i<height; i++)
{
DMA[3].src = &color;
DMA[3].dst = videoBuffer + (row+r)*240 + col);
DMA[3].cnt = DMA_ON | DMA_FIXED_SOURCE | width;
}
}
Here's a debugging SSCCE (Short, Self-Contained, Correct Example) based on your code. There are assertions in this code that fire; it runs, but is known not to be correct. I've renamed bottom to btm and oldBottom to oldBtm so that the names are symmetric; it makes the code layout more systematic (but is otherwise immaterial).
#include <assert.h>
#include <stdio.h>
typedef struct Point { int x, y; } Point;
typedef struct Rect { struct Point xx, yy; } Rect;
enum { size = 2 };
typedef enum { colourGreen = 0, colourBlack = 1 } Colour;
/*ARGSUSED*/
static void drawRect(Rect r, Colour c)
{
printf(" (%3d)(%3d)", r.xx.x, r.yy.x);
}
int main(void)
{
Rect top[size], oldTop[size];
Rect btm[size], oldBtm[size];
int counter = 0;
for (int i = 0; i < size; i++)
{
top[i].xx.x = 240 - 4 * i;
top[i].xx.y = 0 + 10 + i;
top[i].yy.x = 240 - 14 * i;
top[i].yy.y = 0 + 20 + i;
btm[i].xx.x = 0 + 72 * i;
btm[i].xx.y = 0 + 10 * i;
btm[i].yy.x = 0 + 12 * i;
btm[i].yy.y = 0 + 20 * i;
oldTop[i] = top[i];
oldBtm[i] = btm[i];
}
while (1)
{
if (counter++ > 480) // Limit amount of output!
break;
for (int i = 0; i < size; i++)
{
//in Struct Rect, xx is the top-left corner point, and yy is the bottom right
top[i].xx.x--;
top[i].yy.x--;
btm[i].xx.x--;
btm[i].yy.x--;
if (top[i].xx.x < 0)
{
top[i].xx.x += 240;
top[i].yy.x += 240;
}
if (btm[i].xx.x < 0)
{
btm[i].xx.x += 240;
btm[i].yy.x += 240;
}
}
for (int i = 0; i < size; i++)
{
assert(top[i].xx.x >= 0 && top[i].yy.x >= 0);
assert(btm[i].xx.x >= 0 && btm[i].yy.x >= 0);
}
for (int i = 0; i < size; i++)
{
drawRect(oldTop[i], colourBlack);
drawRect(oldBtm[i], colourBlack);
}
/*call delay function that wait for Vertical Blank*/
for(int i = 0; i < size; i++)
{
drawRect(top[i], colourGreen);
drawRect(btm[i], colourGreen);
oldTop[i] = top[i];
oldBtm[i] = btm[i];
}
putchar('\n');
}
return(0);
}
As noted in a late comment, one big difference between this and your code is that oldBottom in your code is declared as:
Rect top[size], oldTop[size];
Rect bottom[size], oldBottom[i];
using the size i instead of size. This probably accounts for array overwriting issues you see.
There's a second problem though; the assertions in the loop in the middle fire:
(240)(240) ( 0)( 0) (236)(226) ( 72)( 12) (239)(239) (239)(239) (235)(225) ( 71)( 11)
(239)(239) (239)(239) (235)(225) ( 71)( 11) (238)(238) (238)(238) (234)(224) ( 70)( 10)
(238)(238) (238)(238) (234)(224) ( 70)( 10) (237)(237) (237)(237) (233)(223) ( 69)( 9)
(237)(237) (237)(237) (233)(223) ( 69)( 9) (236)(236) (236)(236) (232)(222) ( 68)( 8)
(236)(236) (236)(236) (232)(222) ( 68)( 8) (235)(235) (235)(235) (231)(221) ( 67)( 7)
(235)(235) (235)(235) (231)(221) ( 67)( 7) (234)(234) (234)(234) (230)(220) ( 66)( 6)
(234)(234) (234)(234) (230)(220) ( 66)( 6) (233)(233) (233)(233) (229)(219) ( 65)( 5)
(233)(233) (233)(233) (229)(219) ( 65)( 5) (232)(232) (232)(232) (228)(218) ( 64)( 4)
(232)(232) (232)(232) (228)(218) ( 64)( 4) (231)(231) (231)(231) (227)(217) ( 63)( 3)
(231)(231) (231)(231) (227)(217) ( 63)( 3) (230)(230) (230)(230) (226)(216) ( 62)( 2)
(230)(230) (230)(230) (226)(216) ( 62)( 2) (229)(229) (229)(229) (225)(215) ( 61)( 1)
(229)(229) (229)(229) (225)(215) ( 61)( 1) (228)(228) (228)(228) (224)(214) ( 60)( 0)
Assertion failed: (btm[i].xx.x >= 0 && btm[i].yy.x >= 0), function main, file video.c, line 63.
I think your 'not negative' checks should be revised to:
if (top[i].xx.x < 0)
top[i].xx.x += 240;
if (top[i].yy.x < 0)
top[i].yy.x += 240;
if (btm[i].xx.x < 0)
btm[i].xx.x += 240;
if (btm[i].yy.x < 0)
btm[i].yy.x += 240;
This stops anything going negative. However, it is perfectly plausible that you should simply be checking on the bottom-right x-coordinate (instead of the top-left coordinate) using the original block. Or the wraparound may need to be more complex altogether. That's for you to decipher. But I think that the odd displays occur because you were providing negative values where you didn't intend to and weren't supposed to.
The key points to note here are:
When you're debugging an algorithm, you don't have to use the normal display mechanisms.
When you're debugging, reduce loop sizes where you can (size == 2).
Printing just the relevant information (here, the x-coordinates) helped reduce the output.
Putting the counter code to limit the amount of output simplifies things.
If things are going wrong, look for patterns in what is going wrong early.
I had various versions of the drawRect() function before I got to the design shown, which works well on a wide screen (eg 120x65) terminal window.