image proccessing further optimization - c

I'm new to optimization and was given a task to optimize a function that processes an image as much as possible. it takes an image, blurs it and then saves the blurred image, and then continues and sharpens the image, and saves also the sharpened image.
Here is my code:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = calculateIndex(1, 1, n);
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operato like that : insted of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is actually the same as multiply by (-1), in more efficient way.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[calculateIndex(i,j,n)];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}
I wanted to ask about the if statements, is there anything better that can replace those? And also more generally speaking can anyone spot an optimization mistakes here, or can offer his inputs?
Thanks a lot!
updated code:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = calculateIndex(1, 1, n);
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
index += 2;
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operato like that : insted of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is actually the same as multiply by (-1), in more efficient way.
*/
index = calculateIndex(1,1,n);
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[index];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
index += 2;
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}
------------------------------------------------------------------------------updated code:
typedef struct {
unsigned char red;
unsigned char green;
unsigned char blue;
} pixel;
// I delete the other struct because we can do the same operations with use of only addresses
//use macro instead of function is more efficient
#define calculateIndex(i, j, n) ((i)*(n)+(j))
// I combine all the functions in one because it is time consuming
void myfunction(Image *image, char* srcImgpName, char* blurRsltImgName, char* sharpRsltImgName) {
// use variable from type 'register int' is much more efficient from 'int'
register int i,j, ii, jj, sum_red, sum_green, sum_blue;
//using local variable is much more efficient than using pointer to pixels from the original image,and updat its value in each iteration
pixel current_pixel , p;
//dst will point on the first pixel in the image
pixel* dst = (pixel*)image->data;
int squareN = n*n;
//instead of multiply by 3 - I used shift
register int sizeToAllocate = ((squareN)<<1)+(squareN); // use variable from type 'register int' is much more efficient from 'int'
pixel* src = malloc(sizeToAllocate);
register int index;
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// first step : smooth //////////////////////////////////////////////////////////////////////
/**the smooth blur is step that apply the blur-kernel (matrix of ints) over each pixel in the bouns - and make the image more smooth.
*this function was originally used this matrix :
* [1, 1, 1]
* [1, 1, 1]
* [1, 1, 1]
*because the matrix is full of 1 , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable.
*/
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
index = n + 1;
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
for(ii = i-1; ii <= i+1; ++ii) {
for(jj =j-1; jj <= j+1; ++jj) {
//take care of the [ii,jj] pixel in the matrix
//calculate the adrees of the current pixel
pixel p = src[calculateIndex(ii, jj, n)];
//sum the colors' values of the neighbors of the current pixel
sum_red += p.red;
sum_green += p.green;
sum_blue += p.blue;
}
}
//calculate the avarage of the colors' values around the current pixel - as written in the instructions
sum_red = (((sum_red) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_green = (((sum_green) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
sum_blue = (((sum_blue) * 0xE38F) >> 19);//instead of dividing by 9 - I used shift because it is more efficient
current_pixel.red = (unsigned char)sum_red;
current_pixel.green = (unsigned char)sum_green;
current_pixel.blue = (unsigned char)sum_blue;
dst[index++] = current_pixel;
}
index += 2;
}
// write result image to file
writeBMP(image, srcImgpName, blurRsltImgName);
//memcpy replace the old functions that converts chars to pixels or pixels to chars. it is very efficient and build-in in c libraries
memcpy(src, dst, sizeToAllocate);
///////////////////////////////////////// second step : sharp //////////////////////////////////////////////////////////////////////
/** I want to sharp the smooth image . In this step I apply the sharpen kernel (matrix of ints) over each pixel in the bouns - and make the image more sharp.
*this function was originally used this matrix :
* [-1, -1, -1]
* [-1, 9, -1]
* [-1, -1, -1]
*because the matrix is full of (-1) , we don't really need it - the access to the matrix is very expensive . instead of the matrix I used
*primitive variable. I operate like that : instead of multiply in (-1) in the end of the step , I define counter initializes with zero , and
*substruct all te colors' values from it. the result is actually the same as multiply by (-1), in more efficient way.
*/
index = calculateIndex(1,1,n);
//the loops are starting with 1 and not with 0 because we need to check only the pixels with 8 neighbors around them
for (i = 1 ; i < n-1; ++i) {
for (j = 1 ; j < n-1 ; ++j) {
// I used this variables as counters to the colors' values around a specific pixel
sum_red = 0;
sum_green = 0;
sum_blue = 0;
// Do central pixel first
p=src[index];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
//each pixel's colors' values must match the range [0,255] - I used the idea from the original code
//the red value must be in the range [0,255]
if (sum_red < 0) {
sum_red = 0;
} else if (sum_red > 255 ) {
sum_red = 255;
}
current_pixel.red = (unsigned char)sum_red;
//the green value must be in the range [0,255]
if (sum_green < 0) {
sum_green = 0;
} else if (sum_green > 255 ) {
sum_green = 255;
}
current_pixel.green = (unsigned char)sum_green;
//the blue value must be in the range [0,255]
if (sum_blue < 0) {
sum_blue = 0;
} else if (sum_blue > 255 ) {
sum_blue = 255;
}
current_pixel.blue = (unsigned char)sum_blue;
// put the updated pixel in [i,j] in the image
dst[calculateIndex(i, j, n)] = current_pixel;
}
index += 2;
}
//free the allocated space to prevent memory leaks
free(src);
// write result image to file
writeBMP(image, srcImgpName, sharpRsltImgName);
}

Some general optimization guidelines:
If you're running on x86, compile as a 64-bit binary. x86 is really a register-starved CPU. In 32-bit mode you pretty much have only 5 or 6 32-bit general-purpose registers available, and you only get "all" 6 if you compile with optimizations like -fomit-frame-pointer on GCC. In 64-bit mode you'll have 13 or 14 64-bit general-purpose registers.
Get a good compiler and use the highest possible general optimization level.
Profile! Profile! Profile! Actually profile your code so actually know where the performance bottlenecks are. Any guesses about the location of any performance bottlenecks are likely wrong.
Once you find your bottlenecks, examine the actual instructions the compiler produces and look at the bottleneck areas, just to see what's happening. Perhaps the bottleneck is where the compiler had to do a lot of register spilling and filling because of register pressure. This can be really helpful if you can profile down to the instruction level.
Use the insights from the profiling and examination of the generated instructions to improve your code and compile arguments. For example, if you're seeing a lot of register spilling and filling, you need to reduce register pressure, perhaps by manually coalescing loops or disabling prefetching with a compiler option.
Experiment with different page size options. If a single row of pixels is a significant fraction of a page size, reaching into other rows is more likely to reach into another page and result in a TLB miss. Using larger memory pages may significantly reduce this.
Some specific ideas for your code:
Use only one outer loop. You'll have to experiment to find the fastest way to handle your "extra" edge pixels. The fastest way might be to not do anything special, roll right over them like "normal" pixels, and just ignore the values in them later.
Manually unroll the two inner loops - you're only doing 9 pixels.
Don't use calculateIndex() - use the address of the current pixel and find the other pixels simply by subtracting or adding the proper value from the current pixel address. For example, the address of the upper-left pixel in your inner loops would be something like currentPixelAddress - n - 1.
Those would convert your four-deep nested loops into a single loop with very little index calculations needed.

A few ideas - untested.
You have if(ii==i && jj=j) to test for the central pixel in your sharpening loop which you do 9x for every pixel. I think it would be faster to remove that if and do exactly the same for every pixel but then make a correction, outside the loop by adding 10x the central pixel.
// Do central pixel first
p=src[calculateIndex(i,j,n)];
sum_red = 10*p.red;
sum_green = 10*p.green;
sum_blue = 10*p.blue;
for(ii =i-1; ii <= i + 1; ++ii) {
for(jj = j-1; jj <= j + 1; ++jj) {
p = src[calculateIndex(ii, jj, n)];
//operate according to the instructions
sum_red -= p.red;
sum_green -= p.green;
sum_blue -= p.blue;
}
}
Where you do dst[calculateIndex(i, j, n)] = current_pixel;, you can probably calculate the index once before the loop at the start and then just increment the pointer with each write inside the loop - assuming your arrays are contiguous and unpadded.
index=calculateIndex(1,1,n)
for (i = 1 ; i < n - 1; ++i) {
for (j = 1 ; j < n - 1 ; ++j) {
...
dst[index++] = current_pixel;
}
index+=2; // skip over last pixel of this line and first pixel of next line
}
As you move your 3x3 window of 9 pixels across the image, you could "remember" the left-most column of 3 pixels from the previous position, then instead of 9 additions for each pixel, you would do a single subtraction for the left-most column leaving the window and 3 additions for the new column entering the window on the right side, i.e. 4 calculations instead of 9.

Related

Blur filter in C results in only a slightly changed image

i am trying to make a blur filter in c that takes the neighboring pixels of the main pixel, takes the avarage of the rgb values and stores it in the temp array, them changes the image using the temp array values, it seems correct but it is not working as intended, giving an output of a very slightly blured image. I realy dont see my mistake and would be very thankful if someone helped, sorry if i made something horrible, started learning c last week.
i checked this post
Blurring an Image in c pixel by pixel - special cases
but i did not see were i went wrong.
im working with this data struct
BYTE rgbtBlue;
BYTE rgbtGreen;
BYTE rgbtRed;
void blur(int height, int width, RGBTRIPLE image[height][width])
{
// ints to use later
int j;
int p;
RGBTRIPLE temp[height][width];
for(int n = 0; n < height; n++) // loop to check every pixel
{
for(int k = 0; k < width; k++)
{
int widx = 3;
int hghtx = 3;
// conditionals for border cases
int y = 0;
if(n == 0)
{
p = 0;
hghtx = 2;
}
if(n == height - 1)
{
p = -1;
hghtx = 2;
}
if(k == 0)
{
j = 0;
widx = 2;
}
if(k == width - 1)
{
j = -1;
widx = 2;
}
for(int u = 0; u < hghtx; u++) // matrix of pixels around the main pixel using the conditionals gathered before
for(int i = 0; i < widx; i++)
if(y == 1) // takes the average of color and stores it in the RGB temp
{
temp[n][k].rgbtGreen = temp[n][k].rgbtGreen + image[n + p + u][k + j + i].rgbtGreen / (hghtx * widx);
temp[n][k].rgbtRed = temp[n][k].rgbtRed + image[n + p + u][k + j + i].rgbtRed / (hghtx * widx);
temp[n][k].rgbtBlue = temp[n][k].rgbtBlue + image[n + p + u][k + j + i].rgbtBlue / (hghtx * widx);
}
else // get first value of temp
{
temp[n][k].rgbtGreen = (image[n + p + u][k + j + i].rgbtGreen) / (hghtx * widx);
temp[n][k].rgbtRed = (image[n + p + u][k + j + i].rgbtRed) / (hghtx * widx);
temp[n][k].rgbtBlue = (image[n + p + u][k + j + i].rgbtBlue) / (hghtx * widx);
y++;
}
}
}
// changes the original image to the blured one
for(int n = 0; n < height; n++)
for(int k = 0; k < width; k++)
image[n][k] = temp[n][k];
}
I think it's a combination of things.
If the code worked the way you expect, you would be still doing a blur of just 3x3 pixels and that can be hardly noticeable, especially on large images (I'm pretty sure it will be unnoticeable on an image 4000x3000 pixels)
There are some problems with the code.
As #Fe2O3 says, at the end of the first line, widx will change to 2 and stay 2 for the rest of the image.
you are reading from temp[][] without initializing it. I think that if you compile that in release mode (not debug), temp[][] will contain random data and not all zeros as you probably expect. (as #WeatherWane pointed out)
The way you calculate the average of the pixels is weird. If you use a matrix 3x3 pixels, each pixel value shoud be divided by 9 in the final sum. But you divide the first pixel nine times by 2 (in effect doing /256), the second one eight times by 2 (so its pixel/128) etc. until the last one is divided by 2. So basically, it's mostly the value of the bottom right pixel.
also, since your RGB values are just bytes, you may want to divide them first and only then add them, because otherwise, you'll get overflows with wild results.
Try using a debugger to see the values you are actually calculating. It can be quite an eye opener :)

Kiss FFT on a dsPIC33

I have been trying to get KissFFT to work on a dsPIC, however after trying various different ways, the output is not what it should be. I was hoping to get some help to see if there are any configurations that I may be overlooking or if its just somthing i haven't thought of?
I am using a dsPIC33EP256MC202 with the XC16 compiler within MPLABX.
Declarations and memory assignment.
int readings[3] = {0, 0, 0};
kiss_fft_scalar zero;
memset(&zero,0,sizeof(zero));
int size = 128 * 2;
float fin[256];
kiss_fft_cpx in[size];
kiss_fft_cpx out[size];
for (i = 0; i < size; i++) {
in[i].r = zero;
in[i].i = zero;
out[i].r = zero;
out[i].i = zero;
}
kiss_fft_cfg mycfg = kiss_fft_alloc(size*2 ,0 ,NULL,NULL);
Get readings from an accellerometer on the breadboard and populate the float array (using pythagoras to consolidate the 3 axis' into one signal). The input XYZ value are scaled down as they come in anywhere between -2400 and 2400 on average.
while(1)
{
if(iii <= 1){
UART_Write_Text("Collecting...");
}
getOutput(readings);
X = (double)readings[0];
Y = (double)readings[1];
Z = (double)readings[2];
X = X / 50;
Y = Y / 50;
Z = Z / 50;
if(ii <= 256){
fin[ii] = sqrt(X*X + Y*Y + Z*Z);
ii++;
}
else{
i=0;
while(i<255){
fin[i] = fin[i+1];
i++;
}
fin[255] = sqrt(X*X + Y*Y + Z*Z);
}
Once the float array is full of values, populate the real component of the input complex array with the values in the float array. Then perform the Kiss FFT and populate a float array (arrayDFTOUT) with the absolute value of each real and imaginary value of the out array of Kiss FFT, the final loop makes any negative value positive.
if(iii == 255){
iii = 0;
UART_Write_Text("Processing...");
for (i = 0; i < size; i++) {
// samples are type of short
in[i].r = fin[i];
in[i].i = zero;
out[i].r = zero;
out[i].i = zero;
}
kiss_fft(mycfg, in, out);
for(i=0;i<128;i++){
arrayDFTOUT[i] = sqrt((out[i].r*out[i].r) + (out[i].i*out[i].i));
}
arrayDFTOUT[0] = 1;
for(i = 0; i<128; i++){
if(arrayDFTOUT[i] < 0){
arrayDFTOUT[i] = arrayDFTOUT[i] - (arrayDFTOUT[i]*2);
}
}
Finally display the output values through serial using the UART on the breadboard.
for(i = 0; i < 128; i++){
sprintf(temp, "%f,", arrayDFTOUT[i]);
UART_Write_Text(temp);
}
And are the results. All zero's aparet from the first value that was set to 1 after KissFFT had been performed. Any ideas?

2D convolution with a with a kernel which is not center originated

I want to do 2D convolution of an image with a Gaussian kernel which is not centre originated given by equation:
h(x-x', y-y') = exp(-((x-x')^2+(y-y'))/2*sigma)
Lets say the centre of kernel is (1,1) instead of (0,0). How should I change my following code for generation of kernel and for the convolution?
int krowhalf=krow/2, kcolhalf=kcol/2;
int sigma=1
// sum is for normalization
float sum = 0.0;
// generate kernel
for (int x = -krowhalf; x <= krowhalf; x++)
{
for(int y = -kcolhalf; y <= kcolhalf; y++)
{
r = sqrtl((x-1)*(x-1) + (y-1)*(y-1));
gKernel[x + krowhalf][y + kcolhalf] = exp(-(r*r)/(2*sigma));
sum += gKernel[x + krowhalf][y + kcolhalf];
}
}
//normalize the Kernel
for(int i = 0; i < krow; ++i)
for(int j = 0; j < kcol; ++j)
gKernel[i][j] /= sum;
float **convolve2D(float** in, float** out, int h, int v, float **kernel, int kCols, int kRows)
{
int kCenterX = kCols / 2;
int kCenterY = kRows / 2;
int i,j,m,mm,n,nn,ii,jj;
for(i=0; i < h; ++i) // rows
{
for(j=0; j < v; ++j) // columns
{
for(m=0; m < kRows; ++m) // kernel rows
{
mm = kRows - 1 - m; // row index of flipped kernel
for(n=0; n < kCols; ++n) // kernel columns
{
nn = kCols - 1 - n; // column index of flipped kernel
//index of input signal, used for checking boundary
ii = i + (m - kCenterY);
jj = j + (n - kCenterX);
// ignore input samples which are out of bound
if( ii >= 0 && ii < h && jj >= 0 && jj < v )
//out[i][j] += in[ii][jj] * (kernel[mm+nn*29]);
out[i][j] += in[ii][jj] * (kernel[mm][nn]);
}
}
}
}
}
Since you're using the convolution operator you have 2 choices:
Using it Spatial Invariant property.
To so so, just calculate the image using regular convolution filter (Better done using either conv2 or imfilter) and then shift the result.
You should mind the boundary condition you'd to employ (See imfilter properties).
Calculate the shifted result specifically.
You can do this by loops as you suggested or more easily create non symmetric kernel and still use imfilter or conv2.
Sample Code (MATLAB)
clear();
mInputImage = imread('3.png');
mInputImage = double(mInputImage) / 255;
mConvolutionKernel = zeros(3, 3);
mConvolutionKernel(2, 2) = 1;
mOutputImage01 = conv2(mConvolutionKernel, mInputImage);
mConvolutionKernelShifted = [mConvolutionKernel, zeros(3, 150)];
mOutputImage02 = conv2(mConvolutionKernelShifted, mInputImage);
figure();
imshow(mOutputImage01);
figure();
imshow(mOutputImage02);
The tricky part is to know to "Crop" the second image in the same axis as the first.
Then you'll have a shifted image.
You can use any Kernel and any function which applies convolution.
Enjoy.

C Language - General algorithm to read a square matrix, based on the square number of it's side?

So we're reading a matrix and saving it in an array sequentially. We read the matrix from a starting [x,y] point which is provided. Here's an example of some code I wrote to get the values of [x-1,y] [x+1,y] [x,y-1] [x,y+1], which is a cross.
for(i = 0, n = -1, m = 0, array_pos = 0; i < 4; i++, n++, array_pos++) {
if(x+n < filter_matrix.src.columns && x+n >= 0 )
if(y+m < filter_matrix.src.lines && y+m >= 0){
for(k = 0; k < numpixels; k++) {
arrayToProcess[array_pos].rgb[h] = filter_matrix.src.points[x+n][y+m].rgb[h];
}
}
m = n;
m++;
}
(The if's are meant to avoid reading null positions, since it's an image we're reading the origin pixel can be located in a corner. Not relevant to the issue here.)
Now is there a similar generic algorithm which can read ALL the elements around as a square (not just a cross) based on a single parameter, which is the size of the square's side squared?
If it helps, the only values we're dealing with are 9, 25 and 49 (a 3x3 5x5 and 7x7 square).
Here is a generalized code for reading the square centered at (x,y) of size n
int startx = x-n/2;
int starty = y-n/2;
for(int u=0;u<n;u++) {
for(int v=0;v<n;v++) {
int i = startx + u;
int j = starty + v;
if(i>=0 && j>=0 && i<N && j<M) {
printf(Matrix[i][j]);
}
}
}
Explanation: Start from top left value which is (x - n/2, y-n/2) now consider that you are read a normal square matrix from where i and j are indices of Matrix[i][j]. So we just added startx & starty to shift the matrix at (0,0) to (x-n/2,y-n/2).
Given:
static inline int min(int x, int y) { return (x < y) ? x : y; }
static inline int max(int x, int y) { return (x > y) ? x : y; }
or equivalent macros, and given that:
the x-coordinates range from 0 to x_max (inclusive),
the y-coordinates range from 0 to y_max (inclusive),
the centre of the square (x,y) is within the bounds,
the square you are creating has sides of (2 * size + 1) (so size is 1, 2, or 3 for the 3x3, 5x5, and 7x7 cases; or if you prefer to have sq_side = one of 3, 5, 7, then size = sq_side / 2),
the integer types are all signed (so x - size can produce a negative value; if they're unsigned, you will get the wrong result using the expressions shown),
then you can ensure that you are within bounds by setting:
x_lo = max(x - size, 0);
x_hi = min(x + size, x_max);
y_lo = max(y - size, 0);
y_hi = min(y + size, y_max);
for (x_pos = x_lo; x_pos <= x_hi; x_pos++)
{
for (y_pos = y_lo; y_pos <= y_hi; y_pos++)
{
// Process the data at array[x_pos][y_pos]
}
}
Basically, the initial assignments determine the bounds of the the array from [x-size][y-size] to [x+size][y+size], but bounded by 0 on the low side and the maximum sizes on the high end. Then scan over the relevant rectangular (usually square) sub-section of the matrix. Note that this determines the valid ranges once, outside the loops, rather than repeatedly within the loops.
If the integer types are signed, you have ensure you never try to create a negative number during subtraction. The expressions could be rewritten as:
x_lo = x - min(x, size);
x_hi = min(x + size, x_max);
y_lo = y - min(y, size);
y_hi = min(y + size, y_max);
which isn't as symmetric but only uses the min function.
Given the coordinates (x,y), you first need to find the surrounding elements. You can do that with a double for loop, like this:
for (int i = x-1; i <= x+1; i++) {
for (int j = y-1; j <= y+1; j++) {
int elem = square[i][j];
}
}
Now you just need to do a bit of work to make sure that 0 <= i,j < n, where n is the length of a side;
I don't know whether the (X,Y) in your code is the center of the square. I assume it is.
If the side of the square is odd. generate the coordinates of the points on the square. I assume the center is (0,0). Then the points on the squares are
(-side/2, [-side/2,side/2 - 1]); ([-side/2 + 1,side/2], -side/2); (side/2,[side/2 - 1,-side/2]);([side/2 - 1, side/2],-side/2);
side is the length of the square
make use of this:
while(int i<=0 && int j<=0)
for (i = x-1; i <= x+1; i++) {
for (j = y-1; j <= y+1; j++) {
int elem = square[i][j];
}
}
}

Optimize Bilinear Resize Algorithm in C

Can anyone spot any way to improve the speed in the next Bilinear resizing Algorithm?
I need to improve Speed as this is critical, keeping good image quality. Is expected to be used in mobile devices with low speed CPUs.
The algorithm is used mainly for up-scale resizing. Any other faster Bilinear algorithm also would be appreciated. Thanks
void resize(int* input, int* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
for (int i = 0; i < targetHeight; i++)
{
for (int j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = input[index] ;
b = input[index + 1] ;
c = input[index + sourceWidth] ;
d = input[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
output [offset++] =
0x000000ff | // alpha
((((int)red) << 24)&0xff0000) |
((((int)green) << 16)&0xff00) |
((((int)blue) << 8)&0xff00);
}
}
}
Off the the top of my head:
Stop using floating-point, unless you're certain your target CPU has it in hardware with good performance.
Make sure memory accesses are cache-optimized, i.e. clumped together.
Use the fastest data types possible. Sometimes this means smallest, sometimes it means "most native, requiring least overhead".
Investigate if signed/unsigned for integer operations have performance costs on your platform.
Investigate if look-up tables rather than computations gain you anything (but these can blow the caches, so be careful).
And, of course, do lots of profiling and measurements.
In-Line Cache and Lookup Tables
Cache your computations in your algorithm.
Avoid duplicate computations (like (1-y_diff) or (x_ratio * j))
Go through all the lines of your algorithm, and try to identify patterns of repetitions. Extract these to local variables. And possibly extract to functions, if they are short enough to be inlined, to make things more readable.
Use a lookup-table
It's quite likely that, if you can spare some memory, you can implement a "store" for your RGB values and simply "fetch" them based on the inputs that produced them. Maybe you don't need to store all of them, but you could experiment and see if some come back often. Alternatively, you could "fudge" your colors and thus end up with less values to store for more lookup inputs.
If you know the boundaries for you inputs, you can calculate the complete domain space and figure out what makes sense to cache. For instance, if you can't cache the whole R, G, B values, maybe you can at least pre-compute the shiftings ((b>>16) and so forth...) that are most likely deterministic in your case).
Use the Right Data Types for Performance
If you can avoid double and float variables, use int. On most architectures, int would be test faster type for computations because of the memory model. You can still achieve decent precision by simply shifting your units (ie use 1026 as int instead of 1.026 as double or float). It's quite likely that this trick would be enough for you.
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
Could surely use some optimization: you were using x_ration * j-1 just a few cycles earlier, so all you really need here is x+=x_ratio
My random guess (use a profiler instead of letting people guess!):
The compiler has to generate that works when input and output overlap which means it has to do generate loads of redundant stores and loads. Add restrict to the input and output parameters to remove that safety feature.
You could also try using a=b; and c=d; instead of loading them again.
here is my version, steal some ideas. My C-fu is quite weak, so some lines are pseudocodes, but you can fix them.
void resize(int* input, int* output,
int sourceWidth, int sourceHeight,
int targetWidth, int targetHeight
) {
// Let's create some lookup tables!
// you can move them into 2-dimensional arrays to
// group together values used at the same time to help processor cache
int sx[0..targetWidth ]; // target->source X lookup
int sy[0..targetHeight]; // target->source Y lookup
int mx[0..targetWidth ]; // left pixel's multiplier
int my[0..targetHeight]; // bottom pixel's multiplier
// we don't have to calc indexes every time, find out when
bool reloadPixels[0..targetWidth ];
bool shiftPixels[0..targetWidth ];
int shiftReloadPixels[0..targetWidth ]; // can be combined if necessary
int v; // temporary value
for (int j = 0; j < targetWidth; j++){
// (8bit + targetBits + sourceBits) should be < max int
v = 256 * j * (sourceWidth-1) / (targetWidth-1);
sx[j] = v / 256;
mx[j] = v % 256;
reloadPixels[j] = j ? ( sx[j-1] != sx[j] ? 1 : 0)
: 1; // always load first pixel
// if no reload -> then no shift too
shiftPixels[j] = j ? ( sx[j-1]+1 = sx[j] ? 2 : 0)
: 0; // nothing to shift at first pixel
shiftReloadPixels[j] = reloadPixels[i] | shiftPixels[j];
}
for (int i = 0; i < targetHeight; i++){
v = 256 * i * (sourceHeight-1) / (targetHeight-1);
sy[i] = v / 256;
my[i] = v % 256;
}
int shiftReload;
int srcIndex;
int srcRowIndex;
int offset = 0;
int lm, rm, tm, bm; // left / right / top / bottom multipliers
int a, b, c, d;
for (int i = 0; i < targetHeight; i++){
srcRowIndex = sy[ i ] * sourceWidth;
tm = my[i];
bm = 255 - tm;
for (int j = 0; j < targetWidth; j++){
// too much ifs can be too slow, measure.
// always true for first pixel in a row
if( shiftReload = shiftReloadPixels[ j ] ){
srcIndex = srcRowIndex + sx[j];
if( shiftReload & 2 ){
a = b;
c = d;
}else{
a = input[ srcIndex ];
c = input[ srcIndex + sourceWidth ];
}
b = input[ srcIndex + 1 ];
d = input[ srcIndex + 1 + sourceWidth ];
}
lm = mx[j];
rm = 255 - lm;
// WTF?
// Input AA RR GG BB
// Output RR GG BB AA
if( j ){
leftOutput = rightOutput ^ 0xFFFFFF00;
}else{
leftOutput =
// blue element
((( ( (a&0xFF)*tm
+ (c&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((a>>8)&0xFF)*tm
+ ((c>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((a>>16)&0xFF)*tm
+ ((c>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
}
rightOutput =
// blue element
((( ( (b&0xFF)*tm
+ (d&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((b>>8)&0xFF)*tm
+ ((d>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((b>>16)&0xFF)*tm
+ ((d>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
output[offset++] =
// alpha
0x000000ff
| leftOutput
| rightOutput
;
}
}
}

Resources