I need to use MPI and OpenMP (2 different problems) to parallelize a code from Sbac-Pad marathon (reference: http://lspd.mackenzie.br/marathon/18/problems.html). I am working on the himeno benchmark. I believe the only part of this code that is worth parallellizing is the jacobi function:
#define MR(mt,n,r,c,d) mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]
struct Matrix {
float* m;
int mnums;
int mrows;
int mcols;
int mdeps;
};
float
jacobi(int nn, Matrix* a,Matrix* b,Matrix* c,
Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2)
{
int i,j,k,n,imax,jmax,kmax;
float gosa,s0,ss;
imax= p->mrows-1;
jmax= p->mcols-1;
kmax= p->mdeps-1;
for(n=0 ; n<nn ; n++){
gosa = 0.0;
for(i=1 ; i<imax; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(a,0,i,j,k)*MR(p,0,i+1,j, k)
+ MR(a,1,i,j,k)*MR(p,0,i, j+1,k)
+ MR(a,2,i,j,k)*MR(p,0,i, j, k+1)
+ MR(b,0,i,j,k)
*( MR(p,0,i+1,j+1,k) - MR(p,0,i+1,j-1,k)
- MR(p,0,i-1,j+1,k) + MR(p,0,i-1,j-1,k) )
+ MR(b,1,i,j,k)
*( MR(p,0,i,j+1,k+1) - MR(p,0,i,j-1,k+1)
- MR(p,0,i,j+1,k-1) + MR(p,0,i,j-1,k-1) )
+ MR(b,2,i,j,k)
*( MR(p,0,i+1,j,k+1) - MR(p,0,i-1,j,k+1)
- MR(p,0,i+1,j,k-1) + MR(p,0,i-1,j,k-1) )
+ MR(c,0,i,j,k) * MR(p,0,i-1,j, k)
+ MR(c,1,i,j,k) * MR(p,0,i, j-1,k)
+ MR(c,2,i,j,k) * MR(p,0,i, j, k-1)
+ MR(wrk1,0,i,j,k);
ss= (s0*MR(a,3,i,j,k) - MR(p,0,i,j,k))*MR(bnd,0,i,j,k);
gosa+= ss*ss;
MR(wrk2,0,i,j,k)= MR(p,0,i,j,k) + omega*ss;
}
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(p,0,i,j,k)= MR(wrk2,0,i,j,k);
} /* end n loop */
return(gosa);
}
The problem is, this function seems to have a sequential nature, since every iteration of nn is dependant on the last one. What I tried, using MPI, was making an auxiliar variable for gosa (auxgosa), and using MPI_REDUCE after the i j k for loops, like the following (root process is rank = 0):
//rank is the current process
//size is the total amount of processes
int start = ((imax+1)/size)*rank;
int stop = ((imax+1)/size)*(rank+1)-1;
if(rank == 0){start++;}
for(n=0 ; n<nn ; n++){
gosa = 0.0;
auxgosa = 0.0;
for(i=start ; i<stop; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(aa,0,i,j,k)*MR(pp,0,i+1,j,k)
+ MR(aa,1,i,j,k)*MR(pp,0,i, j+1,k)
+ MR(aa,2,i,j,k)*MR(pp,0,i, j, k+1)
+ MR(bb,0,i,j,k)
*( MR(pp,0,i+1,j+1,k) - MR(pp,0,i+1,j-1,k)
- MR(pp,0,i-1,j+1,k) + MR(pp,0,i-1,j-1,k) )
+ MR(bb,1,i,j,k)
*( MR(pp,0,i,j+1,k+1) - MR(pp,0,i,j-1,k+1)
- MR(pp,0,i,j+1,k-1) + MR(pp,0,i,j-1,k-1) )
+ MR(bb,2,i,j,k)
*( MR(pp,0,i+1,j,k+1) - MR(pp,0,i-1,j,k+1)
- MR(pp,0,i+1,j,k-1) + MR(pp,0,i-1,j,k-1) )
+ MR(cc,0,i,j,k) * MR(pp,0,i-1,j, k)
+ MR(cc,1,i,j,k) * MR(pp,0,i, j-1,k)
+ MR(cc,2,i,j,k) * MR(pp,0,i, j, k-1)
+ MR(awrk1,0,i,j,k);
ss= (s0*MR(aa,3,i,j,k) - MR(pp,0,i,j,k))*MR(abnd,0,i,j,k);
auxgosa+= ss*ss;
MR(awrk2,0,i,j,k)= MR(pp,0,i,j,k) + omega*ss;
}
MPI_Reduce(&auxgosa,&gosa,1,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD);
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(pp,0,i,j,k)= MR(awrk2,0,i,j,k);
} /* end n loop */
Unfortunately, this didn't work. Could anyone give me some insight about this? I plan using a similar strategy with OpenMP.
If awrk2 is different from a, p, b, c and wrk1, then there is no loop carried dependence.
A simple google search will point you to parallelized versions of the Himeno benchmark (MPI, OpenMP and hybrid MPI+OpenMP versions are available).
I am trying to compute the integral of the function f(x)=(1-x^2)^(1/2) from x=0 to x=1. The answer should be approximately pi/4. I am currently getting 2.
My current implementation of the trapezoidal rule is the following:
double
def_integral(double *f, double *x, int n)
{
double F;
for (int i = 0 ; i < n ; i++) {
F += 0.5 * ( x[i+1] - x[i] ) * ( f[i] + f[i+1] );
}
return F;
}
I'm creating N divisions to approximate the area under the curve between x_1=0 and x_N=1 by looping through i to N with x_i = i / N.
int
main(int argc, char **argv)
{
int N = 1000;
double f_x[N];
double x[N];
for (int i = 0 ; i <= N ; i++) {
double x = i * 1. / N;
f_x[i] = sqrt(1. - pow(x, 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
double F_x = def_integral(f_x, x, N);
printf("The integral is %g\n", F_x);
}
The result of 2 that I am currently getting should be dependent on the number of N division, however, no matter if I make N=10000 or N=100, I still get 2.
Any suggestions?
In this for loop, you forgot updatin array x as well.
for (int i = 0 ; i <= N ; i++) {
double x = i * 1. / N;
f_x[i] = sqrt(1. - pow(x, 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
So, for loop should be replaced by
for (int i = 0 ; i <= N ; i++) {
double xi = i * 1. / N;
x[i] = xi;
f_x[i] = sqrt(1. - pow(xi , 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
In your main code, you call def_integral with a double (x) and in the function an array of x (double * x) is expected. Perhaps (it is what I suppose), the problem comes from the fact you formula needs x(i+1)-x(i) but you use a constant step. Indeed, x(i+1)-x(i)=step_x is constant so you do not need each x(i) but only value : 1./N
Other remark, with a constant step, your formula could be simplified to : F_x=step_x* ( 0.5*f_x(x0)+ f_x(x1)+...+f_x(xn-1)+ 0.5*f_x(xn) ) . It helps to simplify the code and to write a better efficient one.
Everything is commented in the code above. I hope it could help you. Best regards.
#include <stdio.h>
#include <math.h>
double
def_integral(double *f, double step_x, int n)
{
double F;
for (int i = 0 ; i < n ; i++) {
F += 0.5 * ( step_x ) * ( f[i] + f[i+1] );
}
return F;
}
int main()
{
int N = 1001; // 1001 abscissas means 1000 intervalls (see comment on array size and indices)
double f_x[N]; // not needed for the simplified algorithm
double step_x = 1. / N; // x(i+1)-x(i) is constant
for (int i = 0 ; i < N ; i++) { // Note : i<N and not i<=N
double xi = i * step_x; // abscissa calculation
f_x[i] = sqrt((1. - xi )*(1. + xi )); // cf chux comment
}
double F_x = def_integral(f_x, step_x, N);
printf("The integral is %.10g\n", F_x);
// simplified algorithm
// F_x=step_x*( 0.5*f_x(x0)+f_x(x1)+...+f_x(xn-1)+0.5f_x(xn) )
double xi;
xi=0; // x(0)
F_x=0.5*sqrt((1. - xi )*(1. + xi ));
for (int i=1 ; i<=N-1 ; i++) {
xi=step_x*i;
F_x+=sqrt((1. - xi )*(1. + xi ));
}
xi=step_x*N;
F_x+=0.5*sqrt((1. - xi )*(1. + xi ));
F_x=step_x*F_x;
printf("The integral is %.10g\n", F_x);
}
I need to pass Two Dimension array to a function as a single pointer. There are different types of approaches are there but due to some constraints(CodeGeneration), I want to pass a single pointer only. I have macros which contain the size of each dimension. I implemented the following way but I am not sure it will work fine for N dimensions also
#define size_1D 3
#define size_2D 3
void fun(int *arr)
{
int i,total_size = size_1D* size_2D;
for(i = 0; i < total_size ; i++)
{
int value = arr[i];
}
}
int main()
{
int arr[size_1D][size_2D] = {{1,2,7},{8,4,9}};
fun(&arr[0][0]);
}
Any loophole is there if I followed the above approach?
void fun(int (*arr)[3]);
or exactly equivalent, but maybe more readable:
void fun(int arr[][3]);
arr is a pointer to two dimensional array with 3 rows and 3 columns. arr decayed to a pointer has the type of a pointer to an array of 3 elements. You need to pass a pointer to an array of 3 elements. You can access the data normally, using arr[a][b].
#define size_1D 3
#define size_2D 3
void fun(int arr[][3])
{
for(int i = 0; i < size_1D ; i++) {
for(int j = 0; j < size_2D ; j++) {
int value = arr[i][j];
}
}
}
int main()
{
int arr[size_1D][size_2D] = {{1,2,7},{8,4,9}};
fun(arr);
}
You can specify the sizes as arguments and use a variable length array declaration inside function parameter list. The compiler will do some job for you.
#include <stdlib.h>
void fun(size_t xmax, size_t ymax, int arr[xmax][ymax]);
// is equivalent to
void fun(size_t xmax, size_t ymax, int arr[][ymax]);
// is equivalent to
void fun(size_t xmax, size_t ymax, int (*arr)[ymax]);
void fun(size_t xmax, size_t ymax, int arr[xmax][ymax])
{
for(int i = 0; i < xmax ; i++) {
for(int j = 0; j < ymax ; j++) {
int value = arr[i][j];
}
}
}
int main()
{
int arr[3][4] = {{1,2,7},{8,4,9}};
fun(3, 4, arr);
}
#edit
We know that the result of array subscript operator is exactly identical to pointer dereference operator of the sum:
a[b] <=> *(a + b)
From pointer arithmetic we know that:
type *pnt;
int a;
pnt + a = (typeof(pnt))(void*)((uintptr_t)(void*)pnt + a * sizeof(*pnt))
pnt + a = (int*)(void*)((uintptr_t)(void*)pnt + a * sizeof(type))
And that the array is equal to the value to the pointer to the first element of an array:
type pnt[A];
assert((uintptr_t)pnt == (uintptr_t)&pnt[0]);
assert((uintptr_t)pnt == (uintptr_t)&*(pnt + 0));
assert((uintptr_t)pnt == (uintptr_t)&*pnt);
So:
int arr[A][B];
then:
arr[x][y]
is equivalent to (ignore warnings, kind-of pseudocode):
*(*(arr + x) + y)
*( *(int[A][B])( (uintptr_t)arr + x * sizeof(int[B]) ) + y )
// ---- x * sizeof(int[B]) = x * B * sizeof(int)
*( *(int[A][B])( (uintptr_t)arr + x * B * sizeof(int) ) + y )
// ---- C11 6.5.2.1p3
*( (int[B])( (uintptr_t)arr + x * B * sizeof(int) ) + y )
*(int[B])( (uintptr_t)( (uintptr_t)arr + x * B * sizeof(int) ) + y * sizeof(int) )
// ---- *(int[B])( ... ) = (int)dereference( ... ) = *(int*)( ... )
// ---- loose braces - conversion from size_t to uintptr_t should be safe
*(int*)( (uintptr_t)arr + x * B * sizeof(int) + y * sizeof(int) )
*(int*)( (uintptr_t)arr + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)( &*arr ) + ( x * B + y ) * sizeof(int) )
// ---- (uintptr_t)arr = (uintptr_t)&arr[0][0]
*(int*)( (uintptr_t)( &*(*(arr + 0) + 0) ) + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)( &arr[0][0] ) + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)&arr[0][0] + ( x * B + y ) * sizeof(int) )
// ---- decayed typeof(&arr[0][0]) = int*
*( &arr[0][0] + ( x * B + y ) )
(&arr[0][0])[x * B + y]
So:
arr[x][y] == (&arr[0][0])[x * B + y]
arr[x][y] == (&arr[0][0])[x * sizeof(*arr)/sizeof(**arr) + y]
On a sane architecture where sizeof(uintptr_t) == sizeof(size_t) == sizeof(int*) == sizeof(int**) and etc., and there is no difference in accessing data behind a int* pointer from accessing data behind int(*)[B] pointer etc. You should be safe with accessing one dimensional array when using a pointer to the first array member, as the operations should be equivalent ("safe" with exception for out-of-bound accesses, that's never safe)
Note, that this is correctly undefined behavior according to C standard and will not work on all architectures. Example: there could be an architecture, where data of the type int[A] are stored in different memory bank then int[A][B] data (by hardware, by design). So the type of the pointer tells the compiler which data bank to choose, so accessing the same data with the same to the value pointer, but with different pointer type, leads to UB, as the compiler chooses different data bank to access the data.
I am trying to allocate and copy memory of a flattened 2D array on to the device using cudaMalloc3D to test the performance of cudaMalloc3D. But when I try to write to the array from the kernel it throws 'an illegal memory access was encountered' exception. The program runs fine if I am just reading from the array but when I try to write to it, there is an error. Any help on this will be greatly appreciated. Below is my code and the syntax for compiling the code.
Compile using
nvcc -O2 -arch sm_20 test.cu
Code: test.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 10;
// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;
// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);
// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );
// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;
// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;
// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
return EXIT_SUCCESS;
}
The reason the error doesn't occur on this line:
REAL tmp = unew_row[j]; // no error on this line
is because the compiler is optimizing that line out. It doesn't do anything useful, and so the compiler completely eliminates it. The compiler warning:
xxx.cu(87): warning: variable "tmp" was declared but never referenced
is a hint to that effect.
Your code is very nearly correct. The issue is here:
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
It should be:
REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);
The i variable in your kernel is the width (i.e. X) dimension.
The j variable is the height (i.e. Y) dimension.
The height is the one that refers to which row you are on, therefore the row pitch should be multiplied by the height parameter, i.e. j, not i.
Similarly, although it's not the source of the specific failure for your particular dimensions, this code may be not what you intended either:
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
If, for example, you were intending to compute the offset to the row and then index into the row (perhaps to set every element in the alocation, for example) then I think you would want to use i not j as your final index:
REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error
However, for this particular example, this is not the actual source of the illegal memory access.
Can anyone spot any way to improve the speed in the next Bilinear resizing Algorithm?
I need to improve Speed as this is critical, keeping good image quality. Is expected to be used in mobile devices with low speed CPUs.
The algorithm is used mainly for up-scale resizing. Any other faster Bilinear algorithm also would be appreciated. Thanks
void resize(int* input, int* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
for (int i = 0; i < targetHeight; i++)
{
for (int j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = input[index] ;
b = input[index + 1] ;
c = input[index + sourceWidth] ;
d = input[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
output [offset++] =
0x000000ff | // alpha
((((int)red) << 24)&0xff0000) |
((((int)green) << 16)&0xff00) |
((((int)blue) << 8)&0xff00);
}
}
}
Off the the top of my head:
Stop using floating-point, unless you're certain your target CPU has it in hardware with good performance.
Make sure memory accesses are cache-optimized, i.e. clumped together.
Use the fastest data types possible. Sometimes this means smallest, sometimes it means "most native, requiring least overhead".
Investigate if signed/unsigned for integer operations have performance costs on your platform.
Investigate if look-up tables rather than computations gain you anything (but these can blow the caches, so be careful).
And, of course, do lots of profiling and measurements.
In-Line Cache and Lookup Tables
Cache your computations in your algorithm.
Avoid duplicate computations (like (1-y_diff) or (x_ratio * j))
Go through all the lines of your algorithm, and try to identify patterns of repetitions. Extract these to local variables. And possibly extract to functions, if they are short enough to be inlined, to make things more readable.
Use a lookup-table
It's quite likely that, if you can spare some memory, you can implement a "store" for your RGB values and simply "fetch" them based on the inputs that produced them. Maybe you don't need to store all of them, but you could experiment and see if some come back often. Alternatively, you could "fudge" your colors and thus end up with less values to store for more lookup inputs.
If you know the boundaries for you inputs, you can calculate the complete domain space and figure out what makes sense to cache. For instance, if you can't cache the whole R, G, B values, maybe you can at least pre-compute the shiftings ((b>>16) and so forth...) that are most likely deterministic in your case).
Use the Right Data Types for Performance
If you can avoid double and float variables, use int. On most architectures, int would be test faster type for computations because of the memory model. You can still achieve decent precision by simply shifting your units (ie use 1026 as int instead of 1.026 as double or float). It's quite likely that this trick would be enough for you.
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
Could surely use some optimization: you were using x_ration * j-1 just a few cycles earlier, so all you really need here is x+=x_ratio
My random guess (use a profiler instead of letting people guess!):
The compiler has to generate that works when input and output overlap which means it has to do generate loads of redundant stores and loads. Add restrict to the input and output parameters to remove that safety feature.
You could also try using a=b; and c=d; instead of loading them again.
here is my version, steal some ideas. My C-fu is quite weak, so some lines are pseudocodes, but you can fix them.
void resize(int* input, int* output,
int sourceWidth, int sourceHeight,
int targetWidth, int targetHeight
) {
// Let's create some lookup tables!
// you can move them into 2-dimensional arrays to
// group together values used at the same time to help processor cache
int sx[0..targetWidth ]; // target->source X lookup
int sy[0..targetHeight]; // target->source Y lookup
int mx[0..targetWidth ]; // left pixel's multiplier
int my[0..targetHeight]; // bottom pixel's multiplier
// we don't have to calc indexes every time, find out when
bool reloadPixels[0..targetWidth ];
bool shiftPixels[0..targetWidth ];
int shiftReloadPixels[0..targetWidth ]; // can be combined if necessary
int v; // temporary value
for (int j = 0; j < targetWidth; j++){
// (8bit + targetBits + sourceBits) should be < max int
v = 256 * j * (sourceWidth-1) / (targetWidth-1);
sx[j] = v / 256;
mx[j] = v % 256;
reloadPixels[j] = j ? ( sx[j-1] != sx[j] ? 1 : 0)
: 1; // always load first pixel
// if no reload -> then no shift too
shiftPixels[j] = j ? ( sx[j-1]+1 = sx[j] ? 2 : 0)
: 0; // nothing to shift at first pixel
shiftReloadPixels[j] = reloadPixels[i] | shiftPixels[j];
}
for (int i = 0; i < targetHeight; i++){
v = 256 * i * (sourceHeight-1) / (targetHeight-1);
sy[i] = v / 256;
my[i] = v % 256;
}
int shiftReload;
int srcIndex;
int srcRowIndex;
int offset = 0;
int lm, rm, tm, bm; // left / right / top / bottom multipliers
int a, b, c, d;
for (int i = 0; i < targetHeight; i++){
srcRowIndex = sy[ i ] * sourceWidth;
tm = my[i];
bm = 255 - tm;
for (int j = 0; j < targetWidth; j++){
// too much ifs can be too slow, measure.
// always true for first pixel in a row
if( shiftReload = shiftReloadPixels[ j ] ){
srcIndex = srcRowIndex + sx[j];
if( shiftReload & 2 ){
a = b;
c = d;
}else{
a = input[ srcIndex ];
c = input[ srcIndex + sourceWidth ];
}
b = input[ srcIndex + 1 ];
d = input[ srcIndex + 1 + sourceWidth ];
}
lm = mx[j];
rm = 255 - lm;
// WTF?
// Input AA RR GG BB
// Output RR GG BB AA
if( j ){
leftOutput = rightOutput ^ 0xFFFFFF00;
}else{
leftOutput =
// blue element
((( ( (a&0xFF)*tm
+ (c&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((a>>8)&0xFF)*tm
+ ((c>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((a>>16)&0xFF)*tm
+ ((c>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
}
rightOutput =
// blue element
((( ( (b&0xFF)*tm
+ (d&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((b>>8)&0xFF)*tm
+ ((d>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((b>>16)&0xFF)*tm
+ ((d>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
output[offset++] =
// alpha
0x000000ff
| leftOutput
| rightOutput
;
}
}
}