OpenCL: Local Memory faster than L1 Cache on CPU? - c

I wrote an OpenCL kernel that performs a box blur on an input matrix. The implementation was originally written for a GPU, and uses local memory to store the neighborhoods of work items in a work group. Then, I ran the kernel on a CPU and compared the running times to an implemenation which relied on caching reads from global memory automatically instead of manually storing them in local memory first.
Under the assumption that a CPU has no "local memory" and instead uses RAM, using local memory on a CPU should do more harm than good. However, the "local memory" kernel was faster than the one that relied on caching by 10ms (~112ms vs. ~122ms on a 8192x8192 matrix with Work Item / Work Group / "number of values calculated by each work item" settings deemed optimal for both implementations since they were found by an auto-tuner for both kernels separately).
The kernels were run on a Intel Xeon E5-1620 v2 CPU using an OpenCL intel platform available on the host.
What are reasons for this to happen?
"Local Memory" kernel: Each work item works on a "block" of values. Each block is copied to shared memory, and its neighborhood is copied to local memory depending on where the block is in the work group so no values are copied twice. Then, after the barrier, the final value is calculated.
The code below is the X-direction kernel; the y-direction kernel is exactly the same except for the direction in which the values are inspect to calculate the output value.
__kernel void boxblur_x (__read_only __global float* image,
__local float* localmem,
__write_only __global float* output)
{
// size of input and output matrix
int MATRIX_SIZE_Y = IMAGE_HEIGHT;
int MATRIX_SIZE_X = IMAGE_WIDTH;
int MATRIX_SIZE = MATRIX_SIZE_Y * MATRIX_SIZE_X;
// mask size
int S_L = MASK_SIZE_LEFT;
int S_U = 0;
int S_R = MASK_SIZE_RIGHT;
int S_D = 0;
int SHAPE_SIZE_Y = S_U + S_D + 1;
int SHAPE_SIZE_X = S_L + S_R + 1;
int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X;
// tuning parameter
// ---------------------------------------------------------------
//work items in y/x dimension per work group
int NUM_WI_Y = get_local_size(1);
int NUM_WI_X = get_local_size(0);
//size of blocks
int BLOCKHEIGHT = X_BLOCKHEIGHT;
int BLOCKWIDTH = X_BLOCKWIDTH;
//position in matrix
int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH;
int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT;
//localMemory size
int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R;
//position in localmem
int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH;
int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT;
// copy values to shared memory
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X];
}
}
// only when all work items have arrived here,
// computation continues - otherwise, not all needed
// values might be available in local memory
barrier (CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < BLOCKHEIGHT; i++)
{
for (int j = 0; j < BLOCKWIDTH; j++)
{
float sum = 0;
for (int b = 0; b <= S_L + S_R; b++)
{
sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
}
// divide by size of mask
float pixelValue = sum / SHAPE_SIZE;
// write new pixel value to output image
output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue;
}
}
}
"L1 Caching kernel": Despite the many defines, it does exactly the same, but relies on global memory caching of the blocks instead of explicitly managing local memory.
#define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y )
#define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X )
#define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y )
#define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X )
#define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y )
#define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X )
#define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y )
#define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X )
#define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y )
#define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X )
#define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii )
#define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj )
#define reorder(j) ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X )
#define reorder_inv(j) reorder(j)
#define view( i, j, x, y ) input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ]
#define a_wg( i, j, x, y ) view( WG_BLOCK_OFFSET_Y + (i), WG_BLOCK_OFFSET_X + reorder(j), (x), (y) )
#define a_wi( i, j, x, y ) a_wg( WI_BLOCK_OFFSET_Y + (i), WI_BLOCK_OFFSET_X + (j) , (x), (y) )
#define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) , (x), (y) )
#define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ]
#define res(i, j) output[ (i) * OUTPUT_SIZE_X + (j) ]
#define res_wg( i, j ) res( WG_BLOCK_OFFSET_Y + (i) , WG_BLOCK_OFFSET_X + reorder_inv(j) )
#define res_wi( i, j ) res_wg( WI_BLOCK_OFFSET_Y + (i) , WI_BLOCK_OFFSET_X + (j) )
#define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) )
float f_stencil( __global float* input, int ii, int jj, int i, int j )
{
// indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// computation
float sum = 0;
for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y )
for( int x = 0 ; x < SHAPE_SIZE_X ; ++x)
sum += a_cache(i, j, y, x);
return sum / SHAPE_SIZE;
}
__kernel void stencil( __global float* input,
__global float* output
)
{
//indices
const int WG_ID_X = get_group_id(0);
const int WG_ID_Y = get_group_id(1);
const int WI_ID_X = get_local_id(0);
const int WI_ID_Y = get_local_id(1);
// iteration over cache blocks
for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
// iteration within a cache block
for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
res_cache( i, j ) = f_stencil( input, ii, jj, i , j );
}

When you combine "L1 cache" version's loops:
for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii )
for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj )
for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i )
for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j )
for( int y = 0 ; y < SHAPE_SIZE_Y(SU+SD+1) ; ++y )
for( int x = 0 ; x < SHAPE_SIZE_X(SL+SR+1) ; ++x)
.... += a_cache(i, j, y, x);
and "local" version:
for (int i = 0; i < BLOCKHEIGHT; i++)
for (int j = 0; j < BLOCKWIDTH; j++)
for (int b = 0; b <= S_L + S_R; b++)
... +=input[...]
"a_cache" has a lot of compute
a_cache(i, j, y, x);
becomes
a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j), x, y )
and that becomes
view( WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X + reorder(CACHE_BLOCK_OFFSET_X + (j)), (x), (y) )
and that becomes
view( WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X )
, (x), (y) )
and that becomes
input[ ((WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i))) + (x)) * INPUT_SIZE_X + ((WG_BLOCK_OFFSET_X + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X) + (y)) ]
this is 9 additions + 2 multiplications + 1 modulo + 1 division.
"local" version has
sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH];
which is 4 additions + 3 multiplications but no modulo and no division.
"L1 cache" version needs to keep loop counters for 6 loops and they could be using more cpu-registers or even L1 cache. Data cache size is 128 kB per core or 64 kB per thread. If you launch 1024 threads per core(each core is a work group right?) then 1024 * 6 * 4 = 24kB L1 is needed just for loop counters. This leaves 40kB to use. When you add "const int WG_ID_X" and other variables (5 of them), only 20kB is left. Now add the "f_stencil" functions temporary "stack" variables for its arguments, there may be no L1 cache left, decreasing efficiency. "local" version has about 10-12 variables used(not-used variables maybe optimized out?) and no functions so it may be better for L1.
https://software.intel.com/en-us/node/540486
saying
To reduce the overhead of maintaining a workgroup, you should create
work-groups that are as large as possible, which means 64 and more
work-items. One upper bound is the size of the accessed data set as it
is better not to exceed the size of the L1 cache in a single work
group.
and
If your kernel code contains the barrier instruction, the issue of
work-group size becomes a tradeoff. The more local and private memory
each work-item in the work-group requires, the smaller the optimal
work-group size is. The reason is that a barrier also issues copy
instructions for the total amount of private and local memory used by
all work-items in the work-group in the work-group since the state of
each work-item that arrived at the barrier is saved before proceeding
with another work-item.
you have only 1 barrier in "local" version and before that point, 8 variables are used so not much memory needed to copy?

Related

How to use MPI and OpenMP to run a parallel loop

I need to use MPI and OpenMP (2 different problems) to parallelize a code from Sbac-Pad marathon (reference: http://lspd.mackenzie.br/marathon/18/problems.html). I am working on the himeno benchmark. I believe the only part of this code that is worth parallellizing is the jacobi function:
#define MR(mt,n,r,c,d) mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]
struct Matrix {
float* m;
int mnums;
int mrows;
int mcols;
int mdeps;
};
float
jacobi(int nn, Matrix* a,Matrix* b,Matrix* c,
Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2)
{
int i,j,k,n,imax,jmax,kmax;
float gosa,s0,ss;
imax= p->mrows-1;
jmax= p->mcols-1;
kmax= p->mdeps-1;
for(n=0 ; n<nn ; n++){
gosa = 0.0;
for(i=1 ; i<imax; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(a,0,i,j,k)*MR(p,0,i+1,j, k)
+ MR(a,1,i,j,k)*MR(p,0,i, j+1,k)
+ MR(a,2,i,j,k)*MR(p,0,i, j, k+1)
+ MR(b,0,i,j,k)
*( MR(p,0,i+1,j+1,k) - MR(p,0,i+1,j-1,k)
- MR(p,0,i-1,j+1,k) + MR(p,0,i-1,j-1,k) )
+ MR(b,1,i,j,k)
*( MR(p,0,i,j+1,k+1) - MR(p,0,i,j-1,k+1)
- MR(p,0,i,j+1,k-1) + MR(p,0,i,j-1,k-1) )
+ MR(b,2,i,j,k)
*( MR(p,0,i+1,j,k+1) - MR(p,0,i-1,j,k+1)
- MR(p,0,i+1,j,k-1) + MR(p,0,i-1,j,k-1) )
+ MR(c,0,i,j,k) * MR(p,0,i-1,j, k)
+ MR(c,1,i,j,k) * MR(p,0,i, j-1,k)
+ MR(c,2,i,j,k) * MR(p,0,i, j, k-1)
+ MR(wrk1,0,i,j,k);
ss= (s0*MR(a,3,i,j,k) - MR(p,0,i,j,k))*MR(bnd,0,i,j,k);
gosa+= ss*ss;
MR(wrk2,0,i,j,k)= MR(p,0,i,j,k) + omega*ss;
}
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(p,0,i,j,k)= MR(wrk2,0,i,j,k);
} /* end n loop */
return(gosa);
}
The problem is, this function seems to have a sequential nature, since every iteration of nn is dependant on the last one. What I tried, using MPI, was making an auxiliar variable for gosa (auxgosa), and using MPI_REDUCE after the i j k for loops, like the following (root process is rank = 0):
//rank is the current process
//size is the total amount of processes
int start = ((imax+1)/size)*rank;
int stop = ((imax+1)/size)*(rank+1)-1;
if(rank == 0){start++;}
for(n=0 ; n<nn ; n++){
gosa = 0.0;
auxgosa = 0.0;
for(i=start ; i<stop; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(aa,0,i,j,k)*MR(pp,0,i+1,j,k)
+ MR(aa,1,i,j,k)*MR(pp,0,i, j+1,k)
+ MR(aa,2,i,j,k)*MR(pp,0,i, j, k+1)
+ MR(bb,0,i,j,k)
*( MR(pp,0,i+1,j+1,k) - MR(pp,0,i+1,j-1,k)
- MR(pp,0,i-1,j+1,k) + MR(pp,0,i-1,j-1,k) )
+ MR(bb,1,i,j,k)
*( MR(pp,0,i,j+1,k+1) - MR(pp,0,i,j-1,k+1)
- MR(pp,0,i,j+1,k-1) + MR(pp,0,i,j-1,k-1) )
+ MR(bb,2,i,j,k)
*( MR(pp,0,i+1,j,k+1) - MR(pp,0,i-1,j,k+1)
- MR(pp,0,i+1,j,k-1) + MR(pp,0,i-1,j,k-1) )
+ MR(cc,0,i,j,k) * MR(pp,0,i-1,j, k)
+ MR(cc,1,i,j,k) * MR(pp,0,i, j-1,k)
+ MR(cc,2,i,j,k) * MR(pp,0,i, j, k-1)
+ MR(awrk1,0,i,j,k);
ss= (s0*MR(aa,3,i,j,k) - MR(pp,0,i,j,k))*MR(abnd,0,i,j,k);
auxgosa+= ss*ss;
MR(awrk2,0,i,j,k)= MR(pp,0,i,j,k) + omega*ss;
}
MPI_Reduce(&auxgosa,&gosa,1,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD);
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(pp,0,i,j,k)= MR(awrk2,0,i,j,k);
} /* end n loop */
Unfortunately, this didn't work. Could anyone give me some insight about this? I plan using a similar strategy with OpenMP.
If awrk2 is different from a, p, b, c and wrk1, then there is no loop carried dependence.
A simple google search will point you to parallelized versions of the Himeno benchmark (MPI, OpenMP and hybrid MPI+OpenMP versions are available).

Trapezoidal Integration in C

I am trying to compute the integral of the function f(x)=(1-x^2)^(1/2) from x=0 to x=1. The answer should be approximately pi/4. I am currently getting 2.
My current implementation of the trapezoidal rule is the following:
double
def_integral(double *f, double *x, int n)
{
double F;
for (int i = 0 ; i < n ; i++) {
F += 0.5 * ( x[i+1] - x[i] ) * ( f[i] + f[i+1] );
}
return F;
}
I'm creating N divisions to approximate the area under the curve between x_1=0 and x_N=1 by looping through i to N with x_i = i / N.
int
main(int argc, char **argv)
{
int N = 1000;
double f_x[N];
double x[N];
for (int i = 0 ; i <= N ; i++) {
double x = i * 1. / N;
f_x[i] = sqrt(1. - pow(x, 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
double F_x = def_integral(f_x, x, N);
printf("The integral is %g\n", F_x);
}
The result of 2 that I am currently getting should be dependent on the number of N division, however, no matter if I make N=10000 or N=100, I still get 2.
Any suggestions?
In this for loop, you forgot updatin array x as well.
for (int i = 0 ; i <= N ; i++) {
double x = i * 1. / N;
f_x[i] = sqrt(1. - pow(x, 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
So, for loop should be replaced by
for (int i = 0 ; i <= N ; i++) {
double xi = i * 1. / N;
x[i] = xi;
f_x[i] = sqrt(1. - pow(xi , 2.));
//printf("%.2f %.5f\n", x, f_x[i]); //uncomment if you wanna see function values
}
In your main code, you call def_integral with a double (x) and in the function an array of x (double * x) is expected. Perhaps (it is what I suppose), the problem comes from the fact you formula needs x(i+1)-x(i) but you use a constant step. Indeed, x(i+1)-x(i)=step_x is constant so you do not need each x(i) but only value : 1./N
Other remark, with a constant step, your formula could be simplified to : F_x=step_x* ( 0.5*f_x(x0)+ f_x(x1)+...+f_x(xn-1)+ 0.5*f_x(xn) ) . It helps to simplify the code and to write a better efficient one.
Everything is commented in the code above. I hope it could help you. Best regards.
#include <stdio.h>
#include <math.h>
double
def_integral(double *f, double step_x, int n)
{
double F;
for (int i = 0 ; i < n ; i++) {
F += 0.5 * ( step_x ) * ( f[i] + f[i+1] );
}
return F;
}
int main()
{
int N = 1001; // 1001 abscissas means 1000 intervalls (see comment on array size and indices)
double f_x[N]; // not needed for the simplified algorithm
double step_x = 1. / N; // x(i+1)-x(i) is constant
for (int i = 0 ; i < N ; i++) { // Note : i<N and not i<=N
double xi = i * step_x; // abscissa calculation
f_x[i] = sqrt((1. - xi )*(1. + xi )); // cf chux comment
}
double F_x = def_integral(f_x, step_x, N);
printf("The integral is %.10g\n", F_x);
// simplified algorithm
// F_x=step_x*( 0.5*f_x(x0)+f_x(x1)+...+f_x(xn-1)+0.5f_x(xn) )
double xi;
xi=0; // x(0)
F_x=0.5*sqrt((1. - xi )*(1. + xi ));
for (int i=1 ; i<=N-1 ; i++) {
xi=step_x*i;
F_x+=sqrt((1. - xi )*(1. + xi ));
}
xi=step_x*N;
F_x+=0.5*sqrt((1. - xi )*(1. + xi ));
F_x=step_x*F_x;
printf("The integral is %.10g\n", F_x);
}

Passing Two Dimension array to a function as a single pointer

I need to pass Two Dimension array to a function as a single pointer. There are different types of approaches are there but due to some constraints(CodeGeneration), I want to pass a single pointer only. I have macros which contain the size of each dimension. I implemented the following way but I am not sure it will work fine for N dimensions also
#define size_1D 3
#define size_2D 3
void fun(int *arr)
{
int i,total_size = size_1D* size_2D;
for(i = 0; i < total_size ; i++)
{
int value = arr[i];
}
}
int main()
{
int arr[size_1D][size_2D] = {{1,2,7},{8,4,9}};
fun(&arr[0][0]);
}
Any loophole is there if I followed the above approach?
void fun(int (*arr)[3]);
or exactly equivalent, but maybe more readable:
void fun(int arr[][3]);
arr is a pointer to two dimensional array with 3 rows and 3 columns. arr decayed to a pointer has the type of a pointer to an array of 3 elements. You need to pass a pointer to an array of 3 elements. You can access the data normally, using arr[a][b].
#define size_1D 3
#define size_2D 3
void fun(int arr[][3])
{
for(int i = 0; i < size_1D ; i++) {
for(int j = 0; j < size_2D ; j++) {
int value = arr[i][j];
}
}
}
int main()
{
int arr[size_1D][size_2D] = {{1,2,7},{8,4,9}};
fun(arr);
}
You can specify the sizes as arguments and use a variable length array declaration inside function parameter list. The compiler will do some job for you.
#include <stdlib.h>
void fun(size_t xmax, size_t ymax, int arr[xmax][ymax]);
// is equivalent to
void fun(size_t xmax, size_t ymax, int arr[][ymax]);
// is equivalent to
void fun(size_t xmax, size_t ymax, int (*arr)[ymax]);
void fun(size_t xmax, size_t ymax, int arr[xmax][ymax])
{
for(int i = 0; i < xmax ; i++) {
for(int j = 0; j < ymax ; j++) {
int value = arr[i][j];
}
}
}
int main()
{
int arr[3][4] = {{1,2,7},{8,4,9}};
fun(3, 4, arr);
}
#edit
We know that the result of array subscript operator is exactly identical to pointer dereference operator of the sum:
a[b] <=> *(a + b)
From pointer arithmetic we know that:
type *pnt;
int a;
pnt + a = (typeof(pnt))(void*)((uintptr_t)(void*)pnt + a * sizeof(*pnt))
pnt + a = (int*)(void*)((uintptr_t)(void*)pnt + a * sizeof(type))
And that the array is equal to the value to the pointer to the first element of an array:
type pnt[A];
assert((uintptr_t)pnt == (uintptr_t)&pnt[0]);
assert((uintptr_t)pnt == (uintptr_t)&*(pnt + 0));
assert((uintptr_t)pnt == (uintptr_t)&*pnt);
So:
int arr[A][B];
then:
arr[x][y]
is equivalent to (ignore warnings, kind-of pseudocode):
*(*(arr + x) + y)
*( *(int[A][B])( (uintptr_t)arr + x * sizeof(int[B]) ) + y )
// ---- x * sizeof(int[B]) = x * B * sizeof(int)
*( *(int[A][B])( (uintptr_t)arr + x * B * sizeof(int) ) + y )
// ---- C11 6.5.2.1p3
*( (int[B])( (uintptr_t)arr + x * B * sizeof(int) ) + y )
*(int[B])( (uintptr_t)( (uintptr_t)arr + x * B * sizeof(int) ) + y * sizeof(int) )
// ---- *(int[B])( ... ) = (int)dereference( ... ) = *(int*)( ... )
// ---- loose braces - conversion from size_t to uintptr_t should be safe
*(int*)( (uintptr_t)arr + x * B * sizeof(int) + y * sizeof(int) )
*(int*)( (uintptr_t)arr + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)( &*arr ) + ( x * B + y ) * sizeof(int) )
// ---- (uintptr_t)arr = (uintptr_t)&arr[0][0]
*(int*)( (uintptr_t)( &*(*(arr + 0) + 0) ) + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)( &arr[0][0] ) + ( x * B + y ) * sizeof(int) )
*(int*)( (uintptr_t)&arr[0][0] + ( x * B + y ) * sizeof(int) )
// ---- decayed typeof(&arr[0][0]) = int*
*( &arr[0][0] + ( x * B + y ) )
(&arr[0][0])[x * B + y]
So:
arr[x][y] == (&arr[0][0])[x * B + y]
arr[x][y] == (&arr[0][0])[x * sizeof(*arr)/sizeof(**arr) + y]
On a sane architecture where sizeof(uintptr_t) == sizeof(size_t) == sizeof(int*) == sizeof(int**) and etc., and there is no difference in accessing data behind a int* pointer from accessing data behind int(*)[B] pointer etc. You should be safe with accessing one dimensional array when using a pointer to the first array member, as the operations should be equivalent ("safe" with exception for out-of-bound accesses, that's never safe)
Note, that this is correctly undefined behavior according to C standard and will not work on all architectures. Example: there could be an architecture, where data of the type int[A] are stored in different memory bank then int[A][B] data (by hardware, by design). So the type of the pointer tells the compiler which data bank to choose, so accessing the same data with the same to the value pointer, but with different pointer type, leads to UB, as the compiler chooses different data bank to access the data.

'an illegal memory access' when trying to write to a 2D array allocated using cudaMalloc3D

I am trying to allocate and copy memory of a flattened 2D array on to the device using cudaMalloc3D to test the performance of cudaMalloc3D. But when I try to write to the array from the kernel it throws 'an illegal memory access was encountered' exception. The program runs fine if I am just reading from the array but when I try to write to it, there is an error. Any help on this will be greatly appreciated. Below is my code and the syntax for compiling the code.
Compile using
nvcc -O2 -arch sm_20 test.cu
Code: test.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 10;
// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;
// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);
// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );
// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;
// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;
// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
return EXIT_SUCCESS;
}
The reason the error doesn't occur on this line:
REAL tmp = unew_row[j]; // no error on this line
is because the compiler is optimizing that line out. It doesn't do anything useful, and so the compiler completely eliminates it. The compiler warning:
xxx.cu(87): warning: variable "tmp" was declared but never referenced
is a hint to that effect.
Your code is very nearly correct. The issue is here:
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
It should be:
REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);
The i variable in your kernel is the width (i.e. X) dimension.
The j variable is the height (i.e. Y) dimension.
The height is the one that refers to which row you are on, therefore the row pitch should be multiplied by the height parameter, i.e. j, not i.
Similarly, although it's not the source of the specific failure for your particular dimensions, this code may be not what you intended either:
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
If, for example, you were intending to compute the offset to the row and then index into the row (perhaps to set every element in the alocation, for example) then I think you would want to use i not j as your final index:
REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error
However, for this particular example, this is not the actual source of the illegal memory access.

Optimize Bilinear Resize Algorithm in C

Can anyone spot any way to improve the speed in the next Bilinear resizing Algorithm?
I need to improve Speed as this is critical, keeping good image quality. Is expected to be used in mobile devices with low speed CPUs.
The algorithm is used mainly for up-scale resizing. Any other faster Bilinear algorithm also would be appreciated. Thanks
void resize(int* input, int* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
for (int i = 0; i < targetHeight; i++)
{
for (int j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = input[index] ;
b = input[index + 1] ;
c = input[index + sourceWidth] ;
d = input[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
output [offset++] =
0x000000ff | // alpha
((((int)red) << 24)&0xff0000) |
((((int)green) << 16)&0xff00) |
((((int)blue) << 8)&0xff00);
}
}
}
Off the the top of my head:
Stop using floating-point, unless you're certain your target CPU has it in hardware with good performance.
Make sure memory accesses are cache-optimized, i.e. clumped together.
Use the fastest data types possible. Sometimes this means smallest, sometimes it means "most native, requiring least overhead".
Investigate if signed/unsigned for integer operations have performance costs on your platform.
Investigate if look-up tables rather than computations gain you anything (but these can blow the caches, so be careful).
And, of course, do lots of profiling and measurements.
In-Line Cache and Lookup Tables
Cache your computations in your algorithm.
Avoid duplicate computations (like (1-y_diff) or (x_ratio * j))
Go through all the lines of your algorithm, and try to identify patterns of repetitions. Extract these to local variables. And possibly extract to functions, if they are short enough to be inlined, to make things more readable.
Use a lookup-table
It's quite likely that, if you can spare some memory, you can implement a "store" for your RGB values and simply "fetch" them based on the inputs that produced them. Maybe you don't need to store all of them, but you could experiment and see if some come back often. Alternatively, you could "fudge" your colors and thus end up with less values to store for more lookup inputs.
If you know the boundaries for you inputs, you can calculate the complete domain space and figure out what makes sense to cache. For instance, if you can't cache the whole R, G, B values, maybe you can at least pre-compute the shiftings ((b>>16) and so forth...) that are most likely deterministic in your case).
Use the Right Data Types for Performance
If you can avoid double and float variables, use int. On most architectures, int would be test faster type for computations because of the memory model. You can still achieve decent precision by simply shifting your units (ie use 1026 as int instead of 1.026 as double or float). It's quite likely that this trick would be enough for you.
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
Could surely use some optimization: you were using x_ration * j-1 just a few cycles earlier, so all you really need here is x+=x_ratio
My random guess (use a profiler instead of letting people guess!):
The compiler has to generate that works when input and output overlap which means it has to do generate loads of redundant stores and loads. Add restrict to the input and output parameters to remove that safety feature.
You could also try using a=b; and c=d; instead of loading them again.
here is my version, steal some ideas. My C-fu is quite weak, so some lines are pseudocodes, but you can fix them.
void resize(int* input, int* output,
int sourceWidth, int sourceHeight,
int targetWidth, int targetHeight
) {
// Let's create some lookup tables!
// you can move them into 2-dimensional arrays to
// group together values used at the same time to help processor cache
int sx[0..targetWidth ]; // target->source X lookup
int sy[0..targetHeight]; // target->source Y lookup
int mx[0..targetWidth ]; // left pixel's multiplier
int my[0..targetHeight]; // bottom pixel's multiplier
// we don't have to calc indexes every time, find out when
bool reloadPixels[0..targetWidth ];
bool shiftPixels[0..targetWidth ];
int shiftReloadPixels[0..targetWidth ]; // can be combined if necessary
int v; // temporary value
for (int j = 0; j < targetWidth; j++){
// (8bit + targetBits + sourceBits) should be < max int
v = 256 * j * (sourceWidth-1) / (targetWidth-1);
sx[j] = v / 256;
mx[j] = v % 256;
reloadPixels[j] = j ? ( sx[j-1] != sx[j] ? 1 : 0)
: 1; // always load first pixel
// if no reload -> then no shift too
shiftPixels[j] = j ? ( sx[j-1]+1 = sx[j] ? 2 : 0)
: 0; // nothing to shift at first pixel
shiftReloadPixels[j] = reloadPixels[i] | shiftPixels[j];
}
for (int i = 0; i < targetHeight; i++){
v = 256 * i * (sourceHeight-1) / (targetHeight-1);
sy[i] = v / 256;
my[i] = v % 256;
}
int shiftReload;
int srcIndex;
int srcRowIndex;
int offset = 0;
int lm, rm, tm, bm; // left / right / top / bottom multipliers
int a, b, c, d;
for (int i = 0; i < targetHeight; i++){
srcRowIndex = sy[ i ] * sourceWidth;
tm = my[i];
bm = 255 - tm;
for (int j = 0; j < targetWidth; j++){
// too much ifs can be too slow, measure.
// always true for first pixel in a row
if( shiftReload = shiftReloadPixels[ j ] ){
srcIndex = srcRowIndex + sx[j];
if( shiftReload & 2 ){
a = b;
c = d;
}else{
a = input[ srcIndex ];
c = input[ srcIndex + sourceWidth ];
}
b = input[ srcIndex + 1 ];
d = input[ srcIndex + 1 + sourceWidth ];
}
lm = mx[j];
rm = 255 - lm;
// WTF?
// Input AA RR GG BB
// Output RR GG BB AA
if( j ){
leftOutput = rightOutput ^ 0xFFFFFF00;
}else{
leftOutput =
// blue element
((( ( (a&0xFF)*tm
+ (c&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((a>>8)&0xFF)*tm
+ ((c>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((a>>16)&0xFF)*tm
+ ((c>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
}
rightOutput =
// blue element
((( ( (b&0xFF)*tm
+ (d&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((b>>8)&0xFF)*tm
+ ((d>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((b>>16)&0xFF)*tm
+ ((d>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
output[offset++] =
// alpha
0x000000ff
| leftOutput
| rightOutput
;
}
}
}

Resources