How to use MPI and OpenMP to run a parallel loop - c
I need to use MPI and OpenMP (2 different problems) to parallelize a code from Sbac-Pad marathon (reference: http://lspd.mackenzie.br/marathon/18/problems.html). I am working on the himeno benchmark. I believe the only part of this code that is worth parallellizing is the jacobi function:
#define MR(mt,n,r,c,d) mt->m[(n) * mt->mrows * mt->mcols * mt->mdeps + (r) * mt->mcols* mt->mdeps + (c) * mt->mdeps + (d)]
struct Matrix {
float* m;
int mnums;
int mrows;
int mcols;
int mdeps;
};
float
jacobi(int nn, Matrix* a,Matrix* b,Matrix* c,
Matrix* p,Matrix* bnd,Matrix* wrk1,Matrix* wrk2)
{
int i,j,k,n,imax,jmax,kmax;
float gosa,s0,ss;
imax= p->mrows-1;
jmax= p->mcols-1;
kmax= p->mdeps-1;
for(n=0 ; n<nn ; n++){
gosa = 0.0;
for(i=1 ; i<imax; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(a,0,i,j,k)*MR(p,0,i+1,j, k)
+ MR(a,1,i,j,k)*MR(p,0,i, j+1,k)
+ MR(a,2,i,j,k)*MR(p,0,i, j, k+1)
+ MR(b,0,i,j,k)
*( MR(p,0,i+1,j+1,k) - MR(p,0,i+1,j-1,k)
- MR(p,0,i-1,j+1,k) + MR(p,0,i-1,j-1,k) )
+ MR(b,1,i,j,k)
*( MR(p,0,i,j+1,k+1) - MR(p,0,i,j-1,k+1)
- MR(p,0,i,j+1,k-1) + MR(p,0,i,j-1,k-1) )
+ MR(b,2,i,j,k)
*( MR(p,0,i+1,j,k+1) - MR(p,0,i-1,j,k+1)
- MR(p,0,i+1,j,k-1) + MR(p,0,i-1,j,k-1) )
+ MR(c,0,i,j,k) * MR(p,0,i-1,j, k)
+ MR(c,1,i,j,k) * MR(p,0,i, j-1,k)
+ MR(c,2,i,j,k) * MR(p,0,i, j, k-1)
+ MR(wrk1,0,i,j,k);
ss= (s0*MR(a,3,i,j,k) - MR(p,0,i,j,k))*MR(bnd,0,i,j,k);
gosa+= ss*ss;
MR(wrk2,0,i,j,k)= MR(p,0,i,j,k) + omega*ss;
}
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(p,0,i,j,k)= MR(wrk2,0,i,j,k);
} /* end n loop */
return(gosa);
}
The problem is, this function seems to have a sequential nature, since every iteration of nn is dependant on the last one. What I tried, using MPI, was making an auxiliar variable for gosa (auxgosa), and using MPI_REDUCE after the i j k for loops, like the following (root process is rank = 0):
//rank is the current process
//size is the total amount of processes
int start = ((imax+1)/size)*rank;
int stop = ((imax+1)/size)*(rank+1)-1;
if(rank == 0){start++;}
for(n=0 ; n<nn ; n++){
gosa = 0.0;
auxgosa = 0.0;
for(i=start ; i<stop; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++){
s0= MR(aa,0,i,j,k)*MR(pp,0,i+1,j,k)
+ MR(aa,1,i,j,k)*MR(pp,0,i, j+1,k)
+ MR(aa,2,i,j,k)*MR(pp,0,i, j, k+1)
+ MR(bb,0,i,j,k)
*( MR(pp,0,i+1,j+1,k) - MR(pp,0,i+1,j-1,k)
- MR(pp,0,i-1,j+1,k) + MR(pp,0,i-1,j-1,k) )
+ MR(bb,1,i,j,k)
*( MR(pp,0,i,j+1,k+1) - MR(pp,0,i,j-1,k+1)
- MR(pp,0,i,j+1,k-1) + MR(pp,0,i,j-1,k-1) )
+ MR(bb,2,i,j,k)
*( MR(pp,0,i+1,j,k+1) - MR(pp,0,i-1,j,k+1)
- MR(pp,0,i+1,j,k-1) + MR(pp,0,i-1,j,k-1) )
+ MR(cc,0,i,j,k) * MR(pp,0,i-1,j, k)
+ MR(cc,1,i,j,k) * MR(pp,0,i, j-1,k)
+ MR(cc,2,i,j,k) * MR(pp,0,i, j, k-1)
+ MR(awrk1,0,i,j,k);
ss= (s0*MR(aa,3,i,j,k) - MR(pp,0,i,j,k))*MR(abnd,0,i,j,k);
auxgosa+= ss*ss;
MR(awrk2,0,i,j,k)= MR(pp,0,i,j,k) + omega*ss;
}
MPI_Reduce(&auxgosa,&gosa,1,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD);
for(i=1 ; i<imax ; i++)
for(j=1 ; j<jmax ; j++)
for(k=1 ; k<kmax ; k++)
MR(pp,0,i,j,k)= MR(awrk2,0,i,j,k);
} /* end n loop */
Unfortunately, this didn't work. Could anyone give me some insight about this? I plan using a similar strategy with OpenMP.
If awrk2 is different from a, p, b, c and wrk1, then there is no loop carried dependence.
A simple google search will point you to parallelized versions of the Himeno benchmark (MPI, OpenMP and hybrid MPI+OpenMP versions are available).
Related
Matrix Subset Operations in Array
I have a matrix multiplication problem. We have an image matrix which can be have variable size. It is required to calculate C = A*B for every possible nxn. C will be added to output image as seen in figure. The center point of A Matrix is located in the lower triangle. Also, B is placed diagonally symmetric to A. A can be overlap, so, B can be overlap too. Figures can be seen in below for more detailed understand: Blue X points represent all possible mid points of A. Algorithm should just do multiply A and diagonally mirrored version of A or called B. I done it with lots of for loop. I need to reduce number of for that I used. Could you help me please? What kind of algorithm can be used for this problem? I have some confusing points. Could you please help me with your genius algorithm talents? Or could you direct me to an expert? Original Questions is below: Thanks. Update: #define SIZE_ARRAY 20 #define SIZE_WINDOW 5 #define WINDOW_OFFSET 2 #define INDEX_OFFSET 1 #define START_OFFSET_COLUMN 2 #define START_OFFSET_ROW 3 #define END_OFFSET_COLUMN 3 #define END_OFFSET_ROW 2 #define GET_LOWER_DIAGONAL_INDEX_MIN_ROW (START_OFFSET_ROW); #define GET_LOWER_DIAGONAL_INDEX_MAX_ROW (SIZE_ARRAY - INDEX_OFFSET - END_OFFSET_ROW) #define GET_LOWER_DIAGONAL_INDEX_MIN_COL (START_OFFSET_COLUMN); #define GET_LOWER_DIAGONAL_INDEX_MAX_COL (SIZE_ARRAY - INDEX_OFFSET - END_OFFSET_COLUMN) uint32_t lowerDiagonalIndexMinRow = GET_LOWER_DIAGONAL_INDEX_MIN_ROW; uint32_t lowerDiagonalIndexMaxRow = GET_LOWER_DIAGONAL_INDEX_MAX_ROW; uint32_t lowerDiagonalIndexMinCol = GET_LOWER_DIAGONAL_INDEX_MIN_COL; uint32_t lowerDiagonalIndexMaxCol = GET_LOWER_DIAGONAL_INDEX_MAX_COL; void parallelMultiplication_Stable_Master() { startTimeStamp = omp_get_wtime(); #pragma omp parallel for num_threads(8) private(outerIterRow, outerIterCol,rA,cA,rB,cB) shared(inputImage, outputImage) for(outerIterRow = lowerDiagonalIndexMinRow; outerIterRow < lowerDiagonalIndexMaxRow; outerIterRow++) { for(outerIterCol = lowerDiagonalIndexMinCol; outerIterCol < lowerDiagonalIndexMaxCol; outerIterCol++) { if(outerIterCol + 1 < outerIterRow) { rA = outerIterRow - WINDOW_OFFSET; cA = outerIterCol - WINDOW_OFFSET; rB = outerIterCol - WINDOW_OFFSET; cB = outerIterRow - WINDOW_OFFSET; for(i= outerIterRow - WINDOW_OFFSET; i <= outerIterRow + WINDOW_OFFSET; i++) { for(j= outerIterCol - WINDOW_OFFSET; j <= outerIterCol + WINDOW_OFFSET; j++) { for(k=0; k < SIZE_WINDOW; k++) { #pragma omp critical outputImage[i][j] += inputImage[rA][cA+k] * inputImage[rB+k][cB]; } cB++; rA++; } rB++; cA++; printf("Thread Number - %d",omp_get_thread_num()); } } } } stopTimeStamp = omp_get_wtime(); printArray(outputImage,"Output Image"); printConsoleNotification(100, startTimeStamp, stopTimeStamp); } I am getting segmentation fault error if I set up thread count more than "1". What is the trick ?
I'm not providing a solution, but some thoughts that may help the OP exploring a possible approach. You can evaluate each element of the resulting C matrix directly, from the values of the original matrix in a way similar to a convolution operation. Consider the following image (sorry if it's confusing): Instead of computing each matrix product for every A submatrix, you can evaluate the value of each Ci, j from the values in the shaded areas. Note that Ci, j depends only on a small subset of row i and that the elements of the upper right triangular submatrix (where the B submatrices are picked) could be copied and maybe transposed in a more chache-friendly accomodation. Alternatively, it may be worth exploring an approach where for every possible Bi, j, all the corresponding elements of C are evaluated. Edit Note that you can actually save a lot of calculations (and maybe cache misses) by grouping the terms, see e.g. the first two elements of row i in A: More formally Ci,j = Ai,j-4 · (Bj-4,i + Bj-4,i+1 + Bj-4,i+2 + Bj-4,i+3 + Bj-4,i+4) Ci,j += Ai,j-3 · (Bj-3,i-1 + Bj-3,i+4 + 2·(Bj-3,i + Bj-3,i+1 + Bj-3,i+2 + Bj-3,i+3)) Ci,j += Ai,j-2 · (Bj-2,i-2 + Bj-2,i+4 + 2·(Bj-2,i-1 + Bj-2,i+3) + 3·(Bj-2,i + Bj-2,i+1 + Bj-2,i+2)) Ci,j += Ai,j-1 · (Bj-1,i-3 + Bj-1,i+4 + 2·(Bj-1,i-2 + Bj-1,i+3) + 3·(Bj-1,i-1 + Bj-1,i+2) + 4·(Bj-1,i + Bj-1,i+1)) Ci,j += Ai,j · (Bj,i-4 + Bj,i+4 + 2·(Bj,i-3 + Bj,i+3) + 3·(Bj,i-2 + Bj,i+2) + 4·(Bj,i-1 + Bj,i+1) + 5·Bj,i) Ci,j += Ai,j+1 · (Bj+1,i-4 + Bj+1,i+3 + 2·(Bj+1,i-3 + Bj+1,i+2) + 3·(Bj+1,i-2 + Bj+1,i+1) + 4·(Bj+1,i-1 + Bj+1,i)) Ci,j += Ai,j+2 · (Bj+2,i-4 + Bj+2,i+2 + 2·(Bj+2,i-3 + Bj+2,i+1) + 3·(Bj+2,i-2 + Bj+2,i-1 + Bj+2,i)) Ci,j += Ai,j+3 · (Bj+3,i-4 + Bj+3,i+1 + 2·(Bj+3,i-3 + Bj+3,i-2 + Bj+3,i-1 + Bj+3,i)) Ci,j += Ai,j+4 · (Bj+4,i-4 + Bj+4,i-3 + Bj+4,i-2 + Bj+4,i-1 + Bj+4,i) If I correctly estimated, this requires something like 60 additions and 25 (possibly fused) multiplications, compared to 125 operations like Ci,j += Ai,k · Bk,i spread all over the places. I think that cache-locality may have a bigger impact on performance than the mere reduction of operations. We could also precompute all the values Si,j = Bj,i + Bj,i+1 + Bj,i+2 + Bj,i+3 + Bj,i+4 Then the previous formulas become Ci,j = Ai,j-4 · Sj-4,i Ci,j += Ai,j-3 · (Sj-3,i-1 + Sj-3,i) Ci,j += Ai,j-2 · (Sj-2,i-2 + Sj-2,i-1 + Sj-2,i) Ci,j += Ai,j-1 · (Sj-1,i-3 + Sj-1,i-2 + Sj-1,i-1 + Sj-1,i) Ci,j += Ai,j · (Sj,i-4 + Sj,i-3 + Sj,i-2 + Sj,i-1 + Sj,i) Ci,j += Ai,j+1 · (Sj+1,i-4 + Sj+1,i-3 + Sj+1,i-2 + Sj+1,i-1) Ci,j += Ai,j+2 · (Sj+2,i-4 + Sj+2,i-3 + Sj+2,i-2) Ci,j += Ai,j+3 · (Sj+3,i-4 + Sj+3,i-3) Ci,j += Ai,j+4 · Sj+4,i-4
Here is my take. I wrote this before OP showed any code, so I'm not following any of their code patterns. I start with a suitable image struct, just for my own sanity. struct Image { float* values; int rows, cols; }; struct Image image_allocate(int rows, int cols) { struct Image rtrn; rtrn.rows = rows; rtrn.cols = cols; rtrn.values = malloc(sizeof(float) * rows * cols); return rtrn; } void image_fill(struct Image* img) { ptrdiff_t row, col; for(row = 0; row < img->rows; ++row) for(col = 0; col < img->cols; ++col) img->values[row * img->cols + col] = rand() * (1.f / RAND_MAX); } void image_print(const struct Image* img) { ptrdiff_t row, col; for(row = 0; row < img->rows; ++row) { for(col = 0; col < img->cols; ++col) printf("%.3f ", img->values[row * img->cols + col]); putchar('\n'); } putchar('\n'); } A 5x5 matrix multiplication is too small to reasonably dispatch to BLAS. So I write a simple version myself that can be loop-unrolled and / or inlined. This routine could use a couple of micro-optimizations but let's keep it simple for now. /** out += left * right for 5x5 sub-matrices */ static void mat_mul_5x5( float* restrict out, const float* left, const float* right, int cols) { ptrdiff_t row, col, inner; float sum; for(row = 0; row < 5; ++row) { for(col = 0; col < 5; ++col) { sum = out[row * cols + col]; for(inner = 0; inner < 5; ++inner) sum += left[row * cols + inner] * right[inner * cols + col]; out[row * cols + col] = sum; } } } Now for the single-threaded implementation of the main algorithm. Again, nothing fancy. We just iterate over the lower triangular matrix, excluding the diagonal. I keep track of the top-left corner instead of the center point. Makes index computation a bit simpler. void compute_ltr(struct Image* restrict out, const struct Image* in) { ptrdiff_t top, left, end; /* if image is not quadratic, find quadratic subset */ end = out->rows < out->cols ? out->rows : out->cols; assert(in->rows == out->rows && in->cols == out->cols); memset(out->values, 0, sizeof(float) * out->rows * out->cols); for(top = 1; top <= end - 5; ++top) for(left = 0; left < top; ++left) mat_mul_5x5(out->values + top * out->cols + left, in->values + top * in->cols + left, in->values + left * in->cols + top, in->cols); } The parallelization is a bit tricky because we have to make sure the threads don't overlap in their output matrices. A critical section, atomics or similar stuff would cost too much performance. A simpler solution is a strided approach: If we always keep the threads 5 rows apart, they cannot interfere. So we simply compute every fifth row, synchronize all threads, then compute the next set of rows, five apart, and so on. void compute_ltr_parallel(struct Image* restrict out, const struct Image* in) { /* if image is not quadratic, find quadratic subset */ const ptrdiff_t end = out->rows < out->cols ? out->rows : out->cols; assert(in->rows == out->rows && in->cols == out->cols); memset(out->values, 0, sizeof(float) * out->rows * out->cols); /* * Keep the parallel section open for multiple loops to reduce * overhead */ # pragma omp parallel { ptrdiff_t top, left, offset; for(offset = 0; offset < 5; ++offset) { /* Use dynamic scheduling because the work per row varies */ # pragma omp for schedule(dynamic) for(top = 1 + offset; top <= end - 5; top += 5) for(left = 0; left < top; ++left) mat_mul_5x5(out->values + top * out->cols + left, in->values + top * in->cols + left, in->values + left * in->cols + top, in->cols); } } } My benchmark with 1000 iterations of a 1000x1000 image show 7 seconds for the serial version and 1.2 seconds for the parallelized version on my 8 core / 16 thread CPU. EDIT for completeness: Here are the includes and the main for benchmarking. #include <assert.h> #include <stddef.h> /* using ptrdiff_t */ #include <stdlib.h> /* using malloc */ #include <stdio.h> /* using printf */ #include <string.h> /* using memset */ /* Insert code from above here */ int main() { int rows = 1000, cols = 1000, rep = 1000; struct Image in, out; in = image_allocate(rows, cols); out = image_allocate(rows, cols); image_fill(&in); # if 1 do compute_ltr_parallel(&out, &in); while(--rep); # else do compute_ltr(&out, &in); while(--rep); # endif } Compile with gcc -O3 -fopenmp. Regarding the comment, and also your way of using OpenMP: Don't overcomplicate things with unnecessary directives. OpenMP can figure out how many threads are available itself. And private variables can easily be declared within the parallel section (usually). If you want a specific number of threads, just call with the appropriate environment variable, e.g. on Linux call OMP_NUM_THREADS=8 ./executable
Accurate method for finding the time complexity of a function
How to find the time complexity of this function: Code void f(int n) { for(int i=0; i<n; ++i) for(int j=0; j<i; ++j) for(int k=i*j; k>0; k/=2) printf("~"); } I took an educated guess of (n^2)*log(n) based on intuition and it turned out to be correct. But I can't seem to find an accurate explanation for it.
For every value of i, i>0, there will be i-1 values of the inner loop, each of them for k starting respectively at: i*1, i*2, ..., i(i-1) Since k is divided by 2 until it reaches 0, each of these inner-inner loops require lg(k) steps. Hence lg(i*1) + lg(i*2) + ... + lg(i(i-1)) = lg(i) + lg(i) + lg(2) + ... + lg(i) + lg(i-1) = (i-1)lg(i) + lg(2) + ... + lg(i-1) Therefore the total would be f(n) ::= sum_{i=1}^{n-1} i*lg(i) + lg(2) + ... + lg(i-1) Let's now bound f(n+1) from above: f(n+1) <= sum_{i-1}^n i*lg(i) + (i-1)lg(i-1) <= 2*sum_{i-1}^n i*lg(i) <= C*integral_0^n x(ln x) ; integral bound, some constant C = C/2(n^2(ln n) - n^2/2) ; integral x*ln(x) = x^2/2*ln(x) - x^2/4 = O(n^2*lg(n)) If we now bound f(n+1) from below: f(n+1) >= sum_{i=1}^n i*lg(i) >= C*integral_0^n x(ln x) ; integral bound = C*(n^2*ln(n)/2 - n^2/4) ; integral x*ln(x) = x^2/2*ln(x) - x^2/4 >= C/4(n^2*ln(n)) = O(n^2*lg(n))
OpenCL: Local Memory faster than L1 Cache on CPU?
I wrote an OpenCL kernel that performs a box blur on an input matrix. The implementation was originally written for a GPU, and uses local memory to store the neighborhoods of work items in a work group. Then, I ran the kernel on a CPU and compared the running times to an implemenation which relied on caching reads from global memory automatically instead of manually storing them in local memory first. Under the assumption that a CPU has no "local memory" and instead uses RAM, using local memory on a CPU should do more harm than good. However, the "local memory" kernel was faster than the one that relied on caching by 10ms (~112ms vs. ~122ms on a 8192x8192 matrix with Work Item / Work Group / "number of values calculated by each work item" settings deemed optimal for both implementations since they were found by an auto-tuner for both kernels separately). The kernels were run on a Intel Xeon E5-1620 v2 CPU using an OpenCL intel platform available on the host. What are reasons for this to happen? "Local Memory" kernel: Each work item works on a "block" of values. Each block is copied to shared memory, and its neighborhood is copied to local memory depending on where the block is in the work group so no values are copied twice. Then, after the barrier, the final value is calculated. The code below is the X-direction kernel; the y-direction kernel is exactly the same except for the direction in which the values are inspect to calculate the output value. __kernel void boxblur_x (__read_only __global float* image, __local float* localmem, __write_only __global float* output) { // size of input and output matrix int MATRIX_SIZE_Y = IMAGE_HEIGHT; int MATRIX_SIZE_X = IMAGE_WIDTH; int MATRIX_SIZE = MATRIX_SIZE_Y * MATRIX_SIZE_X; // mask size int S_L = MASK_SIZE_LEFT; int S_U = 0; int S_R = MASK_SIZE_RIGHT; int S_D = 0; int SHAPE_SIZE_Y = S_U + S_D + 1; int SHAPE_SIZE_X = S_L + S_R + 1; int SHAPE_SIZE = SHAPE_SIZE_Y * SHAPE_SIZE_X; // tuning parameter // --------------------------------------------------------------- //work items in y/x dimension per work group int NUM_WI_Y = get_local_size(1); int NUM_WI_X = get_local_size(0); //size of blocks int BLOCKHEIGHT = X_BLOCKHEIGHT; int BLOCKWIDTH = X_BLOCKWIDTH; //position in matrix int GLOBAL_POS_X = get_global_id(0) * BLOCKWIDTH; int GLOBAL_POS_Y = get_global_id(1) * BLOCKHEIGHT; //localMemory size int LOCALMEM_WIDTH = S_L + NUM_WI_X * BLOCKWIDTH + S_R; //position in localmem int LOCAL_POS_X = S_L + get_local_id(0) * BLOCKWIDTH; int LOCAL_POS_Y = S_U + get_local_id(1) * BLOCKHEIGHT; // copy values to shared memory for (int i = 0; i < BLOCKHEIGHT; i++) { for (int j = 0; j < BLOCKWIDTH; j++) { localmem[(LOCAL_POS_X + j) + (LOCAL_POS_Y + i) * LOCALMEM_WIDTH] = image[GLOBAL_POS_X + j + (GLOBAL_POS_Y + i) * MATRIX_SIZE_X]; } } // only when all work items have arrived here, // computation continues - otherwise, not all needed // values might be available in local memory barrier (CLK_LOCAL_MEM_FENCE); for (int i = 0; i < BLOCKHEIGHT; i++) { for (int j = 0; j < BLOCKWIDTH; j++) { float sum = 0; for (int b = 0; b <= S_L + S_R; b++) { sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH]; } // divide by size of mask float pixelValue = sum / SHAPE_SIZE; // write new pixel value to output image output[GLOBAL_POS_X + j + ((GLOBAL_POS_Y + i) * get_global_size(0) * BLOCKWIDTH)] = pixelValue; } } } "L1 Caching kernel": Despite the many defines, it does exactly the same, but relies on global memory caching of the blocks instead of explicitly managing local memory. #define WG_BLOCK_SIZE_Y ( OUTPUT_SIZE_Y / NUM_WG_Y ) #define WG_BLOCK_SIZE_X ( OUTPUT_SIZE_X / NUM_WG_X ) #define WI_BLOCK_SIZE_Y ( WG_BLOCK_SIZE_Y / NUM_WI_Y ) #define WI_BLOCK_SIZE_X ( WG_BLOCK_SIZE_X / NUM_WI_X ) #define WG_BLOCK_OFFSET_Y ( WG_BLOCK_SIZE_Y * WG_ID_Y ) #define WG_BLOCK_OFFSET_X ( WG_BLOCK_SIZE_X * WG_ID_X ) #define WI_BLOCK_OFFSET_Y ( WI_BLOCK_SIZE_Y * WI_ID_Y ) #define WI_BLOCK_OFFSET_X ( WI_BLOCK_SIZE_X * WI_ID_X ) #define NUM_CACHE_BLOCKS_Y ( WI_BLOCK_SIZE_Y / CACHE_BLOCK_SIZE_Y ) #define NUM_CACHE_BLOCKS_X ( WI_BLOCK_SIZE_X / CACHE_BLOCK_SIZE_X ) #define CACHE_BLOCK_OFFSET_Y ( CACHE_BLOCK_SIZE_Y * ii ) #define CACHE_BLOCK_OFFSET_X ( CACHE_BLOCK_SIZE_X * jj ) #define reorder(j) ( ( (j) / WI_BLOCK_SIZE_X) + ( (j) % WI_BLOCK_SIZE_X) * NUM_WI_X ) #define reorder_inv(j) reorder(j) #define view( i, j, x, y ) input[ ((i) + (x)) * INPUT_SIZE_X + ((j) + (y)) ] #define a_wg( i, j, x, y ) view( WG_BLOCK_OFFSET_Y + (i), WG_BLOCK_OFFSET_X + reorder(j), (x), (y) ) #define a_wi( i, j, x, y ) a_wg( WI_BLOCK_OFFSET_Y + (i), WI_BLOCK_OFFSET_X + (j) , (x), (y) ) #define a_cache( i, j, x, y ) a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) , (x), (y) ) #define res_wg( i, j ) output[ (WG_BLOCK_OFFSET_Y + i) * OUTPUT_SIZE_X + WG_BLOCK_OFFSET_X + reorder_inv(j) ] #define res(i, j) output[ (i) * OUTPUT_SIZE_X + (j) ] #define res_wg( i, j ) res( WG_BLOCK_OFFSET_Y + (i) , WG_BLOCK_OFFSET_X + reorder_inv(j) ) #define res_wi( i, j ) res_wg( WI_BLOCK_OFFSET_Y + (i) , WI_BLOCK_OFFSET_X + (j) ) #define res_cache( i, j ) res_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j) ) float f_stencil( __global float* input, int ii, int jj, int i, int j ) { // indices const int WG_ID_X = get_group_id(0); const int WG_ID_Y = get_group_id(1); const int WI_ID_X = get_local_id(0); const int WI_ID_Y = get_local_id(1); // computation float sum = 0; for( int y = 0 ; y < SHAPE_SIZE_Y ; ++y ) for( int x = 0 ; x < SHAPE_SIZE_X ; ++x) sum += a_cache(i, j, y, x); return sum / SHAPE_SIZE; } __kernel void stencil( __global float* input, __global float* output ) { //indices const int WG_ID_X = get_group_id(0); const int WG_ID_Y = get_group_id(1); const int WI_ID_X = get_local_id(0); const int WI_ID_Y = get_local_id(1); // iteration over cache blocks for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii ) for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj ) // iteration within a cache block for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i ) for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j ) res_cache( i, j ) = f_stencil( input, ii, jj, i , j ); }
When you combine "L1 cache" version's loops: for( int ii=0 ; ii < NUM_CACHE_BLOCKS_Y ; ++ii ) for( int jj=0 ; jj < NUM_CACHE_BLOCKS_X ; ++jj ) for( int i=0 ; i < CACHE_BLOCK_SIZE_Y ; ++i ) for( int j=0 ; j < CACHE_BLOCK_SIZE_X ; ++j ) for( int y = 0 ; y < SHAPE_SIZE_Y(SU+SD+1) ; ++y ) for( int x = 0 ; x < SHAPE_SIZE_X(SL+SR+1) ; ++x) .... += a_cache(i, j, y, x); and "local" version: for (int i = 0; i < BLOCKHEIGHT; i++) for (int j = 0; j < BLOCKWIDTH; j++) for (int b = 0; b <= S_L + S_R; b++) ... +=input[...] "a_cache" has a lot of compute a_cache(i, j, y, x); becomes a_wi( CACHE_BLOCK_OFFSET_Y + (i), CACHE_BLOCK_OFFSET_X + (j), x, y ) and that becomes view( WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X + reorder(CACHE_BLOCK_OFFSET_X + (j)), (x), (y) ) and that becomes view( WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i)), WG_BLOCK_OFFSET_X + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X ) , (x), (y) ) and that becomes input[ ((WG_BLOCK_OFFSET_Y + (CACHE_BLOCK_OFFSET_Y + (i))) + (x)) * INPUT_SIZE_X + ((WG_BLOCK_OFFSET_X + ( ( (CACHE_BLOCK_OFFSET_X + (j)) / WI_BLOCK_SIZE_X) + ( (CACHE_BLOCK_OFFSET_X + (j)) % WI_BLOCK_SIZE_X) * NUM_WI_X) + (y)) ] this is 9 additions + 2 multiplications + 1 modulo + 1 division. "local" version has sum += localmem[(get_local_id(0) * BLOCKWIDTH + j + b) + (get_local_id(1) * BLOCKHEIGHT + i) * LOCALMEM_WIDTH]; which is 4 additions + 3 multiplications but no modulo and no division. "L1 cache" version needs to keep loop counters for 6 loops and they could be using more cpu-registers or even L1 cache. Data cache size is 128 kB per core or 64 kB per thread. If you launch 1024 threads per core(each core is a work group right?) then 1024 * 6 * 4 = 24kB L1 is needed just for loop counters. This leaves 40kB to use. When you add "const int WG_ID_X" and other variables (5 of them), only 20kB is left. Now add the "f_stencil" functions temporary "stack" variables for its arguments, there may be no L1 cache left, decreasing efficiency. "local" version has about 10-12 variables used(not-used variables maybe optimized out?) and no functions so it may be better for L1. https://software.intel.com/en-us/node/540486 saying To reduce the overhead of maintaining a workgroup, you should create work-groups that are as large as possible, which means 64 and more work-items. One upper bound is the size of the accessed data set as it is better not to exceed the size of the L1 cache in a single work group. and If your kernel code contains the barrier instruction, the issue of work-group size becomes a tradeoff. The more local and private memory each work-item in the work-group requires, the smaller the optimal work-group size is. The reason is that a barrier also issues copy instructions for the total amount of private and local memory used by all work-items in the work-group in the work-group since the state of each work-item that arrived at the barrier is saved before proceeding with another work-item. you have only 1 barrier in "local" version and before that point, 8 variables are used so not much memory needed to copy?
Fortran array Index
I am migrating the Fortran code to C. I would like to know what will be the C equivalent for the following statements T is a 2d array of dimension (B,B) and T_indices is an array of indices in T , T_indices(E) where E < B. Ai and Bi are variables. T(T_indices(:),:) = 1/(Ai/Bi)*T(T_indices(:),:) T(:,T_indices(:)) = 1/(Ai/Bi)*T(:,T_indices(:)) My proposed C translation: for (i=0 ; i < E ; i++){ for (j=0 ; j< B ; j++){ T[(T_indices[i]-1) * B + j] = 1/(Ai/Bi)* T[(T_indices[i]-1) * B + j]; T[j * B + (T_indices[i]-1)] = 1/(Ai/Bi)* T[j * B + (T_indices[i]-1)]; } } Is this a correct translation?
2D convolution with a with a kernel which is not center originated
I want to do 2D convolution of an image with a Gaussian kernel which is not centre originated given by equation: h(x-x', y-y') = exp(-((x-x')^2+(y-y'))/2*sigma) Lets say the centre of kernel is (1,1) instead of (0,0). How should I change my following code for generation of kernel and for the convolution? int krowhalf=krow/2, kcolhalf=kcol/2; int sigma=1 // sum is for normalization float sum = 0.0; // generate kernel for (int x = -krowhalf; x <= krowhalf; x++) { for(int y = -kcolhalf; y <= kcolhalf; y++) { r = sqrtl((x-1)*(x-1) + (y-1)*(y-1)); gKernel[x + krowhalf][y + kcolhalf] = exp(-(r*r)/(2*sigma)); sum += gKernel[x + krowhalf][y + kcolhalf]; } } //normalize the Kernel for(int i = 0; i < krow; ++i) for(int j = 0; j < kcol; ++j) gKernel[i][j] /= sum; float **convolve2D(float** in, float** out, int h, int v, float **kernel, int kCols, int kRows) { int kCenterX = kCols / 2; int kCenterY = kRows / 2; int i,j,m,mm,n,nn,ii,jj; for(i=0; i < h; ++i) // rows { for(j=0; j < v; ++j) // columns { for(m=0; m < kRows; ++m) // kernel rows { mm = kRows - 1 - m; // row index of flipped kernel for(n=0; n < kCols; ++n) // kernel columns { nn = kCols - 1 - n; // column index of flipped kernel //index of input signal, used for checking boundary ii = i + (m - kCenterY); jj = j + (n - kCenterX); // ignore input samples which are out of bound if( ii >= 0 && ii < h && jj >= 0 && jj < v ) //out[i][j] += in[ii][jj] * (kernel[mm+nn*29]); out[i][j] += in[ii][jj] * (kernel[mm][nn]); } } } } }
Since you're using the convolution operator you have 2 choices: Using it Spatial Invariant property. To so so, just calculate the image using regular convolution filter (Better done using either conv2 or imfilter) and then shift the result. You should mind the boundary condition you'd to employ (See imfilter properties). Calculate the shifted result specifically. You can do this by loops as you suggested or more easily create non symmetric kernel and still use imfilter or conv2. Sample Code (MATLAB) clear(); mInputImage = imread('3.png'); mInputImage = double(mInputImage) / 255; mConvolutionKernel = zeros(3, 3); mConvolutionKernel(2, 2) = 1; mOutputImage01 = conv2(mConvolutionKernel, mInputImage); mConvolutionKernelShifted = [mConvolutionKernel, zeros(3, 150)]; mOutputImage02 = conv2(mConvolutionKernelShifted, mInputImage); figure(); imshow(mOutputImage01); figure(); imshow(mOutputImage02); The tricky part is to know to "Crop" the second image in the same axis as the first. Then you'll have a shifted image. You can use any Kernel and any function which applies convolution. Enjoy.