Loop transformation for data dependence and parallelization - c

I have a nested for loop for iterating over a three dimensional space (one for each dimension). The nested loop forms a part of a stencil based matrix solver which has a operation with data dependence. I have gone through lot of links/online material going into the details of loop transformations and it seems like loop skewing can help me. Though it is pretty straight forward for a 2d grid (consisting of two loop nests) i find it bit difficult to extend to 3d. The loop looks like this.
# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ)
mythread = omp_get_thread_num();//0
// loop 1
#pragma omp for reduction(+:pipi_sparse)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
dummy = COEFF[i][j][k][6] * p_sparse_s[i][j][k];
if (PeriodicBoundaryX && i == 1) dummy += COEFF[i][j][k][0] * p_sparse_s[nx ][j][k];
else dummy += COEFF[i][j][k][0] * p_sparse_s[i-1][j][k];
if (PeriodicBoundaryX && i == nx) dummy += COEFF[i][j][k][1] * p_sparse_s[1 ][j][k];
else dummy += COEFF[i][j][k][1] * p_sparse_s[i+1][j][k];
if (PeriodicBoundaryY && j == 1) dummy += COEFF[i][j][k][2] * p_sparse_s[i][ny ][k];
else dummy += COEFF[i][j][k][2] * p_sparse_s[i][j-1][k];
if (PeriodicBoundaryY && j == ny) dummy += COEFF[i][j][k][3] * p_sparse_s[i][ 1][k];
else dummy += COEFF[i][j][k][3] * p_sparse_s[i][j+1][k];
if (PeriodicBoundaryZ && k == 1) dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][nz ];
else dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][k-1];
if (PeriodicBoundaryZ && k == nz) dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][ 1];
else dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][k+1];
ap_sparse_s[i][j][k] = dummy;
pipi_sparse += p_sparse_s[i][j][k] * ap_sparse_s[i][j][k];
// loop 2
#pragma omp for schedule(static, nx/NTt)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
dummy = res_sparse_s[i][j][k];
dummy -= COEFF[i][j][k][7] * RLL[i-1][j][k];
if (PeriodicBoundaryX && i==nx)dummy -= COEFF[i][j][k][8] * RLL[1 ][j][k];
dummy -= COEFF[i][j][k][2] * RLL[i][j-1][k];
if (PeriodicBoundaryY && j==ny) dummy -= COEFF[i][j][k][3] * RLL[i][1 ][k];
dummy -= COEFF[i][j][k][4] * RLL[i][j][k-1];
if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[i][j][k][5] * RLL[i][j][1 ];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
// loop 3
#pragma omp for schedule(static, nx/NTt)
for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--)
dummy = RLL[i][j][k]*h_sparse_s[i][j][k];
if (PeriodicBoundaryX && i==1) dummy -= COEFF[i][j][k][7] * RLL[nx ][j][k];
dummy -= COEFF[i][j][k][8] * RLL[i+1][j][k];
if (PeriodicBoundaryY && j==1) dummy -= COEFF[i][j][k][2] * RLL[i][ny ][k];
dummy -= COEFF[i][j][k][3] * RLL[i][j+1][k];
if (PeriodicBoundaryZ && k==1) dummy -= COEFF[i][j][k][4] * RLL[i][j][nz ];
dummy -= COEFF[i][j][k][5] * RLL[i][j][k+1];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
Loop 1 -> Data dependence of [i][j][k] on [i+1][i-1][j-1][j+1][k-1][k+1] although the values of p_sparse_s are read-only
Loop 2 -> Data dependence of [i][j][k] on [i-1][j-1][k-1]
Loop 3 -> Data dependence of [i][j][k] on [i+1][j+1][k+1]
COEFF[i][j][k][NUM] are just generic coefficients (some constant numbers) defined for each point in the 3d space. Since there are 9 such coefficients corresponding to neighboring points hence COEFF[][][][0], COEFF[][][][1] .... COEFF[][][][8]and like that.
find a small code below that has a data dependence. I have tried to skew the inner k loop with respect to the i and j loop so that the k loop can be vectorised. Problem is the code gives absolutely correct answers when running in serial and gives some weird answers if I enforce parallelism or if I enforce vectorisation of the inner loop.
typedef double lr;
#define nx 4
#define ny 4
#define nz 4
print3dmatrix(double a[nx+2][ny+2][nz+2])
for(int i=1; i<= nx; i++) {
for(int j=1; j<= ny; j++) {
for(int k=1; k<= nz; k++) {
printf("%f ", a[i][j][k]);
double a[nx+2][ny+2][nz+2];
double b[nx+2][ny+2][nz+2];
// matrix filling
// b is just a copy of a
for(int i=0; i< nx+2; i++) for(int j=0; j< ny+2; j++) for(int k=0; k< nz+2; k++)
a[i][j][k] = rand() % 5;
b[i][j][k] = a[i][j][k];
// loop 1
//#pragma omp parallel for num_threads(1)
for(int i=1; i<= nx; i++) for(int j=1; j<= ny; j++) for(int k=1; k<= nz; k++)
a[i][j][k] = -1*a[i-1][j][k] - 1*a[i][j-1][k] -1 * a[i][j][k-1] + 4 * a[i][j][k];
// loop 2
//#pragma omp parallel for num_threads(1)
for(int i=1; i<= nx; i++)
for(int j=1; j<= ny; j++)
// #pragma omp simd
for(int m=j+1; m<= j+nz; m++)
b[i][j][m-j] = -1*b[i-1][j][m-j] - 1*b[i][j-1][m-j] -1 * b[i][j][m-j-1] + 4 * b[i][j][m-j];
return 0;
see - loop skewing for vectorisation


Clang OpenMP. Find max value in matrix N x N

I need to find max value in matrix using OpenMP. It is my first experience with OpenMP, previously I did this task using pthreads.
I wrote this code but it does not work:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
void MatrixFIller(int nrows, int* m) {
for (int i = 0; i < nrows; i++) {
for (int j = 0; j < nrows; j++) {
*(m + i * nrows + j) = rand() % 200;
#define dimension 9
#define number_of_threads 4
int main() {
int matrix[dimension][dimension];
int local_max=-1;
int final_max=-1;
int j = 0;
MatrixFIller(dimension, &matrix[0][0]);
for (int i = 0; i < dimension; i++) {
for (int j = 0; j < dimension; j++) {
printf("%d\t", matrix[i][j]);
#pragma omp parallel private(local_max)
#pragma omp for
for (j = 0; j < dimension * dimension; j++) {
if (*(matrix + (int)((j) / dimension) * dimension + (j - dimension * ((int)(j / dimension)))) > local_max) {
local_max = *(matrix + (int)((j) / dimension) * dimension + (j - dimension * ((int)((j) / dimension))));
#pragma omp critical
if (local_max > final_max) { final_max = local_max; };
printf("Max value of matrix with dimension %d is %d", dimension, final_max);
The idea is that in pragma for each thread finds local max and after that it is compared with global max value in pragma critical. Why it does not correct? Thanks!
When entering the parallel region, local_max gets unitialized: the private clause creates variables that are local to each thread and that's it, they are not initialized to any value. If you want them to be initialized with the content of local_max had before entering the parallel region, you have to use the firstprivate clause instead.
However, it would actually be better to declare (and initialize) local_max inside the parallel region.
Also, you may have a look at the reduction clause (with the max option), which will make the code even simpler:
#pragma omp parallel for reduction(max:final_max)
for (j = 0; j < dimension * dimension; j++) {
if (*(matrix + (int)((j) / dimension) * dimension + (j - dimension * ((int)(j / dimension)))) > final_max) {
final_max = *(matrix + (int)((j) / dimension) * dimension + (j - dimension * ((int)((j) / dimension))));
Following Laci's comment about the incorrectness of the arithmetic: all of your indeces calculations look correct but are not easy to read. Since you have from the begining a 2D array it is simpler to set two loops. And possibly tell OpenMP to parallelize them both using the collapse clause (and by the way, and as far as possible, declare the loop indeces within the for(): this avoids always wondering which ones should be declared as private or not):
#pragma omp parallel for reduction(max:final_max) collapse(2)
for (int i = 0; i < dimension; i++) {
for (int j = 0; j < dimension; j++) {
if (matrix[i][j] > final_max) {
final_max = matrix[i][j];

Numbers not randomized after runs

I'm trying to create an openMP program that randomizes double arrays and run the values through the formula: y[i] = (a[i] * b[i]) + c[i] + (d[i] * e[i]) + (f[i] / 2);
If I run the program multiple times I've realised that the Y[] values are the same even though they are supposed to be randomized when the arrays are initialized in the first #pragma omp for . Any Ideas as to why this might be happening?
#include <stdio.h>
#include <stdlib.h>
#define ARRAY_SIZE 10
double randfrom(double min, double max);
double randfrom(double min, double max)
double range = (max - min);
double div = RAND_MAX / range;
return min + (rand() / div);
int main() {
int i;
double min, max;
int imin, imax;
/*A[10] consists of random number in between 1 and 100
B[10] consists of random number in between 10 and 50
C[10] consists of random number in between 1 and 10
D[10] consists of random number in between 1 and 50
E[10] consists of random number in between 1 and 5
F[10] consists of random number in between 10 and 80*/
#pragma omp parallel
#pragma omp parallel for
for (i = 0; i < ARRAY_SIZE; i++) {
a[i] = randfrom(1, 100);
b[i] = randfrom(10, 50);
c[i] = randfrom(1, 50);
d[i] = randfrom(1, 50);
e[i] = randfrom(1, 5);
f[i] = randfrom(10, 80);
printf("This is the parallel Print\n\n\n");
#pragma omp parallel shared(a,b,c,d,e,f,y) private(i)
#pragma omp for schedule(dynamic) nowait
for (i = 0; i < ARRAY_SIZE; i++) {
/*printf("A[%d]%.2f",i, a[i]);
printf("B[%d]%.2f", i, b[i]);
printf("C[%d]%.2f", i, c[i]);
printf("D[%d]%.2f", i, d[i]);
printf("E[%d]%.2f", i, e[i]);
printf("F[%d]%.2f", i, f[i]);
y[i] = (a[i] * b[i]) + c[i] + (d[i] * e[i]) + (f[i] / 2);
printf("Y[%d]=%.2f\n", i, y[i]);
#pragma omp parallel shared(y, min,imin,max,imax) private(i)
#pragma omp for schedule(dynamic) nowait
for (i = 0; i < ARRAY_SIZE; i++) {
if (i == 0) {
min = y[i];
imin = i;
else {
if (y[i] < min) {
min = y[i];
imin = i;
#pragma omp for schedule(dynamic) nowait
for (i = 0; i < ARRAY_SIZE; i++) {
if (i == 0) {
max = y[i];
imax = i;
else {
if (y[i] > max) {
max = y[i];
imax = i;
printf("min y[%d] = %.2f\nmax y[%d] = %.2f\n", imin, min, imax, max);
return 0;
First of all, I would like to emphasize that OpenMP has significant overheads, so you need a reasonable amount of work in your code, otherwise the overhead is bigger than the gain by parallelization. In your code this is the case, so the fastest solution is to use serial code. However, you mentioned that your goal is to learn OpenMP, so I will show you how to do it.
In your previous post's comments #paleonix linked a post ( How to generate random numbers in parallel? ) which answers your question about random numbers. One of the solutions is to use rand_r.
Your code has a data race when searching for minimum and maximum values of array Y. If you need to find the minimum/maximum value only it is very easy, because you can use reduction like this:
double max=y[0];
#pragma omp parallel for default(none) shared(y) reduction(max:max)
for (int i = 1; i < ARRAY_SIZE; i++) {
if (y[i] > max) {
max = y[i];
But in your case you also need the indices of minimum and maximum value, so it is a bit more complicated. You have to use a critical section to be sure that other threads can not change the max, min, imax and imin values while you updating their values. So, it can be done the following way (e.g. for finding minimum value):
#pragma omp parallel for
for (int i = 0; i < ARRAY_SIZE; i++) {
if (y[i] < min) {
#pragma omp critical
if (y[i] < min) {
min = y[i];
imin = i;
Note that the if (y[i] < min) appears twice, because after the first comparison other threads may change the value of min, so inside the critical region before updating min and imin values you have to check it again. You can do it exactly the same way in the case of finding the maximum value.
Always use your variables at their minimum required scope.
It is also recommend to use default(none) clause in your OpenMP parallel region so, you have to explicitly define the sharing attributes all of your variables.
You can fill the array and find its minimum/maximum values in a single loop and print their values in a different serial loop.
If you set min and max before the loop, you can get rid of the extra comparison if (i == 0) used inside the loop.
Putting it together:
double threadsafe_rand(unsigned int* seed, double min, double max)
double range = (max - min);
double div = RAND_MAX / range;
return min + (rand_r(seed) / div);
In main:
double min=DBL_MAX;
double max=-DBL_MAX;
#pragma omp parallel default(none) shared(a,b,c,d,e,f,y,imin,imax,min,max)
unsigned int seed=omp_get_thread_num();
#pragma omp for
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = threadsafe_rand(&seed, 1,100);
b[i] = threadsafe_rand(&seed,10, 50);
c[i] = threadsafe_rand(&seed,1, 10);
d[i] = threadsafe_rand(&seed,1, 50);
e[i] = threadsafe_rand(&seed,1, 5);
f[i] = threadsafe_rand(&seed,10, 80);
y[i] = (a[i] * b[i]) + c[i] + (d[i] * e[i]) + (f[i] / 2);
if (y[i] < min) {
#pragma omp critical
if (y[i] < min) {
min = y[i];
imin = i;
if (y[i] > max) {
#pragma omp critical
if (y[i] > max) {
max = y[i];
imax = i;
// printout
for (int i = 0; i < ARRAY_SIZE; i++) {
printf("Y[%d]=%.2f\n", i, y[i]);
printf("min y[%d] = %.2f\nmax y[%d] = %.2f\n", imin, min, imax, max);
I have updated the code according to #Qubit's and #JérômeRichard's suggestions:
I used the 'Really minimal PCG32 code' / (c) 2014 M.E. O'Neill / from https://www.pcg-random.org/download.html. Note that I do not intend to properly handle the seeding of this simple random number generator. If you would like to do so, please use a complete random number generator library.
I have changed the code to use user defined reductions. Indeed, it makes the code much more efficient, but not really beginner friendly. It would require a very long post to explain it, so if you are interested in the details, please read a book about OpenMP.
I have reduced the number of divisions in threadsafe_rand
The updated code:
#define ARRAY_SIZE 10
// *Really* minimal PCG32 code / (c) 2014 M.E. O'Neill / pcg-random.org
// Licensed under Apache License 2.0 (NO WARRANTY, etc. see website)
typedef struct { uint64_t state; uint64_t inc; } pcg32_random_t;
inline uint32_t pcg32_random_r(pcg32_random_t* rng)
uint64_t oldstate = rng->state;
// Advance internal state
rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
// Calculate output function (XSH RR), uses old state for max ILP
uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
uint32_t rot = oldstate >> 59u;
return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
inline double threadsafe_rand(pcg32_random_t* seed, double min, double max)
const double tmp=1.0/UINT32_MAX;
return min + tmp*(max - min)*pcg32_random_r(seed);
struct v{
double value;
int i;
#pragma omp declare reduction(custom_min: struct v: \
omp_out = omp_in.value < omp_out.value ? omp_in : omp_out )\
initializer(omp_priv={DBL_MAX,0} )
#pragma omp declare reduction(custom_max: struct v: \
omp_out = omp_in.value > omp_out.value ? omp_in : omp_out )\
initializer(omp_priv={-DBL_MAX,0} )
int main() {
struct v max={-DBL_MAX,0};
struct v min={DBL_MAX,0};
#pragma omp parallel default(none) shared(a,b,c,d,e,f,y) reduction(custom_min:min) reduction(custom_max:max)
pcg32_random_t seed={omp_get_thread_num()*7842 + time(NULL)%2299, 1234+omp_get_thread_num()};
#pragma omp for
for (int i=0 ; i < ARRAY_SIZE; i++) {
a[i] = threadsafe_rand(&seed, 1,100);
b[i] = threadsafe_rand(&seed,10, 50);
c[i] = threadsafe_rand(&seed,1, 10);
d[i] = threadsafe_rand(&seed,1, 50);
e[i] = threadsafe_rand(&seed,1, 5);
f[i] = threadsafe_rand(&seed,10, 80);
y[i] = (a[i] * b[i]) + c[i] + (d[i] * e[i]) + (f[i] / 2);
if (y[i] < min.value) {
min.value = y[i];
min.i = i;
if (y[i] > max.value) {
max.value = y[i];
max.i = i;
// printout
for (int i = 0; i < ARRAY_SIZE; i++) {
printf("Y[%d]=%.2f\n", i, y[i]);
printf("min y[%d] = %.2f\nmax y[%d] = %.2f\n", min.i, min.value, max.i, max.value);
return 0;

Parallel sections code with nested loops in openmp

I made this parallel code to share the iterations like first and last, fisrst+1 and last-1,... But I don't know how to improve the code in every one of the two parallel sections because I have an inner loop in the sections and I can't think of any way to simplify it, thanks.
This isn't about which values are stored in x or y, I use this sections design because the requisite is execute the iterations from 0 to N like: 0 N, 1 N-1, 2 N-2 but I would like to know if I can optimize the inner loops maintaining this model
int x = 0, y = 0,k,i,j,h;
#pragma omp parallel private(i, h) reduction(+:x, y)
#pragma omp sections
#pragma omp section
for (i=0; i<N/2; i++)
C[i] = 0;
for (j=0; j<N; j++)
C[i] += MAT[i][j] * B[j];
x += C[i];
#pragma omp section
for (h=N-1; h>=N/2; h--)
C[h] = 0;
for (k=0; k<N; k++)
C[h] += MAT[h][k] * B[k];
y += C[h];
x = x + y;
Using sections seems like the wrong approach. A pragma omp for seems more appropriate. Also note that you forgot to declare j private.
int x = 0, y = 0,k,i,j;
#pragma omp parallel private(i,j) reduction(+:x, y)
# pragma omp for nowait
for(i=0; i<N/2; i++) {
// local variable to make the life easier on the compiler
int ci = 0;
for(j=0; j<N; j++)
ci += MAT[i][j] * B[j];
x += ci;
C[i] = ci;
# pragma omp for nowait
for(i=N/2; i < N; i++) {
int ci = 0;
for(j=0; j<N; j++)
ci += MAT[i][j] * B[j];
y += ci;
C[i] = ci;
x = x + y;
Also, I'm not sure but if you just want x as your final output, you can simplify the code even further:
int x=0, i, j;
#pragma omp parallel for reduction(+:x) private(i,j)
for(i=0; i < N; ++i)
for(j=0; j < N; ++j)
x += MAT[i][j] * B[j];
The section construct is to distribute different tasks to different threads and each section block marks a different task so you will not be able to do that iterations in the order you want I answered you here:
Distribution of loop iterations between threads with a specific order
But I want to clarify that the requirement to use sections is that each block must be independent of the other blocks.
A section gets only one thread, so you can't make the loops parallel. How about
Make a parallel loop to N at the top level,
then inside each iteration use a conditional to decide whether to accumulate into x,y?
Although #Homer512 's solution looks correct to me too.

Actual dIfference between 2 ways of equal parallelism using omp threads

I am trying to parallelize my program using OMP threads .
What I am doing is the following and it works perfectly :
#pragma omp parallel num_threads(threadnum) \
default(none) shared(scoreBoard, nDiag, qlength, dlength) private(nEle, i, si, sj, ai, aj, max)
for (i = 1; i < nDiag; ++i)
if (i <= qlength && i <= dlength) nEle = i;
else if(i <= findmax(qlength, dlength)) nEle = findmin(qlength, dlength);
else nEle = 2*findmin(qlength, dlength) - i + abs(qlength - dlength);
calcfirstele(%si, %sj);
#pragma omp for
for (j = 1; j <= nEle; ++j)
ai = si - j + 1;
aj = sj + j - 1
max = searchmax(ai,aj);
scoreBoard[ai][aj] = max;
But isn't it equal to :
for (i = 1; i < nDiag; ++i)
if (i <= qlength && i <= dlength) nEle = i;
else if(i <= findmax(qlength, dlength)) nEle = findmin(qlength, dlength);
else nEle = 2*findmin(qlength, dlength) - i + abs(qlength - dlength);
calcfirstele(%si, %sj);
#pragma omp parallel num_threads(threadnum) \
default(none) shared(scoreBoard) private(nEle, i, si, sj, ai, aj, max)
#pragma omp for
for (j = 1; j <= nEle; ++j)
ai = si - j + 1;
aj = sj + j - 1
max = searchmax(ai,aj);
scoreBoard[ai][aj] = max;
Why when i use the second one my program is making more time than the serial one , whereas in the first case it works lot faster than the serial ? Can't understand the difference between them
Your second code is wrong and has an undefined behavior.
The reason for that is that by declaring nEle, si and sj private, you create some local (per-thread) versions of these variables, without giving them any value. Therefore, nEle notably, which is the upper bound of you for loop, can have whatever value, likely increasing quite dramatically the length of your computation.
In order to fix your code, the snippet you gave should look like this (with a few simplifications, not tested obviously):
for (int i = 1; i < nDiag; ++i) {
if (i <= qlength && i <= dlength)
nEle = i;
else if(i <= findmax(qlength, dlength))
nEle = findmin(qlength, dlength);
nEle = 2*findmin(qlength, dlength) - i + abs(qlength - dlength);
calcfirstele(%si, %sj); // not sure what this suppose to mean...
#pragma omp parallel for num_threads(threadnum) private(ai, aj, max)
for (int j = 1; j <= nEle; ++j) {
ai = si - j + 1;
aj = sj + j - 1
max = searchmax(ai,aj);
scoreBoard[ai][aj] = max;

OpenMP with C program

I am having a hard time using OpenMP with C to parallelize this method. I was wondering if anyone could help and possibly tell me what is wrong with my parallelization of this method.
void blur(float **out, float **in) {
// assumes "padding" to avoid messy border cases
int i, j, r, c;
float tmp, term;
term = 1.0 / 157.0;
#pragma omp parallel num_threads(8)
#pragma omp for private(r,c)
for (i = 0; i < N-4; i++) {
for (j = 0; j < N-4; j++) {
tmp = 0.0;
for (r = 0; r < 5; r++) {
for (c = 0; c < 5; c++) {
tmp += in[i+r][j+c] * mask[r][c];
out[i+2][j+2] = term * tmp;
You shall either declare tmp inside the loop:
// at line 11:
float tmp = 0.0;
or specify tmp as a private variable:
// at line 7:
#pragma omp for private(r,c,tmp)
Or it would be treated like a shared variable among threads.
