Encountering race condition using openMP in C - c

Here is the section of my code where I am running into a race condition. I'm just trying to copy the values of the matrix "matxOriginal" into the matrix "cluster0", but, when run on multiple threads using openMP, the sample values printed for "cluster0" is different from "matxOriginal".
Both matrix has been dynamically allocated and are both 698 x 9 matrix.
Also, I would like to keep the use and purpose of the "cluster0IndexCounter" variable for a different use outside of what I'm posting. So if you can, please let me know how to make this work.
double **matxGen(int row, int col)
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
return m;
double **emptyMatxGen(int row, int col)
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
return m;
int main()
int x, i, j, tid, row=699, col=9,
double **matxOriginal, **matx, **cluster0;
matxOriginal=matxGen(row, col);
matx=matxGen(row, col);
double *centerPoint0=matx[99];
cluster0=emptyMatxGen(row, col);
#pragma omp parallel for private(x, j, tid) schedule(static) reduction(+:cluster0IndexCounter)
for (x=0; x<=698; x++)
for (j=0; j<9; j++)
printf("cluster0: %f, %f, %f, %f, %f\n", cluster0[9][0], cluster0[9][1], cluster0[9][2], cluster0[9][3], cluster0[9][4]);
return 0;


Perform a triple pointer (C) offloading to NVIDIA GPU with OpenMP

I've been working with a heat transfer code. This code, basically, stablishes the initial conditions for a cube and all of its faces. The six faces start at different temperatures, and then the code will be calculating how the temperature changes in all of the faces due to the heat transfer between them. Now, I've been trying offloading to an NVIDIA GPU using OpenMP directives. This code initializes the faces conditions using a triple pointer, which is sort of an array of arrays. Reading a little bit about this matter, I've come to know that 3D architectures are not easily offloaded to the GPUs. So my question is if it is possible to offload this triple pointer arrays to the GPU or if I have to use a more flat array form.
Here I leave the code, which is still working on CPU. Parallel version of the code.
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 25 //This defines the number of points per dimension (Cube = N*N*N)
#define NUM_STEPS 6000 //This is the number of simulations time steps
/*writeFile: this function writes simulation results into a file.
* A file is created for each iteration that's passed to the function
* as a parameter. It also takes the triple pointer to the simulation
* data*/
void writeFile(int iteration, double*** data){
char filename[50];
char itr[12];
sprintf(itr, "%d", iteration);
strcpy(filename, "heat_");
strcat(filename, itr);
strcat(filename, ".txt");
//printf("Filename is %s\n", filename);
FILE *fp;
fp = fopen(filename, "w");
fprintf(fp, "x,y,z,T\n");
for(int i=0; i<N; i++){
for(int j=0;j<N; j++){
for(int k=0; k<N; k++){
fprintf(fp,"%d,%d,%d,%f\n", i,j,k,data[i][j][k]);
void compute_heat_transfer(double ***arrayOld, double ***arrayNew){
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
#pragma omp parallel shared(arrayNew, arrayOld) private(i,j,k)
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
int main (int argc, char *argv[]) {
int i,j,k,nsteps;
double mean;
double ***arrayOld; //Variable that will hold the data of the past iteration
double ***arrayNew; //Variable where newly computed data will be stored
arrayOld = (double***)malloc(N*sizeof(double**));
arrayNew = (double***)malloc(N*sizeof(double**));
if(arrayOld== NULL){
fprintf(stderr, "Out of memory");
for(i=0; i<N;i++){
arrayOld[i] = (double**)malloc(N*sizeof(double*));
arrayNew[i] = (double**)malloc(N*sizeof(double*));
fprintf(stderr, "Out of memory");
for(int j=0;j<N;j++){
arrayOld[i][j] = (double*)malloc(N*sizeof(double));
arrayNew[i][j] = (double*)malloc(N*sizeof(double));
fprintf(stderr,"Out of memory");
/*Set boundary values and compute mean boundary values*/
mean = 0.0;
for(i=0; i<N; i++){
arrayOld[i][j][0] = 100.0;
mean += arrayOld[i][j][0];
for(i=0; i<N; i++){
arrayOld[i][j][N-1] = 100.0;
mean += arrayOld[i][j][N-1];
for(j=0; j<N; j++){
arrayOld[0][j][k] = 100.0;
mean += arrayOld[0][j][k];
for(j=0; j<N; j++){
arrayOld[N-1][j][k] = 100.0;
mean += arrayOld[N-1][j][k];
for(i=0; i<N; i++){
arrayOld[i][0][k] = 100.0;
mean += arrayOld[i][0][k];
for(i=0; i<N; i++){
arrayOld[i][N-1][k] = 0.0;
mean += arrayOld[i][N-1][k];
mean /= (6.0 * (N*N));
/*Initialize interior values*/
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1;k++){
arrayOld[i][j][k] = mean;
double tdata = omp_get_wtime();
compute_heat_transfer(arrayOld, arrayNew);
tdata = omp_get_wtime()-tdata;
printf("Execution time was %f secs\n", tdata);
for(i=0; i<N;i++){
for(int j=0;j<N;j++){
return 0;
Use variable length arrays with dynamic storage:
double (*arr)[N][N] = calloc(N, sizeof *arr);
Use good old arr[i][j][k] syntax
double *flat = (double*)arr;
Note that this conversion is not guaranteed by the C standard to work.
Though it will very likely work on all platforms capable of using GPUs.
Passing to functions.
VLAs can be parameters of the functions.
void fun(int n, double arr[n][n][n]) {
Exemplary usage would be:
foo(N, arr);
VLA friendly variant of compute_heat_transfer():
void compute_heat_transfer(int n, double arrayOld[restrict n][n][n], double arrayNew[restrict n][n][n]) {
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
Keyword restrict in arrNew[restrict n][n][n] is used to let the compiler assume that arrNew and arrOld do not alias. It should let the compiler use more aggressive optimizations.
Note that arrNew and arrOld are pointer to arrays. So rather than copy arrNew to arrOld you could simply swap those pointers forming a kind of simple double buffering. It should make the code even faster.

Matrix traversal optimization

Given a n x n matrix of ints, I have an algorithm that at each step of a for loop of range n traverses and modifies the matrix. Here is the code:
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
int d;
for (int k=0; k<n; k++)
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
for (int i=0; i<n; i++)
The matrix is built as an array of n*n ints and for each index line i, dist[i] is the address of the row of index i [the above code is the standard way to write the Floyd-Warshall algorithm but my question is not about this algorithm by itself].
The following drawing tries to explain how the matrix is processed:
At each step of the loop of index k, the underlying matrix is traversed line by line.
Now, consider the following transformation of the previous code:
void relax(Matrix dist, int n, int* rowk, int* colk)
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
void floyd_fast(Matrix dist, int n)
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
for (k=0; k<n; k++)
int* rowk =dist[k];
for (i=0; i<n; i++)
relax(dist, n, rowk, colk);
for (i=0; i<n; i++)
At every step, the elements of the matrix are accessed in the same order as in the previous algorithm.
The only difference is that at each step k in the exterior loop, the column of index k is copied into a temporary array, cf. the colk malloc above. It results that the element at position (i, k) is read from this array instead of being accessed directly from the matrix.
This innocuous change leads in fact to a significant speedup: you gain a factor 4 if n=1000.
I know that in C, it's faster to traverse an array in row major order but this is always the case here. So i was wondering why there is a speedup so important. Is it related to cache optimisation?
Complete code
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
int d;
for (int k=0; k<n; k++)
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
for (int i=0; i<n; i++)
void relax(Matrix dist, int n, int* rowk, int* colk)
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
void floyd_fast(Matrix dist, int n)
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
for (k=0; k<n; k++)
int* rowk =dist[k];
for (i=0; i<n; i++)
relax(dist, n, rowk, colk);
for (i=0; i<n; i++)
void print(Matrix dist, int n)
int i, j;
for (i=0; i<n; i++)
for (j=0; j<n; j++)
printf("%d ", dist[i][j]);
void test_slow(Matrix dist, int n)
clock_t now=clock();
floyd_slow(dist, n);
// print(dist, n);
int *p=dist[0];
fprintf(stderr, "Elapsed slow: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
void test_fast(Matrix dist, int n)
clock_t now=clock();
floyd_fast(dist, n);
// print(dist, n);
int *p=dist[0];
fprintf(stderr, "Elapsed fast: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
int * data(int n)
int N=n*n;
int *t=malloc(N*sizeof(int));
if (!t)
for (int i=0; i<N;i++)
return t;
Matrix getMatrix(int *t, int n)
int N=n*n;
int *tt=malloc(N*sizeof(int));
Matrix mat=malloc(n*sizeof(int*));
if (!tt || !mat)
memcpy(tt, t, N*sizeof(int));
for (int i=0; i<n;i++)
return mat;
int main(void)
int n=1000;
int *t=data(n);
Matrix mat_slow=getMatrix(data(n), n);
Matrix mat_fast=getMatrix(data(n), n);
test_slow(mat_slow, n);
test_fast(mat_fast, n);
return 0;
Elapsed slow: 0.58 s
Elapsed fast: 0.14 s
Compilation options:
rm floyd
gcc -Wall -O3 -march=native -ffast-math -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.c -o floyd -lm

K-means openmp parallelisation internal compiler error

I'm having troubles trying to parallelize the following function of the k-means algorithm.
The original code is this one:
void PointsToCentroides(int *Punts, int N, int D, int C, double *Centroides, int *PC, int *Sep){
int i, j, k, m;
int dist[C];
for(i=0; i<C; i++)Sep[i]=0;
for(i=0; i<N; i++){
for(j=0; j<C; j++){
dist[j] = 0;
for(k=0; k<D; k++){
dist[j] += (Punts[i*D+k] - Centroides[j*D+k]) * (Punts[i*D+k] - Centroides[j*D+k]);
m = min(dist, C);
PC[m*N+Sep[m]] = i;
I've tried to turn it to a parallel for with a critical segment for the last two instructions but it makes the program run much slower.
The parallel for code is the following:
void PointsToCentroides(int *Punts, int N, int D, int C, double *Centroides, int *PC, int *Sep){
int i, j, k;
for(i=0; i<C; i++)Sep[i]=0;
#pragma omp parallel for shared(PC, Sep)
for(i=0; i<N; i++){
int dist[C], m;
for(j=0; j<C; j++){
dist[j] = 0;
for(k=0; k<D; k++){
dist[j] += (Punts[i*D+k] - Centroides[j*D+k]) * (Punts[i*D+k] - Centroides[j*D+k]);
m = min(dist, C);
#pragma omp critical
PC[m*N+Sep[m]] = i;
I've also tried to make a struct so each thread has its own data-set to save the different points it works with, but the compiler is giving en internal error message. The code is the following:
void PointsToCentroides(int *Punts, int N, int D, int C, double *Centroides, int *PC, int *Sep){
int i;
for(i=0; i<C; i++){
Sep[i] = 0;
struct centroids{
int p[N];
int count;
struct kt{
struct centroids cent[C];
} ptc[omp_get_max_threads()];
#pragma omp parallel private(i)
for(i=0; i<C; i++) ptc[omp_get_thread_num()].cent[i].count = 0;
#pragma omp parallel for //shared(PC, Sep)
for(i=0; i<N; i++){
int dist[C], m, j, k;
for(j=0; j<C; j++){
dist[j] = 0;
for(k=0; k<D; k++){
dist[j] += (Punts[i*D+k] - Centroides[j*D+k]) * (Punts[i*D+k] - Centroides[j*D+k]);
m = min(dist, C);
//PC[m*N+Sep[m]] = i;
ptc[omp_get_thread_num()].cent[m].count ++;
#pragma omp parallel private(i)
int p = 0;
for(i=0; i<C; i++){
#pragma omp critical
PC[i*N+Sep[i]] = ptc[omp_get_thread_num()].cent[i].p[ptc[omp_get_thread_num()].cent[i].count];
The error message that the compiler is giving is the following one:
user#pc:~/Descargas/HG$ gcc -O3 -g hg_k-means.c -fopenmp
hg_k-means.c: En la función ‘PointsToCentroides._omp_fn.0’:
hg_k-means.c:92:10: error interno del compilador: en emit_move_insn, en expr.c:3698
#pragma omp parallel private(i)
0x75201a emit_move_insn(rtx_def*, rtx_def*)
0x7409f3 extract_bit_field_1
0x7411f5 extract_bit_field(rtx_def*, unsigned long, unsigned long, int, rtx_def*, machine_mode, machine_mode, bool)
0x74e433 expand_expr_real_1(tree_node*, rtx_def*, machine_mode, expand_modifier, rtx_def**, bool)
0x758f7a store_expr_with_bounds(tree_node*, rtx_def*, int, bool, bool, tree_node*)
0x759bce expand_assignment(tree_node*, tree_node*, bool)
0x66d1d0 expand_gimple_stmt_1
0x66d1d0 expand_gimple_stmt
0x66e79f expand_gimple_basic_block
0x673906 execute
Por favor, envíe un informe completo de errores,
con el código preprocesado si es apropiado.
Please include the complete backtrace with any bug report.
Véase <https://gcc.gnu.org/bugs/> para instrucciones.
I'm a bit new to OpenMP parallelisation methods, so any help is welcomed.

Openmp segmentation fault when setting the number of threads through console

I have the code of matrices multiplication with using openmp:
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main()
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
#pragma omp parallel for shared(a, b, c) private(i, j, k)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
printf("Time=%lf\n", t2-t1);
Now I want to set the number of threads which I want through command line. I do that by using
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main(int argc, char** argv[])
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
int t = atoi(argv[1]);
#pragma omp parallel for shared(a, b, c) private(i, j, k) num_threads(t)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
printf("Time=%lf\n", t2-t1);
Everything is fine, except one crucial thing: when I try to compute the product of matrices with dimension more than (more or less) 500, I get the mistake: "segmentation fault". Could someone clarify the reason for this mistake?
I don't know anything about openmp, but you are most assuredly blowing up your stack. Default stack space will vary from system to system, but with N == 1000, you are trying to put three 2D arrays totaling 3 million doubles on the stack. Assuming a double is 8 bytes, that's 24 million bytes, or just shy of 22.9MB. There can't be many systems allowing that kind of stack space. Instead, I'd recommend trying to grab that amount of memory from the heap. Something like this:
//double a[N][N],b[N][N],c[N][N];
double **a, **b, **c;
a = malloc(sizeof(double*) * N);
b = malloc(sizeof(double*) * N);
c = malloc(sizeof(double*) * N);
for (i=0; i<N; i++)
a[i] = malloc(sizeof(double) * N);
b[i] = malloc(sizeof(double) * N);
c[i] = malloc(sizeof(double) * N);
// do your calculations
for (i=0; i<N; i++)
I've verified on my machine at least, that with N == 1000 I crash right out of the gate with EXC_BAD_ACCESS when trying to place those arrays on the stack. When I dynamically allocate the memory instead as shown above, I get no seg faults.

how to build function that multiply matrices created via struct in C

I am supposed to build a matrix_mul function that display the result of a multiplication between two matrices that were created via a struct. My problem is, my function does not multiply the value inside the matrices instead it return an interger.
typedef struct{
int rows;
int cols;
int **data;
} matrix;
matrix matrix_mul(int **a, int **b){
int rowA=(sizeof(a)/sizeof(a[0]));
int colA=(sizeof(a)/sizeof(a[0][0]))/rowA;
int rowB=(sizeof(b)/sizeof(b[0]));
int colB=(sizeof(b)/sizeof(b[0][0]))/rowB;
int i, j, **mulMatrix;
if(colA != rowB)
printf("error: cannot multiply matrices");
int i, j, k;
mulMatrix = (int **)malloc(rowA * sizeof(int *));
for(i=0; i<rowA; i++)
mulMatrix[i] = (int *)malloc(colB * sizeof(int));
for(i=0; i<rowA; ++i){
for(j=0; j<colB; ++j){
for(k=0; k<colA; ++k)
mulMatrix[i][j]+= a[i][k] * b[k][j];
printf("Matrix: \n\n");
for(i=0; i< rowA; i++){
for(j=0; j< colB; j++){
printf("%d ", mulMatrix[i][j]);
int main()
int i, j;
matrix a, b;
printf("matrix1 - enter number of rows and cols: ");
scanf("%d %d", &a.rows, &a.cols);
a.data = (int **)malloc(a.rows * sizeof(int *));
printf("enter matrix values: ");
for(i=0; i<a.rows; i++){
a.data[i] = (int *)malloc(a.cols * sizeof(int));
for(j=0; j<a.cols; j++){
scanf("%d", &a.data[i][j]);
printf("matrix2 - enter number of rows and cols: ");
scanf("%d %d", &b.rows, &b.cols);
b.data = (int **)malloc(b.rows * sizeof(int *));
printf("enter matrix values: ");
for(i=0; i<b.rows; i++){
b.data[i] = (int *)malloc(b.cols * sizeof(int));
for(j=0; j<b.cols; j++){
scanf("%d", &b.data[i][j]);
matrix_mul(a.data, b.data);
return 0;
These lines
int rowA=(sizeof(a)/sizeof(a[0]));
int colA=(sizeof(a)/sizeof(a[0][0]))/rowA;
won't give you the results you are looking for.
sizeof(a) is sizeof(int**) and sizeof(a[0]) is sizeof(int*). Since pointers are usually of the same size, rowA will be always set to 1. colA will be set to sizeof(int**)/sizeof(int), which is a constant value. Once again, it's not the value you are hoping to get.
You should pass the matrix objects, a and b to matrix_mul. That'll give you the size of the matrices to use for multiplying them.
matrix matrix_mul(matrix a, matrix b){
Of course, you need to modify the implementation accordingly.
