Perform a triple pointer (C) offloading to NVIDIA GPU with OpenMP - c

I've been working with a heat transfer code. This code, basically, stablishes the initial conditions for a cube and all of its faces. The six faces start at different temperatures, and then the code will be calculating how the temperature changes in all of the faces due to the heat transfer between them. Now, I've been trying offloading to an NVIDIA GPU using OpenMP directives. This code initializes the faces conditions using a triple pointer, which is sort of an array of arrays. Reading a little bit about this matter, I've come to know that 3D architectures are not easily offloaded to the GPUs. So my question is if it is possible to offload this triple pointer arrays to the GPU or if I have to use a more flat array form.
Here I leave the code, which is still working on CPU. Parallel version of the code.
#include <omp.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 25 //This defines the number of points per dimension (Cube = N*N*N)
#define NUM_STEPS 6000 //This is the number of simulations time steps
/*writeFile: this function writes simulation results into a file.
* A file is created for each iteration that's passed to the function
* as a parameter. It also takes the triple pointer to the simulation
* data*/
void writeFile(int iteration, double*** data){
char filename[50];
char itr[12];
sprintf(itr, "%d", iteration);
strcpy(filename, "heat_");
strcat(filename, itr);
strcat(filename, ".txt");
//printf("Filename is %s\n", filename);
FILE *fp;
fp = fopen(filename, "w");
fprintf(fp, "x,y,z,T\n");
for(int i=0; i<N; i++){
for(int j=0;j<N; j++){
for(int k=0; k<N; k++){
fprintf(fp,"%d,%d,%d,%f\n", i,j,k,data[i][j][k]);
}
}
}
fclose(fp);
}
void compute_heat_transfer(double ***arrayOld, double ***arrayNew){
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
}*/
#pragma omp parallel shared(arrayNew, arrayOld) private(i,j,k)
{
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1;k<N-1;k++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
}
}
}
#pragma omp for
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
}
}
}
}
}
}
int main (int argc, char *argv[]) {
int i,j,k,nsteps;
double mean;
double ***arrayOld; //Variable that will hold the data of the past iteration
double ***arrayNew; //Variable where newly computed data will be stored
arrayOld = (double***)malloc(N*sizeof(double**));
arrayNew = (double***)malloc(N*sizeof(double**));
if(arrayOld== NULL){
fprintf(stderr, "Out of memory");
exit(0);
}
for(i=0; i<N;i++){
arrayOld[i] = (double**)malloc(N*sizeof(double*));
arrayNew[i] = (double**)malloc(N*sizeof(double*));
if(arrayOld[i]==NULL){
fprintf(stderr, "Out of memory");
exit(0);
}
for(int j=0;j<N;j++){
arrayOld[i][j] = (double*)malloc(N*sizeof(double));
arrayNew[i][j] = (double*)malloc(N*sizeof(double));
if(arrayOld[i][j]==NULL){
fprintf(stderr,"Out of memory");
exit(0);
}
}
}
/*Set boundary values and compute mean boundary values*/
mean = 0.0;
for(i=0; i<N; i++){
for(j=0;j<N;j++){
arrayOld[i][j][0] = 100.0;
mean += arrayOld[i][j][0];
}
}
for(i=0; i<N; i++){
for(j=0;j<N;j++){
arrayOld[i][j][N-1] = 100.0;
mean += arrayOld[i][j][N-1];
}
}
for(j=0; j<N; j++){
for(k=0;k<N;k++){
arrayOld[0][j][k] = 100.0;
mean += arrayOld[0][j][k];
}
}
for(j=0; j<N; j++){
for(k=0;k<N;k++){
arrayOld[N-1][j][k] = 100.0;
mean += arrayOld[N-1][j][k];
}
}
for(i=0; i<N; i++){
for(k=0;k<N;k++){
arrayOld[i][0][k] = 100.0;
mean += arrayOld[i][0][k];
}
}
for(i=0; i<N; i++){
for(k=0;k<N;k++){
arrayOld[i][N-1][k] = 0.0;
mean += arrayOld[i][N-1][k];
}
}
mean /= (6.0 * (N*N));
/*Initialize interior values*/
for(i=1; i<N-1; i++){
for(j=1; j<N-1; j++){
for(k=1; k<N-1;k++){
arrayOld[i][j][k] = mean;
}
}
}
double tdata = omp_get_wtime();
compute_heat_transfer(arrayOld, arrayNew);
tdata = omp_get_wtime()-tdata;
printf("Execution time was %f secs\n", tdata);
for(i=0; i<N;i++){
for(int j=0;j<N;j++){
free(arrayOld[i][j]);
free(arrayNew[i][j]);
}
free(arrayOld[i]);
free(arrayNew[i]);
}
free(arrayOld);
free(arrayNew);
return 0;
}

Use variable length arrays with dynamic storage:
Allocation:
double (*arr)[N][N] = calloc(N, sizeof *arr);
Indexing.
Use good old arr[i][j][k] syntax
Deallocation.
free(arr)
Flattening.
double *flat = (double*)arr;
Note that this conversion is not guaranteed by the C standard to work.
Though it will very likely work on all platforms capable of using GPUs.
Passing to functions.
VLAs can be parameters of the functions.
void fun(int n, double arr[n][n][n]) {
...
}
Exemplary usage would be:
foo(N, arr);
EDIT
VLA friendly variant of compute_heat_transfer():
void compute_heat_transfer(int n, double arrayOld[restrict n][n][n], double arrayNew[restrict n][n][n]) {
int i,j,k;
/*Compute steady-state solution*/
for(int nsteps=0; nsteps < NUM_STEPS; nsteps++){
/*if(nsteps % 100 == 0){
writeFile(nsteps, arrayOld);
}*/
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
//This is the 6-neighbor stencil computation
arrayNew[i][j][k] = (arrayOld[i-1][j][k] + arrayOld[i+1][j][k] + arrayOld[i][j-1][k] + arrayOld[i][j+1][k] +
arrayOld[i][j][k-1] + arrayOld[i][j][k+1])/6.0;
}}}
#pragma omp parallel for collapse(3)
for(i=1; i<n-1; i++){
for(j=1; j<n-1; j++){
for(k=1; k<n-1; k++){
arrayOld[i][j][k] = arrayNew[i][j][k];
}}}
}
}
Keyword restrict in arrNew[restrict n][n][n] is used to let the compiler assume that arrNew and arrOld do not alias. It should let the compiler use more aggressive optimizations.
Note that arrNew and arrOld are pointer to arrays. So rather than copy arrNew to arrOld you could simply swap those pointers forming a kind of simple double buffering. It should make the code even faster.

Related

Matrix traversal optimization

Given a n x n matrix of ints, I have an algorithm that at each step of a for loop of range n traverses and modifies the matrix. Here is the code:
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
The matrix is built as an array of n*n ints and for each index line i, dist[i] is the address of the row of index i [the above code is the standard way to write the Floyd-Warshall algorithm but my question is not about this algorithm by itself].
The following drawing tries to explain how the matrix is processed:
At each step of the loop of index k, the underlying matrix is traversed line by line.
Now, consider the following transformation of the previous code:
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
At every step, the elements of the matrix are accessed in the same order as in the previous algorithm.
The only difference is that at each step k in the exterior loop, the column of index k is copied into a temporary array, cf. the colk malloc above. It results that the element at position (i, k) is read from this array instead of being accessed directly from the matrix.
This innocuous change leads in fact to a significant speedup: you gain a factor 4 if n=1000.
I know that in C, it's faster to traverse an array in row major order but this is always the case here. So i was wondering why there is a speedup so important. Is it related to cache optimisation?
Complete code
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
void print(Matrix dist, int n)
{
int i, j;
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
printf("%d ", dist[i][j]);
printf("\n");
}
}
void test_slow(Matrix dist, int n)
{
clock_t now=clock();
floyd_slow(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed slow: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
void test_fast(Matrix dist, int n)
{
clock_t now=clock();
floyd_fast(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed fast: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
int * data(int n)
{
int N=n*n;
int *t=malloc(N*sizeof(int));
if (!t)
exit(EXIT_FAILURE);
srand(time(NULL));
for (int i=0; i<N;i++)
t[i]=(1+rand())%10;
return t;
}
Matrix getMatrix(int *t, int n)
{
int N=n*n;
int *tt=malloc(N*sizeof(int));
Matrix mat=malloc(n*sizeof(int*));
if (!tt || !mat)
exit(EXIT_FAILURE);
memcpy(tt, t, N*sizeof(int));
for (int i=0; i<n;i++)
mat[i]=&tt[i*n];
return mat;
}
int main(void)
{
int n=1000;
int *t=data(n);
Matrix mat_slow=getMatrix(data(n), n);
Matrix mat_fast=getMatrix(data(n), n);
test_slow(mat_slow, n);
test_fast(mat_fast, n);
return 0;
}
Output:
Elapsed slow: 0.58 s
Elapsed fast: 0.14 s
Compilation options:
rm floyd
gcc -Wall -O3 -march=native -ffast-math -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.c -o floyd -lm

Encountering race condition using openMP in C

Here is the section of my code where I am running into a race condition. I'm just trying to copy the values of the matrix "matxOriginal" into the matrix "cluster0", but, when run on multiple threads using openMP, the sample values printed for "cluster0" is different from "matxOriginal".
Both matrix has been dynamically allocated and are both 698 x 9 matrix.
Also, I would like to keep the use and purpose of the "cluster0IndexCounter" variable for a different use outside of what I'm posting. So if you can, please let me know how to make this work.
double **matxGen(int row, int col)
{
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
{
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
{
m[i][j]=j+i;
}
}
return m;
}
double **emptyMatxGen(int row, int col)
{
int i=0, j=0;
double **m;
m=(double **) malloc(row*col*sizeof(double *));
for (i; i<row; i++)
{
m[i]=(double *) malloc(col*sizeof(double ));
for (j=0; j<col; j++)
{
m[i][j]=0.0;
}
}
return m;
}
int main()
{
int x, i, j, tid, row=699, col=9,
cluster0IndexCounter=0;
double **matxOriginal, **matx, **cluster0;
matxOriginal=matxGen(row, col);
matx=matxGen(row, col);
double *centerPoint0=matx[99];
cluster0=emptyMatxGen(row, col);
#pragma omp parallel for private(x, j, tid) schedule(static) reduction(+:cluster0IndexCounter)
for (x=0; x<=698; x++)
{
for (j=0; j<9; j++)
{
cluster0[cluster0IndexCounter][j]=matxOriginal[x][j];
}
cluster0IndexCounter=cluster0IndexCounter+1;
}
printf("cluster0: %f, %f, %f, %f, %f\n", cluster0[9][0], cluster0[9][1], cluster0[9][2], cluster0[9][3], cluster0[9][4]);
free(cluster0);
free(matxOriginal);
free(matx);
return 0;
}

Openmp segmentation fault when setting the number of threads through console

I have the code of matrices multiplication with using openmp:
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main()
{
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
t1=omp_get_wtime();
#pragma omp parallel for shared(a, b, c) private(i, j, k)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
}
}
t2=omp_get_wtime();
printf("Time=%lf\n", t2-t1);
}
Now I want to set the number of threads which I want through command line. I do that by using
atoi(argv[])
Namely
#include <stdio.h>
#include <omp.h>
#include <math.h>
#define N 1000
int main(int argc, char** argv[])
{
long int i, j, k;
//long int N = atoi(argv[1]);
double t1, t2;
double a[N][N],b[N][N],c[N][N];
for (i=0; i<N; i++)
for (j=0; j<N; j++)
a[i][j]=b[i][j]=log(i*j/(i*j+1.)+1) +exp(-(i+j)*(i+j+1.));
int t = atoi(argv[1]);
t1=omp_get_wtime();
#pragma omp parallel for shared(a, b, c) private(i, j, k) num_threads(t)
for(i=0; i<N; i++){
for(j=0; j<N; j++){
c[i][j] = 0.0;
for(k=0; k<N; k++) c[i][j]+=a[i][k]*b[k][j];
}
}
t2=omp_get_wtime();
printf("Time=%lf\n", t2-t1);
}
Everything is fine, except one crucial thing: when I try to compute the product of matrices with dimension more than (more or less) 500, I get the mistake: "segmentation fault". Could someone clarify the reason for this mistake?
I don't know anything about openmp, but you are most assuredly blowing up your stack. Default stack space will vary from system to system, but with N == 1000, you are trying to put three 2D arrays totaling 3 million doubles on the stack. Assuming a double is 8 bytes, that's 24 million bytes, or just shy of 22.9MB. There can't be many systems allowing that kind of stack space. Instead, I'd recommend trying to grab that amount of memory from the heap. Something like this:
//double a[N][N],b[N][N],c[N][N];
double **a, **b, **c;
a = malloc(sizeof(double*) * N);
b = malloc(sizeof(double*) * N);
c = malloc(sizeof(double*) * N);
for (i=0; i<N; i++)
{
a[i] = malloc(sizeof(double) * N);
b[i] = malloc(sizeof(double) * N);
c[i] = malloc(sizeof(double) * N);
}
// do your calculations
for (i=0; i<N; i++)
{
free(a[i]);
free(b[i]);
free(c[i]);
}
free(a);
free(b);
free(c);
I've verified on my machine at least, that with N == 1000 I crash right out of the gate with EXC_BAD_ACCESS when trying to place those arrays on the stack. When I dynamically allocate the memory instead as shown above, I get no seg faults.

Using dgesv in C to determine linear regression with and without intercept

The following code is using dgesv library in C to calculate linear regression. It has X observations and Y predictions, with X and Y saved as double arrays. I am wondering
1) Is this code calculating linear regression with intercept or not?
2) How can I alter the code so that it does the opposite of 1, in terms of using or not using intercept?
Here is the code:
#include <stdio.h>
#define N 16 /* number of observations */
#define P 2 /* number of predictors */
void dgesv_(int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info);
int main(){
/* longley dataset from R: Employed (Y) GNP.deflator and Population (X) */
double Y[N] = {60.323,61.122,60.171,61.187,63.221,63.639,64.989,63.761,66.019,67.857,68.169,66.513,68.655,69.564,69.331,70.551};
double X[N][P] =
{{83,107.608},
{88.5,108.632},
{88.2,109.773},
{89.5,110.929},
{96.2,112.075},
{98.1,113.27},
{99,115.094},
{100,116.219},
{101.2,117.388},
{104.6,118.734},
{108.4,120.445},
{110.8,121.95},
{112.6,123.366},
{114.2,125.368},
{115.7,127.852},
{116.9,130.081}};
int i, j, k, n1=P+1, n2=1, ipiv[P+1], info;
double X1[N][P+1], XtX[(P+1) * (P+1)], XtY[P+1];
rest of code follows:
/* design matrix */
for (i=0; i<N; i++){
X1[i][0] = 1;
for (j=1; j<n1; j++)
X1[i][j] = X[i][j-1];
}
/* t(X1) %*% X1 */
for (i=0; i<n1; i++){
for (j=0; j<n1; j++){
XtX[i*n1+j] = 0;
for (k=0; k<N; k++)
XtX[i*n1+j] += X1[k][i] * X1[k][j];
}
}
/* t(X1) %*% Y */
for (i=0; i<n1; i++){
XtY[i] = 0;
for (j=0; j<N; j++){
XtY[i] += X1[j][i] * Y[j];
}
}
rest of code follows...
/* XtX is symmetric, no transpose needed before passing to Fortran subrountine */
dgesv_(&n1, &n2, XtX, &n1, ipiv, XtY, &n1, &info);
if (info!=0) printf("failure with error %d\n", info);
/* print beta */
printf("The regression coefficients: ");
for (i=0; i<n1; i++){
printf("%f ", XtY[i]);
}
printf("\n");
return 0;
}

Prove that c uses row-major order for arranging two-dimensional arrays

I'm trying to prove that c uses row-major order as memory structure so I'm measuring the time to create a row-major array vs. column-major array (multiplication * 2).
The problem is that something isn't quite right with the algorithm of the loops subrow(array) and subcol(array). The row-major order should produce less misses in the cache and thus be faster than column major but I constantly get the opposite. If you run the code you'll get something like this:
Row-major took 6511 miliseconds.
Column-major took 5690 miliseconds.
Please help me sort out the algorithm.
Edit:
It was pointed out that I am not actually accessing the array so I am not testing access speed. I added sum += array[i][j]; to the subcol and subrow loop, but I am still getting the same consistent result that row-major order is performing slower whereas the opposite should be true. Perhaps something is wrong with the way I am setting i and j in the loops.
Output:
subrow sum = 784293664
Row-major took 6737 miliseconds.
subcol sum = 784293664
Column-major took 6594 miliseconds.
Updated Code:
#include <stdio.h>
#include <sys/time.h>
#define ROW 1000
#define COL 1000
void subrow(int array[ROW][COL]);
void subcol(int array[ROW][COL]);
int main()
{
int array[ROW][COL];
int i, j;
for(i=0;i<ROW;i++) // sets the array to each element to x*y then multiplies by 2
{
for (j=0; j<COL; j++)
{
array[i][j]=i*j;
array[i][j]=array[i][j]*2;
}
}
subrow(array); //calls the max row function
subcol(array); //calls the max col function
return 0;
}
void subrow(int array[ROW][COL])
{
int i,j;
struct timeval stop, start;
gettimeofday(&start, NULL);
int sum = 0;
for (i=0;i<ROW;i++)
{
for (j=0; j<COL; j++)
{
sum += array[i][j];
}
}
printf("subrow sum = %d\n", sum);
gettimeofday(&stop, NULL);
printf("Row-major took %lu miliseconds.\n", (stop.tv_usec - start.tv_usec));
return;
}
void subcol(int array[ROW][COL])
{
int i,j;
struct timeval stop, start; //
gettimeofday(&start, NULL);
int sum = 0;
for (i=0; i<COL;i++)
{
for (j=0; j<ROW; j++)
{
sum += array[i][j];
}
}
printf("subcol sum = %d\n", sum);
gettimeofday(&stop, NULL);
printf("Column-major took %lu miliseconds.\n", (stop.tv_usec - start.tv_usec));
return;
}
for (i=0;i<ROW;i++)
{
for (j=0; j<COL; j++)
{
}
}
You're not actually accessing the arrays in these loops. You're just iterating with a couple of int variables. You need to actually read from or write to array[i][j] if you want to test access speed.
For instance:
int sum = 0;
for (i=0;i<ROW;i++)
{
for (j=0; j<COL; j++)
{
sum += array[i][j];
}
}
// Do something with `sum` so the compiler doesn't optimize it, and the loops above,
// away.
printf("sum = %d\n", sum);

Resources