First of all, my Englishg level sucks, so sorry if something isn't well written...
I'm learning how to parallelize C code using OpenMP, the algorith I'm trying to parallelize is the shallow water equations algorithm, and although with a simple #pragma omp parallel for in the most critical loop I've gained nearly 40% more performance I know that my implementation is very poor and I'm not milking the cores as I should. The structure of the code is simple: a 'main' that allocates memory and initializes some matrixes and arrays and calls a function called solver that does all the work, where I putted the #pragma omp parallel for.
I was thinking that I could boost the performance using a parallel section where the memory is allocated and initialized so every thread has all the data, but when i run the program I don't have any boost, and since I'm a rookie with this I don't know if my thinking was bad or the bad thing was my implementation. I'll apreciate some help or a hint that could boost the performance of the algorithm. This is my homework and I don't want someone to do it for me, just a little help that can make me go forward...
I'll paste the code for better understanding:
MAIN FUNCTION (Allocations and initializations)
int main(int argc, char **argv) {
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
/* Default values to use: m volumes in the x-direction and n volumes in the y-direction
M = 1000;
N = 1000;
/* create file and verbose flags */
.......
.......
/* Parse command line options */
.......
.......
epsi = 2.0;
delta = 0.5;
dx = (xend - xstart) / (double) M;
dy = (yend - ystart) / (double) N;
dt = dx / sqrt( 9.81 * 5.0);
tend = 0.1;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
/*HERE IS WHRE I PUT THE PRAGMA FOR PARALLEL INITIALIZATION AND ALLOCATIONS*/
#pragma omp parallel
{
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));
for (i = 0; i < cell_size; i++) {
ffx[i] = ffx[0] + i * m;
nFx[i] = nFx[0] + i * m;
ffy[i] = ffy[0] + i * n;
nFy[i] = nFy[0] + i * n;
}
for (i = 0,tmp= -dx/2 + xstart; i < m; i++, tmp += dx)
x[i] = tmp;
for (i = 0,tmp= -dy/2 + ystart; i < n; i++, tmp += dy)
y[i] = tmp;
/* Set initial Gauss hump */
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
Q(0, i, j) = 4.0;
Q(1, i, j) = 0.0;
Q(2, i, j) = 0.0;
}
}
for (i = 1; i < m-1; i++) {
for (j = 1; j < n-1; j++) {
Q(0, i, j) = 4.0 + epsi * exp(-(pow(x[i] - xend / 4.0, 2) + pow(y[j] - yend / 4.0, 2)) /
(pow(delta, 2)));
}
}
}
// Record start time
stime = gettime();
/*THIS IS THE FUNCTION WHERE THE 'WORK' IS DONE*/
solver(Q, ffx, ffy, nFx, nFy, m, n, tend, dx, dy, dt);`
}
SOLVER FUNCTION (Critical Section)
/*
This is the main solver routine.
*/
void solver(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double tend, double dx, double dy, double dt) {
double bc_mask[3] = {1.0, -1.0, -1.0};
double time;
int i, j, k, steps;
steps = ceil(tend / dt);
for (i = 0, time = 0.0; i < steps; i++, time += dt) {
/* Apply boundary condition */
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 1; j < n - 1 ; j++)
{
Q(k, 0, j) = bc_mask[k] * Q(k, 1, j);
Q(k, m-1, j) = bc_mask[k] * Q(k, m-2, j);
}
}
#pragma omp parallel for private(j) num_threads (NTHR)
for (k = 0; k < cell_size; k++)
{
for (j = 0; j < m; j++)
{
Q(k, j, 0) = bc_mask[k] * Q(k, j, 1);
Q(k, j, n-1) = bc_mask[k] * Q(k, j, n-2);
}
}
/* Update all volumes with the Lax-Friedrich's scheme */
laxf_scheme_2d(Q, ffx, ffy, nFx, nFy, m, n, dx, dy, dt);
}
}
/*
This is the Lax-Friedrich's scheme for updating volumes
*/
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < m; j++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) - dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < m-1; j++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp parallel for private(k,j) num_threads (NTHR)
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (k = 0; k < cell_size; k++)
for (j = 1; j < n; j++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) - dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (k = 0; k < cell_size; k++)
for (j = 1; j < n-1; j++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
As I understand there is no data dependency in the loops of the solver function and it's sub-functions, and since putting a parallel region in the allocation and data initialization did nothing, I don't know how to continue.
Thanks in advance!
There are multiple problems with your code. First of all, you have data races there, since you write to shared variables, such as Q, x, and y, by all threads. Either do the allocations outside of a parallel region or perform them by a single thread only (#pragma omp master or #pragma omp single).
Then, you don't parallelize the for loops in the initialization section. In fact, all these loops are executed by all threads within whole ranges (again with data races and likely a lot of cache contention). You should add #pragma omp parallel to these loops. For nested loops, the collapse directive might be useful.
Also, be sure that there are no data races in solver() and laxf_scheme_2d() functions. Seemingly, the most time of the calculation is spend within laxf_scheme_2d(), however, this function is not at all run in parallel. Does it use OpenMP internally?
Thank you for the answers. I've seen many problems in my implementation, first of all the heaviest function where all the job is done is laxf_scheme_2d.
About the Q variable i have this #define Q(i, j, k) Q[((k) + n * ((j) + m * (i)))]
This is laxf_scheme_2d
void laxf_scheme_2d(double *Q, double **ffx, double **ffy, double **nFx, double **nFy,
int m, int n, double dx, double dy, double dt) {
int i, j, k;
/* Calculate and update fluxes in the x-direction */
#pragma omp for
for (i = 1; i < n; i++) {
fx(Q, ffx, m, n, i);
for (j = 1; j < m; j++)
for (k = 0; k < cell_size; k++)
nFx[k][j] = 0.5 * ((ffx[k][j-1] + ffx[k][j]) -
dx/dt * (Q(k, j, i) - Q(k, j-1, i)));
for (j = 1; j < m-1; j++)
for (k = 0; k < cell_size; k++)
Q(k, j, i) = Q(k, j, i) - dt/dx * ((nFx[k][j+1] - nFx[k][j]));
}
/* Calculate and update fluxes in the y-direction */
#pragma omp for
for (i = 1; i < m; i++) {
fy(Q, ffy, m, n, i);
for (j = 1; j < n; j++)
for (k = 0; k < cell_size; k++)
nFy[k][j] = 0.5 * ((ffy[k][j-1] + ffy[k][j]) -
dy/dt * (Q(k, i, j) - Q(k, i, j -1)));
for (j = 1; j < n-1; j++)
for (k = 0; k < cell_size; k++)
Q(k,i,j) = Q(k,i,j) - dt/dy * ((nFy[k][j+1] - nFy[k][j]));
}
}
Functions fx and fy are very simple and with no data dependencies. I can't put de #pragma omp parallel for above the first for loop because there are data races but for now I can't see how to change this code to overcome them.
long int i, j, m, n, M, N;
char *ptr;
long int s;
int flag, verbose;
double *Q;
double *x, *y;
double **ffx, **nFx, **ffy, **nFy;
double dx, dt, epsi, delta, dy, tend, tmp, stime;
M = 1000;
N = 1000;
/* Add two ghost volumes at each side of the domain */
m = M + 2;
n = N + 2;
/* Allocate memory for the domain */
Q = (double *) malloc(m * n * cell_size * sizeof(double));
x = (double *) malloc(m * sizeof(double));
y = (double *) malloc(n * sizeof(double));
/* Allocate memory for fluxes */
ffx = (double **) malloc(cell_size * sizeof(double *));
ffy = (double **) malloc(cell_size * sizeof(double *));
nFx = (double **) malloc(cell_size * sizeof(double *));
nFy = (double **) malloc(cell_size * sizeof(double *));
ffx[0] = (double *) malloc(cell_size * m * sizeof(double));
nFx[0] = (double *) malloc(cell_size * m * sizeof(double));
ffy[0] = (double *) malloc(cell_size * n * sizeof(double));
nFy[0] = (double *) malloc(cell_size * n * sizeof(double));
Related
I'm trying to write a function that does naive matrix multiplication of two contiguous, row-major arrays. But when I attempt to print each value at the end I get garbage. I'm guessing it's because I've mixed up the proper iterations and scaling needed to jump rows/columns. Does anyone have any advice?
Full code necessary is below:
#include <stdio.h>
#include <stdlib.h>
void dmatmul(double *a, double *b, double *c, int astride, int bstride, int cdim_0, int cdim_1) {
int i, j, p;
for (i = 0; i < cdim_0; i++) {
for (j = 0; j < cdim_1; j++) {
c[i * cdim_1 + j] = 0.0;
for (p = 0; p < (astride); p++) {
c[i * cdim_1 + j] += a[i * (astride) + p] * b[p * (bstride) + j];
}
}
}
}
int main(void) {
double *x, *y, *z;
int xdim_0, xdim_1, ydim_0, ydim_1, zdim_0, zdim_1, i, j;
xdim_0 = 2;
xdim_1 = 4;
ydim_0 = 4;
ydim_1 = 2;
zdim_0 = 2;
zdim_1 = 2;
x = (double *) malloc (xdim_0 * xdim_1 * sizeof(double));
y = (double *) malloc (ydim_0 * ydim_1 * sizeof(double));
z = (double *) malloc (zdim_0 * zdim_1 * sizeof(double));
for (i = 0; i < xdim_0 * xdim_1; i++) {
x[i] = i + 1;
y[i] = 2 * (i + 1);
}
dmatmul(x, y, z, xdim_1, ydim_1, zdim_0, zdim_1);
printf("\nMatrix product of X and Y dimensions: (%d, %d)\n", zdim_0, zdim_1);
printf("Matrix product of X and Y values:");
for (i = 0; i < zdim_0; i++) {
printf("\n");
for (j = 0; j < zdim_1; i++) {
printf("\t%f", z[i * zdim_1 + j]);
}
}
return 0;
}
The primary problem is a typo in the inner for loop doing the printing. You have:
for (j = 0; j < zdim_1; i++)
but you ned to increment j, not i:
for (j = 0; j < zdim_1; j++)
Here's my code, which has an independent matrix printing function appropriate for the arrays you're using:
/* SO 7516-7451 */
#include <stdio.h>
#include <stdlib.h>
static void dmatmul(double *a, double *b, double *c, int astride, int bstride, int cdim_0, int cdim_1)
{
int i, j, p;
for (i = 0; i < cdim_0; i++)
{
for (j = 0; j < cdim_1; j++)
{
c[i * cdim_1 + j] = 0.0;
for (p = 0; p < (astride); p++)
{
c[i * cdim_1 + j] += a[i * (astride) + p] * b[p * (bstride) + j];
}
}
}
}
static void mat_print(const char *tag, int rows, int cols, double *matrix)
{
printf("%s (%dx%d):\n", tag, rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
printf("%4.0f", matrix[i * cols + j]);
putchar('\n');
}
}
int main(void)
{
int xdim_0 = 2;
int xdim_1 = 4;
int ydim_0 = 4;
int ydim_1 = 2;
int zdim_0 = 2;
int zdim_1 = 2;
double *x = (double *)malloc(xdim_0 * xdim_1 * sizeof(double));
double *y = (double *)malloc(ydim_0 * ydim_1 * sizeof(double));
double *z = (double *)malloc(zdim_0 * zdim_1 * sizeof(double));
for (int i = 0; i < xdim_0 * xdim_1; i++)
{
x[i] = i + 1;
y[i] = 2 * (i + 1);
}
mat_print("X", xdim_0, xdim_1, x);
mat_print("Y", ydim_0, ydim_1, y);
dmatmul(x, y, z, xdim_1, ydim_1, zdim_0, zdim_1);
mat_print("Z", zdim_0, zdim_1, z);
printf("\nMatrix product of X and Y dimensions: (%d, %d)\n", zdim_0, zdim_1);
printf("Matrix product of X and Y values:\n");
for (int i = 0; i < zdim_0; i++)
{
for (int j = 0; j < zdim_1; j++)
printf("\t%f", z[i * zdim_1 + j]);
printf("\n");
}
return 0;
}
I've also initialized the variables as I declared them. The code should, but does not, check that the memory was allocated.
When I ran this code without your printing, I got the correct result, so then I took a good look at that and saw the problem.
X (2x4):
1 2 3 4
5 6 7 8
Y (4x2):
2 4
6 8
10 12
14 16
Z (2x2):
100 120
228 280
Matrix product of X and Y dimensions: (2, 2)
Matrix product of X and Y values:
100.000000 120.000000
228.000000 280.000000
I wrote the code below for tiled matrix multiplication with AVX.
#include <x86intrin.h>
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void fill(double *mat, int n) {
register int i;
register int j;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
mat[i * n + j] = (double) (i * n + j);
}
}
}
void blocked_mmul_avx(double *A, double *B, double *C, int n) {
register int row, block, chunk, sub_chunk;
int block_size = 4;
for (row = 0; row < n; row++)
for (block = 0; block < n; block += block_size) {
register __m256d c0 = _mm256_setzero_pd();
for (chunk = 0; chunk < n; chunk += block_size) {
for (sub_chunk = 0; sub_chunk < block_size; sub_chunk++) {
register __m256d a = _mm256_broadcast_sd(
&A[row * n + chunk + sub_chunk]);
register __m256d b = _mm256_loadu_pd(
&B[chunk * n + sub_chunk * n + block]);
c0 += a * b;
}
}
_mm256_storeu_pd(&C[row * n + block], c0);
}
}
int main() {
int n = 4096; //n = 512, 1024, 2048, 4096
double *A;
double *B;
double *C;
A = (double*) malloc(n * n * sizeof(double));
B = (double*) malloc(n * n * sizeof(double));
C = (double*) malloc(n * n * sizeof(double));
srand(0);
fill(A, n); // it is a function to load A and B with random numbers
fill(B, n);
printf("n = %d\n\n", n);
clock_t t0 = clock();
blocked_mmul_avx(A, B, C, n);
clock_t t1 = clock();
printf("Execution Time = %0.3f s \n\n", (float) (t1 - t0) / CLOCKS_PER_SEC);
free(A);
free(B);
free(C);
return 0;
}
If I set size of the block to 4, the code works correctly. But if I increase it to 16, 1/4 of entries of output matrix is correct. I don't know why this is happening??
I checked and there are 16 YMM registers so I think block size can be increased to 16. But my experiment shows the otherwise.
Any idea why this is happening?
I know OpenMP shares all variables declared in an outer scope between all workers. And that my be the answer of my question. But I really confused why function omp3 delivers right result while function omp2 delivers a wrong result.
void omp2(double *A, double *B, double *C, int m, int k, int n) {
for (int i = 0; i < m; ++i) {
#pragma omp parallel for
for (int ki = 0; ki < k; ++ki) {
for (int j = 0; j < n; ++j) {
C[i * n + j] += A[i * k + ki] * B[ki * n + j];
}
}
}
}
void omp3(double *A, double *B, double *C, int m, int k, int n) {
for (int i = 0; i < m; ++i) {
for (int ki = 0; ki < k; ++ki) {
#pragma omp parallel for
for (int j = 0; j < n; ++j) {
C[i * n + j] += A[i * k + ki] * B[ki * n + j];
}
}
}
}
The problem is that there is a race condition in this line:
C[i * n + j] += ...
Different threads can read and write the same memory location (C[i * n + j]) simultaneously, which causes data race. In omp2 this data race can occur, but not in omp3.
The solution is (as suggested by #Victor Eijkhout) is to reorder your loops, use a local variable to calculate the sum of the innermost loop. In this case C[i * n + j] is updated only once, so you got rid of data race and the outermost loop can be parallelized (which gives the best performance):
#pragma omp parallel for
for (int i = 0; i < m; ++i) {
for (int j = 0; j < n; ++j) {
double sum=0;
for (int ki = 0; ki < k; ++ki) {
sum += A[i * k + ki] * B[ki * n + j];
}
C[i * n + j] +=sum;
}
}
Note that you can use collapse(2) clause, which may increase the performance.
I have written the following function in C
double * transpose(double *M, int n) {
double *T = (double *) malloc(n * n * sizeof(double));
int i, j;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
T[i + (j * n)] = M[(i * n) + j];
}
}
return T;
}
and I call it as such:
C = transpose(C, n);
where C was previously declared as
double *C = (double *) malloc(n * n * sizeof(double));
and then initialised with values.
How can I, instead of returning T, set my function type to void and then call the equivalent of *M = *T instead of my return statement. In other words, how can I call the function like:
transpose(C, n);
so that *C is pointing to the memory allocation created by *T?
EDIT:
As pointed out by wildplasser below, a more efficient way of transposing the matrix in place would be swapping the {i, j} pairs except along the diagonal.
Something along the lines of this:
void * transpose(double *M, int n) {
int i, j;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
if (i != j) {
double temp = M[i + (j * n)];
M[i + (j * n)] = M[(i * n) + j];
M[(i * n) + j] = temp;
}
}
}
}
However, calling this as
transpose(C, n);
does not allow C to keep it's transposition after the function. What am I doing wrong here?
Also GCC is giving me the warning
Utilities.c: In function 'transpose':
Utilities.c:34:1: warning: control reaches end of non-void function [-Wreturn-type]
}
^
I have defined my function as void in both the header and source file?
void transpose(double *arr, size_t siz)
{
size_t ii,jj;
for(ii=0;ii<siz;ii++) {
for(jj=ii+1;jj<siz;jj++){
double tmp;
size_t aa,bb;
aa = ii+ siz * jj;
bb = jj+ siz * ii;
tmp = arr[aa];
arr[aa] = arr[bb];
arr[bb] = tmp;
}
}
}
Don't worry about theaa and bb variables. Any decent compiler will optimize them away.
First of all, your code has a memory leak. You overwrite the previous value of C without ever freeing it.
To transpose in place, you need to just swap all the right indexes without swapping twice. So it's like the same problem as reversing an array in place.
void transpose(double *M, int n) {
int i, j;
double temp;
for(i = 0; i < n; i++) {
for(j = 0; j < i; j++) {
temp = M[i + j * n];
M[i + j * n] = M[j + i * n];
M[j + i * n] = temp;
}
}
}
void transpose(double **C, double *M, int n) {
double *T = (double *) malloc(n * n * sizeof(double));
int i, j;
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
T[i + (j * n)] = M[(i * n) + j];
}
}
*C = T;
}
And call your function like this :
transpose(&C, n);
As the comments suggest free the old C before to avoid memory leak
I am writing a blocked matrix multiplication algorithm for n x n matrices. My matrices are stored as 1D arrays. My first version of the algorithm works fine:
double * blocked_ijk_matmul(double *A, double *B, int n, int b) {
double *C = (double *) malloc(n * n * sizeof(double));
int i_block, j_block, k_block, i, j, k;
for (i_block = 0; i_block < n; i_block += b) {
for (j_block = 0; j_block < n; j_block += b) {
for (k_block = 0; k_block < n; k_block += b) {
for (i = i_block; i < fmin(i_block + b, n); ++i) {
for (j = j_block; j < fmin(j_block + b, n); ++j) {
for (k = k_block; k < fmin(k_block + b, n); ++k) {
C[(i * n) + j] += A[(i * n) + k] * B[(k * n) + j];
}
}
}
}
}
}
return C;
}
However, in this algorithm C[(i *n) * j] is computed quite a large number of times depending on the size of the matrices. If I instead try to store this sum, and then set the value of C[(i *n) * j] to the total sum value when all summations are complete, I get incorrect results:
double * blocked_ijk_matmul(double *A, double *B, int n, int b) {
double *C = (double *) malloc(n * n * sizeof(double));
int i_block, j_block, k_block, i, j, k;
for (i_block = 0; i_block < n; i_block += b) {
for (j_block = 0; j_block < n; j_block += b) {
for (k_block = 0; k_block < n; k_block += b) {
for (i = i_block; i < fmin(i_block + b, n); ++i) {
for (j = j_block; j < fmin(j_block + b, n); ++j) {
double sum = 0;
for (k = k_block; k < fmin(k_block + b, n); ++k) {
sum += A[(i * n) + k] * B[(k * n) + j];
}
C[(i * n) + j] = sum;
}
}
}
}
}
return C;
}
I cannot figure out for quite some time why this is not working. Clearly, double sum = 0; and C[(i * n) + j] = sum; need to be placed somewhere else, but I cannot figure out where.