/***every function is working correct but after only first iteration is giving collective abort anyone can tell what is or coulde be the reason***/
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *s, int *e)
{
/*****calculating start and end row for every process*****/
*s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
*e = *s + (n/size)+((n%size)>id);
}
void onedinit(double **a, double **b, double **f, const int nx, const int s, const int e)
{
int i, j;
int ls, le;
ls = s - (s!=0);
le = e + (e!=nx);
/***setting all the intial values to zero****/
for (i = ls; i < le; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
//***************************Defining Boundary Condition***********************************//
/***setting left boundary to 1***/
for (i = ls; i < le; i++) a[i][0] = b[i][0] = 1;
/***setting value f(0, i) is 2***/
if (s==0) for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int s, const int e, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/*****************if process id is odd then first send and if even then first recive to avoid deadlock**********/
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
}
void sweep1d(double **a, double **f, int nx, const int s, const int e, double **b)
{
int i, j;
int rows;
rows = e - s - (s==0) - (e==0);
nx -= 1;
double h = 1.0 / (double)nx;
for (i = 1; i <= rows; i++) for (j = 1; j < nx; j++)
b[i][j] = 0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, int s, int e)
{
double sum = 0.0;
int i, j;
int st, ed;
st = (s!=0);
ed = e-s+(s!=0);
for (i = st; i < ed; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, s, e, it;
double diffnorm, dwork;
double t1, t2;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
MPE_decomp1d(ny, numprocs, myid, &s, &e);
int ls, le, rows;
int i, j;
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, s, e);
diffnorm = 0.0;
it = 0;
do
{
// printf("%danshu\n", myid);
exchng1(a, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, s, e, b);
exchng1(b, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, s, e, a);
dwork = diff(a, b, nx, s, e);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
MPI_Barrier(comm1d);
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}
So just dumping 130 lines of code in SO and asking why it doesn't work is probably not the best way to get good answers - especially when the only actual sentance you write is "every function is working"... if that were the case, you wouldn't have a problem. You need to narrow things down to some more specific case and get a more specific question.
In this particular case, I've seen lots of code like this in the past while teaching, so it's feasible to see some of what's going on.
First off, you can't do stuff like this:
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
/*...*/
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
/*...*/
}
If you have 100 rows broken up into 4 processors, and you're (say) MPI Task 2, then your s is 50 and e is 75, and so ls would be 49 and le would be 76, and so you're trying to access a[49..76] even though you've only allocated a of size 27! That particular error comes up all over the code, and needs to be fixed. You want to be accessing a[0..rows-1].
Incidentally, I haven't even checked to see if MPE_decomp1d actually does the right thing. We all go through the phase where we think it's cute in C to put things in one line by using logical expressions multiplied by ternary operators, etc, but seriously, it makes your code unnecessarily tedious to disentangle when someone else has to fix it -- whether it's SOers or yourself 2 months later.
In exchng1, you're doing unnecessary work. You don't need to check to see if nbrbottom or nbrtop are valid; if they aren't, MPI_Cart_shift returns MPI_PROC_NULL to which sending or receiving is a no-op. So sending/receiving from those ranks is harmless, which is a great design decision, because it avoids lots of corner cases in the logic.
Similarly, to avoid deadlock you can use MPI_Sendrecv rather than individual Sends and Recvs. That plus the above means that instead of this:
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
you can do this:
MPI_Sendrecv(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
-- way simpler, right?
There's still some problem in the exchange, though; receiving into a[e-s+1] isn't right, although as I've mentioned, I can't be bothered decrypting MPE_decomp1d to figure out why. Presumably you want to be receiving into a[rows-1].
Finally, the MPI_Barrier() is slow and completely unnecesssary; there's enough synchronization in the guardcell exchanges (to say nothing of the Allreduce) that you don't need it.
When all those changes are made, the code runs without memory access problems; you'll have to check that it gives the right answers.
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *rows)
{
int s, e;
s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
e = s + (n/size)+((n%size)>id);
*rows = e - s - (s==0) - (e==0);
}
void onedinit(double **a, double **b, double **f, const int nx, const int rows, const int id, const int nprocs)
{
int i, j;
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
for (i = 0; i < rows; i++) a[i][0] = b[i][0] = 1;
if (id == 0)
for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int rows, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/* send data downwards */
MPI_Sendrecv(a[rows-2], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
/* send data upwards */
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[rows-1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
void sweep1d(double **a, double **f, const int nx, const int rows, double **b)
{
int i, j;
double h = 1.0 / (double)nx;
for (i = 1; i < rows-1; i++) for (j = 1; j < nx-1; j++)
b[i][j] =
0.25 * ( a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, const int rows)
{
double sum = 0.0;
int i, j;
for (i = 0; i < rows; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, it;
double diffnorm, dwork;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
int rows;
MPE_decomp1d(ny, numprocs, myid, &rows);
int i, j;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = 0; i < rows; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, rows, myid, numprocs);
diffnorm = 0.0;
it = 0;
do
{
exchng1(a, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, rows, b);
exchng1(b, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, rows, a);
dwork = diff(a, b, nx, rows);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}
Related
I have the following code
MPI_Init(NULL, NULL);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int n_chunks = 16;
assert(n % n_chunks == 0);
int chunk_size = n / n_chunks;
int psizes[2] = {0, 0};
MPI_Dims_create(world_size, 2, psizes);
MPI_Datatype *dist_types = (MPI_Datatype *) malloc(world_size * sizeof(MPI_Datatype));
for (int i = 0; i < world_size; i++) {
int sizes[2] = {n, n};
int distribs[2] = {MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC};
int dargs[2] = {chunk_size, chunk_size};
MPI_Type_create_darray(world_size, i, 2,
sizes, distribs, dargs, psizes,
MPI_ORDER_C, MPI_DOUBLE, &dist_types[i]);
MPI_Type_commit(&dist_types[i]);
}
MPI_Request *send_requests;
if (rank == 0) {
send_requests = (MPI_Request *) malloc(world_size * sizeof(MPI_Request));
for (int i = 0; i < world_size; i++) {
MPI_Isend(&A[0][0], 1, dist_types[i],
i, 0, MPI_COMM_WORLD, &send_requests[i]);
}
}
int dist_size;
MPI_Type_size(dist_types[rank], &dist_size);
dist_size /= sizeof(double);
double *D = (double *) malloc(dist_size * sizeof(double));
MPI_Request recv_request;
MPI_Irecv(D, dist_size, MPI_DOUBLE,
0, 0, MPI_COMM_WORLD, &recv_request);
MPI_Wait(&recv_request, MPI_STATUS_IGNORE);
if (rank == 0) {
MPI_Waitall(1, send_requests, MPI_STATUSES_IGNORE);
}
int m = n / psizes[0];
if (rank == 0) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < m; j++) {
printf("%.2lf ", D[i * m + j]);
}
printf("\n");
}
When I print out the matrix D, I don't get a block cyclic view of A as I'd expect. Rather, the entries are all jumbled up and apart from the top row they look quite random.
Hence, my question is, can I generally expect this to work or are you not really supposed to use MPI_Type_create_darray in this situation. I'm wondering because from what I could find online, people only mention the function in the context of MPI-IO and I couldn't locate a single example of it being used in a way similar to what I have.
I'm an MPI novice, so maybe I'm just doing something wrong that's unrelated to the type I'm using. Also, I did read that it's not really ideal to distribute your matrix this way and rather use MPI-IO, but I can't really change that.
1. Goal
I am implementing matrix multiplication by Fox method for square mxn matrices AB = C and world_size = 4 processors disposed as a topological mesh/grid:
P0-P1
| |
P2-P3
where - represents mesh_r (mesh_rows) communicator and | represents mesh_c (mesh_columns) communicator, build through build_mesh procedure.
2. My code
int main(int argc, char *argv[])
{
int process_rank, world_size;
int mesh_rows, mesh_columns;
int mesh_dimension = 2;
int *process_coordinates;
MPI_Comm mesh, mesh_r, mesh_c;
int process_rank_mesh;
int *A, *A_loc;
int *B, *B_loc;
int *C, *C_loc;
int m, n, mloc, nloc;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (process_rank == 0) {
m = n = /*world_size * 1*/ 2; // multiple of world_size = 4
}
MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
A = fill_matrix(A, m, n);
B = fill_matrix(A, m, n);
C = (int*) calloc(m * n, sizeof(int));
if (process_rank == 0)
mesh_rows = 2;
if (is_divisible(world_size, mesh_rows))
mesh_columns = world_size / mesh_rows;
else {
mesh_rows = 1;
mesh_columns = world_size / mesh_rows;
}
MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
MPI_Comm_rank(mesh, &process_rank_mesh);
mloc = m / mesh_rows;
nloc = m / mesh_columns;
handle_errors(m, n, world_size, process_rank);
A_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
B_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
C_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(C, C_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int)); // blocco di A addizionale inviato dai processori sulla diagonale a quelli su mesh_r
// START BMR
memcpy(A_loc_add, A_loc, sizeof(A_loc) * mloc);
MPI_Bcast(A_loc_add, mloc * nloc, MPI_INT, f(process_rank), mesh_r);
// Compute Cij = AB for each process
for (int i = 0; i < m; i++) {
if (process_rank == 0 || process_rank == 3)
C_loc[i] += A_loc[i] * B_loc[i];
else
C_loc[i] += A_loc_add[i] * B_loc[i];
}
for (int i = 1; i < m; i++) {
// Broadcast
memcpy(A_loc_add, A_loc, sizeof(A_loc) * mloc);
MPI_Bcast(A_loc_add, mloc * nloc, MPI_INT, !f(process_rank), mesh_r);
// Rolling
int *t_loc = (int*) calloc(mloc * nloc, sizeof(int)); // variabile temporanea necessaria per lo scambio
memcpy(t_loc, B_loc, sizeof(B_loc) * mloc);
MPI_Status status;
int mate; // P0 <-> P2 e P1 <-> P3 - that is B_loc swap in mesh_c communicator
if (process_rank == 0) {
mate = 2;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 1) {
mate = 3;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 2) {
mate = 0;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 3) {
mate = 1;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
}
memcpy(B_loc, t_loc, sizeof(t_loc) * mloc);
free(t_loc);
// multiply
dot_product(A_loc_add, C_loc, B_loc, m);
}
// END BMR
MPI_Finalize();
return 0;
}
void dot_product(int *A_loc_add, int *C_loc, int *B_loc, int m)
{
for (int i = 0; i < m; i++)
C_loc[i] += A_loc_add[i] * B_loc[i];
}
int f(int process_rank)
{
if (process_rank == 0 || process_rank == 1)
return 0;
else
return 1;
}
void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
MPI_Datatype square_block;
int stride = n;
int count = mloc;
int block_length = nloc;
MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
MPI_Datatype square_block_resized;
MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
MPI_Type_commit(&square_block_resized);
int *send_counts = (int*) calloc(world_size, sizeof(int));
int *displs = (int*) calloc(world_size, sizeof(int));
for (int i = 0; i < mesh_rows; i++) {
for (int j = 0; j < mesh_columns; j++) {
send_counts[i * mesh_columns + j] = 1;
displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
}
}
MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}
void handle_errors(int m, int n, int world_size, int process_rank)
{
if (process_rank == 0) {
if (m != n) {
perror("Not square matrices\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
if (world_size != 4) {
perror("World size must be 4\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
}
}
bool is_divisible(int dividend, int divisor)
{
return dividend % divisor == 0;
}
void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
int mesh_rows, int mesh_columns, int *process_coordinates)
{
int mesh_dimension = 2;
int *mesh_n_dimension;
int mesh_reorder = 0;
int *mesh_period;
int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension[0] = mesh_rows;
mesh_n_dimension[1] = mesh_columns;
mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
mesh_period[0] = mesh_period[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
remain_dims[0] = 0;
remain_dims[1] = 1;
MPI_Cart_sub(*mesh, remain_dims, mesh_r);
remain_dims[0] = 1;
remain_dims[1] = 0;
MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}
int *fill_matrix(int *Mat, int m, int n)
{
int k = 0;
Mat = (int*) calloc(m * n, sizeof(int));
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
Mat[i * n + j] = ++k;
return Mat;
}
3. Result
My algorithm is working fine if m = 2:
C00:
7
C01:
10
C10:
15
C11:
22
as you can see here
but makes mistakes when calculating C for other values of m (e.g. world_size * 1):
C00:
58 92
242 308
C01:
78 120
294 368
C10:
198 260
494 588
C11:
274 344
602 704
whose expected result C is:
Can you help me? You can use this online matrices multiplication tool for debugging and calculate expected result. I am 100% sure that build_mesh and distribute procedures are correct: the bug must be in BMR section
I am new to mpi and I am trying to write a mini C program that calculates the percentage ratio of numbers that the user inputs.
The percentage ratio is calculated by that expression
`δi = ((xi – xmin ) / (xmax – xmin )) * 100`.
The numbers that the user inputs are stored in an array of fixed size data[100] and are scattered to all processes (this program is supposed to work only with four processes).
The problem I am facing is that the division doesn't work although all the processes have the data. For example if the user inputs the numbers {1, 2, 3, 4} the expected percentage ratio according to the mathematical expression is {0, 33.3, 66.6, 100} but instead I am getting {0,0,100,100}. This is what I have.
#include <stdio.h>
#include "mpi.h"
int main(int argc, char** argv){
int my_rank;
int total_processes;
int root = 0;
int data[100];
int loc_data[100];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &total_processes);
int input_size = 0;
if (my_rank == 0){
printf("Input how many numbers: ");
scanf("%d", &input_size);
printf("Input the elements of the array: ");
for(int i=0; i<input_size; i++){
scanf("%d", &data[i]);
}
}
MPI_Bcast(&input_size, 1, MPI_INT, root, MPI_COMM_WORLD);
int loc_num = input_size/total_processes;
MPI_Scatter(&data, loc_num, MPI_INT, loc_data, loc_num, MPI_INT, root, MPI_COMM_WORLD);
int global_max = 0;
int global_min = 0;
MPI_Reduce(&loc_data, &global_max, 1, MPI_INT, MPI_MAX, root, MPI_COMM_WORLD);
MPI_Reduce(&loc_data, &global_min, 1, MPI_INT, MPI_MIN, root, MPI_COMM_WORLD);
float loc_delta[100];
int x = 0;
int y = 0;
float p = 0;
for(int j = 0; j< loc_num; j++){
x = loc_data[j] - global_min;
y = global_max - global_min;
}
MPI_Bcast(&y, 1, MPI_INT, root, MPI_COMM_WORLD);
for(int j = 0; j< loc_num ; j++){
p = (x / y) * 100;
printf("p= %f \n", p);
loc_delta[j] = p;
}
float final_delta[100];
MPI_Gather(&loc_delta, 1, MPI_FLOAT, final_delta, 1, MPI_FLOAT, root, MPI_COMM_WORLD);
if(my_rank == 0){
printf("max number: %d\n", global_max);
printf("min number: %d\n", global_min);
for(int i = 0; i<input_size; i++)
printf("delta[%d]: %.2f | ", i+1, final_delta[i]);
}
printf("\n");
MPI_Finalize();
return 0;
}
There are several issues with your code.
First:
int global_max = 0;
int global_min = 0;
MPI_Reduce(&loc_data, &global_max, 1, MPI_INT, MPI_MAX, root, MPI_COMM_WORLD);
MPI_Reduce(&loc_data, &global_min, 1, MPI_INT, MPI_MIN, root, MPI_COMM_WORLD);
unfortunately,
MPI does not get the minimum of all elements in the array, you have to
do that manually. (source)
Therefore, one needs to first calculate the min and the max within each process' array, and then one can reduce those min and max results among the other processes. Since, all processes should have the min and max of that array, instead of MPI_Reduce, you should use MPI_Allreduce. And your code would look like the following:
int local_max = loc_data[0];
int local_min = loc_data[0];
for(int i = 1; i < loc_num; i++){
local_max = (local_max > loc_data[i]) ? local_max : loc_data[i];
local_min = (local_min < loc_data[i]) ? local_min : loc_data[i];
}
int global_max = local_max;
int global_min = local_min;
MPI_Allreduce(&local_max, &global_max, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
MPI_Allreduce(&local_min, &global_min, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
Unless you are assuming that loc_num=1, which you should not, this code
for(int j = 0; j< loc_num; j++){
x = loc_data[j] - global_min;
y = global_max - global_min;
}
overrides the same x and y. Moreover, you should not call MPI_Bcast(&y, 1, MPI_INT, root, MPI_COMM_WORLD);, you want for all the processes to first calculate in parallel their work based on the formula:
δi = ((xi – xmin ) / (xmax – xmin )) * 100.
and only then send their work back to the master process. So each process should apply that formula to their input indices, stored the results in an array and send it back to the master process. Like so:
float loc_delta[100];
float y = global_max - global_min;
for(int j = 0; j< loc_num; j++){
loc_delta[j] = (((float) (loc_data[j] - global_min) / y) * 100.0);
}
float final_delta[100];
MPI_Gather(&loc_delta, loc_num, MPI_FLOAT, final_delta, loc_num, MPI_FLOAT, root, MPI_COMM_WORLD);
Notice that I am casting (((float) (loc_data[j] - global_min) / y) * 100.0); to float. Otherwise, C would return an int representation of the result.
I am trying to implement a master/slave relationship which solves the mandelbrot set and prints it into a ppm file. This is what I have so far:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi/mpi.h>
int calculateMan (double , double ); //calculateMandelbrotSet
MPI_Status status;
struct Number {
double R;
double i;
} Z,C;
const int color;
int colorTemp; //color value
const int max = 1000; //max iteration value
const int ResHeight = 800; //Resolution
const int ResWidth = 800;
double CRMax = 1.5;
double CIMax = 2.0;
double CRMin = -2.5;
double CIMin = -2.0; //Constant values
double colorWidth;
double colorHeight;
int main (int argc, char** argv) {
int rank, size = 0;
int nodos, source, dest;
double startTime, endTime;
//Rank = current process ID
//Size = amount of processes
MPI_Init (&argc, &argv); // starts MPI
startTime = MPI_Wtime();
MPI_Comm_size (MPI_COMM_WORLD, &size); // get number of processes
MPI_Comm_rank (MPI_COMM_WORLD, &rank); // get current process
nodos = size - 1;
if (rank == 0) { // MASTER --------------------------------------
colorHeight = (CIMax - CIMin) / ResHeight;
colorWidth = (CRMax - CRMin) / ResWidth;
FILE *fp;
fp = fopen("Mandelbrot.ppm","w");
fprintf(fp,"P3\n %d\n %d\n %d\n",ResWidth,ResHeight,255); //Magic Number & Header
for (int row = 0; row < ResHeight; row++) {
C.i= CIMin + row*colorHeight;
for (int column = 0; column < ResWidth; column++) {
C.R = CRMin + column*colorWidth;
//data sends
for (dest = 1; dest <= nodos; dest++) {
MPI_Send(&C.R, sizeof(double), MPI_DOUBLE, dest, column, MPI_COMM_WORLD);
MPI_Send(&C.i, sizeof(double), MPI_DOUBLE, dest, column, MPI_COMM_WORLD);
}
}
}
for (int row = 0; row < ResHeight; row++) {
for (int column = 0; column < ResWidth; column++) {
//Recv and print
MPI_Recv(&colorTemp, sizeof(int), MPI_DOUBLE, source, 0, MPI_COMM_WORLD, &status);
fprintf(fp, "%d %d %d\n", colorTemp, 1,3);
}
}
fclose(fp);
} //------------------------- END MASTER
if (rank > 0) // START SLAVE --------------------------------------
{
for (int row = 0; row < ResHeight; row++) {
for (int column = 0; column < ResWidth; column++) {
MPI_Recv(&C.R, sizeof(double), MPI_DOUBLE, 0, column, MPI_COMM_WORLD, &status);
MPI_Recv(&C.i, sizeof(double), MPI_DOUBLE, 0, column, MPI_COMM_WORLD, &status);
colorTemp = calculateMan(C.R, C.i);
MPI_Send(&colorTemp, sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
} // SLAVE END---------------------------------
endTime = MPI_Wtime(); //stop timer
MPI_Finalize(); //end MPI
printf("Time: %.6f\n", endTime-startTime);
exit(0); //end program
}
int calculateMan (double CReal, double CImaginary) {
int i = 0;
Z.R = 0.0;
Z.i = 0.0;
while (((i < max) && (Z.R*Z.R) + (Z.i * Z.i) < 4))
{
double temp = (Z.R * Z.R) - (Z.i * Z.i) + CReal;
Z.i = 2.0 * Z.R * Z.i + CImaginary;
Z.R = temp;
i++;
}
if (i == max)
return 0; //interior is black
else
return 255; //exterior white
}
I am trying to run my program but I cannot figure out why the RECV and print have an infinite iteration. Also, can anyone have a look at the code and tell me any sort of other issues or things I should look out for, for future reference?
Thanks!
I am new to MPI. I need to make a program for matrix multiplication in 2D topology (grid). First matrix (A) distributes along x coordinate, second matrix (B) distributes along y coordinate. Every process counts one submatrix. I use MPI_Bcast to send submatrices in dimensions, but after that program doesn't continue. What did I do wrong?
Here is the code.
#include<stdio.h>
#include<stdlib.h>
#include<mpi/mpi.h>
#define NUM_DIMS 2
#define N 81
#define A(i, j) A[N*(i)+(j)]
#define B(i, j) B[N*(i)+(j)]
#define C(i, j) C[N*(i)+(j)]
#define AA(i, j) AA[k *(i)+(j)] //
#define BB(i, j) BB[k*(i)+(j)]
#define CC(i, j) CC[k*(i)+(j)]
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int threadCount;
int threadRank;
MPI_Comm_size(MPI_COMM_WORLD, &threadCount);
int dims[NUM_DIMS] = {0};
//Создаем решетку
int periods[2] = {0, 0};
MPI_Comm comm_2D;
MPI_Comm comm_1D[2];
MPI_Dims_create(threadCount, NUM_DIMS, dims);
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_2D);
MPI_Comm_rank(comm_2D, &threadRank);
int k = N/dims[1];
double *A = (double*)calloc(N*N, sizeof(double));
double *B = (double*)calloc(N*N, sizeof(double));
double *C = (double*)calloc(N*N, sizeof(double));
double startTime = MPI_Wtime();
int subdims[2];
subdims[0] = 0;
subdims[1] = 1;
MPI_Cart_sub(comm_2D, subdims, &comm_1D[0]);
subdims[0] = 1;
subdims[1] = 0;
MPI_Cart_sub(comm_2D, subdims, &comm_1D[1]);
MPI_Datatype column, matrix;
MPI_Type_vector(N, N / k, N, MPI_DOUBLE, &column);
MPI_Type_create_resized(column, 0, N / k * sizeof(double), &column);
MPI_Type_commit(&column);
double *AA, *BB, *CC;
AA = (double*)calloc(N * k, sizeof(double));
BB = (double*)calloc(N * k, sizeof(double));
CC = (double*)calloc(k * k , sizeof(double));
int threadCoords[2];
MPI_Comm_rank(comm_2D, &threadRank);
MPI_Cart_coords(comm_2D, threadRank, NUM_DIMS, threadCoords);
if (threadCoords[0] == 0) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
A(i, j) = 1;
B(i, j) = 1;
}
}
}
if (threadCoords[1] == 0) {
MPI_Scatter(A, N * k, MPI_DOUBLE, AA, N * k, MPI_DOUBLE, 0, comm_1D[0]);
}
if (threadCoords[0] == 0) {
int offset[3] = {0, 1, 2};
int send[3] = {1, 1, 1};
MPI_Scatterv(B, send, offset, column, BB, N * k , MPI_DOUBLE, 0, comm_1D[1]);
}
int r = MPI_Bcast(AA, k*N, MPI_DOUBLE, 0, comm_1D[1]);
fprintf(stderr, "r = %d\n", r);
int p = MPI_Bcast(BB, k*N, MPI_DOUBLE, 0, comm_1D[0]);
fprintf(stderr, "p = %d\n", p);
/*...*/
}