1. Goal
I am implementing matrix multiplication by Fox method for square mxn matrices AB = C and world_size = 4 processors disposed as a topological mesh/grid:
P0-P1
| |
P2-P3
where - represents mesh_r (mesh_rows) communicator and | represents mesh_c (mesh_columns) communicator, build through build_mesh procedure.
2. My code
int main(int argc, char *argv[])
{
int process_rank, world_size;
int mesh_rows, mesh_columns;
int mesh_dimension = 2;
int *process_coordinates;
MPI_Comm mesh, mesh_r, mesh_c;
int process_rank_mesh;
int *A, *A_loc;
int *B, *B_loc;
int *C, *C_loc;
int m, n, mloc, nloc;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (process_rank == 0) {
m = n = /*world_size * 1*/ 2; // multiple of world_size = 4
}
MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
A = fill_matrix(A, m, n);
B = fill_matrix(A, m, n);
C = (int*) calloc(m * n, sizeof(int));
if (process_rank == 0)
mesh_rows = 2;
if (is_divisible(world_size, mesh_rows))
mesh_columns = world_size / mesh_rows;
else {
mesh_rows = 1;
mesh_columns = world_size / mesh_rows;
}
MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
MPI_Comm_rank(mesh, &process_rank_mesh);
mloc = m / mesh_rows;
nloc = m / mesh_columns;
handle_errors(m, n, world_size, process_rank);
A_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
B_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
C_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(C, C_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int)); // blocco di A addizionale inviato dai processori sulla diagonale a quelli su mesh_r
// START BMR
memcpy(A_loc_add, A_loc, sizeof(A_loc) * mloc);
MPI_Bcast(A_loc_add, mloc * nloc, MPI_INT, f(process_rank), mesh_r);
// Compute Cij = AB for each process
for (int i = 0; i < m; i++) {
if (process_rank == 0 || process_rank == 3)
C_loc[i] += A_loc[i] * B_loc[i];
else
C_loc[i] += A_loc_add[i] * B_loc[i];
}
for (int i = 1; i < m; i++) {
// Broadcast
memcpy(A_loc_add, A_loc, sizeof(A_loc) * mloc);
MPI_Bcast(A_loc_add, mloc * nloc, MPI_INT, !f(process_rank), mesh_r);
// Rolling
int *t_loc = (int*) calloc(mloc * nloc, sizeof(int)); // variabile temporanea necessaria per lo scambio
memcpy(t_loc, B_loc, sizeof(B_loc) * mloc);
MPI_Status status;
int mate; // P0 <-> P2 e P1 <-> P3 - that is B_loc swap in mesh_c communicator
if (process_rank == 0) {
mate = 2;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 1) {
mate = 3;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 2) {
mate = 0;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
} else if (process_rank == 3) {
mate = 1;
MPI_Send(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD);
MPI_Recv(t_loc, mloc * nloc, MPI_INT, mate, 0, MPI_COMM_WORLD, &status);
}
memcpy(B_loc, t_loc, sizeof(t_loc) * mloc);
free(t_loc);
// multiply
dot_product(A_loc_add, C_loc, B_loc, m);
}
// END BMR
MPI_Finalize();
return 0;
}
void dot_product(int *A_loc_add, int *C_loc, int *B_loc, int m)
{
for (int i = 0; i < m; i++)
C_loc[i] += A_loc_add[i] * B_loc[i];
}
int f(int process_rank)
{
if (process_rank == 0 || process_rank == 1)
return 0;
else
return 1;
}
void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
MPI_Datatype square_block;
int stride = n;
int count = mloc;
int block_length = nloc;
MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
MPI_Datatype square_block_resized;
MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
MPI_Type_commit(&square_block_resized);
int *send_counts = (int*) calloc(world_size, sizeof(int));
int *displs = (int*) calloc(world_size, sizeof(int));
for (int i = 0; i < mesh_rows; i++) {
for (int j = 0; j < mesh_columns; j++) {
send_counts[i * mesh_columns + j] = 1;
displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
}
}
MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}
void handle_errors(int m, int n, int world_size, int process_rank)
{
if (process_rank == 0) {
if (m != n) {
perror("Not square matrices\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
if (world_size != 4) {
perror("World size must be 4\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
}
}
}
bool is_divisible(int dividend, int divisor)
{
return dividend % divisor == 0;
}
void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
int mesh_rows, int mesh_columns, int *process_coordinates)
{
int mesh_dimension = 2;
int *mesh_n_dimension;
int mesh_reorder = 0;
int *mesh_period;
int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension[0] = mesh_rows;
mesh_n_dimension[1] = mesh_columns;
mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
mesh_period[0] = mesh_period[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
remain_dims[0] = 0;
remain_dims[1] = 1;
MPI_Cart_sub(*mesh, remain_dims, mesh_r);
remain_dims[0] = 1;
remain_dims[1] = 0;
MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}
int *fill_matrix(int *Mat, int m, int n)
{
int k = 0;
Mat = (int*) calloc(m * n, sizeof(int));
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
Mat[i * n + j] = ++k;
return Mat;
}
3. Result
My algorithm is working fine if m = 2:
C00:
7
C01:
10
C10:
15
C11:
22
as you can see here
but makes mistakes when calculating C for other values of m (e.g. world_size * 1):
C00:
58 92
242 308
C01:
78 120
294 368
C10:
198 260
494 588
C11:
274 344
602 704
whose expected result C is:
Can you help me? You can use this online matrices multiplication tool for debugging and calculate expected result. I am 100% sure that build_mesh and distribute procedures are correct: the bug must be in BMR section
Related
1. Goal
I have to distribute an array, called A_loc, over a custom communicator (that is not MPI_COMM_WORLD). Let's suppose we want to distribute an array over mesh_r communicator:
P0-P1
| |
P2-P3
where - represents mesh_r (mesh_rows) communicator and | represents mesh_c (mesh_columns) communicator, build through build_mesh procedure.
2. Code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <mpi.h>
bool is_divisible(int, int);
void build_mesh(MPI_Comm*, MPI_Comm*, MPI_Comm*, int, int, int, int, int*);
int *fill_matrix(int*, int, int);
void print_matrix(int*, int, int, int, int);
void handle_errors(int, int, int, int);
void distribute(int*, int*, int, int, int, int, int, int, int);
void debug(int*, int*, int, int, int, int, int, int, int);
int main(int argc, char *argv[])
{
int process_rank, world_size;
int mesh_rows, mesh_columns;
int mesh_dimension = 2;
int *process_coordinates;
MPI_Comm mesh, mesh_r, mesh_c;
int process_rank_mesh;
int *A, *A_loc;
int *B, *B_loc;
int m, n, mloc, nloc;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &process_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
if (process_rank == 0) {
m = n = world_size * 1; // multiple of world_size = 4
}
MPI_Bcast(&m, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
A = fill_matrix(A, m, n);
B = fill_matrix(A, m, n);
if (process_rank == 0)
mesh_rows = 2;
if (is_divisible(world_size, mesh_rows))
mesh_columns = world_size / mesh_rows;
else {
mesh_rows = 1;
mesh_columns = world_size / mesh_rows;
}
MPI_Bcast(&mesh_rows, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&mesh_columns, 1, MPI_INT, 0, MPI_COMM_WORLD);
process_coordinates = (int*) calloc(mesh_dimension, sizeof(int));
build_mesh(&mesh, &mesh_r, &mesh_c, process_rank, world_size, mesh_rows, mesh_columns, process_coordinates);
MPI_Comm_rank(mesh, &process_rank_mesh);
mloc = m / mesh_rows;
nloc = m / mesh_columns;
handle_errors(m, n, world_size, process_rank);
A_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(A, A_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
B_loc = (int*) calloc(mloc * nloc, sizeof(int));
distribute(B, B_loc, m, n, mloc, nloc, world_size, mesh_rows, mesh_columns);
// I want to re-write this part so I can exploit mesh_r communicator instead of MPI_COMM_WORLD...
int *A_loc_add = (int*) calloc(mloc * nloc, sizeof(int));
if (process_rank == 0) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 1, 10, MPI_COMM_WORLD);
} else if (process_rank == 3) {
MPI_Send(A_loc, mloc * nloc, MPI_INT, 2, 20, MPI_COMM_WORLD);
}
MPI_Status status;
if (process_rank == 1) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
} else if (process_rank == 2) {
MPI_Recv(A_loc_add, mloc * nloc, MPI_INT, 3, 20, MPI_COMM_WORLD, &status);
}
MPI_Finalize();
return 0;
}
void distribute(int *Mat, int *Mat_loc, int m, int n, int mloc, int nloc, int world_size, int mesh_rows, int mesh_columns)
{
MPI_Datatype square_block;
int stride = n;
int count = mloc;
int block_length = nloc;
MPI_Type_vector(count, block_length, stride, MPI_INT, &square_block);
MPI_Datatype square_block_resized;
MPI_Type_create_resized(square_block, 0, sizeof(int), &square_block_resized);
MPI_Type_commit(&square_block_resized);
int *send_counts = (int*) calloc(world_size, sizeof(int));
int *displs = (int*) calloc(world_size, sizeof(int));
for (int i = 0; i < mesh_rows; i++) {
for (int j = 0; j < mesh_columns; j++) {
send_counts[i * mesh_columns + j] = 1;
displs[i * mesh_columns + j] = i * n * block_length + j * block_length;
}
}
MPI_Scatterv(Mat, send_counts, displs, square_block_resized, Mat_loc, mloc * nloc, MPI_INT, 0, MPI_COMM_WORLD);
}
bool is_divisible(int dividend, int divisor)
{
return dividend % divisor == 0;
}
void build_mesh(MPI_Comm *mesh, MPI_Comm *mesh_r, MPI_Comm *mesh_c, int process_rank, int world_size,
int mesh_rows, int mesh_columns, int *process_coordinates)
{
int mesh_dimension = 2;
int *mesh_n_dimension;
int mesh_reorder = 0;
int *mesh_period;
int *remain_dims = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension = (int*) calloc(mesh_dimension, sizeof(int));
mesh_n_dimension[0] = mesh_rows;
mesh_n_dimension[1] = mesh_columns;
mesh_period = (int*) calloc(mesh_dimension, sizeof(int));
mesh_period[0] = mesh_period[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, mesh_dimension, mesh_n_dimension, mesh_period, mesh_reorder, mesh);
MPI_Cart_coords(*mesh, process_rank, mesh_dimension, process_coordinates);
remain_dims[0] = 0;
remain_dims[1] = 1;
MPI_Cart_sub(*mesh, remain_dims, mesh_r);
remain_dims[0] = 1;
remain_dims[1] = 0;
MPI_Cart_sub(*mesh, remain_dims, mesh_c);
}
int *fill_matrix(int *Mat, int m, int n)
{
int k = 0;
Mat = (int*) calloc(m * n, sizeof(int));
for (int i = 0; i < m; i++)
for (int j = 0; j < n; j++)
Mat[i * n + j] = ++k;
return Mat;
}
As you can see this works fine but I wish I could re-write that commented part so I can exploit mesh_r communicator and distribute A_loc over every processors on mesh_r, instead of the hard-coded send with dest = 1 and dest = 2 over MPI_COMM_WORLD.
Any help?
Instead of sends and receives, you should use a Bcast as you did in an earlier version of your code. Your problem is that you're not thinking in a distributed manner, but you try to keep a global view. By that I mean that after you create the sub-communicator mesh_r, every process seems to be in that communicator, but and here it comes: there are multiple mesh_r communicators, and each process is part of exactly one. Each MPI process sees exactly the one mesh_r communicator that it is part of. Thus a single code line MPI_Bcast( ...buffer stuff...., mesh_r ) does multiple broadcasts, one in each grid row.
I want to implement the Cannon Algorithm using MPI in C using cartesian communicators which are shifted using the default functions and by sending 2-dimensional blocks from the 2 matrices.
I have tried to follow a couple of tutorials found online, but I realized none were implemented the way I wanted them to, using both 2-dimensional blocks and cartesian communicators.
EDIT: I have managed to get over the error after realizing that I was using the proc_grid_size variable in a wrong way, confusing the size of the process matrix with the block size and entering into some unallocated memory area.
I am running with an input of 25 processes and 2 10*10 matrices stored in 2 different files.
I am currently trying to implement the shift operations using the MPI_Cart_Shift function. But I don't know how to send the block over to the neighbors.
This is my current implementation of this specific part, which is not working (the application just hangs):
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
int nlocal;
int npes, dims[2], periods[2];
int myrank, my2drank, mycoords[2];
int uprank, downrank, leftrank, rightrank, coords[2];
int shiftsource, shiftdest;
MPI_Status status;
MPI_Comm comm_2d;
// Get the communicator related information
MPI_Comm_size(MPI_COMM_WORLD, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// Set up the Cartesian topology
dims[0] = dims[1] = proc_matrix_size;//sqrt(npes);
// Set the periods for wraparound connections
periods[0] = periods[1] = 1;
// Create the Cartesian topology, with rank reordering
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm_2d);
// Get the rank and coordinates with respect to the new topology
MPI_Comm_rank(comm_2d, &my2drank);
MPI_Cart_coords(comm_2d, my2drank, 2, mycoords);
// Compute ranks of the up and left shifts
// Get line neighbors (direction = 1, displacement = 1)
MPI_Cart_shift(comm_2d, 1, 1, &leftrank, &rightrank);
// Get column neighbors (direction = 0, displacement = 1)
MPI_Cart_shift(comm_2d, 0, 1, &uprank, &downrank);
// Determine the dimension of the local matrix block
nlocal = block_size;// n / dims[0];
MPI_Cart_shift(comm_2d, 1, -mycoords[1], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(&(a[0][0]), 1, subarrtype,
shiftdest, 1, shiftsource, 1, comm_2d, &status);
MPI_Cart_shift(comm_2d, 0, -mycoords[0], &shiftsource, &shiftdest);
MPI_Sendrecv_replace(&(b[0][0]), 1, subarrtype,
shiftdest, 1, shiftsource, 1, comm_2d, &status);
After closing the application, I discover that the root process is the only one that hangs:
F:\Facultate\AN_4\PDC\Labs\MPI\Cannon\x64\Release>mpiexec -np 25 Cannon.exe
a.txt b.txt> mpiexec aborting job...
job aborted:
[ranks] message
[0] job terminated by the user
[1-24] terminated
---- error analysis -----
[0] on DESKTOP-JB1815M
ctrl-c was hit. job aborted by the user.
---- error analysis -----
INITIAL SOLVED CODE:
int malloc2D(int ***array, int n, int m) {
int i;
/* allocate the n*m contiguous items */
int *p = (int*) calloc(n*m, sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int**) calloc(n, sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int free2D(int ***array) {
/* free the memory - the first element of the array is at the start */
free(&((*array)[0][0]));
/* free the pointers into the memory */
free(*array);
return 0;
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (argc != 3) {
fprintf(stderr, "Not enough arguments passed! Make sure you pass 2 filenames.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Declare file pointers
FILE* fa = NULL;
FILE* fb = NULL;
// Declare matrix pointers
int **A = NULL;
int **B = NULL;
int **C = NULL;
// Declare matrix dimensions
int ma = 0, na = 0;
int mb = 0, nb = 0;
// Nr of processes on each line/column in process mesh
int proc_matrix_size = (int)sqrt(world_size);
// Single value for quadratic matrix size
int n = 0;
// Nr of elements on each line/column in local matrix
// of each process
int block_size = 0;
// Open files and read matrices
if (world_rank == 0)
{
fa = fopen(argv[1], "r");
fb = fopen(argv[2], "r");
// Read matrix dymensions
fscanf(fa, "%d %d\n", &ma, &na);
fscanf(fb, "%d %d\n", &mb, &nb);
// Check if matrices are quadratic
if ((ma != na) && (na != mb) && (mb != nb))
{
printf("Invalid matrices dimensions\n");
return 0;
}
n = na;
// Check if sqrt(nr_processes) divides matrix dimension
if ((n % proc_matrix_size != 0) || (world_size % proc_matrix_size != 0))
{
printf("Number of processes does not fit matrix size\n");
return 0;
}
block_size = n / proc_matrix_size;
malloc2D(&A, n, n);
malloc2D(&B, n, n);
malloc2D(&C, n, n);
// Read matrices A & B from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fa, "%d ", &A[i][j]);
fscanf(fb, "%d ", &B[i][j]);
}
fscanf(fa, "\n");
}
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
/*
Divide matrices in blocks and send each block to the corresponding process
*/
// Declare global pointers to matrices
int *globalAptr = NULL;
int *globalBptr = NULL;
int *globalCptr = NULL;
// Declare global return pointers
int *globalA2ptr = NULL;
int *globalB2ptr = NULL;
int **A2 = NULL;
int **B2 = NULL;
// Declare local matrix pointers
int **a = NULL;
int **b = NULL;
int **c = NULL;
malloc2D(&A2, n, n);
malloc2D(&B2, n, n);
if (world_rank == 0)
{
globalAptr = &(A[0][0]);
globalBptr = &(B[0][0]);
globalA2ptr = &(A2[0][0]);
globalB2ptr = &(B2[0][0]);
globalCptr = &(C[0][0]);
}
malloc2D(&a, block_size, block_size);
malloc2D(&b, block_size, block_size);
malloc2D(&c, block_size, block_size);
// Sizes of input global matrix
int sizes[2] = { n, n };
// Sizes of each block
int subsizes[2] = { block_size, block_size };
// Begining of current block
int starts[2] = { 0,0 };
// Declare subarray type
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, block_size * sizeof(int), &subarrtype);
MPI_Type_commit(&subarrtype);
// Scatter the A and B to all processes
int* sendcounts = (int*)malloc(proc_matrix_size * proc_matrix_size * sizeof(int));
int* displs = (int*)malloc(proc_matrix_size * proc_matrix_size * sizeof(int));
if (world_rank == 0)
{
for (int i = 0; i < proc_matrix_size * proc_matrix_size; i++)
sendcounts[i] = 1;
int disp = 0;
for (int i = 0; i < proc_matrix_size; i++) {
for (int j = 0; j < proc_matrix_size; j++) {
displs[i * proc_matrix_size + j] = disp;
disp += 1;
}
disp += ((n / proc_matrix_size)-1) * proc_matrix_size;
}
}
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
for (int i = 0; i < block_size; i++) {
for (int j = 0; j < block_size; j++) {
a[i][j] = 10 + a[i][j];
b[i][j] = 10 + b[i][j];
}
}
// It all goes back to process 0
MPI_Gatherv(&(a[0][0]), block_size * block_size, MPI_INT,
globalA2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Gatherv(&(b[0][0]), block_size * block_size, MPI_INT,
globalB2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
OLD:
I would like to mention that at the moment, I am trying to send blocks over the default communicator and planning to implement the shifting operations and the cartesian communicator after managing to send the matrix blocks.
The help I need is with regard to the Scatterv function which throws the following error:
job aborted: [ranks] message
[0] fatal error Fatal error in MPI_Scatterv: Invalid count, error
stack: MPI_Scatterv(sbuf=0x0000029262048D40, scnts=0x00000292620482B0,
displs=0x0000029262048250, dtype=USER,
rbuf=0x000002926203ED30, rcount=25, MPI_INT, root=0, MPI_COMM_WORLD)
failed Negative count, value is -1912594387
[1-7] terminated
This is the code I have written until now:
#include "stdafx.h"
#include "mpi.h"
#include "stdio.h"
#include "stdlib.h"
#include <assert.h>
#include <cstdlib>
#include <math.h>
int malloc2D(int ***array, int n, int m) {
int i;
/* allocate the n*m contiguous items */
int *p = (int*) malloc(n*m * sizeof(int));
if (!p) return -1;
/* allocate the row pointers into the memory */
(*array) = (int**) malloc(n * sizeof(int*));
if (!(*array)) {
free(p);
return -1;
}
/* set up the pointers into the contiguous memory */
for (i = 0; i<n; i++)
(*array)[i] = &(p[i*m]);
return 0;
}
int free2D(int ***array) {
/* free the memory - the first element of the array is at the start */
free(&((*array)[0][0]));
/* free the pointers into the memory */
free(*array);
return 0;
}
int main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
if (argc != 3) {
fprintf(stderr, "Not enough arguments passed! Make sure you pass 2 filenames.\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// Find out rank, size
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Declare file pointers
FILE* fa = NULL;
FILE* fb = NULL;
// Declare matrix pointers
int **A = NULL;
int **B = NULL;
int **C = NULL;
// Declare matrix dymensions
int ma = 0, na = 0;
int mb = 0, nb = 0;
// Nr of processes on each line/column in process mesh
int proc_grid_size = (int)sqrt(world_size);
// Single value for quadratic matrix size
int n = 0;
// Nr of elements on each line/column in local matrix
// of each process
int block_size = 0;
// Open files and read matrices
if (world_rank == 0)
{
fa = fopen(argv[1], "r");
fb = fopen(argv[2], "r");
// Read matrix dymensions
fscanf(fa, "%d %d\n", &ma, &na);
fscanf(fb, "%d %d\n", &mb, &nb);
// Check if matrices are quadratic
if ((ma != na) && (na != mb) && (mb != nb))
{
printf("Invalid matrices dimensions\n");
return 0;
}
n = na;
// Check if sqrt(nr_processes) divides matrix dimension
if ((n % proc_grid_size != 0) || (world_size % proc_grid_size != 0))
{
printf("Number of processes does not fit matrix size\n");
return 0;
}
block_size = n / proc_grid_size;
// Initialize matrices
A = (int**)calloc(n, sizeof(int*));
B = (int**)calloc(n, sizeof(int*));
//C = (int**)calloc(n, sizeof(int*));
for (int i = 0; i < n; i++)
{
A[i] = (int*)calloc(n, sizeof(int));
B[i] = (int*)calloc(n, sizeof(int));
//C[i] = (int*)calloc(n, sizeof(int));
}
// Read matrix A from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fa, "%d ", &A[i][j]);
printf("%d ", A[i][j]);
}
fscanf(fa, "\n");
printf("\n");
}
// Read matrix B from file
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
fscanf(fb, "%d ", &B[i][j]);
printf("%d ", B[i][j]);
}
fscanf(fb, "\n");
printf("\n");
}
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else {
MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&block_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
/*
Divide matrices in blocks and send each block to the corresponding process
*/
// Sizes of input global matrix
int sizes[2] = { n, n };
// Sizes of each block
int subsizes[2] = { block_size, block_size };
// Begining of current block
int starts[2] = { 0,0 };
// Declare subarray type
MPI_Datatype type, subarrtype;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &type);
MPI_Type_create_resized(type, 0, block_size * sizeof(int), &subarrtype);
MPI_Type_commit(&subarrtype);
// Declare global pointers to matrices
int *globalAptr = NULL;
int *globalBptr = NULL;
int **A2 = NULL;
int **B2 = NULL;
malloc2D(&A2, n, n);
malloc2D(&B2, n, n);
// Declare global return pointers
int *globalA2ptr = NULL;
int *globalB2ptr = NULL;
if (world_rank == 0)
{
globalAptr = &(A[0][0]);
globalBptr = &(B[0][0]);
globalA2ptr = &(A2[0][0]);
globalB2ptr = &(B2[0][0]);
}
// Declare local matrix pointers
int **a = NULL;
int **b = NULL;
malloc2D(&a, block_size, block_size);
malloc2D(&b, block_size, block_size);
// Scatter the A and B to all processes
int* sendcounts = (int*)malloc(proc_grid_size * proc_grid_size * sizeof(int));
int* displs = (int*)malloc(proc_grid_size * proc_grid_size * sizeof(int));
if (world_rank == 0)
{
for (int i = 0; i < proc_grid_size * proc_grid_size; i++)
sendcounts[i] = 1;
int disp = 0;
for (int i = 0; i < proc_grid_size; i++) {
for (int j = 0; j < proc_grid_size; j++) {
displs[i * proc_grid_size + j] = disp;
disp += 1;
}
disp += ((block_size) - 1) * proc_grid_size;
}
for (int i = 0; i < proc_grid_size * proc_grid_size; i++)
{
printf("Send cound: %d\n", sendcounts[i]);
}
}
MPI_Scatterv(globalAptr, sendcounts, displs, subarrtype, &(a[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
MPI_Scatterv(globalBptr, sendcounts, displs, subarrtype, &(b[0][0]),
block_size * block_size, MPI_INT,
0, MPI_COMM_WORLD);
// Now each processor has its local array, and can process it
for (int i = 0; i < block_size; i++) {
for (int j = 0; j < block_size; j++) {
a[i][j] = 10 + a[i][j];
b[i][j] = 10 + b[i][j];
}
}
// It all goes back to process 0
MPI_Gatherv(&(a[0][0]), block_size * block_size, MPI_INT,
globalA2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
MPI_Gatherv(&(b[0][0]), block_size * block_size, MPI_INT,
globalB2ptr, sendcounts, displs, subarrtype,
0, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Thank you very much!
I am new to MPI. I need to make a program for matrix multiplication in 2D topology (grid). First matrix (A) distributes along x coordinate, second matrix (B) distributes along y coordinate. Every process counts one submatrix. I use MPI_Bcast to send submatrices in dimensions, but after that program doesn't continue. What did I do wrong?
Here is the code.
#include<stdio.h>
#include<stdlib.h>
#include<mpi/mpi.h>
#define NUM_DIMS 2
#define N 81
#define A(i, j) A[N*(i)+(j)]
#define B(i, j) B[N*(i)+(j)]
#define C(i, j) C[N*(i)+(j)]
#define AA(i, j) AA[k *(i)+(j)] //
#define BB(i, j) BB[k*(i)+(j)]
#define CC(i, j) CC[k*(i)+(j)]
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
int threadCount;
int threadRank;
MPI_Comm_size(MPI_COMM_WORLD, &threadCount);
int dims[NUM_DIMS] = {0};
//Создаем решетку
int periods[2] = {0, 0};
MPI_Comm comm_2D;
MPI_Comm comm_1D[2];
MPI_Dims_create(threadCount, NUM_DIMS, dims);
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_2D);
MPI_Comm_rank(comm_2D, &threadRank);
int k = N/dims[1];
double *A = (double*)calloc(N*N, sizeof(double));
double *B = (double*)calloc(N*N, sizeof(double));
double *C = (double*)calloc(N*N, sizeof(double));
double startTime = MPI_Wtime();
int subdims[2];
subdims[0] = 0;
subdims[1] = 1;
MPI_Cart_sub(comm_2D, subdims, &comm_1D[0]);
subdims[0] = 1;
subdims[1] = 0;
MPI_Cart_sub(comm_2D, subdims, &comm_1D[1]);
MPI_Datatype column, matrix;
MPI_Type_vector(N, N / k, N, MPI_DOUBLE, &column);
MPI_Type_create_resized(column, 0, N / k * sizeof(double), &column);
MPI_Type_commit(&column);
double *AA, *BB, *CC;
AA = (double*)calloc(N * k, sizeof(double));
BB = (double*)calloc(N * k, sizeof(double));
CC = (double*)calloc(k * k , sizeof(double));
int threadCoords[2];
MPI_Comm_rank(comm_2D, &threadRank);
MPI_Cart_coords(comm_2D, threadRank, NUM_DIMS, threadCoords);
if (threadCoords[0] == 0) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
A(i, j) = 1;
B(i, j) = 1;
}
}
}
if (threadCoords[1] == 0) {
MPI_Scatter(A, N * k, MPI_DOUBLE, AA, N * k, MPI_DOUBLE, 0, comm_1D[0]);
}
if (threadCoords[0] == 0) {
int offset[3] = {0, 1, 2};
int send[3] = {1, 1, 1};
MPI_Scatterv(B, send, offset, column, BB, N * k , MPI_DOUBLE, 0, comm_1D[1]);
}
int r = MPI_Bcast(AA, k*N, MPI_DOUBLE, 0, comm_1D[1]);
fprintf(stderr, "r = %d\n", r);
int p = MPI_Bcast(BB, k*N, MPI_DOUBLE, 0, comm_1D[0]);
fprintf(stderr, "p = %d\n", p);
/*...*/
}
I try to sort different array with mpi. Every array are allocate locally.
for example we have {1-7-4-12} {3-7-5-9} {12-15-2-16} {10-8-11-13}
and we want {1-2-3-4}{5-6-7-8}{9-10-11-12}{13-14-15-16}
So I use odd-even strategy. For 2proccess it's works in every case but when i try with more process i have new value. For my example i can have {23-2-3-4}. I think my problem is from allocate memory but i don't find where and what i do wrong...
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MASTER 0
#define MIN(a,b) ((a)<(b)?(a):(b))
#define BLOCK_LOW(id,p,n) ((id)*(n)/(p))
#define BLOCK_HIGH(id,p,n) \
(BLOCK_LOW((id)+1,p,n)-1)
#define BLOCK_SIZE(id,p,n) \
(BLOCK_LOW((id)+1, p, n)-BLOCK_LOW(id, p , n))
#define BLOCK_OWNER(index,p,n) \
(((p)*(index+1)-1)/(n))
int nbProcess, id, n; //n = number of value
void printTabByProcess(int *T){
int i = 0;
int size = BLOCK_SIZE(id, nbProcess, n);
printf("Tab n°%d [ ", id, size);
for(i; i < size; i++){
printf(" %d ", T[i]);
}
printf(" ]\n");
}
void fusion(int *t,int deb1,int fin1,int fin2){
int *table1;
int deb2=fin1+1;
int compt1=deb1;
int compt2=deb2;
int i;
table1=(int*)malloc((fin1-deb1+1)*sizeof(int));
for(i=deb1;i<=fin1;i++) {
table1[i-deb1]=t[i];
}
for(i=deb1;i<=fin2;i++){
if(compt1==deb2)
break;
else if(compt2==(fin2+1)){
t[i]=table1[compt1-deb1];
compt1++;
}
else if(table1[compt1-deb1]<t[compt2]){
t[i]=table1[compt1-deb1];
compt1++;
}
else{
t[i]=t[compt2];
compt2++;
}
}
free(table1);
}
void tri_fusion(int*t,int deb,int fin){
if(deb!=fin){
int milieu=(fin+deb)/2;
tri_fusion(t,deb,milieu);
tri_fusion(t,milieu+1,fin);
fusion(t,deb,milieu,fin);
}
}
int* fusion2(int* t1, int* t2, int size1, int size2){
int* buffer = malloc(sizeof(int)*(size1 + size2));
int index1 = 0;
int index2 = 0;
int i = 0;
for(i; i < (size1 + size2) - 1; i++){
if(t1[index1] < t2[index2]){
buffer[i] = t1[index1];
index1++;
}else{
buffer[i] = t2[index2];
index2++;
}
}
if(index1 == size1 - 1 ){
buffer[size1 + size2 - 1] = t1[index1];
}else{
buffer[size1 + size2 - 1] = t2[index2];
}
return buffer;
}
/*
*
* OUR FUNCTION TO PARALLEL SORT
*
*/
void TD_trier(int* T){
MPI_Status status;
int size = BLOCK_SIZE(id, nbProcess, n);
int receive_size = 0;
int* receive;
int* array_tmp;
int i = 0;
tri_fusion(T, 0, size - 1);
MPI_Barrier(MPI_COMM_WORLD);
for(i; i < nbProcess; i++){
if(i%2==0){
if(id % 2 == 1){//send to left
MPI_Send(&size, 1, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
MPI_Send(T, size, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
MPI_Recv(T, size, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
}else {
MPI_Recv(&receive_size, 1, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
receive = malloc(sizeof(int) * size);
MPI_Recv(receive, receive_size, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
array_tmp = fusion2(T, receive, size, receive_size);
MPI_Send(&array_tmp[size], receive_size, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
T = realloc(array_tmp, sizeof(int) * size);
}
if(id == 1){
//~ printTabByProcess(T);
}
}else if(i%2 == 1 && id < nbProcess-1){ //send to right
if(id % 2 == 1){
MPI_Send(&size, 1, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
MPI_Send(T, size, MPI_INT, id + 1, 1, MPI_COMM_WORLD);
//printTabByProcess(T);
MPI_Recv(T, size, MPI_INT, id + 1, 1, MPI_COMM_WORLD, &status);
}else if(id != 0 && id%2 ==0) {
MPI_Recv(&receive_size, 1, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
//receive = malloc(sizeof(int) * size);
MPI_Recv(receive, receive_size, MPI_INT, id - 1, 1, MPI_COMM_WORLD, &status);
//printTabByProcess(receive);
array_tmp = fusion2(T, receive, size, receive_size);
MPI_Send(array_tmp, receive_size, MPI_INT, id - 1, 1, MPI_COMM_WORLD);
printTabByProcess(&array_tmp[2]);
T = array_tmp + size;
printTabByProcess(T);
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
//printTabByProcess(T);
}
int generateRandomValue(){
return rand() % 100;
}
//init array with "random" value
int* TD_init(int n){
int i = 0;
int indiceDerniere = (id+1)*n/nbProcess -1;
int indicePremiere = id*n/nbProcess;
int* arrayLocal;
int localSize = indiceDerniere - indicePremiere +1;
arrayLocal = malloc(sizeof(int)*localSize);
//~ printf("id : %d - nbCase : %d (debut : %d, fin : %d)\n",
//~ id, localSize, indicePremiere, indiceDerniere);
for(i; i < localSize; i++){
arrayLocal[i] = generateRandomValue() - id;
}
printTabByProcess(arrayLocal);
return arrayLocal;
}
int main (int argc, char *argv[]){
//int n = 0;
int *dataLocal;
int dest;
int x;
int success;
MPI_Status status;
srand(time(NULL));
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nbProcess); //numtask contient le nombre de processeur
MPI_Comm_rank(MPI_COMM_WORLD, &id); //taskid, determine le numero du processus
//~ printf ("MPI task %d has started...\n", id);
//~ tag2 = 1;
//~ tag1 = 2;
MPI_Barrier (MPI_COMM_WORLD);
/***** Master task only ******/
if (id == MASTER){
printf("Chose a number of value :");
scanf("%d",&n);
/* Send the number of cases */
for (dest=1; dest<nbProcess; dest++) {
MPI_Send(&n, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); //send number of value
}
} /* end of master section */
/***** Non-master tasks only *****/
if (id > MASTER) {
/* Receive the number of cases */
MPI_Recv(&n, 1, MPI_INT, MASTER, 1, MPI_COMM_WORLD, &status);
}
MPI_Barrier (MPI_COMM_WORLD);
dataLocal = TD_init(n);
MPI_Barrier (MPI_COMM_WORLD);
if(id == 0){
printf("__________________________________________\n");
}
TD_trier(dataLocal);
MPI_Finalize();
}
Troubles may come from fusion2 function. index1 can become higher than size1. In fact, the MPI part works correctly. The code works once tests are performed. Here is a version that is not optimal but...
int* fusion2(int* t1, int* t2, int size1, int size2){
int* buffer = malloc(sizeof(int)*(size1 + size2));
int index1 = 0;
int index2 = 0;
int i = 0;
for(i; i < (size1 + size2) ; i++){
if(index1==size1){
buffer[i] = t2[index2];
index2++;
}else{
if(index2==size2){
buffer[i] = t1[index1];
index1++;
}else{
if(t1[index1] < t2[index2]){
buffer[i] = t1[index1];
index1++;
}else{
buffer[i] = t2[index2];
index2++;
}
}
}
}
return buffer;
}
Watch for memory management.
Ex : did you free T before doing ?
T = realloc(array_tmp, sizeof(int) * size);
Did you free "receive" ? did you free "array_tmp" in the second part ?
I fear memory leakages exist... It might be better to avoid allocation in fusion2, and even in the loops. Allocate array_tmp and receive at start, with "enougth" space, might be safer (faster ?).
Bye,
Francis
More : qsort (in stdlib) may go faster for local sorting.
/***every function is working correct but after only first iteration is giving collective abort anyone can tell what is or coulde be the reason***/
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *s, int *e)
{
/*****calculating start and end row for every process*****/
*s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
*e = *s + (n/size)+((n%size)>id);
}
void onedinit(double **a, double **b, double **f, const int nx, const int s, const int e)
{
int i, j;
int ls, le;
ls = s - (s!=0);
le = e + (e!=nx);
/***setting all the intial values to zero****/
for (i = ls; i < le; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
//***************************Defining Boundary Condition***********************************//
/***setting left boundary to 1***/
for (i = ls; i < le; i++) a[i][0] = b[i][0] = 1;
/***setting value f(0, i) is 2***/
if (s==0) for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int s, const int e, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/*****************if process id is odd then first send and if even then first recive to avoid deadlock**********/
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
}
void sweep1d(double **a, double **f, int nx, const int s, const int e, double **b)
{
int i, j;
int rows;
rows = e - s - (s==0) - (e==0);
nx -= 1;
double h = 1.0 / (double)nx;
for (i = 1; i <= rows; i++) for (j = 1; j < nx; j++)
b[i][j] = 0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, int s, int e)
{
double sum = 0.0;
int i, j;
int st, ed;
st = (s!=0);
ed = e-s+(s!=0);
for (i = st; i < ed; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, s, e, it;
double diffnorm, dwork;
double t1, t2;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
MPE_decomp1d(ny, numprocs, myid, &s, &e);
int ls, le, rows;
int i, j;
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, s, e);
diffnorm = 0.0;
it = 0;
do
{
// printf("%danshu\n", myid);
exchng1(a, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, s, e, b);
exchng1(b, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, s, e, a);
dwork = diff(a, b, nx, s, e);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
MPI_Barrier(comm1d);
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}
So just dumping 130 lines of code in SO and asking why it doesn't work is probably not the best way to get good answers - especially when the only actual sentance you write is "every function is working"... if that were the case, you wouldn't have a problem. You need to narrow things down to some more specific case and get a more specific question.
In this particular case, I've seen lots of code like this in the past while teaching, so it's feasible to see some of what's going on.
First off, you can't do stuff like this:
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
/*...*/
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
/*...*/
}
If you have 100 rows broken up into 4 processors, and you're (say) MPI Task 2, then your s is 50 and e is 75, and so ls would be 49 and le would be 76, and so you're trying to access a[49..76] even though you've only allocated a of size 27! That particular error comes up all over the code, and needs to be fixed. You want to be accessing a[0..rows-1].
Incidentally, I haven't even checked to see if MPE_decomp1d actually does the right thing. We all go through the phase where we think it's cute in C to put things in one line by using logical expressions multiplied by ternary operators, etc, but seriously, it makes your code unnecessarily tedious to disentangle when someone else has to fix it -- whether it's SOers or yourself 2 months later.
In exchng1, you're doing unnecessary work. You don't need to check to see if nbrbottom or nbrtop are valid; if they aren't, MPI_Cart_shift returns MPI_PROC_NULL to which sending or receiving is a no-op. So sending/receiving from those ranks is harmless, which is a great design decision, because it avoids lots of corner cases in the logic.
Similarly, to avoid deadlock you can use MPI_Sendrecv rather than individual Sends and Recvs. That plus the above means that instead of this:
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
you can do this:
MPI_Sendrecv(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
-- way simpler, right?
There's still some problem in the exchange, though; receiving into a[e-s+1] isn't right, although as I've mentioned, I can't be bothered decrypting MPE_decomp1d to figure out why. Presumably you want to be receiving into a[rows-1].
Finally, the MPI_Barrier() is slow and completely unnecesssary; there's enough synchronization in the guardcell exchanges (to say nothing of the Allreduce) that you don't need it.
When all those changes are made, the code runs without memory access problems; you'll have to check that it gives the right answers.
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *rows)
{
int s, e;
s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
e = s + (n/size)+((n%size)>id);
*rows = e - s - (s==0) - (e==0);
}
void onedinit(double **a, double **b, double **f, const int nx, const int rows, const int id, const int nprocs)
{
int i, j;
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
for (i = 0; i < rows; i++) a[i][0] = b[i][0] = 1;
if (id == 0)
for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int rows, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/* send data downwards */
MPI_Sendrecv(a[rows-2], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
/* send data upwards */
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[rows-1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
void sweep1d(double **a, double **f, const int nx, const int rows, double **b)
{
int i, j;
double h = 1.0 / (double)nx;
for (i = 1; i < rows-1; i++) for (j = 1; j < nx-1; j++)
b[i][j] =
0.25 * ( a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, const int rows)
{
double sum = 0.0;
int i, j;
for (i = 0; i < rows; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, it;
double diffnorm, dwork;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
int rows;
MPE_decomp1d(ny, numprocs, myid, &rows);
int i, j;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = 0; i < rows; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, rows, myid, numprocs);
diffnorm = 0.0;
it = 0;
do
{
exchng1(a, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, rows, b);
exchng1(b, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, rows, a);
dwork = diff(a, b, nx, rows);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}