I have two threads (rank 0 and 1), each containing a 3D matrix (x0,y,z) and (x1,y,z) with a different size along the x dimension. I would like to send a specific plane (x0 constant,y,z) from the first thread to the second and replace one of its face (x1 constant, y, z). The following code I made seems to work well when the two matrices have identical dimensions (even in x), but does not send the right face when x0 != x1 :
double ***alloc2(int x, int y,int z){
int i, j;
double ***array = (double ***) malloc(sizeof(double ***)*x);
for (i = 0; i<x; i++){
array[i] = (double **) malloc(sizeof(double*)*y);
for (j=0; j<y; j++){
array[i][j] = (double *) malloc(sizeof(double)*z);}}
return array;
}
int main(int argc, char *argv[]){
MPI_Status status;
MPI_Comm_size(MPI_COMM_WORLD, &nbr);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Some long code I skiped */
/* ... */
MPI_Datatype sub;
MPI_Type_vector(nL+1, nL+1, nL_thread, MPI_DOUBLE, &sub);
MPI_Type_commit(&sub);
if(rank == 0){
MPI_Send(&c_new[3][0][0], 1, sub, rank+1,01, MPI_COMM_WORLD);
MPI_Recv(&c_new[4][0][0], 1, sub, rank+1,10, MPI_COMM_WORLD, &status);}
if(rank == 1){
MPI_Recv(&c_new[0][0][0], 1, sub, rank-1,01, MPI_COMM_WORLD, &status);
MPI_Send(&c_new[1][0][0], 1, sub, rank-1,10, MPI_COMM_WORLD);}
}
nL is the length in the y and z dimensions, same for all threads, nL_thread is the x dimension (in this particular case, nL_thread = 3 for rank 1 and 4 for rank 0). here I am trying to replace the faces (0,y,z) of rank 1 by (3,y,z) of rank 0, and (4,y,z) of rank 0 by (1,y,z) of rank 1.
Related
Below is my code. Currently, it is trying to distribute work to be done for a 1d representation of a matrix (2d array). I MPI_Scatter the portion of the array which needs work. I store that portion into local_C which should be of the same size as the portion sent. I also broadcast the M (col), Q (used for gather function as col size), ....
int main(int argc, char *argv[]) {
int rank;
int nproc;
int n_local;
int N; // rows
int M; // rows/columns
int Q; // columns
MPI_Init (&argc, &argv); /* intialize MPI*/
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Comm_size(comm, &nproc);
MPI_Comm_rank(comm, &rank);
int *matrixA;
int *matrixB;
int *matrixC;
int *local_C;
// manager core constructs factors of matrix representation
if (rank == 0) {
N = atoi(argv[1]);
M = atoi(argv[2]);
Q = atoi(argv[3]);
// check if correct number of input
if (argc != 4) {
printf("Enter <filename> <N> <M> <Q>\n");
exit(1);
}
else if (N % nproc != 0) { // check if N is a multiple of the number of processors
printf("Ensure N is divisible by number of processors: %i\n", nproc);
exit(1);
}
// create matrices of size
matrixA = malloc(N * M * sizeof(long));
randomlyFillArray(matrixA, N * M);
matrixB = malloc(M * Q * sizeof(long));
randomlyFillArray(matrixB, M * Q);
// create resulting product matrix of size
matrixC = malloc(N * Q * sizeof(long));
// sequential compute
//computeMatrixProductSequentially(matrixA, matrixB, matrixC, M, N, Q);
// parallel compute
// block data
n_local = N / nproc;
local_C = malloc(n_local * M * sizeof(long));
MPI_Bcast(&M, 1, MPI_INT, 0, comm);
MPI_Bcast(&Q, 1, MPI_INT, 0, comm);
MPI_Bcast(&n_local, 1, MPI_INT, 0, comm);
// scatter matrixA for n_local row to cores
MPI_Scatter(&matrixA, n_local * M, MPI_LONG, &local_C, n_local * M, MPI_LONG, 0, comm);
// broadcast matrixB to all cores
MPI_Bcast(&matrixB, 1, MPI_LONG, 0, comm);
}
}
else {
MPI_Bcast(&M, 1, MPI_INT, 0, comm);
MPI_Bcast(&Q, 1, MPI_INT, 0, comm);
MPI_Bcast(&n_local, 1, MPI_INT, 0, comm);
// scatter recv matrixA row
MPI_Scatter(&matrixA, n_local * M, MPI_LONG, &local_C, n_local * M, MPI_LONG, 0, comm);
// broadcast recv matrixB
MPI_Bcast(&matrixB, 1, MPI_LONG, 0, comm);
//MPI_Gather();
}
MPI_Finalize();
return 0;
}
Here is the error when trying to compile and run the program.
The purpose, in case it matters, is to multiply two matrices in parallel using 1d arrays.
The problem with your code is that MPI calls take a int* or double* or whateversimpletype* argument. Your MatrixA is int*, so using &MatrixA makes the buffer int**. Solution: pass MatrixA directly as buffer.
Also: you are coding as if the scatter operation on non-zero ranks creates the matrix. That is not the case. You need to allocate the array yourself, and MPI will write the values into it.
Another remark: scattering a matrix is not a scalable solution and is bad MPI coding. It introduces both a memory bottleneck, because your process zero needs to be able to store all the data, and a time bottleneck because all otehr processes have to wait for process zero to construct the matrix. The right way to code this is to let each process construct its own part of the matrix. Always keep your data structure distributed from start to end!
I am trying to find out the time taken by each processor and the total time taken to calculate the whole program, there seems to be some sort of error. Any suggestions and help would be much appreciated.I had used the same method for another code and it worked there, but can't seem to figure out the problem in this one.
The code I have written
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mpi.h"
int main(int argc, char** argv){
int my_rank;
double time1, time2, duration, global;
int size;
float a ;
float b ;
int n ;
float h;
float local_a;
float local_b;
int local_n;
float integral;
float total;
int source;
int dest = 0;
int tag = 0;
MPI_Status status;
float Trap(float local_a, float local_b, int local_n, float h);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (my_rank == 0){
printf("Enter a, b and n \n");
scanf("%f %f %d", &a, &b, &n);
for ( dest = 1 ; dest < size; dest++){
MPI_Send(&a, 1 , MPI_FLOAT, dest , tag=0, MPI_COMM_WORLD);
MPI_Send(&b, 1 , MPI_FLOAT, dest , tag=1, MPI_COMM_WORLD);
MPI_Send(&n, 1 , MPI_INT, dest , tag=2, MPI_COMM_WORLD);
}
}
else{
MPI_Recv(&a, 1, MPI_FLOAT, source, tag=0, MPI_COMM_WORLD, &status);
MPI_Recv(&b, 1, MPI_FLOAT, source, tag=1, MPI_COMM_WORLD, &status);
MPI_Recv(&n, 1, MPI_INT, source, tag=2, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);
time1 = MPI_Wtime();
h = (b-a)/n;
local_n = n/size;
local_a = a + my_rank * local_n * h;
local_b = (local_a + local_n) * h;
integral = Trap(local_a, local_b, local_n, h);
if (my_rank == 0){
total = integral;
for (source = 1; source < size; source++){
MPI_Recv(&integral, 1, MPI_FLOAT, source, tag, MPI_COMM_WORLD, &status);
total += integral;
}
}
else {
MPI_Send(&integral, 1, MPI_FLOAT, dest, tag, MPI_COMM_WORLD);
}
time2 = MPI_Wtime();
duration = time2 - time1;
MPI_Reduce(&duration, &global,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
if (my_rank == 0){
printf("With n = %d trapezoids, our estimate \n", n);
printf("of the integral from %f to %f = %0.8f\n",a,b,total);
printf("Global runtime is %f\n",global);
}
printf("Runtime at %d is %f \n", my_rank,duration);
MPI_Finalize();
}
float Trap(float local_a, float local_b, int local_n, float h){
float integral;
float x;
int i;
float f(float x);
integral = (f(local_a) + f(local_b))/2.0;
x = local_a;
for (int i = 1; i <= local_n-1; i++){
x += h;
integral += f(x);
}
integral *= h;
}
float f(float x){
return x*x;
}
The error that it shows
[Sid-Laptop:4987] *** An error occurred in MPI_Recv
[Sid-Laptop:4987] *** reported by process [852688897,2]
[Sid-Laptop:4987] *** on communicator MPI_COMM_WORLD
[Sid-Laptop:4987] *** MPI_ERR_RANK: invalid rank
[Sid-Laptop:4987] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[Sid-Laptop:4987] *** and potentially your MPI job)
Enter a, b and n
[Sid-Laptop:04980] 2 more processes have sent help message help-mpi-errors.txt / mpi_errors_are_fatal
[Sid-Laptop:04980] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
I cannot reproduce your behaviour where removing the Wtime call "fixes" the program, but I suspect what is happening is this:
Your variable "source" is not set. Unset variables have some garbage-value, but they can often be zero. See this question What does uninitialised memory contain?
If your uninitialized source is 0, than it actually has the correct value for the first set of recv-calls. If it is not zero, there is probably no rank with that number, and the call fails.
Answering why the Wtime-call may or may not make it so that on your specific system (compiler+os+hardware+libraries etc) the uninitialized value happens to be zero is hard and also a bit useless. A C-Program that reads an uninitialized variable has so-called "Undefined Behaviour" and can do anything. It is important to understand the concept of undefined behaviour when programming in C. The c-faq describes it like this:
undefined: Anything at all can happen; the Standard imposes no requirements. The program may fail to compile, or it may execute incorrectly (either crashing or silently generating incorrect results), or it may fortuitously do exactly what the programmer intended.
(https://c-faq.com/ansi/undef.html)
This makes C really quite different from most programming languages in terms of debugging and it is the reason commenters advice you to enable compiler-warnings and fix them.
I'm trying to get values in my spawned process using collective MPI functions.
In this case I have a N*N matrix and I want to pass every row to each process. Get the values in each process and sum their values.
I'm using this example:
MPI_Scatter of 2D array and malloc
main
int main(int argc, char *argv[]){
int *n, range, i, j, dato, resultado;
int *matriz;
char *nombre_esclave="esclavo";
//MPI Section
int rank, size;
MPI_Comm hijos;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
matriz = createMatrix(N, N);
printArray(matriz, N * N);
//Child process
MPI_Comm_spawn("slave", MPI_ARGV_NULL, N, MPI_INFO_NULL, 0, MPI_COMM_SELF, &hijos, MPI_ERRCODES_IGNORE);
// received row will contain N integers
int *procRow = malloc(sizeof(int) * N);
MPI_Scatter(matriz, N, MPI_INT, // send one row, which contains N integers
procRow, N, MPI_INT, // receive one row, which contains N integers
MPI_ROOT, hijos);
MPI_Finalize();
return 0;
}
and in slave
slave
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_get_parent(&parent);
if (parent != MPI_COMM_NULL) {
printf("This is a child process\n");
}
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size);
int *procRow = malloc(sizeof(int) * N);
//UNABLE TO GET VALUES FROM THE PARENT
//I need to sum old the values y every portion of the matrix
//passed to every child process
MPI_Reduce(procRow, &resultado_global, N, MPI_INT, MPI_SUM, 0, parent);
UPDATE
With MPI_Comm_spawn I create 3 childs. In every child I want to get a row of matrix (I use scatter in master). Later I use MPI_Reduce to sum every row in child (that's why I say getting values).
UPDATE 2
On slave I have modified the code and I get the rows in every process.
if (parent != MPI_COMM_NULL) {
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size_remote);
int *matrix = malloc(sizeof(int) * size);
int *procRow = malloc(sizeof(int) * size);
MPI_Scatter(matrix, N, MPI_INT,procRow, N, MPI_INT,0, parent);
//procRow values correctly from each row of the matrix
if (procRow != NULL) {
printf("Process %d; %d %d %d \n", pid, procRow[0], procRow[1], procRow[2]);
}
//Unable to sum each row
MPI_Reduce(procRow, &resultado_global, size, MPI_INT, MPI_SUM, ROOT, parent);
//MPI_Reduce(procRow, &resultado_global, size, MPI_INT, MPI_SUM, ROOT, MPI_COMM_WORLD);
}
UPDATE 3 (SOLVED)
IN SLAVE
if (parent != MPI_COMM_NULL) {
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size_remote);
int *matrix = malloc(sizeof(int) * size);
int *procRow = malloc(sizeof(int) * size);
MPI_Scatter(matrix, N, MPI_INT, procRow, N, MPI_INT, 0, parent);
if (procRow != NULL) {
printf("Process %d; %d %d %d \n", pid, procRow[0], procRow[1], procRow[2]);
sumaParcial=0;
for (int i = 0; i < N; i++)
sumaParcial = sumaParcial + procRow[i];
}
MPI_Reduce(&sumaParcial, &resultado_global, 1, MPI_INT, MPI_SUM, ROOT, parent);
}
IN MASTER
// received row will contain N integers
int *procRow = malloc(sizeof(int) * N);
MPI_Scatter(matriz, N, MPI_INT, // send one row, which contains N integers
procRow, N, MPI_INT, // receive one row, which contains N integers
MPI_ROOT, hijos);
MPI_Reduce(&sumaParcial, &resultado_global, 1, MPI_INT, MPI_SUM, MPI_ROOT, hijos);
printf("\n GLOBAL RESULT :%d\n",resultado_global);
Any idea?
Thanks
From the edit I suppose that the scatter is working correctly.
You main confusion seems to be about MPI_Reduce. It does not do any local reduction. According to your graphic, you want to have the values 6, 15, 24at the ranks 0, 1, 2 in the slaves. That is done entirely without MPI, just by iterating over the local rows.
An MPI_Reduce on the rows would lead to the root having [12, 15, 18]. If you just want the total sum 45 at the root of the slaves, you should first summarize the values locally and then MPI_Reduce the single values from each rank to a single global value.
I am trying to parallelize the following code for calculation of pi.
My approach is to use scatter to parallelize the for, and then use a reduce to calculate the sum value and finally show pi.
My code is the following
#include <stdio.h>
#include <mpi.h>
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i, myid, size;
double x, pi, local_sum = 0.0, sum=0.0;
double send_vec[num_steps], recv_vect[num_steps];
// Initialize the MPI environment
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
if (myid ==0){
int i=0;
for (i=0; i<num_steps;i++){
send_vec[i]=i;
}
}
MPI_Scatter(send_vec, num_steps/size, MPI_INT, recv_vect,
num_steps, MPI_INT, 0, MPI_COMM_WORLD);
for(i = 0; i < num_steps; ++i) {
x = (recv_vect[i]-0.5)*step;
local_sum += 4.0/(1.0+x*x);
}
MPI_Reduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0){
pi = step*sum;
printf("PI value = %f\n", pi);
}
// Finalize the MPI environment.
MPI_Finalize();
}
The thing is when I run the program with the option -np 1 and 2
I do get the desired result.
Yet when I run with 3, 4 and higher I get the following error:
PIC_Send(284).........: Negative count, value is -240000
Fatal error in PMPI_Scatter: Invalid count, error stack
The call to MPI_Scatter() is to be corrected:
MPI_Scatter(send_vec, num_steps/size, MPI_INT, recv_vect,
num_steps, MPI_INT, 0, MPI_COMM_WORLD);
To send double, use the datatype MPI_DOUBLE as you did in the MPI_Reduce()
Since the sendtype is similar to the recvtype, the number of item sent to each process sendcount must be equal to the number of item received by each process recvcount. In the present case, it's num_steps/size.
Finally, the call to MPI_Scatter() will look like:
MPI_Scatter(send_vec, num_steps/size, MPI_DOUBLE, recv_vect,
num_steps/size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
Lastly, dynamic memory allocation can be used to avoid using the stack for storing large arrays. Moreover, the allocated space can be decreased so as to reduce the memory footprint:
num_steps=(num_steps/size)*size;
double* send_vec=NULL;
double* recv_vec=NULL;
if(rank==0){
send_vec=malloc((num_steps/size)*sizeof(double));
if(send_vec==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
}
recv_vec=malloc(num_steps*sizeof(double));
if(recv_vec==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
...
if(rank==0){
free(send_vec);
}
free(recv_vec);
I have an MPI code that implements 2D domain decomposition to compute numerical solutions to a PDE. Currently I write certain 2D distributed arrays out for each process (e.g. array_x--> proc000x.bin). I want to reduce that to a single binary file.
array_0, array_1,
array_2, array_3,
Suppose the above illustrates a cartesian topology with 4 processes (2x2). Each 2D array has dimension (nx + 2, nz + 2). The +2 signifies "ghost" layers added to all sides for communication purposes.
I would like to extract the main arrays (omit the ghost layers) and write them to a single binary file with an order something like,
array_0, array_1, array_2, array_3 --> output.bin
If possible it would be preferable to write it as though I had access to the global grid and was writing row-by-row i.e.,
row 0 of array_0, row 0 of array_1, row 1 of array_0 row_1 of array_1 ....
The attempt below attempts the former of the two output formats in file array_test.c
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
/* 2D array allocation */
float **alloc2D(int rows, int cols);
float **alloc2D(int rows, int cols) {
int i, j;
float *data = malloc(rows * cols * sizeof(float));
float **arr2D = malloc(rows * sizeof(float *));
for (i = 0; i < rows; i++) {
arr2D[i] = &(data[i * cols]);
}
/* Initialize to zero */
for (i= 0; i < rows; i++) {
for (j=0; j < cols; j++) {
arr2D[i][j] = 0.0;
}
}
return arr2D;
}
int main(void) {
/* Creates 5x5 array of floats with padding layers and
* attempts to write distributed arrays */
/* Run toy example with 4 processes */
int i, j, row, col;
int nx = 5, ny = 5, npad = 1;
int my_rank, nproc=4;
int dim[2] = {2, 2}; /* 2x2 cartesian grid */
int period[2] = {0, 0};
int coord[2];
int reorder = 1;
float **A = NULL;
MPI_Comm grid_Comm;
/* Initialize MPI */
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
/* Establish cartesian topology */
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);
/* Get cartesian grid indicies of processes */
MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
row = coord[1];
col = coord[0];
/* Add ghost layers */
nx += 2 * npad;
ny += 2 * npad;
A = alloc2D(nx, ny);
/* Create derived datatype for interior grid (output grid) */
MPI_Datatype grid;
int start[2] = {npad, npad};
int arrsize[2] = {nx, ny};
int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};
MPI_Type_create_subarray(2, arrsize, gridsize,
start, MPI_ORDER_C, MPI_FLOAT, &grid);
MPI_Type_commit(&grid);
/* Fill interior grid */
for (i = npad; i < nx-npad; i++) {
for (j = npad; j < ny-npad; j++) {
A[i][j] = my_rank + i;
}
}
/* MPI IO */
MPI_File fh;
MPI_Status status;
char file_name[100];
int N, offset;
sprintf(file_name, "output.bin");
MPI_File_open(grid_Comm, file_name, MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &fh);
N = (nx - 2 * npad) * (ny - 2 *npad);
offset = (row * 2 + col) * N * sizeof(float);
MPI_File_set_view(fh, offset, MPI_FLOAT, grid, "native",
MPI_INFO_NULL);
MPI_File_write_all(fh, &A[0][0], N, MPI_FLOAT, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
/* Cleanup */
free(A[0]);
free(A);
MPI_Type_free(&grid);
MPI_Finalize();
return 0;
}
Compiles with
mpicc -o array_test array_test.c
Runs with
mpiexec -n 4 array_test
While the code compiles and runs, the output is incorrect. I'm assuming that I have misinterpreted the use of the derived datatype and file writing in this case. I'd appreciate some help figuring out my mistakes.
The error you make here is that you have the wrong file view. Instead of creating a type representing the share of the file the current processor is responsible of, you use the mask corresponding to the local data you want to write.
You have actually two very distinct masks to consider:
The mask for the local data, excluding the halo layers; and
The mask for the global data, as it should be once collated into the file.
The former corresponds to this layout:
Here, the data that you want to output on the file for a given process in in dark blue, and the halo layer that should not be written on the file is in lighter blue.
The later corresponds to this layout:
Here, each colour corresponds to the local data coming from a different process, as distributed on the 2D Cartesian grid.
To understand what you need to create to reach this final result, you have to think backwards:
Your final call to the IO routine should be MPI_File_write_all(fh, &A[0][0], 1, interior, MPI_STATUS_IGNORE);. So you have to have your interior type defined such as to exclude the halo boundary. Well fortunately, the type grid you created already does exactly that. So we will use it.
But now, you have to have the view on the file to allow for this MPI_Fie_write_all() call. So the view must be as described in the second picture. We will therefore create a new MPI type representing it. For that, MPI_Type_create_subarray() is what we need.
Here is the synopsis of this function:
int MPI_Type_create_subarray(int ndims,
const int array_of_sizes[],
const int array_of_subsizes[],
const int array_of_starts[],
int order,
MPI_Datatype oldtype,
MPI_Datatype *newtype)
Create a datatype for a subarray of a regular, multidimensional array
INPUT PARAMETERS
ndims - number of array dimensions (positive integer)
array_of_sizes
- number of elements of type oldtype in each
dimension of the full array (array of positive integers)
array_of_subsizes
- number of elements of type oldtype in each dimension of
the subarray (array of positive integers)
array_of_starts
- starting coordinates of the subarray in each dimension
(array of nonnegative integers)
order - array storage order flag (state)
oldtype - array element datatype (handle)
OUTPUT PARAMETERS
newtype - new datatype (handle)
For our 2D Cartesian file view, here are what we need for these input parameters:
ndims: 2 as the grid is 2D
array_of_sizes: these are the dimensions of the global array to output, namely { nnx*dim[0], nny*dim[1] }
array_of_subsizes: these are the dimensions of the local share of the data to output, namely { nnx, nny }
array_of_start: these are the x,y start coordinates of the local share into the global grid, namely { nnx*coord[0], nny*coord[1] }
order: the ordering is C so this must be MPI_ORDER_C
oldtype: data are floats so this must be MPI_FLOAT
Now that we have our type for the file view, we simply apply it with MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL); and the magic is done.
Your full code becomes:
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
/* 2D array allocation */
float **alloc2D(int rows, int cols);
float **alloc2D(int rows, int cols) {
int i, j;
float *data = malloc(rows * cols * sizeof(float));
float **arr2D = malloc(rows * sizeof(float *));
for (i = 0; i < rows; i++) {
arr2D[i] = &(data[i * cols]);
}
/* Initialize to zero */
for (i= 0; i < rows; i++) {
for (j=0; j < cols; j++) {
arr2D[i][j] = 0.0;
}
}
return arr2D;
}
int main(void) {
/* Creates 5x5 array of floats with padding layers and
* attempts to write distributed arrays */
/* Run toy example with 4 processes */
int i, j, row, col;
int nx = 5, ny = 5, npad = 1;
int my_rank, nproc=4;
int dim[2] = {2, 2}; /* 2x2 cartesian grid */
int period[2] = {0, 0};
int coord[2];
int reorder = 1;
float **A = NULL;
MPI_Comm grid_Comm;
/* Initialize MPI */
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
/* Establish cartesian topology */
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);
/* Get cartesian grid indicies of processes */
MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
row = coord[1];
col = coord[0];
/* Add ghost layers */
nx += 2 * npad;
ny += 2 * npad;
A = alloc2D(nx, ny);
/* Create derived datatype for interior grid (output grid) */
MPI_Datatype grid;
int start[2] = {npad, npad};
int arrsize[2] = {nx, ny};
int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};
MPI_Type_create_subarray(2, arrsize, gridsize,
start, MPI_ORDER_C, MPI_FLOAT, &grid);
MPI_Type_commit(&grid);
/* Fill interior grid */
for (i = npad; i < nx-npad; i++) {
for (j = npad; j < ny-npad; j++) {
A[i][j] = my_rank + i;
}
}
/* Create derived type for file view */
MPI_Datatype view;
int nnx = nx-2*npad, nny = ny-2*npad;
int startV[2] = { coord[0]*nnx, coord[1]*nny };
int arrsizeV[2] = { dim[0]*nnx, dim[1]*nny };
int gridsizeV[2] = { nnx, nny };
MPI_Type_create_subarray(2, arrsizeV, gridsizeV,
startV, MPI_ORDER_C, MPI_FLOAT, &view);
MPI_Type_commit(&view);
/* MPI IO */
MPI_File fh;
MPI_File_open(grid_Comm, "output.bin", MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, &A[0][0], 1, grid, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
/* Cleanup */
free(A[0]);
free(A);
MPI_Type_free(&view);
MPI_Type_free(&grid);
MPI_Finalize();
return 0;
}