Trying to make FFTW3 MPI work, getting zeros - c

I am trying to make the 2D MPI FFTW example from http://www.fftw.org/doc/2d-MPI-example.html#g_t2d-MPI-example work.
The example code is not complete so I have to write some extra lines to make it compile and test (below is the code).
For some reason, the output (in place FFT) is just zeros as far as I can tell.
The output after transform is simply 0 0 (all zeros).
Am I using the MPI FFTW in the wrong way? The example code is simple enough.
// compile with: mpicc simple_mpi_example.c -Wl,-rpath=/usr/local/lib -lfftw3_mpi -lfftw3 -o simple_mpi_example */
#include <fftw3-mpi.h>
int main(int argc, char **argv){
const ptrdiff_t N0 = 1000, N1 = 1000;
fftw_plan plan;
fftw_complex *data; //local data of course
ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
MPI_Init(&argc, &argv);
fftw_mpi_init();
/* get local data size and allocate */
alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
&local_n0, &local_0_start);
data = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * alloc_local);
MPI_Barrier(MPI_COMM_WORLD);
printf("%i %i\n", local_0_start, local_n0);
MPI_Barrier(MPI_COMM_WORLD);
/* create plan for forward DFT */
plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
FFTW_FORWARD, FFTW_ESTIMATE);
/* initialize data to some function my_function(x,y) */
for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j){
data[i*N1 + j][0]=local_0_start + i;
data[i*N1 + j][1]=i;
}
MPI_Barrier(MPI_COMM_WORLD);
printf("%f %f\n", data[10*N1 + 10][0], data[10*N1 + 10][1]);
MPI_Barrier(MPI_COMM_WORLD);
/* compute transforms, in-place, as many times as desired */
fftw_execute(plan);
printf("%f %f\n", data[10*N1 + 10][0], data[10*N1 + 10][1]);
fftw_destroy_plan(plan);
fftw_free(data);
MPI_Finalize();
printf("finalize\n");
return 0;
}

Related

Problem with calculating the total time in MPI program using MPI_Wtime()

I am trying to find out the time taken by each processor and the total time taken to calculate the whole program, there seems to be some sort of error. Any suggestions and help would be much appreciated.I had used the same method for another code and it worked there, but can't seem to figure out the problem in this one.
The code I have written
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "mpi.h"
int main(int argc, char** argv){
int my_rank;
double time1, time2, duration, global;
int size;
float a ;
float b ;
int n ;
float h;
float local_a;
float local_b;
int local_n;
float integral;
float total;
int source;
int dest = 0;
int tag = 0;
MPI_Status status;
float Trap(float local_a, float local_b, int local_n, float h);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (my_rank == 0){
printf("Enter a, b and n \n");
scanf("%f %f %d", &a, &b, &n);
for ( dest = 1 ; dest < size; dest++){
MPI_Send(&a, 1 , MPI_FLOAT, dest , tag=0, MPI_COMM_WORLD);
MPI_Send(&b, 1 , MPI_FLOAT, dest , tag=1, MPI_COMM_WORLD);
MPI_Send(&n, 1 , MPI_INT, dest , tag=2, MPI_COMM_WORLD);
}
}
else{
MPI_Recv(&a, 1, MPI_FLOAT, source, tag=0, MPI_COMM_WORLD, &status);
MPI_Recv(&b, 1, MPI_FLOAT, source, tag=1, MPI_COMM_WORLD, &status);
MPI_Recv(&n, 1, MPI_INT, source, tag=2, MPI_COMM_WORLD, &status);
}
MPI_Barrier(MPI_COMM_WORLD);
time1 = MPI_Wtime();
h = (b-a)/n;
local_n = n/size;
local_a = a + my_rank * local_n * h;
local_b = (local_a + local_n) * h;
integral = Trap(local_a, local_b, local_n, h);
if (my_rank == 0){
total = integral;
for (source = 1; source < size; source++){
MPI_Recv(&integral, 1, MPI_FLOAT, source, tag, MPI_COMM_WORLD, &status);
total += integral;
}
}
else {
MPI_Send(&integral, 1, MPI_FLOAT, dest, tag, MPI_COMM_WORLD);
}
time2 = MPI_Wtime();
duration = time2 - time1;
MPI_Reduce(&duration, &global,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
if (my_rank == 0){
printf("With n = %d trapezoids, our estimate \n", n);
printf("of the integral from %f to %f = %0.8f\n",a,b,total);
printf("Global runtime is %f\n",global);
}
printf("Runtime at %d is %f \n", my_rank,duration);
MPI_Finalize();
}
float Trap(float local_a, float local_b, int local_n, float h){
float integral;
float x;
int i;
float f(float x);
integral = (f(local_a) + f(local_b))/2.0;
x = local_a;
for (int i = 1; i <= local_n-1; i++){
x += h;
integral += f(x);
}
integral *= h;
}
float f(float x){
return x*x;
}
The error that it shows
[Sid-Laptop:4987] *** An error occurred in MPI_Recv
[Sid-Laptop:4987] *** reported by process [852688897,2]
[Sid-Laptop:4987] *** on communicator MPI_COMM_WORLD
[Sid-Laptop:4987] *** MPI_ERR_RANK: invalid rank
[Sid-Laptop:4987] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[Sid-Laptop:4987] *** and potentially your MPI job)
Enter a, b and n
[Sid-Laptop:04980] 2 more processes have sent help message help-mpi-errors.txt / mpi_errors_are_fatal
[Sid-Laptop:04980] Set MCA parameter "orte_base_help_aggregate" to 0 to see all help / error messages
I cannot reproduce your behaviour where removing the Wtime call "fixes" the program, but I suspect what is happening is this:
Your variable "source" is not set. Unset variables have some garbage-value, but they can often be zero. See this question What does uninitialised memory contain?
If your uninitialized source is 0, than it actually has the correct value for the first set of recv-calls. If it is not zero, there is probably no rank with that number, and the call fails.
Answering why the Wtime-call may or may not make it so that on your specific system (compiler+os+hardware+libraries etc) the uninitialized value happens to be zero is hard and also a bit useless. A C-Program that reads an uninitialized variable has so-called "Undefined Behaviour" and can do anything. It is important to understand the concept of undefined behaviour when programming in C. The c-faq describes it like this:
undefined: Anything at all can happen; the Standard imposes no requirements. The program may fail to compile, or it may execute incorrectly (either crashing or silently generating incorrect results), or it may fortuitously do exactly what the programmer intended.
(https://c-faq.com/ansi/undef.html)
This makes C really quite different from most programming languages in terms of debugging and it is the reason commenters advice you to enable compiler-warnings and fix them.

Allocating memory buffer for use in MPI_Pack()

I'm going to use MPI_Pack() to make a message composed of n ints and m doubles. Their positions in the message buffer will be something like this
p1 x ints, q1 x doubles, p2 x ints, q2 x doubles, ..., pN x ints, qN x doubles
where n=p1+p2+...+pN and m=q1+q2+...+qN.
My question: Is the size of this message equal to the size of a message composed of the same number of ints and doubles but with the following order:
n x ints, m x doubles
I'm asking this question because I want to know how much memory should be allocated for the buffer. If the size of the message depends only on the number of ints and doubles and not how they are arranged, then the buffer can be allocated very easily:
MPI_Pack_size(n, MPI_INT, communicator, &k1);
MPI_Pack_size(m, MPI_DOUBLE, communicator, &k2);
buffer = malloc(k1 + k2);
Obviously the following solution is correct:
k = 0;
for (int i=0; i < N; i++)
{
MPI_Pack_size(p[i], MPI_INT, communicator, &k1);
MPI_Pack_size(q[i], MPI_DOUBLE, communicator, &k2);
k += k1 + k2;
}
buffer = malloc(k);
But for a large N, it may result in a too excessively large buffer, because as the official document of MPI states, the routine MPI_Pack_size()
returns an upper bound, rather than an exact bound, since the
exact amount of space needed to pack the message may depend on the context (e.g.,
first message packed in a packing unit may take more space).
UPDATE: a program I wrote for testing if the order of packing the ints and doubles affect the size of the message.
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <stdlib.h>
#include <time.h>
#define BUFF_SIZE 200000 /* buffer size in bytes */
#define MY_MPI_REAL MPI_DOUBLE
typedef double real;
int main()
{
MPI_Init(NULL, NULL);
int ic = 0, rc = 0; /* counters of int and real numbers */
int pos = 0; /* position in the buffer, used in MPI_Pack() calls */
/* allocate memory of the pack buffer */
void *buff = malloc(BUFF_SIZE);
assert(buff);
/* case 1: packing a large number of pairs of arrays */
srand(time(NULL));
for (int i=0; i<100; i++) /* 100 array pairs */
{
/* make int and real arrays of random lengths */
int ik = 99 * ((double)rand() / RAND_MAX) + 1;
int rk = 99 * ((double)rand() / RAND_MAX) + 1;
int *iarr = (int *)malloc(ik * sizeof(int));
assert(iarr);
double *rarr = (real *)malloc(rk * sizeof(real));
assert(rarr);
ic += ik;
rc += rk;
/* pack the array pair */
MPI_Pack(iarr, ik, MPI_INT, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
MPI_Pack(rarr, rk, MY_MPI_REAL, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
free(iarr);
free(rarr);
}
printf("final position for case 1 = %d\n", pos);
/* case 2: packing a single pair of arrays */
pos = 0;
int *iarr = (int *)malloc(ic * sizeof(int));
assert(iarr);
double *rarr = (real *)malloc(rc * sizeof(real));
assert(rarr);
MPI_Pack(iarr, ic, MPI_INT, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
MPI_Pack(rarr, rc, MY_MPI_REAL, buff, BUFF_SIZE, &pos, MPI_COMM_WORLD);
free(iarr);
free(rarr);
printf("final position for case 2 = %d\n", pos);
free(buff);
printf("sizeof(int) = %ld, sizeof(real) = %ld\n", sizeof(int), sizeof(real));
printf("num of ints = %d, num of reals = %d\n", ic, rc);
printf("num of ints x sizeof(int) + num of reals x sizeof(real) = %ld\n", ic*sizeof(int)+rc*sizeof(real));
MPI_Finalize();
}
I think your worries are misplaced. The only possible overhead I see would be from alignment: maybe a one time alignment at the start of the buffer, and then maybe per element. However, the pack buffer is counted in bytes, and I just tested it: even packing a single byte does not lead to any padding. So that leads me to suspect that every data type basically takes the exact amount of space.

Not quite understanding MPI

I am attempting to make a program using MPI that will find the value of PI using MPI.
Currently I can find the sum this way:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUMSTEPS 1000000
int main() {
int i;
double x, pi, sum = 0.0;
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
double step = 1.0/(double) NUMSTEPS;
x = 0.5 * step;
for (i=0;i<= NUMSTEPS; i++){
x+=step;
sum += 4.0/(1.0+x*x);
}
pi = step * sum;
clock_gettime(CLOCK_MONOTONIC, &end);
u_int64_t diff = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
printf("PI is %.20f\n",pi);
printf("elapsed time = %llu nanoseconds\n", (long long unsigned int) diff);
return 0;
}
But this does not use MPI.
So I have tried to make my own in MPI. My logic is:
Split the 1000000 into equal parts based on how many processors I have
Calculate the values for each range
Send the calculated value back to the master and then divide by the number of processors. I would like to keep the main thread free and not do any work. Similar to a master-slave system.
Here's what I have currently. This doesn't seem to be working and the send/receive gives errors about incompatible variables for receive and send.
#include <mpi.h>
#include <stdio.h>
#include <string.h>
#define NUMSTEPS 1000000
int main(int argc, char** argv) {
int comm_sz; //number of processes
int my_rank; //my process rank
// Initialize the MPI environment
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
// Slaves
if (my_rank != 0) {
// Process math then send
int i;
double x, pi, sum = 0.0;
double step = 1.0/(double) NUMSTEPS;
x = 0.5 * step;
// Find the start and end for the number
int processors = comm_sz - 1;
int thread_multi = NUMSTEPS / processors;
int start = my_rank * thread_multi;
if((my_rank - 1) != 0){
start += 1;
}
int end = start + thread_multi ;
for (i=start; i <= end; i++){
x+=step;
sum += 4.0 / (1.0 + x * x);
}
pi = step * sum;
MPI_Send(pi, 1.0, MPI_DOUBLE 1, 0, MPI_COMM_WORLD);
// Master
} else {
// Things in here only get called once.
double pi = 0.0;
double total = 0.0;
for (int q = 1; q < comm_sz; q++) {
MPI_Recv(pi, 1, MPI_DOUBLE, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
total += pi;
pi = 0.0;
}
// Take the added totals and divide by amount of processors that processed, to get the average
double finished = total / (comm_sz - 1);
// Print sum here
printf("Pi Is: %d", finished);
}
// Finalize the MPI environment.
MPI_Finalize();
}
I've currently spent around 3 hours working on this. Never used MPI. Any help would be greatly appreciated.
Try compiling with more compiler warnings and try to fix them, for instance -Wall -Wextra should give you excellent clues about what the issues are.
According to MPI_Send documentation the first argument is a pointer, so you seem to be ignoring an automatic "conversion to pointer" error. You have the same issue in the MPI_Recv() call.
You can try to pass pi as &pi in MPI_Recv and MPI_Send and check if that fixes the error.
As a comment, you can declare dummy variables as pi as a local variables inside the master loop to avoid side-effects:
for (int q = 1; q < comm_sz; q++) {
double pi = 0;
MPI_Recv(&pi, 1, MPI_DOUBLE, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
total += pi;
}

Open MPI Waitall() Segmentation Fault

I'm new with MPI and I'm trying to develop a non-blocking programm (with Isend and Irecv). The functionality is very basic (it's educational):
There is one process (rank 0) who is the master and receives messages from the slaves (rank 1-P). The master only receives results.
The slaves generates an array of N random numbers between 0 and R and then they do some operations with those numbers (again, it's just for educational purpose, the operations don't make any sense)
This whole process (operations + send data) is done M times (this is just for comparing different implementations; blocking and non-blocking)
I get a Segmentation Fault in the Master process when I'm calling the MPI_waitall() funcion
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#include <math.h>
#include <time.h>
#define M 1000 //Number of times
#define N 2000 //Quantity of random numbers
#define R 1000 //Max value of random numbers
double SumaDeRaices (double*);
int main(int argc, char* argv[]) {
int yo; /* rank of process */
int p; /* number of processes */
int dest; /* rank of receiver */
/* Start up MPI */
MPI_Init(&argc, &argv);
/* Find out process rank */
MPI_Comm_rank(MPI_COMM_WORLD, &yo);
/* Find out number of processes */
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Request reqs[p-1];
MPI_Status stats[p-1];
if (yo == 0) {
int i,j;
double result;
clock_t inicio,fin;
inicio = clock();
for(i = 0; i<M; i++){ //M times
for(j = 1; j<p; j++){ //for every slave
MPI_Irecv(&result, sizeof(double), MPI_DOUBLE, j, i, MPI_COMM_WORLD, &reqs[j-1]);
}
MPI_Waitall(p-1,reqs,stats); //wait all slaves (SEG_FAULT)
}
fin = clock()-inicio;
printf("Tiempo total de ejecucion %f segundos \n", ((double)fin)/CLOCKS_PER_SEC);
}
else {
double* numAleatorios = (double*) malloc( sizeof(double) * ((double) N) ); //array with numbers
int i,j;
double resultado;
dest=0;
for(i=0; i<M; i++){ //again, M times
for(j=0; j<N; j++){
numAleatorios[j] = rand() % R ;
}
resultado = SumaDeRaices(numAleatorios);
MPI_Isend(&resultado,sizeof(double), MPI_DOUBLE, dest, i, MPI_COMM_WORLD,&reqs[p-1]); //send result to master
}
}
/* Shut down MPI */
MPI_Finalize();
exit(0);
} /* main */
double SumaDeRaices (double* valores){
int i;
double sumaTotal = 0.0;
//Raices cuadradas de los valores y suma de estos
for(i=0; i<N; i++){
sumaTotal = sqrt(valores[i]) + sumaTotal;
}
return sumaTotal;
}
There are several issues with your code. First and foremost in your Isend you pass &resultado several times without waiting until previous non-blocking operation finishes. You are not allowed to reuse the buffer you pass to Isend before you make sure the operation finishes.
Instead I recommend you using normal Send, because in contrast to synchronous send (SSend) normal blocking send returns as soon as you can reuse the buffer.
Second, there is no need to use message tags. I recommend you to just set tag to 0. In terms of performance it is simply faster.
Third, the result shouldn't be a simple variable, but an array of size at least (p-1)
Fourth, I do not recommend you to allocate arrays on stack, like MPI_Request and MPI_Status if the size is not a known small number. In this case the size of array is (p-1), so you better use malloc for this data structure.
Fifth, if you do not check status, use MPI_STATUSES_IGNORE.
Also instead of sizeof(double) you should specify number of items (1).
But of course the absolutely best version is just to use MPI_Gather.
Moreover, generally there is no reason not to run computations on the root node.
Here is slightly rewritten example:
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#include <math.h>
#include <time.h>
#define M 1000 //Number of times
#define N 2000 //Quantity of random numbers
#define R 1000 //Max value of random numbers
double SumaDeRaices (double* valores)
{
int i;
double sumaTotal = 0.0;
//Raices cuadradas de los valores y suma de estos
for(i=0; i<N; i++) {
sumaTotal = sqrt(valores[i]) + sumaTotal;
}
return sumaTotal;
}
int main(int argc, char* argv[]) {
int yo; /* rank of process */
int p; /* number of processes */
/* Start up MPI */
MPI_Init(&argc, &argv);
/* Find out process rank */
MPI_Comm_rank(MPI_COMM_WORLD, &yo);
/* Find out number of processes */
MPI_Comm_size(MPI_COMM_WORLD, &p);
double *result;
clock_t inicio, fin;
double *numAleatorios;
if (yo == 0) {
inicio = clock();
}
numAleatorios = (double*) malloc( sizeof(double) * ((double) N) ); //array with numbers
result = (double *) malloc(sizeof(double) * p);
for(int i = 0; i<M; i++){ //M times
for(int j=0; j<N; j++) {
numAleatorios[j] = rand() % R ;
}
double local_result = SumaDeRaices(numAleatorios);
MPI_Gather(&local_result, 1, MPI_DOUBLE, result, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); //send result to master
}
if (yo == 0) {
fin = clock()-inicio;
printf("Tiempo total de ejecucion %f segundos \n", ((double)fin)/CLOCKS_PER_SEC);
}
free(numAleatorios);
/* Shut down MPI */
MPI_Finalize();
} /* main */

Writing distributed arrays using MPI-IO and Cartesian topology

I have an MPI code that implements 2D domain decomposition to compute numerical solutions to a PDE. Currently I write certain 2D distributed arrays out for each process (e.g. array_x--> proc000x.bin). I want to reduce that to a single binary file.
array_0, array_1,
array_2, array_3,
Suppose the above illustrates a cartesian topology with 4 processes (2x2). Each 2D array has dimension (nx + 2, nz + 2). The +2 signifies "ghost" layers added to all sides for communication purposes.
I would like to extract the main arrays (omit the ghost layers) and write them to a single binary file with an order something like,
array_0, array_1, array_2, array_3 --> output.bin
If possible it would be preferable to write it as though I had access to the global grid and was writing row-by-row i.e.,
row 0 of array_0, row 0 of array_1, row 1 of array_0 row_1 of array_1 ....
The attempt below attempts the former of the two output formats in file array_test.c
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
/* 2D array allocation */
float **alloc2D(int rows, int cols);
float **alloc2D(int rows, int cols) {
int i, j;
float *data = malloc(rows * cols * sizeof(float));
float **arr2D = malloc(rows * sizeof(float *));
for (i = 0; i < rows; i++) {
arr2D[i] = &(data[i * cols]);
}
/* Initialize to zero */
for (i= 0; i < rows; i++) {
for (j=0; j < cols; j++) {
arr2D[i][j] = 0.0;
}
}
return arr2D;
}
int main(void) {
/* Creates 5x5 array of floats with padding layers and
* attempts to write distributed arrays */
/* Run toy example with 4 processes */
int i, j, row, col;
int nx = 5, ny = 5, npad = 1;
int my_rank, nproc=4;
int dim[2] = {2, 2}; /* 2x2 cartesian grid */
int period[2] = {0, 0};
int coord[2];
int reorder = 1;
float **A = NULL;
MPI_Comm grid_Comm;
/* Initialize MPI */
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
/* Establish cartesian topology */
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);
/* Get cartesian grid indicies of processes */
MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
row = coord[1];
col = coord[0];
/* Add ghost layers */
nx += 2 * npad;
ny += 2 * npad;
A = alloc2D(nx, ny);
/* Create derived datatype for interior grid (output grid) */
MPI_Datatype grid;
int start[2] = {npad, npad};
int arrsize[2] = {nx, ny};
int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};
MPI_Type_create_subarray(2, arrsize, gridsize,
start, MPI_ORDER_C, MPI_FLOAT, &grid);
MPI_Type_commit(&grid);
/* Fill interior grid */
for (i = npad; i < nx-npad; i++) {
for (j = npad; j < ny-npad; j++) {
A[i][j] = my_rank + i;
}
}
/* MPI IO */
MPI_File fh;
MPI_Status status;
char file_name[100];
int N, offset;
sprintf(file_name, "output.bin");
MPI_File_open(grid_Comm, file_name, MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &fh);
N = (nx - 2 * npad) * (ny - 2 *npad);
offset = (row * 2 + col) * N * sizeof(float);
MPI_File_set_view(fh, offset, MPI_FLOAT, grid, "native",
MPI_INFO_NULL);
MPI_File_write_all(fh, &A[0][0], N, MPI_FLOAT, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
/* Cleanup */
free(A[0]);
free(A);
MPI_Type_free(&grid);
MPI_Finalize();
return 0;
}
Compiles with
mpicc -o array_test array_test.c
Runs with
mpiexec -n 4 array_test
While the code compiles and runs, the output is incorrect. I'm assuming that I have misinterpreted the use of the derived datatype and file writing in this case. I'd appreciate some help figuring out my mistakes.
The error you make here is that you have the wrong file view. Instead of creating a type representing the share of the file the current processor is responsible of, you use the mask corresponding to the local data you want to write.
You have actually two very distinct masks to consider:
The mask for the local data, excluding the halo layers; and
The mask for the global data, as it should be once collated into the file.
The former corresponds to this layout:
Here, the data that you want to output on the file for a given process in in dark blue, and the halo layer that should not be written on the file is in lighter blue.
The later corresponds to this layout:
Here, each colour corresponds to the local data coming from a different process, as distributed on the 2D Cartesian grid.
To understand what you need to create to reach this final result, you have to think backwards:
Your final call to the IO routine should be MPI_File_write_all(fh, &A[0][0], 1, interior, MPI_STATUS_IGNORE);. So you have to have your interior type defined such as to exclude the halo boundary. Well fortunately, the type grid you created already does exactly that. So we will use it.
But now, you have to have the view on the file to allow for this MPI_Fie_write_all() call. So the view must be as described in the second picture. We will therefore create a new MPI type representing it. For that, MPI_Type_create_subarray() is what we need.
Here is the synopsis of this function:
int MPI_Type_create_subarray(int ndims,
const int array_of_sizes[],
const int array_of_subsizes[],
const int array_of_starts[],
int order,
MPI_Datatype oldtype,
MPI_Datatype *newtype)
Create a datatype for a subarray of a regular, multidimensional array
INPUT PARAMETERS
ndims - number of array dimensions (positive integer)
array_of_sizes
- number of elements of type oldtype in each
dimension of the full array (array of positive integers)
array_of_subsizes
- number of elements of type oldtype in each dimension of
the subarray (array of positive integers)
array_of_starts
- starting coordinates of the subarray in each dimension
(array of nonnegative integers)
order - array storage order flag (state)
oldtype - array element datatype (handle)
OUTPUT PARAMETERS
newtype - new datatype (handle)
For our 2D Cartesian file view, here are what we need for these input parameters:
ndims: 2 as the grid is 2D
array_of_sizes: these are the dimensions of the global array to output, namely { nnx*dim[0], nny*dim[1] }
array_of_subsizes: these are the dimensions of the local share of the data to output, namely { nnx, nny }
array_of_start: these are the x,y start coordinates of the local share into the global grid, namely { nnx*coord[0], nny*coord[1] }
order: the ordering is C so this must be MPI_ORDER_C
oldtype: data are floats so this must be MPI_FLOAT
Now that we have our type for the file view, we simply apply it with MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL); and the magic is done.
Your full code becomes:
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
/* 2D array allocation */
float **alloc2D(int rows, int cols);
float **alloc2D(int rows, int cols) {
int i, j;
float *data = malloc(rows * cols * sizeof(float));
float **arr2D = malloc(rows * sizeof(float *));
for (i = 0; i < rows; i++) {
arr2D[i] = &(data[i * cols]);
}
/* Initialize to zero */
for (i= 0; i < rows; i++) {
for (j=0; j < cols; j++) {
arr2D[i][j] = 0.0;
}
}
return arr2D;
}
int main(void) {
/* Creates 5x5 array of floats with padding layers and
* attempts to write distributed arrays */
/* Run toy example with 4 processes */
int i, j, row, col;
int nx = 5, ny = 5, npad = 1;
int my_rank, nproc=4;
int dim[2] = {2, 2}; /* 2x2 cartesian grid */
int period[2] = {0, 0};
int coord[2];
int reorder = 1;
float **A = NULL;
MPI_Comm grid_Comm;
/* Initialize MPI */
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
/* Establish cartesian topology */
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, period, reorder, &grid_Comm);
/* Get cartesian grid indicies of processes */
MPI_Cart_coords(grid_Comm, my_rank, 2, coord);
row = coord[1];
col = coord[0];
/* Add ghost layers */
nx += 2 * npad;
ny += 2 * npad;
A = alloc2D(nx, ny);
/* Create derived datatype for interior grid (output grid) */
MPI_Datatype grid;
int start[2] = {npad, npad};
int arrsize[2] = {nx, ny};
int gridsize[2] = {nx - 2 * npad, ny - 2 * npad};
MPI_Type_create_subarray(2, arrsize, gridsize,
start, MPI_ORDER_C, MPI_FLOAT, &grid);
MPI_Type_commit(&grid);
/* Fill interior grid */
for (i = npad; i < nx-npad; i++) {
for (j = npad; j < ny-npad; j++) {
A[i][j] = my_rank + i;
}
}
/* Create derived type for file view */
MPI_Datatype view;
int nnx = nx-2*npad, nny = ny-2*npad;
int startV[2] = { coord[0]*nnx, coord[1]*nny };
int arrsizeV[2] = { dim[0]*nnx, dim[1]*nny };
int gridsizeV[2] = { nnx, nny };
MPI_Type_create_subarray(2, arrsizeV, gridsizeV,
startV, MPI_ORDER_C, MPI_FLOAT, &view);
MPI_Type_commit(&view);
/* MPI IO */
MPI_File fh;
MPI_File_open(grid_Comm, "output.bin", MPI_MODE_CREATE | MPI_MODE_WRONLY,
MPI_INFO_NULL, &fh);
MPI_File_set_view(fh, 0, MPI_FLOAT, view, "native", MPI_INFO_NULL);
MPI_File_write_all(fh, &A[0][0], 1, grid, MPI_STATUS_IGNORE);
MPI_File_close(&fh);
/* Cleanup */
free(A[0]);
free(A);
MPI_Type_free(&view);
MPI_Type_free(&grid);
MPI_Finalize();
return 0;
}

Resources