Does MPI_Reduce need an existing pointer for the receive buffer?

Does MPI_Reduce need an existing pointer for the receive buffer? - c

The MPI documentation asserts that the adress of address of the receive buffer (recvbuf) is significant only at root. Meaning that the memory may not be allocated in the other processes. This is confirmed by this question.
int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype,
MPI_Op op, int root, MPI_Comm comm)
At first I thought that recvbuf did not even have to exist: that the memory for recvbuf itself did not have to be allocated (eg by dynamical allocation). Unfortunately (it took me a lot of time to understand my mistake!), it seems that even if the memory that it points to is not valid, the pointer itself has to exist.
See below for the code I have in mind, with a version that gives a segfault, and one that does not.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char **argv) {
// MPI initialization
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n1 = 3, n2 = 10; // Sizes of the 2d arrays
long **observables = (long **) malloc(n1 * sizeof(long *));
for (int k = 0 ; k < n1 ; ++k) {
observables[k] = (long *) calloc(n2, sizeof(long));
for (long i = 0 ; i < n2 ; ++i) {
observables[k][i] = k * i * world_rank; // Whatever
}
}
long **obs_sum; // This will hold the sum on process 0
#ifdef OLD // Version that gives a segfault
if (world_rank == 0) {
obs_sum = (long **) malloc(n2 * sizeof(long *));
for (int k = 0 ; k < n2 ; ++k) {
obs_sum[k] = (long *) calloc(n2, sizeof(long));
}
}
#else // Correct version
// We define all the pointers in all the processes.
obs_sum = (long **) malloc(n2 * sizeof(long *));
if (world_rank == 0) {
for (int k = 0 ; k < n2 ; ++k) {
obs_sum[k] = (long *) calloc(n2, sizeof(long));
}
}
#endif
for (int k = 0 ; k < n1 ; ++k) {
// This is the line that results in a segfault if OLD is defined
MPI_Reduce(observables[k], obs_sum[k], n2, MPI_LONG, MPI_SUM, 0,
MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
// You may free memory here
return 0;
}
Am I interpreting this correctly? What is the rationale behind this behavior?

The problem is not MPI, but the fact that you are passing obs_sum[k], but you haven't defined/allocated it at all.
for (int k = 0 ; k < n1 ; ++k) {
// This is the line that results in a segfault if OLD is defined
MPI_Reduce(observables[k], obs_sum[k], n2, MPI_LONG, MPI_SUM, 0,
MPI_COMM_WORLD);
}
Even if MPI_Reduce() is not getting its value, the generated code will get obs_sum (undefined and not allocated), add k to it and try to read this pointer (segfault) to be passed to MPI_Reduce().
For example the allocation of the rows should be sufficient for it to work:
#else // Correct version
// We define all the pointers in all the processes.
obs_sum = (long **) malloc(n2 * sizeof(long *));
// try commenting out the following lines
// if (world_rank == 0) {
// for (int k = 0 ; k < n2 ; ++k) {
// obs_sum[k] = (long *) calloc(n2, sizeof(long));
// }
// }
#endif
I would allocate a 2D array as a flat array - I really hate this array-of-arrays representation. Wouldn't this be better?
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
int main(int argc, char **argv) {
// MPI initialization
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n1 = 3, n2 = 10; // Sizes of the 2d arrays
long *observables = (long *) malloc(n1*n2*sizeof(long));
for (int k = 0 ; k < n1 ; ++k) {
for (long i = 0 ; i < n2 ; ++i) {
observables[k*n2+i] = k * i * world_rank; // Whatever
}
}
long *obs_sum = nullptr; // This will hold the sum on process 0
if (world_rank == 0) {
obs_sum = (long *) malloc(n1*n2*sizeof(long));
}
MPI_Reduce(observables, obs_sum, n1*n2, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
// You may free memory here
return 0;
}

Related

MPI_Send with type created through MPI_Type_create_darray

I have the following code
MPI_Init(NULL, NULL);
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int n_chunks = 16;
assert(n % n_chunks == 0);
int chunk_size = n / n_chunks;
int psizes[2] = {0, 0};
MPI_Dims_create(world_size, 2, psizes);
MPI_Datatype *dist_types = (MPI_Datatype *) malloc(world_size * sizeof(MPI_Datatype));
for (int i = 0; i < world_size; i++) {
int sizes[2] = {n, n};
int distribs[2] = {MPI_DISTRIBUTE_CYCLIC, MPI_DISTRIBUTE_CYCLIC};
int dargs[2] = {chunk_size, chunk_size};
MPI_Type_create_darray(world_size, i, 2,
sizes, distribs, dargs, psizes,
MPI_ORDER_C, MPI_DOUBLE, &dist_types[i]);
MPI_Type_commit(&dist_types[i]);
}
MPI_Request *send_requests;
if (rank == 0) {
send_requests = (MPI_Request *) malloc(world_size * sizeof(MPI_Request));
for (int i = 0; i < world_size; i++) {
MPI_Isend(&A[0][0], 1, dist_types[i],
i, 0, MPI_COMM_WORLD, &send_requests[i]);
}
}
int dist_size;
MPI_Type_size(dist_types[rank], &dist_size);
dist_size /= sizeof(double);
double *D = (double *) malloc(dist_size * sizeof(double));
MPI_Request recv_request;
MPI_Irecv(D, dist_size, MPI_DOUBLE,
0, 0, MPI_COMM_WORLD, &recv_request);
MPI_Wait(&recv_request, MPI_STATUS_IGNORE);
if (rank == 0) {
MPI_Waitall(1, send_requests, MPI_STATUSES_IGNORE);
}
int m = n / psizes[0];
if (rank == 0) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < m; j++) {
printf("%.2lf ", D[i * m + j]);
}
printf("\n");
}
When I print out the matrix D, I don't get a block cyclic view of A as I'd expect. Rather, the entries are all jumbled up and apart from the top row they look quite random.
Hence, my question is, can I generally expect this to work or are you not really supposed to use MPI_Type_create_darray in this situation. I'm wondering because from what I could find online, people only mention the function in the context of MPI-IO and I couldn't locate a single example of it being used in a way similar to what I have.
I'm an MPI novice, so maybe I'm just doing something wrong that's unrelated to the type I'm using. Also, I did read that it's not really ideal to distribute your matrix this way and rather use MPI-IO, but I can't really change that.

distributed algorithm in C

I am a beginner in C. I have to create a distributed architecture with the library MPI. The following code is:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char **argv)
{
int N, w = 1, L = 2, M = 50; // with N number of threads
int T= 2;
int myid;
int buff;
float mit[N][T]; // I initialize a 2d array
for(int i = 0; i < N; ++i){
mit[i][0]= M / (float) N;
for (int j = 1; j < T; ++j){
mit[i][j] = 0;
}
}
float tab[T]; // 1d array
MPI_Status stat;
/*********************************************
start
*********************************************/
MPI_Init(&argc,&argv); // Initialisation
MPI_Comm_size(MPI_COMM_WORLD, &N);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
for(int j = 0; j < T; j++) {
for(int i = 0; i < N; i++) { // I iterate for each slave
if (myid !=0) {
float y = ((float) rand()) / (float) RAND_MAX;
mit[i][j + 1] = mit[i][j]*(1 + w * L * y);
buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, 0, 0, MPI_COMM_WORLD); // I send the variable buff to the master
buff=0;
}
if( myid == 0 ) { // Master
for(int i = 1; i < N; i++){
MPI_Recv(&buff, 128, MPI_INT, i, 0, MPI_COMM_WORLD, &stat);
tab[j] += buff; // I need to receive all the variables buff sent by the salves, sum them and stock into the tab at the index j
}
printf("\n%.20f\n",tab[j]); // I print the result of the sum at index j
}
}
}
MPI_Finalize();
return 0;
}
}
I use the command in the terminal: mpicc .c -o my_file to compile the program
Then mpirun -np 101 my_file_c to start the program with 101 threads
But the problem is I have the following error int the terminal:
It seems that [at least] one of the processes that was started with
> mpirun did not invoke MPI_INIT before quitting (it is possible that
> more than one process did not invoke MPI_INIT -- mpirun was only
> notified of the first one, which was on node n0).
>
> mpirun can *only* be used with MPI programs (i.e., programs that
> invoke MPI_INIT and MPI_FINALIZE). You can use the "lamexec" program
> to run non-MPI programs over the lambooted nodes.
It seems that I have a problem with the master but i don't know why...
Any idea ???
Thank you :)

This behavior is very likely the result of a memory corruption.
You cannot
int buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, ...);
depending on what you want to achieve, you can try instead
int buff=mit[i][j+1];
MPI_Send(&buff, 1, MPI_INT, ...);
// ...
MPI_Recv(&buff, 1, MPI_INT, ...);
or
int *buff=&mit[i][j+1];
MPI_Send(buff, 128, MPI_INT, ...);
// fix MPI_Recv()

How to handle MPI sendcount of zero

What is the correct way to handle a sendcount = 0 when using MPI_Gatherv (or any other function that requires a sendcount) when setting up the displs argument?
I have data that needs to be received by all processors, but all processors might not have any data to send themselves. As an MWE, I tried (on just two processors):
#include <stdlib.h>
#include <stdio.h>
#include <mpi.h>
int main(void)
{
int ntasks;
int thistask;
int n = 0;
int i;
int totcounts = 0;
int *data;
int *rbuf;
int *rcnts;
int *displs;
int *master_data;
int *master_displs;
// Set up mpi
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &ntasks);
MPI_Comm_rank(MPI_COMM_WORLD, &thistask);
// Allocate memory for arrays needed by allgatherv
rbuf = calloc(ntasks, sizeof(int));
rcnts = calloc(ntasks, sizeof(int));
displs = calloc(ntasks, sizeof(int));
master_displs = calloc(ntasks, sizeof(int));
// Initialize the counts and displacement arrays
for(i = 0; i < ntasks; i++)
{
rcnts[i] = 1;
displs[i] = i;
}
// Allocate data on just one task, but not others
if(thistask == 1)
{
n = 3;
data = calloc(n, sizeof(int));
for(i = 0; i < n; i++)
{
data[i] = i;
}
}
// Get n so each other processor knows about what others are sending
MPI_Allgatherv(&n, 1, MPI_INT, rbuf, rcnts, displs, MPI_INT, MPI_COMM_WORLD);
// Now that we know how much data each processor is sending, we allocate the array
// to hold it all
for(i = 0; i < ntasks; i++)
{
totcounts += rbuf[i];
}
master_data = calloc(totcounts, sizeof(int));
// Get displs for master data
master_displs[0] = 0;
for(i = 1; i < ntasks; i++)
{
master_displs[i] = master_displs[i - 1] + rbuf[i - 1];
}
// Send each processor's data to all others
MPI_Allgatherv(&data, n, MPI_INT, master_data, rbuf, master_displs, MPI_INT, MPI_COMM_WORLD);
// Print it out to see if it worked
if(thistask == 0)
{
for(i = 0; i < totcounts; i++)
{
printf("master_data[%d] = %d\n", i, master_data[i]);
}
}
// Free
if(thistask == 1)
{
free(data);
}
free(rbuf);
free(rcnts);
free(displs);
free(master_displs);
free(master_data);
MPI_Finalize();
return 0;
}
The way that I've set up master_displs works when every processor has a non-zero n (that is, they have data to send). In this case, both entries will be zero. However, the results of this program are garbage. How would I set up the master_displs array to ensure that master_data holds the correct information (in this case, just master_data[i] = i, as received from task 1)?

MPI gathering 2D subarrays

I know this has been answered many times before and there is a comprehensive answer here which I have read and attempted to use but I just can't get my code to work for some reason.
I have stripped my code down a bit to make it a bit easier to follow, but basically what I am trying to do is have each process initialise a sub-array and work on it, then put the whole big array back together on rank 0. MPI_Gatherv is giving me a segfault and I cannot figure out why.
Any help would be greatly appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#define N 32
void init_lattice(double **site, int row, int col){
int i,j;
for(i=0; i<row; i++){
for(j=0; j<col; j++){
site[i][j]=(drand48()/4294967295.0 + 0.5)*2*M_PI;
}
}
}
int main(int argc, char *argv[]){
int nprocs, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
int dim = 2;
int grid[dim];
grid[0]=0;
grid[1]=0;
// Assign the grid dimensions
MPI_Dims_create(nprocs, dim, grid);
printf("Dim grid: length: %d, width: %d\n", grid[0], grid[1]);
// The new communicator
MPI_Comm comm_grid;
// Allow cyclic behavior
int periodic[dim];
periodic[0] = 1;
periodic[1] = 1;
// Create the communicator
MPI_Cart_create(MPI_COMM_WORLD, dim, grid, periodic, 0, &comm_grid);
int block_len, block_width;
block_len = N/grid[1];
block_width = N/grid[0];
int i, j;
//Create lattice subset
double *data = (double *) malloc (block_len * block_width * sizeof(double));
double **site = (double **) malloc (block_len * sizeof(double *));
for (i = 0; i < block_len; i++)
site[i] = & (data[i * block_width]);
//Initialise lattice
init_lattice(site, block_len, block_width);
MPI_Datatype newtype, subtype;
int sizes[dim];
sizes[0]=N;
sizes[1]=N;
int subsizes[dim];
subsizes[0] = block_len;
subsizes[1] = block_width;
int starts[dim];
starts[0] = 0;
starts[1] = 0;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &newtype);
MPI_Type_create_resized(newtype, 0, N/grid[1]*sizeof(double), &subtype);
MPI_Type_commit(&subtype);
int sendcounts[grid[0]*grid[1]];
int displs[grid[0]*grid[1]];
if (rank == 0) {
for (i=0; i<grid[0]*grid[1]; i++) sendcounts[i] = 1;
int disp = 0;
for (i=0; i<grid[0]; i++) {
for (j=0; j<grid[1]; j++) {
displs[i*grid[0]+j] = disp;
disp += 1;
}
disp += ((N/grid[1])-1)*grid[0];
}
}
//Create global lattice
double *global_data = (double *) malloc (N * N * sizeof(double));
double **global_site = (double **) malloc (N * sizeof(double *));
for (i = 0; i < N; i++)
global_site[i] = & (global_data[i * N]);
MPI_Gatherv(&(site[0][0]), N*N/(grid[0]*grid[1]), MPI_DOUBLE, &(global_site[0][0]), sendcounts, displs, subtype, 0, MPI_COMM_WORLD);
if(rank==0){
printf("Rank: %d\n", rank);
for(i=0; i<N; i++){
for(j=0; j<N; j++){
printf("%.2lf ", global_site[i][j]);
}
printf("\n");
}
}
return 0;
}
EDIT:
Ok so I have changed my array allocations to contiguous memory and everything is working as it should now. Thanks talonmies!

The fundamental problem here is that MPI expects all allocations to be contiguous blocks of memory. Your site and global_site arrays are not, they are arrays of pointers. The MPI routines are just reading past the end of each individual row allocation and causing your segfault.
If you want to allocate an n x n array to use with the MPI then you need to replace this:
double **global_site;
if(rank==0){
global_site = malloc(sizeof(double *)*(N));
for(i=0; i<N; i++)
global_site[i] = malloc(sizeof(double)*(N));
}
with something like this:
double *global_site = malloc(sizeof(double)*(N * N));
You will obviously need to adjust the rest of your code accordingly.
It seems the only reason you are actually using arrays of pointers is for the convenience of [i][j] style 2D indexing. If you use linear or pitched linear memory, you can easily make a little preprocessor macro or helper function which can give you that style of indexing into row or column major ordered storage which is still compatible with MPI.

MPI Subarray Sending Error

I firstly initialize a 4x4 matrix and then try to send the first 2x2 block to the slave process by using MPI in C. However the slave process only receives the first row of the block, the second row is filled with random numbers from computer ram. I couldn't find what is missing. The code of the program is below :
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define SIZE 4
int main(int argc, char** argv)
{
int rank, nproc;
const int root = 0;
const int tag = 3;
int** table;
int* datas;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
datas = malloc(SIZE * SIZE * sizeof(int));
table = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
table[i] = &(datas[i * SIZE]);
for (int i = 0; i < SIZE; i++)
for (int k = 0; k < SIZE; k++)
table[i][k] = 0;
table[0][1] = 1;
table[0][2] = 2;
table[1][0] = 3;
table[2][3] = 2;
table[3][1] = 3;
table[3][2] = 4;
if (rank == root){
MPI_Datatype newtype;
int sizes[2] = { 4, 4 }; // size of table
int subsizes[2] = { 2, 2 }; // size of sub-region
int starts[2] = { 0, 0 };
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &newtype);
MPI_Type_commit(&newtype);
MPI_Send(&(table[0][0]), 1, newtype, 1, tag, MPI_COMM_WORLD);
}
else{
int* local_datas = malloc(SIZE * SIZE * sizeof(int));
int** local = malloc(SIZE * sizeof(int*));
for (int i = 0; i < SIZE; i++)
local[i] = &(local_datas[i * SIZE]);
MPI_Recv(&(local[0][0]), 4, MPI_INT, root, tag, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
for (int i = 0; i < 2; i++){
for (int k = 0; k < 2; k++)
printf("%3d ", local[i][k]);
printf("\n");
}
}
MPI_Finalize();
return 0;
}

You have instructed the receive operation to put four integer values consecutively in memory and therefore the 2x2 block is converted to a 1x4 row upon receive (since local is 4x4). The second row of local contains random values since the memory is never initialised.
You should either make use of MPI_Type_create_subarray in both the sender and the receiver in order to place the received data in a 2x2 block or redefine local to be a 2x2 matrix instead of 4x4.