MPI_Waitany causes segmentation fault - c

I am using MPI to distribute images to different processes so that:
Process 0 distribute images to different processes.
Processes other
than 0 process the image and then send the result back to process 0.
Process 0 tries to busy a process whenever the latter finishes its job with an image, so that as soon as it is idle, it is assigned another image to process. The code follows:
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include "mpi.h"
#define MAXPROC 16 /* Max number of processes */
#define TOTAL_FILES 7
int main(int argc, char* argv[]) {
int i, nprocs, tprocs, me, index;
const int tag = 42; /* Tag value for communication */
MPI_Request recv_req[MAXPROC]; /* Request objects for non-blocking receive */
MPI_Request send_req[MAXPROC]; /* Request objects for non-blocking send */
MPI_Status status; /* Status object for non-blocing receive */
char myname[MPI_MAX_PROCESSOR_NAME]; /* Local host name string */
char hostname[MAXPROC][MPI_MAX_PROCESSOR_NAME]; /* Received host names */
int namelen;
MPI_Init(&argc, &argv); /* Initialize MPI */
MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* Get nr of processes */
MPI_Comm_rank(MPI_COMM_WORLD, &me); /* Get own identifier */
MPI_Get_processor_name(myname, &namelen); /* Get host name */
myname[namelen++] = (char)0; /* Terminating null byte */
/* First check that we have at least 2 and at most MAXPROC processes */
if (nprocs<2 || nprocs>MAXPROC) {
if (me == 0) {
printf("You have to use at least 2 and at most %d processes\n", MAXPROC);
}
MPI_Finalize(); exit(0);
}
/* if TOTAL_FILES < nprocs then use only TOTAL_FILES + 1 procs */
tprocs = (TOTAL_FILES < nprocs) ? TOTAL_FILES + 1 : nprocs;
int done = -1;
if (me == 0) { /* Process 0 does this */
int send_counter = 0, received_counter;
for (i=1; i<tprocs; i++) {
MPI_Isend(&send_counter, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &send_req[i]);
++send_counter;
/* Receive a message from all other processes */
MPI_Irecv (hostname[i], namelen, MPI_CHAR, MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &recv_req[i]);
}
for (received_counter = 0; received_counter < TOTAL_FILES; received_counter++){
/* Wait until at least one message has been received from any process other than 0*/
MPI_Waitany(tprocs-1, &recv_req[1], &index, &status);
if (index == MPI_UNDEFINED) perror("Errorrrrrrr");
printf("Received a message from process %d on %s\n", status.MPI_SOURCE, hostname[index+1]);
if (send_counter < TOTAL_FILES){ /* si todavia faltan imagenes por procesar */
MPI_Isend(&send_counter, 1, MPI_INT, status.MPI_SOURCE, tag, MPI_COMM_WORLD, &send_req[status.MPI_SOURCE]);
++send_counter;
MPI_Irecv (hostname[status.MPI_SOURCE], namelen, MPI_CHAR, MPI_ANY_SOURCE, tag, MPI_COMM_WORLD, &recv_req[status.MPI_SOURCE]);
}
}
for (i=1; i<tprocs; i++) {
MPI_Isend(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &send_req[i]);
}
} else if (me < tprocs) { /* all other processes do this */
int y;
MPI_Recv(&y, 1, MPI_INT, 0,tag,MPI_COMM_WORLD,&status);
while (y != -1) {
printf("Process %d: Received image %d\n", me, y);
sleep(me%3+1); /* Let the processes sleep for 1-3 seconds */
/* Send own identifier back to process 0 */
MPI_Send (myname, namelen, MPI_CHAR, 0, tag, MPI_COMM_WORLD);
MPI_Recv(&y, 1, MPI_INT, 0,tag,MPI_COMM_WORLD,&status);
}
}
MPI_Finalize();
exit(0);
}
which is based on this example.
Right now I'm getting a segmentation fault, not sure why. I'm fairly new to MPI but I can't see a mistake in the code above. It only happens with certain numbers of processes. For example, when TOTAL_FILES = 7 and is run with 5, 6 or 7 processes. Works fine with 9 processes or above.
The entire code can be found here. Trying it with 6 processes causes the mentioned error.
To compile and execute :
mpicc -Wall sscce.c -o sscce -lm
mpirun -np 6 sscce

It's not MPI_Waitany that is causing segmentation fault but it is the way you handle the case when all requests in recv_req[] are completed (i.e. index == MPI_UNDEFINED). perror() does not stop the code and it continues further and then segfaults in the printf statement while trying to access hostname[index+1]. The reason for all requests in the array being completed is that due to the use of MPI_ANY_SOURCE in the receive call the rank of the sender is not guaranteed to be equal to the index of the request in recv_req[] - simply compare index and status.MPI_SOURCE after MPI_Waitany returns to see it for yourself. Therefore the subsequent calls to MPI_Irecv with great probability overwrite still not completed requests and thus the number of requests that can get completed by MPI_Waitany is less than the actual number of results expected.
Also note that you never wait for the send requests to complete. You are lucky that Open MPI implementation uses an eager protocol to send small messages and therefore those get sent even though MPI_Wait(any|all) or MPI_Test(any|all) is never called on the started send requests.

Related

MPI_ERR_BUFFER from MPI_Bsend after removing following print statement?

I have the following code which works:
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n = 10000;
int ni, i;
double t[n];
int x[n];
int buf[n];
int buf_size = n*sizeof(int);
MPI_Buffer_attach(buf, buf_size);
if (world_rank == 0) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
for (i = 0; i < msg_size; ++i) {
msg[i] = rand();
}
double time0 = MPI_Wtime();
MPI_Bsend(&msg, msg_size, MPI_INT, 1, 0, MPI_COMM_WORLD);
t[ni] = MPI_Wtime() - time0;
x[ni] = msg_size;
MPI_Barrier(MPI_COMM_WORLD);
printf("P0 sent msg with size %d\n", msg_size);
}
}
else if (world_rank == 1) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
MPI_Request request;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Irecv(&msg, msg_size, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
MPI_Wait(&request, MPI_STATUS_IGNORE);
printf("P1 received msg with size %d\n", msg_size);
}
}
MPI_Buffer_detach(&buf, &buf_size);
MPI_Finalize();
}
As soon as I remove the print statements, the program crashes, telling me there is a MPI_ERR_BUFFER: invalid buffer pointer. If I remove only one of the print statements the other print statements are still executed, so I believe it crashes at the end of the program. I don't see why it crashes and the fact that it does not crash when I am using the print statements goes beyond my logic...
Would anybody have a clue what is going on here?
You are simply not providing enough buffer space to MPI. In buffered mode, all ongoing messages are stored in the buffer space which is used as a ring buffer. In your code, there can be multiple messages that need to be buffered, regardless of the printf. Note that not even 2*n*sizeof(int) would be enough buffer space - the barriers do not provide a guarantee that the buffer is locally freed even though the corresponding receive is completed. You would have to provide (n*(n-1)/2)*sizeof(int) memory to be sure, or something in-between and hope.
Bottom line: Don't use buffered mode.
Generally, use standard blocking send calls and write the application such that it doesn't deadlock. Tune the MPI implementation such that small messages regardless of the receiver - to avoid wait times on late receivers.
If you want to overlap communication and computation, use nonblocking messages - providing proper memory for each communication.

How to modify MPI blocking send and receive to non-blocking

I am trying to understand the difference between blocking and non-blocking message passing mechanisms in parallel processing using MPI. Suppose we have the following blocking code:
#include <stdio.h>
#include <string.h>
#include "mpi.h"
int main (int argc, char* argv[]) {
const int maximum_message_length = 100;
const int rank_0= 0;
char message[maximum_message_length+1];
MPI_Status status; /* Info about receive status */
int my_rank; /* This process ID */
int num_procs; /* Number of processes in run */
int source; /* Process ID to receive from */
int destination; /* Process ID to send to */
int tag = 0; /* Message ID */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
/* clients processes */
if (my_rank != server_rank) {
sprintf(message, "Hello world from process# %d", my_rank);
MPI_Send(message, strlen(message) + 1, MPI_CHAR, rank_0, tag, MPI_COMM_WORLD);
} else {
/* rank 0 process */
for (source = 0; source < num_procs; source++) {
if (source != rank_0) {
MPI_Recv(message, maximum_message_length + 1, MPI_CHAR, source, tag,
MPI_COMM_WORLD,&status);
fprintf(stderr, "%s\n", message);
}
}
}
MPI_Finalize();
}
Each processor executes its task and send it back to rank_0 (the receiver). rank_0 will run a loop from 1 to n-1 processes and print them sequentially (i step in the loop may not proceed if the current client hasn't sent its task yet). How do I modify this code to achieve the non-blocking mechanism using MPI_Isend and MPI_Irecv? Do I need to remove the loop in receiver part (rank_0) and explicitly state MPI_Irecv(..) for each client, i.e.
MPI_Irecv(message, maximum_message_length + 1, MPI_CHAR, source, tag,
MPI_COMM_WORLD,&status);
Thank you.
What you do with non-blocking communication is to post the communication and then immediately proceed with your program to do other stuff, which again might be posting more communication. Especially, you can post all receives at once, and wait on them to complete only later on.
This is what you typically would do in your scenario here.
Note however, that this specific setup is a bad example, as it basically just reimplements an MPI_Gather!
Here is how you typically would go about the non-blocking communication in your setup. First, you need some storage for all the messages to end up in, and also a list of request handles to keep track of the non-blocking communication requests, thus your first part of the code needs to be changed accordingly:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "mpi.h"
int main (int argc, char* argv[]) {
const int maximum_message_length = 100;
const int server_rank = 0;
char message[maximum_message_length+1];
char *allmessages;
MPI_Status *status; /* Info about receive status */
MPI_Request *req; /* Non-Blocking Requests */
int my_rank; /* This process ID */
int num_procs; /* Number of processes in run */
int source; /* Process ID to receive from */
int tag = 0; /* Message ID */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
/* clients processes */
if (my_rank != server_rank) {
sprintf(message, "Hello world from process# %d", my_rank);
MPI_Send(message, maximum_message_length + 1, MPI_CHAR, server_rank,
tag, MPI_COMM_WORLD);
} else {
No need for non-blocking sends here. Now we go on and receive all these messages on server_rank. We need to loop over all of them and store a request handle for each of them:
/* rank 0 process */
allmessages = malloc((maximum_message_length+1)*num_procs);
status = malloc(sizeof(MPI_Status)*num_procs);
req = malloc(sizeof(MPI_Request)*num_procs);
for (source = 0; source < num_procs; source++) {
req[source] = MPI_REQUEST_NULL;
if (source != server_rank) {
/* Post non-blocking receive for source */
MPI_Irecv(allmessages+(source*(maximum_message_length+1)),
maximum_message_length + 1, MPI_CHAR, source, tag,
MPI_COMM_WORLD, req+source);
/* Proceed without waiting on the receive */
/* (posting further receives */
}
}
/* Wait on all communications to complete */
MPI_Waitall(num_procs, req, status);
/* Print the messages in order to the screen */
for (source = 0; source < num_procs; source++) {
if (source != server_rank) {
fprintf(stderr, "%s\n",
allmessages+(source*(maximum_message_length+1)));
}
}
}
MPI_Finalize();
}
After posting the non-blocking receives, we need to wait on all of them to complete, to print the messages in the correct order. To do this, a MPI_Waitall is used, which allows us to block until all request handles are satisfied. Note, that I include the server_rank here for simplicity, but set its request to MPI_REQUEST_NULL initially, so it will be ignored.
If you do not care about the order, you could process the communications as soon as they become available, by looping over the requests and employing MPI_Waitany. That would return as soon as any communication is completed and you could act on the corresponding data.
With MPI_Gather that code would look like this:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "mpi.h"
int main (int argc, char* argv[]) {
const int maximum_message_length = 100;
const int server_rank = 0;
char message[maximum_message_length+1];
char *allmessages;
int my_rank; /* This process ID */
int num_procs; /* Number of processes in run */
int source; /* Process ID to receive from */
int tag = 0; /* Message ID */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
if (my_rank == server_rank) {
allmessages = malloc((maximum_message_length+1)*num_procs);
}
sprintf(message, "Hello world from process# %d", my_rank);
MPI_Gather(message, (maximum_message_length+1), MPI_CHAR,
allmessages, (maximum_message_length+1), MPI_CHAR,
server_rank, MPI_COMM_WORLD);
if (my_rank == server_rank) {
/* Print the messages in order to the screen */
for (source = 0; source < num_procs; source++) {
if (source != server_rank) {
fprintf(stderr, "%s\n",
allmessages+(source*(maximum_message_length+1)));
}
}
}
MPI_Finalize();
}
And with MPI-3 you can even use a non-blocking MPI_Igather.
If you don't care about the ordering, the last part (starting with MPI_Waitall) could be done with MPI_Waitany like this:
for (i = 0; i < num_procs-1; i++) {
/* Wait on any next communication to complete */
MPI_Waitany(num_procs, req, &source, status);
fprintf(stderr, "%s\n",
allmessages+(source*(maximum_message_length+1)));
}

MPI: What to do when number of expected MPI_Recv is unkown

I've got many slave nodes which might or might not send messages to the master node. So currently there's no way the master node knows how many MPI_Recv to expect. Slave nodes had to sent minimum number of messages to the master node for efficiency reasons.
I managed to find a cool trick, which sends an additional "done" message when its no longer expecting any messages. Unfortunately, it doesn't seem to work in my case, where there're variable number of senders. Any idea on how to go about this? Thanks!
if(rank == 0){ //MASTER NODE
while (1) {
MPI_Recv(&buffer, 10, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == DONE) break;
/* Do stuff */
}
}else{ //MANY SLAVE NODES
if(some conditions){
MPI_Send(&buffer, 64, MPI_INT, root, 1, MPI_COMM_WORLD);
}
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Send(NULL, 1, MPI_INT, root, DONE, MPI_COMM_WORLD);
Not working, the program seem to be still waiting for a MPI_Recv
A simpler and more elegant option would be to use the MPI_IBARRIER. Have each worker call all of the sends that it needs to and then call MPI_IBARRIER when it's done. On the master, you can loop on both an MPI_IRECV on MPI_ANY_SOURCE and an MPI_IBARRIER. When the MPI_IBARRIER is done, you know that everyone has finished and you can cancel the MPI_IRECV and move on. The pseudocode would look something like this:
if (master) {
/* Start the barrier. Each process will join when it's done. */
MPI_Ibarrier(MPI_COMM_WORLD, &requests[0]);
do {
/* Do the work */
MPI_Irecv(..., MPI_ANY_SOURCE, &requests[1]);
/* If the index that finished is 1, we received a message.
* Otherwise, we finished the barrier and we're done. */
MPI_Waitany(2, requests, &index, MPI_STATUSES_IGNORE);
} while (index == 1);
/* If we're done, we should cancel the receive request and move on. */
MPI_Cancel(&requests[1]);
} else {
/* Keep sending work back to the master until we're done. */
while( ...work is to be done... ) {
MPI_Send(...);
}
/* When we finish, join the Ibarrier. Note that
* you can't use an MPI_Barrier here because it
* has to match with the MPI_Ibarrier above. */
MPI_Ibarrier(MPI_COMM_WORLD, &request);
MPI_Wait(&request, MPI_STATUS_IGNORE);
}
1- you called MPI_Barrier in wrong place, it should be called after MPI_Send.
2- the root will exit from loop when it receives DONE from all other ranks (size -1).
the code after some modifications:
#include <mpi.h>
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char** argv)
{
MPI_Init(NULL, NULL);
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Status status;
int DONE = 888;
int buffer = 77;
int root = 0 ;
printf("here is rank %d with size=%d\n" , rank , size);fflush(stdout);
int num_of_DONE = 0 ;
if(rank == 0){ //MASTER NODE
while (1) {
MPI_Recv(&buffer, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
printf("root recev %d from %d with tag = %d\n" , buffer , status.MPI_SOURCE , status.MPI_TAG );fflush(stdout);
if (status.MPI_TAG == DONE)
num_of_DONE++;
printf("num_of_DONE=%d\n" , num_of_DONE);fflush(stdout);
if(num_of_DONE == size -1)
break;
/* Do stuff */
}
}else{ //MANY SLAVE NODES
if(1){
buffer = 66;
MPI_Send(&buffer, 1, MPI_INT, root, 1, MPI_COMM_WORLD);
printf("rank %d sent data.\n" , rank);fflush(stdout);
}
}
if(rank != 0)
{
buffer = 55;
MPI_Send(&buffer, 1, MPI_INT, root, DONE, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
printf("rank %d done.\n" , rank);fflush(stdout);
MPI_Finalize();
return 0;
}
output:
hosam#hosamPPc:~/Desktop$ mpicc -o aa aa.c
hosam#hosamPPc:~/Desktop$ mpirun -n 3 ./aa
here is rank 2 with size=3
here is rank 0 with size=3
rank 2 sent data.
here is rank 1 with size=3
rank 1 sent data.
root recev 66 from 1 with tag = 1
num_of_DONE=0
root recev 66 from 2 with tag = 1
num_of_DONE=0
root recev 55 from 2 with tag = 888
num_of_DONE=1
root recev 55 from 1 with tag = 888
num_of_DONE=2
rank 0 done.
rank 1 done.
rank 2 done.

Search/wait for any transmission in MS-MPI

It's a master - slave situation. How can I make the master process search in a non-blocking way for a message transmitted to him. If at the point of searching there is no message transmited to master, it will continue with iterations. However, if there is a message transmitted to him, it will process the message and than continue with iterations. See the comment inside /* */
int main(int argc, char *argv[])
{
int numprocs,
rank;
MPI_Request request;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(rank == 0) // the searching process
{
for (int i=0; i < 4000000; i++)
{
// do some stuff here; does not matter what
/* see if any message has been transmitted to me at this point
without blocking the process; if at this time it happens to be
transmitted, do something and than continue with for iternations;
or just continue with for iterations and maybe next time will
have a message which sends me to do something */
}
}
else
{
int flag = 1;
while(flag)
{
// something done that at some point changes flag
}
// send a message to process with rank 0 and don't get stuck here
MPI_Isend(12, 1, MPI_INT, 0, 100, MPI_COMM_WORLD, &request);
// some other stuff done
// wait for message to be transmitted
MPI_Wait(&request, &status);
}
MPI_Finalize();
return 0;
}
One solution is to use MPI_IProbe() to test if a message is waiting.
On this line, use a pointer instead of "12"
MPI_Isend(12, 1, MPI_INT, 0, 100, MPI_COMM_WORLD, &request);
Add flag=0 here :
while(flag!=0)
{
// something done that at some point changes flag
}
Here goes the code :
#include "mpi.h"
#include "stdio.h"
int main(int argc, char *argv[])
{
int numprocs,
rank;
MPI_Request request;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(rank == 0) // the searching process
{
int i;
for (i=0; i < 4000000; i++)
{
// do some stuff here; does not matter what
//printf("I am still running!\n");
int flag;
MPI_Iprobe(MPI_ANY_SOURCE,100,MPI_COMM_WORLD,&flag,&status);
if(flag!=0){
int value;
MPI_Recv(&value, 1, MPI_INT, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
printf("I (0) received %d \n",value);
}
/* see if any message has been transmitted to me at this point
without blocking the process; if at this time it happens to be
transmitted, do something and than continue with for iternations;
or just continue with for iterations and maybe next time will
have a message which sends me to do something */
}
}
else
{
int i;
for(i=0;i<42;i++){
int flag = 1;
while(flag!=0)
{
// something done that at some point changes flag
flag=0;
}
int bla=1000*rank+i;
// send a message to process with rank 0 and don't get stuck here
MPI_Isend(&bla, 1, MPI_INT, 0, 100, MPI_COMM_WORLD, &request);
// some other stuff done
printf("I (%d) do something\n",rank);
// wait for message to be transmitted
MPI_Wait(&request, &status);
}
}
MPI_Finalize();
return 0;
}
Bye,
Francis
The non-blocking test for available messages is done using the MPI_Iprobe call. In your case it would look like:
int available;
MPI_Status status;
if(rank == 0) // the searching process
{
for (int i=0; i < 4000000; i++)
{
// do some stuff here; does not matter what
/* see if any message has been transmitted to me at this point
without blocking the process; if at this time it happens to be
transmitted, do something and than continue with for iternations;
or just continue with for iterations and maybe next time will
have a message which sends me to do something */
// Tag value 100 matches the value used in the send operation
MPI_Iprobe(MPI_ANY_SOURCE, 100, MPI_COMM_WORLD, &available, &status);
if (available)
{
// Message source rank is now available in status.MPI_SOURCE
// Receive the message
MPI_Recv(..., status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, &status);
}
}
}
MPI_ANY_SOURCE is used as wildcard rank, i.e. it instruct MPI_Irecv to check for messages from any source. If a corresponding send was posted, then available will be set to true, otherwise it will be set to false. The actual source of the message is also written to the MPI_SOURCE field of the status object. If the available flags indicates the availability of a matching message, one should then post a receive operation in order to receive it. It is important that the rank and the tag are explicitly specified in the receive operation, otherwise a different message could get received instead.
You could also use persistent connections. These behave very much like non-blocking operations with the important difference that they could be restarted multiple times. The same code with persistent connections would look like:
if(rank == 0) // the searching process
{
MPI_Request req;
MPI_Status status;
int completed;
// Prepare the persistent connection request
MPI_Recv_init(buffer, buf_size, buf_type,
MPI_ANY_SOURCE, 100, MPI_COMM_WORLD, &req);
// Make the request active
MPI_Start(&req);
for (int i=0; i < 4000000; i++)
{
// do some stuff here; does not matter what
/* see if any message has been transmitted to me at this point
without blocking the process; if at this time it happens to be
transmitted, do something and than continue with for iternations;
or just continue with for iterations and maybe next time will
have a message which sends me to do something */
// Non-blocking Test for request completion
MPI_Test(&req, &completed, &status);
if (completed)
{
// Message is now in buffer
// Process the message
// ...
// Activate the request again
MPI_Start(&req);
}
}
// Cancel and free the request
MPI_Cancel(&req);
MPI_Request_free(&req);
}
Persistent operations have a slight performance edge over the non-persistent ones, shown in the previous code sample. It is important that buffer is not accessed while the request is active, i.e. after the call to MPI_Start and before MPI_Test signals completion. Persistent send/receive operations also match non-persistent receive/send operations, therefore it is not necessary to change the code of the workers and they can still use MPI_Isend.

mpi collective operations from one communicator to another

I have an application that is parallelized with MPI and is split into a number of different tasks. Each processor is assigned only one task and the group of processors which is assigned the same task is assigned it's own communicator. Periodically, the tasks need to synchronize. Currently, the synchronization is done via MPI_COMM_WORLD, but that has the drawback that no collective operations can be used since it is not guaranteed that other tasks will ever reach that block of code.
As a more concrete example:
task1: equation1_solver, N nodes, communicator: mpi_comm_solver1
task2: equation2_solver, M nodes, communicator: mpi_comm_solver2
task3: file IO , 1 node , communicator: mpi_comm_io
I would like to MPI_SUM an array on task1 and have the result appear at task3. Is there an efficient way to do this? (my apologies if this is a stupid question, I don't have much experience with creating and using custom MPI communicators)
Charles is exactly right; the intercommunicators allow you to talk between communicators (or, to distinguish "normal" communicators in this context, "intra-communicators", which doesn't strike me as much of an improvement).
I've always found the use of these intercommunicators a little confusing for those new to it. Not the basic ideas, which make sense, but the mechanics using (say) MPI_Reduce with one of these. The group of tasks doing the reduction specify the root rank on the remote communicator, so far so good; but within the remote rank communicator, everyone not the root specifies MPI_PROC_NULL as root, whereas the actual root specifies MPI_ROOT. The things one does for backwards compatability, hey?
#include <mpi.h>
#include <stdio.h>
int main(int argc, char **argv)
{
int commnum = 0; /* which of the 3 comms I belong to */
MPI_Comm mycomm; /* Communicator I belong to */
MPI_Comm intercomm; /* inter-communicator */
int cw_rank, cw_size; /* size, rank in MPI_COMM_WORLD */
int rank; /* rank in local communicator */
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &cw_rank);
MPI_Comm_size(MPI_COMM_WORLD, &cw_size);
if (cw_rank == cw_size-1) /* last task is IO task */
commnum = 2;
else {
if (cw_rank < (cw_size-1)/2)
commnum = 0;
else
commnum = 1;
}
printf("Rank %d in comm %d\n", cw_rank, commnum);
/* create the local communicator, mycomm */
MPI_Comm_split(MPI_COMM_WORLD, commnum, cw_rank, &mycomm);
const int lldr_tag = 1;
const int intercomm_tag = 2;
if (commnum == 0) {
/* comm 0 needs to communicate with comm 2. */
/* create an intercommunicator: */
/* rank 0 in our new communicator will be the "local leader"
* of this commuicator for the purpose of the intercommuniator */
int local_leader = 0;
/* Now, since we're not part of the other communicator (and vice
* versa) we have to refer to the "remote leader" in terms of its
* rank in COMM_WORLD. For us, that's easy; the remote leader
* in the IO comm is defined to be cw_size-1, because that's the
* only task in that comm. But for them, it's harder. So we'll
* send that task the id of our local leader. */
/* find out which rank in COMM_WORLD is the local leader */
MPI_Comm_rank(mycomm, &rank);
if (rank == 0)
MPI_Send(&cw_rank, 1, MPI_INT, cw_size-1, 1, MPI_COMM_WORLD);
/* now create the inter-communicator */
MPI_Intercomm_create( mycomm, local_leader,
MPI_COMM_WORLD, cw_size-1,
intercomm_tag, &intercomm);
}
else if (commnum == 2)
{
/* there's only one task in this comm */
int local_leader = 0;
int rmt_ldr;
MPI_Status s;
MPI_Recv(&rmt_ldr, 1, MPI_INT, MPI_ANY_SOURCE, lldr_tag, MPI_COMM_WORLD, &s);
MPI_Intercomm_create( mycomm, local_leader,
MPI_COMM_WORLD, rmt_ldr,
intercomm_tag, &intercomm);
}
/* now let's play with our communicators and make sure they work */
if (commnum == 0) {
int max_of_ranks = 0;
/* try it internally; */
MPI_Reduce(&rank, &max_of_ranks, 1, MPI_INT, MPI_MAX, 0, mycomm);
if (rank == 0) {
printf("Within comm 0: maximum of ranks is %d\n", max_of_ranks);
printf("Within comm 0: sum of ranks should be %d\n", max_of_ranks*(max_of_ranks+1)/2);
}
/* now try summing it to the other comm */
/* the "root" parameter here is the root in the remote group */
MPI_Reduce(&rank, &max_of_ranks, 1, MPI_INT, MPI_SUM, 0, intercomm);
}
if (commnum == 2) {
int sum_of_ranks = -999;
int rootproc;
/* get reduction data from other comm */
if (rank == 0) /* am I the root of this reduce? */
rootproc = MPI_ROOT;
else
rootproc = MPI_PROC_NULL;
MPI_Reduce(&rank, &sum_of_ranks, 1, MPI_INT, MPI_SUM, rootproc, intercomm);
if (rank == 0)
printf("From comm 2: sum of ranks is %d\n", sum_of_ranks);
}
if (commnum == 0 || commnum == 2);
MPI_Comm_free(&intercomm);
MPI_Finalize();
}
All you need is to create a new communicator that includes nodes from both task you want to communicate together. Take a look at MPI Groups and Communicators. You can find many examples on the net, here for instance.

Resources