I am attempting to make a parallel program that merge sorts two arrays that are being sent to each other from separate processes. In this simplified version, where I am attempting to get the communication to work, I wish to simply send one array (length of four unsigned integers) from process 0 to process 1, then print both the local and received arrays in process 1. Here is the code for this. (Load_and_distribute simply fills the arrays, and I have checked to ensure that both processes do indeed have four unsigned integers within).
int
main(int argc, char ** argv)
{
int ret;
unsigned int ln, tn;
unsigned int * lvals;
int rank, size;
ret = MPI_Init(&argc, &argv);
assert(MPI_SUCCESS == ret);
/* get information about MPI environment */
ret = MPI_Comm_size(MPI_COMM_WORLD, &size);
assert(MPI_SUCCESS == ret);
ret = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
assert(MPI_SUCCESS == ret);
load_and_distribute(argv[1], &ln, &lvals);
unsigned int rn;
unsigned int * rvals;
rvals = malloc(4*sizeof(*rvals));
if(rank == 0){
MPI_Send(&lvals, 4, MPI_UNSIGNED, 1, 0, MPI_COMM_WORLD);
}
else if (rank == 1){
rvals[0] = 4;
MPI_Recv(&rvals, 4, MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("lvals = %d %d %d %d\n",lvals[0],lvals[1],lvals[2],lvals[3]);
printf("rvals = %d %d %d %d\n",rvals[0],rvals[1],rvals[2],rvals[3]);
}
ret = MPI_Finalize();
assert(MPI_SUCCESS == ret);
return EXIT_SUCCESS;
}
The send and receive seems to go through without a fit, but when it attempts to print the rval values, I arrive at this output, and I am unsure why.
[hpc5:04355] *** Process received signal ***
[hpc5:04355] Signal: Segmentation fault (11)
[hpc5:04355] Signal code: Address not mapped (1)
[hpc5:04355] Failing at address: 0xe0c4ac
[hpc5:04355] [ 0] /lib64/libpthread.so.0(+0xf370)[0x7f2a8d23c370]
[hpc5:04355] [ 1] ./hms_mpi[0x40165d]
[hpc5:04355] [ 2] /lib64/libc.so.6(__libc_start_main+0xf5)[0x7f2a8ce8db35]
[hpc5:04355] [ 3] ./hms_mpi[0x400c29]
[hpc5:04355] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 4355 on node hpc5 exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
The correct buffers for MPI_Send() and MPI_Recv() are lvals and rvals (e.g. do not use the & keyword)
Remove & in your MPI_Send and MPI_Recv:
MPI_Send(lvals, 4, MPI_UNSIGNED, 1, 0, MPI_COMM_WORLD);
MPI_Recv(rvals, 4, MPI_UNSIGNED, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
It is working like this:
int MPI_Send(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm)
buf: initial address of send buffer (choice)*
Related
In MPI (MPICH) I am trying to use windows. I have a 3D grid topology and additional communicator i_comm.
MPI_Comm cartcomm;
int periods[3]={1,1,1}, reorder=0, coords[3];
int dims[3]={mesh, mesh, mesh}; //mesh is size of each dimention
MPI_Dims_create(size, 3, dims);
MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods,reorder, &cartcomm);
MPI_Cart_coords(cartcomm, my_rank, 3, coords);
MPI_Comm i_comm;
int i_remain_dims[3] = {false, true, false};
MPI_Cart_sub(cartcomm, i_remain_dims, &i_comm);
int i_rank;
MPI_Comm_rank(i_comm, &i_rank);
MPI_Win win_PB;
int * PA = (int *) malloc(r*r*sizeof(int)); //r is input size
int * PB = (int *) malloc(r*r*sizeof(int));
/* arrays are initialized*/
Then I create window and afterwards try to use get function
if(i_rank == 0){
MPI_Win_create(PB, r*r*sizeof(int), sizeof(int), MPI_INFO_NULL, i_comm, &win_PB);
}
else{
MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, i_comm, &win_PB);
}
MPI_Win_fence(0, win_PB);
if(i_rank != 0){
MPI_Get(PB, r*r*sizeof(int), MPI_INT, 0, 0, r*r*sizeof(int), MPI_INT, win_PB);
}
MPI_Win_fence(0, win_PB);
With this code I get long output of errors:
[ana:24006] *** Process received signal ***
[ana:24006] Signal: Segmentation fault (11)
[ana:24006] Signal code: Address not mapped (1)
[ana:24006] Failing at address: 0xa8
Also, without using MPI_Win_fence, I get error with get function: MPI_ERR_RMA_SYNC: error executing rma sync. Which I am not sure is normal.
What I observed is that if I declare arrays in a reverse order then it works fine:
int * PB = (int *) malloc(r*r*sizeof(int));
int * PA = (int *) malloc(r*r*sizeof(int));
The problem is that I will need to create another communicator and another window for PA buffer, so just switching order of lines does not help at the end.
I would highly appreciate any help to figure out what I am doing wrong.
I have the following code written in C with MPI:
#include <mpi.h>
#include <stdio.h>
int main(int argc, char *argv[])
{
int size, rank;
MPI_Status status;
int buf[1000];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
int i = 0;
while (i != 1000) {
buf[i] = i;
i++;
}
MPI_Send(buf, 999, MPI_INT, 1, 1, MPI_COMM_WORLD);
printf("msg has been sent\n");
}
if (rank == 1) {
int sz = sizeof(buf);
int lst = buf[sz-1];
MPI_Recv(buf, 999, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
printf("la taille du buf %d et dernier %d", sz, lst);
}
MPI_Finalize();
}
And after run it it gives this message:
msg has been sente
[blitzkrieg-TravelMate-P253:03395] *** Process received signal ***
[blitzkrieg-TravelMate-P253:03395] Signal: Segmentation fault (11)
[blitzkrieg-TravelMate-P253:03395] Signal code: Address not mapped (1)
[blitzkrieg-TravelMate-P253:03395] Failing at address: 0xbfee8574
[blitzkrieg-TravelMate-P253:03395] [0] [0xb772d40c]
[blitzkrieg-TravelMate-P253:03395] [1] mpii(main+0x12f) [0x8048883]
[blitzkrieg-TravelMate-P253:03395] [2] /lib/i386-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0xb74c84d3]
[blitzkrieg-TravelMate-P253:03395] [3] mpii() [0x80486c1]
[blitzkrieg-TravelMate-P253:03395] *** End of error message ***
mpirun noticed that process rank 1 with PID 3395 on node blitzkrieg-
TravelMate-P253 exited on signal 11 (Segmentation fault).
Any suggestion will help thnx.
The stack trace shows that the error is not in the MPI_Recv as the question title suggests. The error is actually here:
int sz = sizeof(buf);
int lst = buf[sz-1]; // <---- here
Since buf is an array of int and sizeof(buf) returns its size in bytes, sz is set to 4 times the number of elements in the array. Accessing buf[sz-1] goes way beyond the bounds of buf and into an unmapped memory region above the stack of the process.
You should divide the total size of the array by the size of one of its elements, e.g. the first one:
int sz = sizeof(buf) / sizeof(buf[0]);
int lst = buf[sz-1];
EDIT: There is no problem with this code in particular. I created a reduced version of my code and this part works perfectly. I still don't understand why it is not working in my whole code, because I have everything commented but this, but that's maybe too particular. Sorry, wrong question.
(I edited and added at the bottom the error that I get).
I'm trying to parallelize a C program.
I'm encountering errors when I try to pass an array allocated with malloc from the master process to the rest of processes. Or better, when I try to receive it.
This is the piece of code I'm having trouble with:
if (rank == 0)
{
int *data=(int *) malloc(size*sizeof(int));
int error_code = MPI_Send(data, size, MPI_INT, 1, 1, MPI_COMM_WORLD);
if (error_code != MPI_SUCCESS) {
char error_string[BUFSIZ];
int length_of_error_string;
MPI_Error_string(error_code, error_string, &length_of_error_string);
printf("%3d: %s\n", rank, error_string);
}
printf("Data sent.");
}
else if (rank == 1)
{
int *data=(int *) malloc(size*sizeof(int));
int error_code = MPI_Recv(data, size, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
if (error_code != MPI_SUCCESS) {
char error_string[BUFSIZ];
int length_of_error_string;
MPI_Error_string(error_code, error_string, &length_of_error_string);
printf("%3d: %s\n", rank, error_string);
}
printf("Received.");
}
"Data sent." is printed, followed by a segmentation fault (with memory dump) caused by the second process and "Received" is never printed.
I think I'm not receiving well the data. But I tried several possibilities, I think I have to pass the address of the variable and not just the pointer to the first position, so I thought this was the correct way, but it is not working.
From the error codes nothing gets printed.
Does anyone know what's causing the error and what was my mistake?
Thanks!
EDIT:
This is the exact error:
*** Process received signal ***
*** End of error message ***
Signal: Segmentation fault (11)
Signal code: Address not mapped (1)
EDIT 2:
This code works:
int main(int argc, char* argv[])
{
int size_x = 12;
int size_y = 12;
int rank, size, length;
char nodename[BUFSIZ];
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&size);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Get_processor_name(nodename, &length);
MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
if (rank == 0)
{
int *data=malloc(size*sizeof(int));
int error_code = MPI_Send(data, size, MPI_INT, 1, 1, MPI_COMM_WORLD);
if (error_code != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string;
MPI_Error_string(error_code, error_string, &length_of_error_string);
printf("%3d: %s\n", rank, error_string);
}
printf("Data sent.");
}
else if (rank > 0)
{
int *data=malloc(size*sizeof(int));
int error_code = MPI_Recv(data, size, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
if (error_code != MPI_SUCCESS)
{
char error_string[BUFSIZ];
int length_of_error_string;
MPI_Error_string(error_code, error_string, &length_of_error_string);
printf("%3d: %s\n", rank, error_string);
}
printf("Received.");
}
MPI_Finalize();
return 0;
}
I found the problem, it was not the MPI calls, but there was a problem with a previous function (forgot a variable in "printf") which I didn't notice. That made the whole code break. Tricky MPI...
EDIT: My question is similar to C, Open MPI: segmentation fault from call to MPI_Finalize(). Segfault does not always happen, especially with low numbers of processes, so it you answer that one instead that would be great, either way . . .
I was hoping to get some help debugging the following code:
int main(){
long* my_local;
long n, s, f;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
if(my_rank == 0){
/* Get size n from user */
printf("Total processes: %d\n", comm_sz);
printf("Number of keys to be sorted? ");
fflush(stdout);
scanf("%ld", &n);
/* Broadcast size n to other processes */
MPI_Bcast(&n, 1, MPI_LONG, 0, MPI_COMM_WORLD);
/* Create n/comm_sz keys
NOTE! some processes will have 1 extra key if
n%comm_sz != 0 */
create_Keys(&my_local, my_rank, comm_sz, n, &s, &f);
}
if(my_rank != 0){
/* Receive n from process 0 */
MPI_Bcast(&n, 1, MPI_LONG, 0, MPI_COMM_WORLD);
/* Create n/comm_sz keys */
create_Keys(&my_local, my_rank, comm_sz, n, &s, &f);
}
/* The offending function, f is a long set to num elements of my_local*/
Odd_Even_Tsort(&my_local, my_rank, f, comm_sz);
printf("Process %d completed the function", my_rank);
MPI_Finalize();
return 0;
}
void Odd_Even_Tsort(long** my_local, int my_rank, long my_size, int comm_sz)
{
long nochange = 1;
long phase = 0;
long complete = 1;
MPI_Status Stat;
long your_size = 1;
long* recv_buf = malloc(sizeof(long)*(my_size+1));
printf("rank %d has size %ld\n", my_rank, my_size);
while (complete!=0){
if((phase%2)==0){
if( ((my_rank%2)==0) && my_rank < comm_sz-1){
/* Send right */
MPI_Send(&my_size, 1, MPI_LONG, my_rank+1, 0, MPI_COMM_WORLD);
MPI_Send(*my_local, my_size, MPI_LONG, my_rank+1, 0, MPI_COMM_WORLD);
MPI_Recv(&your_size, 1, MPI_LONG, my_rank+1, 0, MPI_COMM_WORLD, &Stat);
MPI_Recv(&recv_buf, your_size, MPI_LONG, my_rank+1, 0, MPI_COMM_WORLD, &Stat);
}
if( ((my_rank%2)==1) && my_rank < comm_sz){
/* Send left */
MPI_Recv(&your_size, 1, MPI_LONG, my_rank-1, 0, MPI_COMM_WORLD, &Stat);
MPI_Recv(&recv_buf, your_size, MPI_LONG, my_rank-1, 0, MPI_COMM_WORLD, &Stat);
MPI_Send(&my_size, 1, MPI_LONG, my_rank-1, 0, MPI_COMM_WORLD);
MPI_Send(*my_local, my_size, MPI_LONG, my_rank-1, 0, MPI_COMM_WORLD);
}
}
phase ++;
complete = 0;
}
printf("Done!\n");
fflush(stdout);
}
And the Error I'm getting is:
[ubuntu:04968] *** Process received signal ***
[ubuntu:04968] Signal: Segmentation fault (11)
[ubuntu:04968] Signal code: Address not mapped (1)
[ubuntu:04968] Failing at address: 0xb
--------------------------------------------------------------------------
mpiexec noticed that process rank 1 with PID 4968 on node ubuntu exited on signal 11 (Segmentation fault).
The reason I'm baffled is that the print statements after the function are still displayed, but if I comment out the function, no errors. So, where the heap am I getting a Segmentation fault?? I'm getting the error with mpiexec -n 2 ./a.out and an 'n' size bigger than 9.
If you actually wanted the entire runnable code, let me know. Really I was hoping not so much for the precise answer but more how to use the gdb/valgrind tools to debug this problem and others like it (and how to read their output).
(And yes, I realize the 'sort' function isn't sorting yet).
The problem here is simple, yet difficult to see unless you use a debugger or print out exhaustive debugging information:
Look at the code where MPI_Recv is called. The recv_buf variable should be supplied as an argument instead of &recv_buf.
MPI_Recv( recv_buf , your_size, MPI_LONG, my_rank-1, 0, MPI_COMM_WORLD, &Stat);
The rest seems ok.
If I am using blocking send and recv (MPI_send(), MPI_recv()), how to make theese two operation indefinite? Like they are repeating all over again?
Sample code:
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
MPI_Comm_rank (MPI_COMM_WORLD,&rank);
if(rank==0){
rc=MPI_Send(msg,1,MPI_CHAR,1,1,MPI_COMM_WORLD);
rc=MPI_Recv(msg,1,MPI_CHAR,1,1,MPI_COMM_WORLD,&status);
}else{
rc=MPI_Recv(msg,1,MPI_CHAR,0,0,MPI_COMM_WORLD,&status);
rc=MPI_Send(msg,1,MPI_CHAR,0,0,MPI_COMM_WORLD);
}
I have tried to put before if(rank==0) -> while(1) and it did the job, but I see there are several sends, then several recieves and I want it like - send(0), receive(1), send(1), recieve(0).
You can code a ring of send-receive operations easily by using MPI_Sendrecv:
int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
int dest, int sendtag, void *recvbuf, int recvcount,
MPI_Datatype recvtype, int source, int recvtag,
MPI_Comm comm, MPI_Status *status)
As you can see it's only a condensed version of a MPI_Send and a MPI_Recv, but it comes handy when all the process needs either to send and receive something.
The following code works for any number of processes (you can adapt it to your needs):
CODE UPDATE #1 (Using MPI_Sendrecv)
#include <stdio.h>
#include <unistd.h>
#include <mpi.h>
int main (int argc, char *argv[])
{
int size, rank, value, next, prev, sendval, recval;
double t0, t;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
value = 5;
if (size > 1)
{
next = (rank + 1)% size;
prev = (size+rank - 1)% size;
sendval = value + rank;
for (;;)
{
t0 = MPI_Wtime();
MPI_Sendrecv(&sendval, 1, MPI_INT, next, 1, &recval, 1, MPI_INT, prev, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
t = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Sended %d to process %d, Received %d from process %d (MPI_SendRecv Time: %f)\n",rank, size-1, sendval, next, recval, prev, (t - t0));
}
}
MPI_Finalize();
return 0;
}
CODE UPDATE #2 (Using separate MPI_Send/MPI_Recv)
#include <stdio.h>
#include <unistd.h>
#include <mpi.h>
int main (int argc, char *argv[])
{
int size, rank, value, next, prev, sendval, recval;
double s0, s, r, r0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
value = 5;
if (size > 1)
{
next = (rank + 1)% size;
prev = (size+rank - 1)% size;
sendval = value + rank;
for (;;)
{
s0 = MPI_Wtime();
MPI_Send(&sendval, 1, MPI_INT, next, 1, MPI_COMM_WORLD);
s = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Sended %d to process %d (MPI_Send Time: %f)\n", rank, size-1, sendval, next, s-s0);
r0 = MPI_Wtime();
MPI_Recv(&recval, 1, MPI_INT, prev, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
r = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Received %d from process %d (MPI_Recv Time: %f)\n", rank, size-1, recval, prev, r-r0);
}
}
MPI_Finalize();
return 0;
}
Running Example
mpicc -o sendrecv sendrecv.c
mpirun -n 2 sendrecv
[0 of 1]: Sended 5 to process 1, Received 6 from process 1 (MPI_SendRecv Time: 0.000121)
[1 of 1]: Sended 6 to process 0, Received 5 from process 0 (MPI_SendRecv Time: 0.000068)
...
It is impossible to give an accurate answer to that without seeing at least the basic layout of your code. Generally, you would place the Send and Receive operations inside an infinite loop. Or, if you're hard pressed for optimal communication costs (or simply feeling adventurous), you could use persistent Send and Receive.