Proper way to create ghost zones MPI [halos] - c

Good night
I'm attending to a parallel programming course. The teacher gave us an assignment that involves domain partition for stencil calculations. For this type of calculations (finite difference) the most common way to parallelize a code is to partition the domain and create some ghost zones (halos).
For better understand the creation of ghost zones in MPI I programmed this simple example that initialize some arrays with inner values = 123 and boundary values 88. At the end of all communication, all ghost values should remain 8. In one node I'm getting 123 values.
Serial (no ghosts):
123 - 123 - ... - 123 - 123
Two partitions:
123 - 123 - ... - 88 ||| 88 - ... - 123 - 123
Three partitions:
123 - 123 - ... - 88 ||| 88 - ... - 123 - 123 - 88 ||| 88 - ... - 123 - 123
Aside from this bug, the main question here is about the correct approach to create and maintain ghost zones updated. Is there a cleaner solution for this aside from my messy if(myid == .... else if( myid = ... else type of implementation ? How people usually implement this kind of parallelism ?
#include<mpi.h>
#include<stdio.h>
#include<stdlib.h>
int WhichSize(int mpiId, int numProc, int tam);
int main(int argc, char *argv[]){
int i;
int localSize;
int numProc;
int myid;
int leftProc;
int rightProc;
int * myArray;
int fullDomainSize = 16;
MPI_Request request;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Lets get each partition size.
localSize = WhichSize(myid, numProc, fullDomainSize);
// Allocate arrays acording to proc number.
if(numProc == 1){
//printf("Allocating Array for serial usage\n");
myArray = (int*)malloc(localSize*sizeof(int));
} else if(numProc == 2) {
//printf("Allocating Array for 2 proc usage\n");
myArray = (int*)malloc((localSize+ 1)*sizeof(int));
} else if(numProc > 2) {
if (myid == 0 || myid == numProc - 1){
//printf("Allocating array for boundary nodes usage\n");
myArray = (int*)malloc((localSize+ 1)*sizeof(int));
} else {
//printf("Allocating array for inner nodes usage\n");
myArray = (int*)malloc((localSize+ 2)*sizeof(int));
}
}
// Now we will fill the arrays with a dummy value 123. For the
// boundaries (ghosts) we will fill than with 80. Just to differe
// ntiate.
if(numProc == 1){
//printf("----------------------------------------\n");
//printf("Filling the serial array with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
} else if(numProc == 2) {
////printf("------------------------------------------------\n");
//printf("Filling array for two proc usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghost.
myArray[localSize+1] = 8;
} else if(numProc > 2) {
if (myid == 0 || myid == numProc - 1){
//printf("--------------------------------------------------\n");
//printf("Filling boundary node arrays usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
} else {
//printf("--------------------------------------------------\n");
//printf("Filling inner node arrays usage with values... \n");
for (i = 0; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
myArray[0] = 8;
}
}
// Now lets comunicate the ghosts with MPI_Sendrecv().
if(numProc == 1){
//printf("Serial usage, no ghost to comunicate \n");
} else if(numProc == 2) {
if (myid == 0){
//printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
1,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == 1) {
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
0,
12345,
MPI_COMM_WORLD,
&request);
}
} else if(numProc > 2) {
if (myid == 0){
rightProc = myid + 1;
if (myid == 0){
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
//printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc) {
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
0,
12345,
MPI_COMM_WORLD,
&request);
}
} else if (myid == numProc - 1) {
leftProc = myid - 1;
if (myid == numProc - 1){
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
////printf("Sending ghost value from proc %d to %d\n", myid, myid + 1);
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == leftProc) {
rightProc = myid + 1;
//printf("Receiving ghost value from proc %d to %d\n", myid-1, myid);
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
}
} else {
//printf("-------------------------------\n");
//printf("Communicating Inner ghosts baby\n");
//printf("-------------------------------\n");
leftProc = myid - 1;
rightProc = myid + 1;
// Communicate tail ghost.
if (myid == leftProc) {
MPI_Isend(&myArray[localSize+1],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc){
MPI_Irecv(&myArray[localSize+1],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
}
// Communicate head ghost.
if (myid == leftProc) {
MPI_Isend(&myArray[0],
1,
MPI_INT,
rightProc,
12345,
MPI_COMM_WORLD,
&request);
} else if (myid == rightProc){
MPI_Irecv(&myArray[0],
1,
MPI_INT,
leftProc,
12345,
MPI_COMM_WORLD,
&request);
}
}
}
// Now I Want to see if the ghosts are in place !.
if (myid == 0){
printf("The ghost value is: %d\n", myArray[localSize + 1]);
} else if (myid == numProc - 1){
printf("The ghost value is: %d\n", myArray[0]);
} else {
printf("The head ghost is: %d\n", myArray[0]);
printf("The tail ghost is: %d\n", myArray[localSize + 1]);
}
MPI_Finalize();
exit(0);
}
int WhichSize(int mpiId, int numProc, int tam){
double resto;
int tamLocal;
tamLocal = tam / numProc;
resto = tam - tamLocal*numProc;
if (mpiId < resto) tamLocal = tamLocal + 1;
return tamLocal;
}
thank you guys !

Halos can be elegantly implemented in MPI using Cartesian virtual topologies and the send-receive operation.
First of all, having lots of rank-dependent logic in conditional operators makes the code hard to read and understand. It is way better when the code is symmetric, i.e. when all ranks execute the same code. Corner cases can be taken care of using the MPI_PROC_NULL null rank - a send to or receive from that rank results in a no-op. It is therefore enough to do:
// Compute the rank of the left neighbour
leftProc = myid - 1;
if (leftProc < 0) leftProc = MPI_PROC_NULL;
// Compute the rank of the right neighbour
rightProc = myid + 1;
if (rightProc >= numProc) rightProc = MPI_PROC_NULL;
// Halo exchange in forward direction
MPI_Sendrecv(&myArray[localSize], 1, MPI_INT, rightProc, 0, // send last element to the right
&myArray[0], 1, MPI_INT, leftProc, 0, // receive into left halo
MPI_COMM_WORLD);
// Halo exchange in reverse direction
MPI_Sendrecv(&myArray[1], 1, MPI_INT, leftProc, 0, // send first element to the left
&myArray[localSize+1], 1, MPI_INT, rightProc, 0, // receive into right halo
MPI_COMM_WORLD);
That code works for any rank, even for those at both ends - there either the source or the destination is the null rank and no actual transfer occurs in the corresponding direction. It also works with any number of MPI processes, from one to many. It requires that all ranks have halos on both sides, including those that don't really need it (the two corner ranks). One can store in those dummy halos useful things like boundary values (e.g. when solving PDEs) or simply live with the memory waste, which is usually negligible.
In your code, you use incorrectly non-blocking operations. Those are tricky and require care to be taken. MPI_Sendrecv could and should be used instead. It performs both send and receive operations at the same time and thus prevents deadlocks (as long as there is a matching receive for each send).
If the domain is periodic, then the rank computation logic becomes simply:
// Compute the rank of the left neighbour
leftProc = (myid - 1 + numProc) % numProc;
// Compute the rank of the right neighbour
rightProc = (myid + 1) % numProc;
Instead of doing the arithmetic, one could create a Cartesian virtual topology and then use MPI_Cart_shift to find the ranks of the two neighbours:
// Create a non-periodic 1-D Cartesian topology
int dims[1] = { numProc };
int periods[1] = { 0 }; // 0 - non-periodic, 1 - periodic
MPI_Comm cart_comm;
MPI_Cart_create(MPI_COMM_WORLD, 1, dims, periods, 1, &cart_comm);
// Find the two neighbours
MPI_Cart_shift(cart_comm, 0, 1, &leftProc, &rightProc);
The code for the halo exchange remains the same with the only difference that cart_comm should replace MPI_COMM_WORLD. MPI_Cart_shift automatically takes care of the corner cases and will return MPI_PROC_NULL when appropriate. The advantage of that method is that you can easily switch between non-periodic and periodic domains by simply flipping the values inside the periods[] array.
The halos have to be updates as often as necessary, which depends on the algorithm. With most iterative schemes, the update must happen at the beginning of each iteration. One could reduce the communication frequency by introducing multi-level halos and using the values in the outer levels to compute the values in the inner ones.
To conclude, your main function could be reduced to (without using a Cartesian topology):
int main(int argc, char *argv[]){
int i;
int localSize;
int numProc;
int myid;
int leftProc;
int rightProc;
int * myArray;
int fullDomainSize = 16;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
// Compute neighbouring ranks
rightProc = myid + 1;
if (rightProc >= numProc) rightProc = MPI_PROC_NULL;
leftProc = myid - 1;
if (leftProc < 0) leftProc = MPI_PROC_NULL;
// Lets get each partition size.
localSize = WhichSize(myid, numProc, fullDomainSize);
// Allocate arrays.
myArray = (int*)malloc((localSize+ 2)*sizeof(int));
// Now we will fill the arrays with a dummy value 123. For the
// boundaries (ghosts) we will fill than with 80. Just to differe
// ntiate.
//printf("--------------------------------------------------\n");
//printf("Filling node arrays usage with values... \n");
for (i = 1; i<localSize; i++){
myArray[i] = 123;
}
// ghosts.
myArray[localSize+1] = 8;
myArray[0] = 8;
//printf("-------------------------------\n");
//printf("Communicating Boundary ghosts !\n");
//printf("-------------------------------\n");
//printf("Sending ghost value to the right\n");
MPI_Sendrecv(&myArray[localSize], 1, MPI_INT, rightProc, 12345,
&myArray[0], 1, MPI_INT, leftProc, 12345,
MPI_COMM_WORLD);
//printf("Sending ghost value to the left\n");
MPI_Sendrecv(&myArray[1], 1, MPI_INT, leftProc, 12345,
&myArray[localSize+1], 1, MPI_INT, rightProc, 12345,
MPI_COMM_WORLD);
// Now I Want to see if the ghosts are in place !.
printf("[%d] The head ghost is: %d\n", myid, myArray[0]);
printf("[%d] The tail ghost is: %d\n", myid, myArray[localSize + 1]);
MPI_Finalize();
return 0;
}

Related

Parallizing the calculation of an integral

Here I have a piece of code that is a function to calculate an integral of a function. In the code is function() defined as the function to integrate.
I'm learning about parallel programming and I need to write this code in parallel. The original program is sequential, because every iteration a send operation is done to another processor. What I want to achieve to make it parallel, is that each loop iteration 3 send operations are executed to the other 3 available processors. Imagine 1 processor who divides the tasks (rank = 0) and 3 other processors who do the actual calculation.
Beware it is a large piece of code, but I also included comments to make it more clear:
The sequential code:
if (myRank == 0)
{
// I am the controller, distribute the work
for (step = 0; step < maxSteps; step++)
{
x[0] = x_start + stepSize*step;
x[1] = x_start + stepSize*(step+1);
nextRank = step % (numProcs-1) + 1;
// Send the work
MPI_Send(x, 2, MPI_DOUBLE, nextRank, TAG_WORK, MPI_COMM_WORLD);
// Receive the result
MPI_Recv(y, 2, MPI_DOUBLE, nextRank, TAG_WORK, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
sum += stepSize*0.5*(y[0]+y[1]);
}
// Signal workers to stop by sending empty messages with tag TAG_END
for (nextRank = 1; nextRank < numProcs; nextRank++)
MPI_Send(&nextRank, 0, MPI_INT, nextRank, TAG_END, MPI_COMM_WORLD);
}
else
{
while (1)
{
// I am a worker, wait for work
// Receive the left and right points of the trapezoid and compute
// the corresponding function values. If the tag is TAG_END, don't
// compute but exit.
MPI_Recv(x, 2, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD,
&status);
if (status.MPI_TAG == TAG_END) break;
y[0] = f(x[0]);
y[1] = f(x[1]);
// Send back the computed result
MPI_Send(y, 2, MPI_DOUBLE, 0, TAG_WORK, MPI_COMM_WORLD);
}
}
return sum;
}
To parallelize it, I really hard-coded it, to make clear what I do. I made the loop increment with steps of 3. I added new arrays to store the x- and y-values. What I did was first collect the x-values in a specific array. Then I send each array of x values to a new processor. Then I execute the other function to obtain the y-values. Then I send them back to processor (rank = 0) to add all the 'integration slices'.
The tried to parallelized code
if (myRank == 0)
{
// I am the controller, distribute the work
for (step = 0; step < maxSteps; step+3)
{
x1[0] = x_start + stepSize*step;
x1[1] = x_start + stepSize*(step+1);
x2[0] = x_start + stepSize*(step+1);
x2[1] = x_start + stepSize*((step+1)+1);
x3[0] = x_start + stepSize*(step+2);
x3[1] = x_start + stepSize*((step+1)+2);
nextRank = step % (numProcs-1) + 1;
// Send the work
MPI_Send(x1, 2, MPI_DOUBLE, 1, TAG_WORK, MPI_COMM_WORLD);
MPI_Send(x2, 2, MPI_DOUBLE, 2, TAG_WORK, MPI_COMM_WORLD);
MPI_Send(x3, 2, MPI_DOUBLE, 3, TAG_WORK, MPI_COMM_WORLD);
// Receive the result
MPI_Recv(y1, 2, MPI_DOUBLE, 1, TAG_WORK, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
sum += stepSize*0.5*(y1[0]+y1[1]);
MPI_Recv(y2, 2, MPI_DOUBLE, 2, TAG_WORK, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
sum += stepSize*0.5*(y2[0]+y2[1]);
MPI_Recv(y3, 2, MPI_DOUBLE, 3, TAG_WORK, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
sum += stepSize*0.5*(y3[0]+y3[1]);
}
// Signal workers to stop by sending empty messages with tag TAG_END
for (nextRank = 1; nextRank < numProcs; nextRank++)
MPI_Send(&nextRank, 0, MPI_INT, nextRank, TAG_END, MPI_COMM_WORLD);
}
else if (myRank = 1)
{
while (1)
{
MPI_Recv(x1, 2, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) break;
y1[0] = func(x1[0]);
y1[1] = func(x1[1]);
// Send back the computed result
MPI_Send(y1, 2, MPI_DOUBLE, 0, TAG_WORK, MPI_COMM_WORLD);
}
}
else if (myRank = 2)
{
while (1)
{
MPI_Recv(x2, 2, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) break;
y2[0] = func(x2[0]);
y2[1] = func(x2[1]);
// Send back the computed result
MPI_Send(y2, 2, MPI_DOUBLE, 0, TAG_WORK, MPI_COMM_WORLD);
}
}
else if (myRank = 3)
{
while (1)
{
MPI_Recv(x3, 2, MPI_DOUBLE, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
if (status.MPI_TAG == TAG_END) break;
y3[0] = func(x3[0]);
y3[1] = func(x3[1]);
// Send back the computed result
MPI_Send(y3, 2, MPI_DOUBLE, 0, TAG_WORK, MPI_COMM_WORLD);
}
}
return sum;
}
The problem is that I don't get output anymore. I'm afraid I created a deadlock, but I can't discover where. Could I get feedback on this method?
source: https://doc.itc.rwth-aachen.de/display/VE/PPCES+2012
If you want to get profit of having 8 cores (this is an example only) the best thing you can do (and the simplest) is to divide your integral interval in eight parts (you can make the partition arbitrarily, to give each the same amount of work, that's up to you) and then calculate independently each integral (with the same loop you had for one thread) in each thread.
This approach doesn't change you original calculation and makes the calculations completely independent from each other (so there's no contention of resources at all)
Finally you only have to add the eight integrals to obtain the result you want.
If you are thinking on things like unrolling loops to make more parallelism then you had better to trust your compiler, which is capable to use in parallel his optimizer to get profit of the more than 32 registers a normal cpu has today, and you most probably won't do it better.
The approach suggested here converts your integral in 8 different integral calculations, each with different parameters and different values, and the calculus in one thread doesn't depend on the calculus on others, so, even in a threading core based on pipelines, you'll never have to reorder or complicate instructions because it's very easy to add instructions of another thread to the pipeline in order to not make bubbles. If you have 8 cores, actually more than 8 threads calculating something doesn't represent any advantageous task.

MPI_Isend/Irecv only excuting in first iteration of for loop. What is preventing it from excecuting in subsequent iterations of the loop

I am creating a program in which information from an array is being passed to different processors. In the code below i am trying to send information to and from processors repeatedly using a for loop. When i run the program on 5 and 2 cores, all the print statements are performed in the 1st iteration as expected but after this no more print statements are executed. The program doesn't exit with any error messages. It just hangs.Any thoughts?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
/*MPI Specific Variables*/
int my_size, my_rank, up, down;
MPI_Request reqU, reqD, sreqU, sreqD;
MPI_Status rUstatus, rDstatus, sUstatus, sDstatus;
/*Other Variables*/
int max_iter = 10;
int grid_size = 1000;
int slice;
int x,y,j;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &my_size);
/*Determining neighbours*/
if (my_rank != 0) /*if statemets used to stop highest and lowest rank neighbours arent outside 0 - my_size-1 range of ranks*/
{
up = my_rank-1;
}
else
{
up = 0;
}
if(my_rank != my_size-1)
{
down = my_rank+1;
}
else
{
down = my_size-1;
}
/*cross-check: presumed my_size is a factor of gridsize else there are odd sized slices and this is not coded for*/
if (grid_size%my_size != 0)
{
printf("ERROR - number of procs = %d, this is not a factor of grid_size %d\n", my_size, grid_size);
exit(0);
}
/*Set Up Distributed Data Approach*/
slice = grid_size/my_size;
printf("slice = %d\n", slice);
double phi[slice+2][grid_size]; /*extra 2 rows to allow for halo data*/
for (y=0; y < slice+2; y++)
{
for (x=0; x < grid_size; x++)
{
phi[y][x] = 0.0;
}
}
for (j=0; j<max_iter +1; j++)
{
if (my_rank > 0)
{
printf("1. myrank =%d\n",my_rank);
/*send top most strip up one node to be recieved as bottom halo*/
MPI_Isend(&phi[1][0], grid_size, MPI_DOUBLE, down, 1, MPI_COMM_WORLD, &sreqU);
printf("2. myrank =%d\n",my_rank);
/*recv top halo from up one node*/
MPI_Irecv(&phi[slice + 1][0], grid_size, MPI_DOUBLE, down, 2, MPI_COMM_WORLD, &reqU);
printf("3. myrank =%d\n",my_rank);
}
if (my_rank < my_size -1)
{
printf("4. myrank =%d\n",my_rank);
/*recv top halo from down one node*/
MPI_Irecv(&phi[0][0], grid_size, MPI_DOUBLE, up, 1, MPI_COMM_WORLD, &reqD);
printf("5. myrank =%d\n",my_rank);
/*send bottom most strip down one node to be recieved as top halo*/
MPI_Isend(&phi[slice][0], grid_size, MPI_DOUBLE, up, 2, MPI_COMM_WORLD, &sreqD);
printf("6. myrank =%d\n",my_rank);
}
if (my_rank>0)
{
printf("7. myrank =%d\n",my_rank);
/*Wait for send to down one rank to complete*/
MPI_Wait(&sreqU, &sUstatus);
printf("8. myrank =%d\n",my_rank);
/*Wait for recieve from up one rank to complete*/
MPI_Wait(&reqU, &rUstatus);
printf("9. myrank =%d\n",my_rank);
}
if (my_rank < my_size-1)
{
printf("10. myrank =%d\n",my_rank);
/*Wait for send to up down one rank to complete*/
MPI_Wait(&sreqD, &sDstatus);;
printf("11. myrank =%d\n",my_rank);
/*Wait for recieve from down one rank to complete*/
MPI_Wait(&reqD, &rDstatus);
printf("12. myrank =%d\n",my_rank);
}
}
printf("l\n");
MPI_Finalize();
return 0;
}
This has nothing to do with iterations, the remaining issue is with the computation of up/down. That is reversed in the code up is defined whenever down is needed... This didn't show in your previous code as MPI_PROC_NULL would just skip those communications.

Segmentation Fault when using MPI_Isend

The aim of my program is calculate the electrostatic potential between an inner conductor and an outer conductor by splitting it up into a grid and then into grid slices. Each processor gets a slice and runs the calculations on each slice. I send data between processors using MPI_Isend and MPI_Irecv. When testing the code I get a segmentation fault:
[physnode5:81440] *** Process received signal ***
[physnode5:81440] Signal: Segmentation fault (11)
[physnode5:81440] Signal code: Address not mapped (1)
[physnode5:81440] Failing at address: 0x58
[physnode5:81440] [ 0] /lib64/libpthread.so.0(+0xf5d0)[0x2ab8069df5d0]
[physnode5:81440] [ 1] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(ompi_request_default_wait+0xd)[0x2ab8066495ed]
[physnode5:81440] [ 2] /opt/yarcc/libraries/openmpi/2.1.0/1/default/lib/libmpi.so.20(MPI_Wait+0x5d)[0x2ab80667a00d]
[physnode5:81440] [ 3] ./mpi_tezt.exe[0x400ffc]
[physnode5:81440] [ 4] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2ab806c0e3d5]
[physnode5:81440] [ 5] ./mpi_tezt.exe[0x4009b9]
[physnode5:81440] *** End of error message ***
when this bit of code is executed. please not i have ssh'ed into a cluster. The file name is mpi_tezt.exe (yes i mispelled it).
I have checked the arrays I want to send are correctly allocated and the send and recv are not sending or receiving data that isn't there (i.e sending data outside range of array.
My code for the MPI_Isend and MPI_Irecv is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char *argv[])
{
/*MPI Specific Variables*/
int my_size, my_rank, up, down;
MPI_Request reqU, reqD, sreqU, sreqD;
MPI_Status rUstatus, rDstatus, sUstatus, sDstatus;
/*Physical Dimensions*/
double Linner = 5.0;/*mm*/
double Rinner = 1.0;/*mm*/
double phi_0 = 1000.0;/*V*/
/*Other Variables*/
int grid_size = 100;
int slice;
int x,y;
double grid_res_y = 0.2;
double grid_res_x = 0.1;
int xboundary, yboundary;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &my_size);
/*Determining neighbours*/
if (my_rank != 0) /*if statemets used to stop highest and lowest rank neighbours arent outside 0 - my_size-1 range of ranks*/
{
up = my_rank-1;
}
else
{
up = 0;
}
if(my_rank != my_size-1)
{
down = my_rank+1;
}
else
{
down = my_size-1;
}
/*cross-check: presumed my_size is a factor of gridsize else there are odd sized slices and this is not coded for*/
if (grid_size%my_size != 0)
{
printf("ERROR - number of procs = %d, this is not a factor of grid_size %d\n", my_size, grid_size);
exit(0);
}
/*Set Up Distributed Data Approach*/
slice = grid_size/my_size;
yboundary = Linner/grid_res_y; /*y grid index of inner conductor wall*/
xboundary = Rinner/grid_res_x; /*x grid and individual array index of inner conductor wall*/
double phi[slice+2][grid_size]; /*extra 2 rows to allow for halo data*/
for (y=0; y < slice+2; y++)
{
for (x=0; x < grid_size; x++)
{
phi[y][x] = 0.0;
}
}
if(my_rank == 0) /*Boundary Containing rank does 2 loops. One over part with inner conductor and one over part without inner conductor*/
{
for(y=0; y < slice+1; y++)
{
for(x=xboundary; x < grid_size; x++)
{
phi[y][x] = phi_0;
}
}
}
if (my_rank < my_size-1)
{
/*send top most strip up one node to be recieved as bottom halo*/
MPI_Isend(&phi[1][0], grid_size , MPI_DOUBLE, down, 1, MPI_COMM_WORLD, &sreqU);
/*recv top halo from up one node*/
MPI_Irecv(&phi[slice+1][0], grid_size, MPI_DOUBLE, down, 2, MPI_COMM_WORLD, &reqU);
}
if (my_rank > 0)
{
/*recv top halo from down one node*/
MPI_Irecv(&phi[0][0], grid_size , MPI_DOUBLE, up, 2, MPI_COMM_WORLD, &reqD);
/*send bottom most strip down one node to be recieved as top halo*/
MPI_Isend(&phi[slice][0], grid_size , MPI_DOUBLE, up, 1, MPI_COMM_WORLD, &sreqD);
}
if (my_rank<my_size-1)
{
/*Wait for send to down one rank to complete*/
MPI_Wait(&sreqD, &sDstatus);
/*Wait for recieve from up one rank to complete*/
MPI_Wait(&reqD, &rDstatus);
}
if (my_rank>0)
{
/*Wait for send to up down one rank to complete*/
MPI_Wait(&sreqU, &sUstatus);
/*Wait for recieve from down one rank to complete*/
MPI_Wait(&reqU, &rUstatus);
}
MPI_Finalize();
return 0;
}
I have been testing on 2 processors (ranks 0 and 1) with the hope of extending it to more.
Any ideas where the fault may lie?
You're faulting in the first MPI_Wait (for rank 0). This is step 7 in the example code below.
Using mpirun -np 2 ./whatever:
It appears that sReqD is not being set correctly. This is set at step 5 by rank 1.
But, step 7 is being executed by rank 0, which does not set sReqD.
So, you need to adjust your if statements to match up correctly for which rank does which MPI_Wait, etc.
Here is your code with some debug printf statements:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
int
main(int argc, char *argv[])
{
/* MPI Specific Variables */
int my_size,
my_rank,
up,
down;
MPI_Request reqU,
reqD,
sreqU,
sreqD;
MPI_Status rUstatus,
rDstatus,
sUstatus,
sDstatus;
/* Physical Dimensions */
double Linner = 5.0; /* mm */
double Rinner = 1.0; /* mm */
double phi_0 = 1000.0;
/*V*/
/* Other Variables */
int grid_size = 100;
int slice;
int x,
y;
double grid_res_y = 0.2;
double grid_res_x = 0.1;
int xboundary,
yboundary;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &my_size);
/* Determining neighbours */
/* if statemets used to stop highest and lowest rank neighbours arent
outside 0 - my_size-1 range of ranks */
if (my_rank != 0) {
up = my_rank - 1;
}
else {
up = 0;
}
if (my_rank != my_size - 1) {
down = my_rank + 1;
}
else {
down = my_size - 1;
}
printf("my_rank=%d my_size=%d up=%d down=%d\n",my_rank,my_size,up,down);
/* cross-check: presumed my_size is a factor of gridsize else there are
odd sized slices and this is not coded for */
if (grid_size % my_size != 0) {
printf("ERROR - number of procs = %d, this is not a factor of grid_size %d\n", my_size, grid_size);
exit(0);
}
/* Set Up Distributed Data Approach */
slice = grid_size / my_size;
/* y grid index of inner conductor wall */
yboundary = Linner / grid_res_y;
/* x grid and individual array index of inner conductor wall */
xboundary = Rinner / grid_res_x;
if (my_rank == 0) {
printf("Linner=%g grid_res_y=%g yboundary=%d\n",
Linner,grid_res_y,yboundary);
printf("Rinner=%g grid_res_x=%g xboundary=%d\n",
Rinner,grid_res_x,xboundary);
printf("slice=%d grid_size=%d phi=%ld\n",
slice,grid_size,sizeof(double) * (slice + 2) * grid_size);
}
/* extra 2 rows to allow for halo data */
double phi[slice + 2][grid_size];
for (y = 0; y < slice + 2; y++) {
for (x = 0; x < grid_size; x++) {
phi[y][x] = 0.0;
}
}
/* Boundary Containing rank does 2 loops. One over part with inner
conductor and one over part without inner conductor */
if (my_rank == 0) {
for (y = 0; y < slice + 1; y++) {
for (x = xboundary; x < grid_size; x++) {
phi[y][x] = phi_0;
}
}
}
if (my_rank < my_size - 1) {
/* send top most strip up one node to be recieved as bottom halo */
printf("1: my_rank=%d MPI_Isend\n",my_rank);
MPI_Isend(&phi[1][0], grid_size, MPI_DOUBLE, down, 1, MPI_COMM_WORLD,
&sreqU);
/* recv top halo from up one node */
printf("2: my_rank=%d MPI_Irecv\n",my_rank);
MPI_Irecv(&phi[slice + 1][0], grid_size, MPI_DOUBLE, down, 2,
MPI_COMM_WORLD, &reqU);
printf("3: my_rank=%d\n",my_rank);
}
if (my_rank > 0) {
/* recv top halo from down one node */
printf("4: my_rank=%d MPI_Irecv\n",my_rank);
MPI_Irecv(&phi[0][0], grid_size, MPI_DOUBLE, up, 2, MPI_COMM_WORLD,
&reqD);
/* send bottom most strip down one node to be recieved as top halo */
printf("5: my_rank=%d MPI_Isend\n",my_rank);
MPI_Isend(&phi[slice][0], grid_size, MPI_DOUBLE, up, 1, MPI_COMM_WORLD,
&sreqD);
printf("6: my_rank=%d\n",my_rank);
}
if (my_rank < my_size - 1) {
/* Wait for send to down one rank to complete */
printf("7: my_rank=%d\n",my_rank);
MPI_Wait(&sreqD, &sDstatus);
printf("8: my_rank=%d\n",my_rank);
/* Wait for recieve from up one rank to complete */
printf("9: my_rank=%d\n",my_rank);
MPI_Wait(&reqD, &rDstatus);
printf("10: my_rank=%d\n",my_rank);
}
if (my_rank > 0) {
/* Wait for send to up down one rank to complete */
printf("11: my_rank=%d\n",my_rank);
MPI_Wait(&sreqU, &sUstatus);
printf("12: my_rank=%d\n",my_rank);
/* Wait for recieve from down one rank to complete */
printf("12: my_rank=%d\n",my_rank);
MPI_Wait(&reqU, &rUstatus);
printf("13: my_rank=%d\n",my_rank);
}
MPI_Finalize();
return 0;
}
Here is the output. Notice that step 7 prints (which is before the first MPI_Wait for rank 0). But, rank 0 never gets to step 8 (the printf after that call)
my_rank=0 my_size=2 up=0 down=1
Linner=5 grid_res_y=0.2 yboundary=25
Rinner=1 grid_res_x=0.1 xboundary=10
slice=50 grid_size=100 phi=41600
1: my_rank=0 MPI_Isend
2: my_rank=0 MPI_Irecv
3: my_rank=0
7: my_rank=0
my_rank=1 my_size=2 up=0 down=1
4: my_rank=1 MPI_Irecv
5: my_rank=1 MPI_Isend
6: my_rank=1
11: my_rank=1
[manderly:230404] *** Process received signal ***
[manderly:230403] *** Process received signal ***
[manderly:230403] Signal: Segmentation fault (11)
[manderly:230403] Signal code: Address not mapped (1)
[manderly:230403] Failing at address: 0x58
[manderly:230404] Signal: Segmentation fault (11)
[manderly:230404] Signal code: Address not mapped (1)
[manderly:230404] Failing at address: 0x58
[manderly:230403] [ 0] [manderly:230404] [ 0] /lib64/libpthread.so.0(+0x121c0)/lib64/libpthread.so.0(+0x121c0)[0x7fa5478341c0]
[0x7fa0ebe951c0]
[manderly:230404] [ 1] [manderly:230403] [ 1] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa0ec0e9a81]
[manderly:230404] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(ompi_request_default_wait+0x31)[0x7fa547a88a81]
[manderly:230403] [ 2] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa0ec12c350]
[manderly:230404] [ 3] ./fix2[0x400f93]
[manderly:230404] [ 4] /usr/lib64/openmpi/lib/libmpi.so.20(PMPI_Wait+0x60)[0x7fa547acb350]
[manderly:230403] [ 3] ./fix2[0x400ef7]
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa0ebaedfea]
[manderly:230404] [ 5] ./fix2[0x40081a[manderly:230403] [ 4] ]
[manderly:230404] *** End of error message ***
/lib64/libc.so.6(__libc_start_main+0xea)[0x7fa54748cfea]
[manderly:230403] [ 5] ./fix2[0x40081a]
[manderly:230403] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node manderly exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------

Program does not take input from user to loop itself again.

I have this code down here and it working for calculating but it does not take the input of the user to do the loop again and calculate for the user again or cancel. the function is near the end with if(rank == 0) { ... }. I need help figure out what am I missing
#include <math.h> //include files
#include <stdio.h>
#include "mpi.h"
void printit()
{
printf("\n*********************************\n");
printf("Welcome to the pi calculator!\n");
printf("Programmer: K. Spry\n");
printf("You set the number of divisions\n");
printf("for estimating the integral: \n\tf(x)=4/(1+x^2)");
printf("\n");
printf("*********************************\n");
} //end printit
//function prototypes
int main(int argc, char* argv[])
{
double actual_pi = 3.141592653589793238462643;
//for comparison later
int n, rank, num_proc, i;
double temp_pi, calc_pi, int_size, part_sum, x;
char response = 'y';
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_proc);
if (rank == 0)
{
printit();
} /* I am root node, print out welcome */
while (response == 'y')
{
if (rank == 0)
{ /*I am root node*/
printf("__________________________________\n");
printf("\nEnter the number of intervals: (0 will exit)\n");
n = fgetc(stdin);
}
else
{
int_size = 1.0 / (double) n; //calcs interval size
part_sum = 0.0;
for (i = rank * n / num_proc; i <= (rank + 1)* n / num_proc; i += 1)
{ //calcs partial sums
x = int_size * ((double)i - 0.5);
part_sum += (4.0 / (1.0 + x*x));
}
temp_pi = int_size * part_sum;
//collects all partial sums computes pi
MPI_Reduce(&temp_pi,&calc_pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
{ /*I am server*/
printf("pi is approximately %f", calc_pi);
printf(". Error is %f", fabs(calc_pi - actual_pi));
printf("\n");
printf("_______________________________________");
printf("\n");
}
} //end else
if (rank == 0)
{ /*I am root node*/
printf("\nCompute with new intervals? (y/n)\n");
response = fgetc(stdin);
}
} //end while
MPI_Finalize(); //terminate MPI
return 0;
}
The problem I have is with the if inside the loop while that ask user to Compute with new intervals and user will input Y or N to response. The problem is when users do input, it stop working and never loop.
It behave as it should from what you coded :-).
The comment from ptb is the answer of your question.... But lets do it :-).
while (response == 'y')
{
if (rank == 0)
{ /*I am root node*/
printf("__________________________________\n");
printf("\nEnter the number of intervals: (0 will exit)\n");
n = fgetc(stdin);
}
/* here we have to: broadcast to all processes the value of response
in fact all processes have to wait that root get the new value and then
get this value from the root. Fortunately MPI_Broadcast is a blocking
opperation, so no MPI_Barrier are needed...*/
MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD);
/* then your computation is done by every processes (in fact in your interval
boundaries you indeed take that process 0 has the first interval... i am not
sure your math a correct, I did not check them) */
int_size = 1.0 / (double) n; //calcs interval size
part_sum = 0.0;
for (i = rank * n / num_proc; i <= (rank + 1)* n / num_proc; i += 1)
{ //calcs partial sums
x = int_size * ((double)i - 0.5);
part_sum += (4.0 / (1.0 + x*x));
}
temp_pi = int_size * part_sum;
//collects all partial sums computes pi
MPI_Reduce(&temp_pi,&calc_pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0)
{ /*I am server*/
printf("pi is approximately %f", calc_pi);
printf(". Error is %f", fabs(calc_pi - actual_pi));
printf("\n");
printf("_______________________________________");
printf("\n");
}
if (rank == 0)
{ /*I am root node*/
printf("\nCompute with new intervals? (y/n)\n");
response = fgetc(stdin);
}
/* here we have to: broadcast to all processes the value of response
in fact all processes have to wait that root get the new value and then
get this value from the root. Fortunately MPI_Broadcast is a blocking
opperation, so no MPI_Barrier are needed..., lol same than with n...*/
MPI_Bcast(&response,1,MPI_CHAR,0,MPI_COMM_WORLD);
} //end while
I did not try to compile... so there is maybe some typos...

MPI: Efficient exchange of different amounts of data between all nodes

I'm totally new to MPI and have absolutely no idea as to what is efficient.
I have x (x = 2, 4, 8, 16, ...) nodes, each of them with large text files of more than 4 million lines.
I want to sort these lines using bucket sort, and for that, I want to send the correct lines to each of the nodes (= buckets), meaning I have to send (and receive) from every node to every other node.
Currently my idea is like this:
With my root, I calculate bucket limits that every node has to know so they can send the right lines to the right nodes. I broadcast these limits using MPI_Bcast.
After that, since the count of every set of lines to be send is different, I send the count of lines to be received from every node to every other node using MPI_Isend, MPI_Irecv and MPI_Waitall.
When every node knows how much data it's going to receive from each node, I do the same for the data itself.
The sends and receives are done alternating for each node (process[i] wants to send to process[i+1] first and then receive from process[i-1]).
My code for the send and receive part looks like this:
// fill the buffer for the current rank (because data doesn't have to be sent)
buffers[rank] = buckets[rank];
int i, amount, sendIndex, receiveIndex;
// create MPI_Requests
int reqsIndex = 0;
MPI_Request dummyRequest;
MPI_Request reqs[numtasks - 1];
for (i = 0; i < numtasks - 1; i++)
reqs[i] = MPI_REQUEST_NULL;
// amounts[i]: Which process receives how much data from the current rank
int* amounts = malloc(sizeof(int) * (numtasks-1));
sendIndex = (rank + 1) % numtasks; // always contains the index of the next process to send data to (starting with [rank + 1])
receiveIndex = (rank + numtasks - 1) % numtasks; // always contains the index of the next process to receive data from (starting with [rank - 1])
int isSending = 1; // determines if the current process sends or receives data (always sends first)
// alternating send and receive for each process
for (i = 0; i < (numtasks - 1) * 2; i++)
{
if (isSending) {
MPI_Isend(&buckets[sendIndex]->n, 1, MPI_INT, sendIndex, TAG_NUMBER_OF_SENDED_LINES, MPI_COMM_WORLD, &dummyRequest);
sendIndex = (sendIndex + 1) % numtasks;
if (sendIndex == rank)
sendIndex = (sendIndex + 1) % numtasks;
}
else {
MPI_Irecv(&amounts[receiveIndex], 1, MPI_INT, receiveIndex, TAG_NUMBER_OF_SENDED_LINES, MPI_COMM_WORLD, &reqs[reqsIndex]);
receiveIndex = (receiveIndex + 1) % numtasks;
if (receiveIndex == rank)
receiveIndex = (receiveIndex + 1) % numtasks;
reqsIndex++;
}
isSending = (isSending + 1) % 2; // switch isSending from 1 to 0 or 0 to 1
}
MPI_Waitall(numtasks - 1, reqs, MPI_STATUSES_IGNORE); // waits for all receives to be finished
// requests reset
reqsIndex = 0;
for (i = 0; i < numtasks - 1; i++)
reqs[i] = MPI_REQUEST_NULL;
sendIndex = (rank + 1) % numtasks; // always contains the index of the next process to send data to (starting with [rank + 1])
receiveIndex = (rank + numtasks - 1) % numtasks; // always contains the index of the next process to receive data from (starting with [rank - 1])
isSending = 1; // determines if the current process sends or receives data (always sends first)
for (i = 0; i < (numtasks - 1) * 2; i++)
{
if (isSending) {
MPI_Isend(buckets[sendIndex]->data, buckets[sendIndex]->n * LINE_LENGTH, MPI_BYTE, sendIndex, TAG_LINES, MPI_COMM_WORLD, &dummyRequest);
sendIndex = (sendIndex + 1) % numtasks;
if (sendIndex == rank)
sendIndex = (sendIndex + 1) % numtasks;
}
else {
lineBuffer* lines = allocLines(amounts[receiveIndex]);
MPI_Irecv(lines->data, amounts[receiveIndex] * LINE_LENGTH, MPI_BYTE, receiveIndex, TAG_LINES, MPI_COMM_WORLD, &reqs[reqsIndex]);
buffers[receiveIndex] = lines;
receiveIndex = (receiveIndex + 1) % numtasks;
if (receiveIndex == rank)
receiveIndex = (receiveIndex + 1) % numtasks;
reqsIndex++;
}
isSending = (isSending + 1) % 2; // switch isSending from 1 to 0 or 0 to 1
}
MPI_Waitall(numtasks - 1, reqs, MPI_STATUSES_IGNORE); // waits for all receives to be finished
Currently, the send/receive times for each node are very different, some processes taking a much longer time to go through this code than other nodes. Is there something better to use than isend/irecv/waitall, especially considering that everything that is being sent is of different size?
Cheers. :)

Resources