Program stops at MPI_Send - c

Program stops working, when I execute it with more than 1 processor.
It stops at first MPI_Send
What am I doing wrong?
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define SIZE 200000
#define SIZE2 256
#define VYVOD 1
int main(int argc, char *argv[])
{
int NX, NT;
double TK, UM, DX, DY, DT;
double starttime, endtime;
int numnode, rank, delta=0, ierr, NXnode;
double **U;
double **U1;
double *sosed1;
double *sosed2;
int i, j, k;
MPI_Status stats;
NX = 1*(SIZE2+1);
TK = 20.00;
UM = 10.0;
DX = 0.1;
DY = DX;
DT = 0.1;
NT = (TK/DT);
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numnode);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
if(rank == 0)
printf("\nTotal nodes: %d\n", numnode);
NX = NX - 2;
NXnode = (NX-(NX%numnode))/numnode;
if (rank < (NX%numnode))
{
delta = rank * NXnode + rank + 1;
NXnode++;
}
else
{
delta = rank * NXnode + (NX%numnode) + 1;
}
if(rank == 0){
printf("Order counting complete, NXnode = %d\n", NXnode);
}
U = (double**)malloc(NXnode*sizeof(double*));
U1 = (double**)malloc(NXnode*sizeof(double*));
sosed1 = (double*)malloc(SIZE*sizeof(double));
sosed2 = (double*)malloc(SIZE*sizeof(double));
for (i=0; i < NXnode; i++)
{
U[i] = (double*)malloc(SIZE*sizeof(double));
U[i][0]=0;
U[i][SIZE-1]=0;
U1[i] = (double*)malloc(SIZE*sizeof(double));
U1[i][0]=0;
U1[i][SIZE-1]=0;
if (U[i]==NULL || U1[i]==NULL)
{
printf("Error at memory allocation!");
return 1;
}
}
MPI_Barrier(MPI_COMM_WORLD);
if(rank == 0){
starttime = MPI_Wtime();
printf("Array allocation complete\n");
}
for (i = 0; i < NXnode; i++)
{
for (j = 1; j < SIZE-1; j++)
{
if ((delta)<=(NXnode/2))
{
U1[i][j]=2*(UM/NXnode)*(delta+i);
}
else
{
U1[i][j]=-2*(UM/NXnode) + 2*UM;
}
}
}
printf("Array init 1 complete, rank %d\n", rank);
MPI_Barrier(MPI_COMM_WORLD);
if (rank > 0)
{
MPI_Send(&(U1[0][0]), SIZE, MPI_DOUBLE , rank-1, 0, MPI_COMM_WORLD);
MPI_Recv(&(sosed1[0]), SIZE, MPI_DOUBLE , rank-1, 1, MPI_COMM_WORLD, &stats);
}
else
{
int initInd = 0;
for (initInd = 0; initInd < SIZE; initInd++)
{
sosed1[initInd]=0;
}
}
if (rank < (numnode-1))
{
MPI_Send(&(U1[NXnode-1][0]), SIZE, MPI_DOUBLE , rank+1, 1, MPI_COMM_WORLD);
MPI_Recv(&(sosed2[0]), SIZE, MPI_DOUBLE , rank+1, 0, MPI_COMM_WORLD, &stats);
}
else
{
int initInd = 0;
for (initInd = 0; initInd < SIZE; initInd++)
{
sosed2[initInd]=0;
}
}
printf("Send complete, rank %d\n", rank);
MPI_Barrier(MPI_COMM_WORLD);
printf("Array init complete, rank %d\n", rank);
for (k = 1; k <= NT; k++)
{
int cycle = 0;
for (cycle=1; cycle < SIZE-1; cycle++)
{
U[0][cycle] = U1[0][cycle] + DT/(DX*DX)*(U1[1][cycle]-2*U1[0][cycle]+sosed1[cycle])+DT/(DY*DY)*(U1[0][cycle+1]+U1[0][cycle-1]-(U1[0][cycle]*2));
}
for (i=1; i<NXnode-1; i++)
{
for(j=1; j<SIZE-1; j++)
{
U[i][j] = U1[i][j] + DT/(DX*DX)*(U1[i+1][j]-2*U1[i][j]+U[i-1][j])+DT/(DY*DY)*(U1[i][j+1]+U1[i][j-1]-(U1[i][j]*2));
}
}
for (cycle=1; cycle < SIZE-1; cycle++)
{
U[NXnode-1][cycle]=U1[NXnode-1][cycle]+DT/(DX*DX)*(sosed2[cycle]-2*U1[NXnode-1][cycle]+U1[NXnode-2][cycle])+DT/(DY*DY)*(U1[NXnode-1][cycle+1]+U1[NXnode-1][cycle-1]-(U1[NXnode-1][cycle]*2));
}
/*U[0] = U1[0]+DT/(DX*DX)*(U1[0+1]-2*U1[0]+sosed1);
for (j = 0; j<NXnode; j++)
{
U[j]=U1[j]+DT/(DX*DX)*(U1[j+1]-2*U1[j]+U1[j-1]);
}
U[NXnode-1]=U1[NXnode-1]+DT/(DX*DX)*(sosed2-2*U1[NXnode-1]+U1[(NXnode-1)-1]);*/
if (rank > 0)
{
MPI_Send(&(U[0][0]), SIZE, MPI_DOUBLE , rank-1, 0, MPI_COMM_WORLD);
}
if (rank < (numnode-1))
{
MPI_Send(&(U[NXnode-1][0]), SIZE, MPI_DOUBLE , rank+1, 0, MPI_COMM_WORLD);
}
if (rank > 0)
{
MPI_Recv(&(sosed1[0]), SIZE, MPI_DOUBLE , rank-1, 0, MPI_COMM_WORLD, &stats);
}
if (rank < (numnode-1))
{
MPI_Recv(&(sosed2[0]), SIZE, MPI_DOUBLE , rank+1, 0, MPI_COMM_WORLD, &stats);
}
for (i = 0; i<NXnode; i++)
{
for (j=0; j<SIZE; j++)
{
U1[i][j]=U[i][j];
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
printf("Array count complete, rank %d\n", rank);
if (rank == 0)
{
endtime=MPI_Wtime();
printf("\n## TIME: %f\n", endtime-starttime);
}
MPI_Finalize();
}
UPDATE#1
Tried it like that, so rank 0 would be the first, still doesn't work:
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0 && numnode > 1)
{
MPI_Recv(&(sosed2[0]), SIZE, MPI_DOUBLE , rank+1, 0, MPI_COMM_WORLD, &stats);
MPI_Send(&(U1[NXnode-1][0]), SIZE, MPI_DOUBLE , rank+1, 1, MPI_COMM_WORLD);
int initInd = 0;
for (initInd = 0; initInd < SIZE; initInd++)
{
sosed1[initInd]=0;
}
}
else if (rank == 0)
{
int initInd = 0;
for (initInd = 0; initInd < SIZE; initInd++)
{
sosed2[initInd]=0;
sosed1[initInd]=0;
}
}
else if (rank < (numnode-1))
{
MPI_Send(&(U1[0][0]), SIZE, MPI_DOUBLE , rank-1, 1, MPI_COMM_WORLD);
MPI_Recv(&(sosed1[0]), SIZE, MPI_DOUBLE , rank-1, 0, MPI_COMM_WORLD, &stats);
MPI_Recv(&(sosed2[0]), SIZE, MPI_DOUBLE , rank+1, 0, MPI_COMM_WORLD, &stats);
MPI_Send(&(U1[NXnode-1][0]), SIZE, MPI_DOUBLE , rank+1, 1, MPI_COMM_WORLD);
}
else if (rank == (numnode - 1))
{
MPI_Send(&(U1[0][0]), SIZE, MPI_DOUBLE , rank-1, 1, MPI_COMM_WORLD);
MPI_Recv(&(sosed1[0]), SIZE, MPI_DOUBLE , rank-1, 0, MPI_COMM_WORLD, &stats);
int initInd = 0;
for (initInd = 0; initInd < SIZE; initInd++)
{
sosed2[initInd]=0;
}
}
UPDATE#2
Solved, used same tag for all Send/Recv.

MPI_Send is blocking the execution until the corresponding MPI_Recv is invoked (presumably in another process).
In your program, all processes except rank=0 are calling MPI_Send immediately after the first barrier, and no one is ready to Recv the message, so MPI_Send blocks infinitely. Essentially, every process is waiting for its message to be accepted by the process with the lower rank (rank 2 is waiting for rank 1, rank 1 is waiting for rank 0), and rank 0 is not accepting any messages at all (it goes to the next block of code and in turn calls MPI_Send too), so everything just hangs.
It looks like you are missing communication part for the process with rank=0 (it should do something like MPI_Recv(from rank 1); ...; MPI_Send(to rank 1);.
Another thing is that you use MPI_Send with tag 1, but call MPI_Recv with tag 0. This won't couple. You need to use the same tag, or to specify MPI_TAG_ANY in the receive operation.

Related

MPI_Gather doesn't receive data

The program should take two cmd line arguments, N = the number of items each worker should generate, and H = the highest value in the range of random numbers generate by each worker. Each worker makes a list of those random values and then the BigList is where I'm trying to gather them all back to but nothing shows up in the array of BigList. So for example:
Running mpirun -np 3 a.out 4 20 gets:
RANK: 1 --- NUM: 18
RANK: 1 --- NUM: 6
RANK: 1 --- NUM: 12
RANK: 1 --- NUM: 10
RANK: 2 --- NUM: 9
RANK: 2 --- NUM: 3
RANK: 2 --- NUM: 6
RANK: 2 --- NUM: 5
and BigList is empty when I'd expect it to get composed of every num listed above.
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char* argv[]){
double t1, t2;
MPI_Init(&argc, &argv);
int rank;
int wsize;
int N = 10, H = 5;
int num, k, i;
int locarr[25];
int bigList[300];
srand(time(NULL));
if(argc > 1){
N = atoi(argv[1]);
H = atoi(argv[2]);
}
t1 = MPI_Wtime();
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &wsize);
if( rank == 0){
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&H, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else{
for(i = 0; i < N; i++){
locarr[i] = (((rand() % H) + 1) / rank);
printf("RANK: %d --- NUM: %d\n", rank, locarr[i]);
}
}
MPI_Gather(&locarr, N, MPI_INT, bigList, N, MPI_INT, 0, MPI_COMM_WORLD);
if( rank == 0){
printf("BigList: ");
for(k = 0; k < (rank * N); k++){
printf(" %d", bigList[k]);
}
printf("\n");
}
t2 = MPI_Wtime();
// printf("\nMPI_Wtime(): %f\n", t2 - t1);
MPI_Finalize();
return 0;
}
Let me expand the comment of Gilles Gouaillardet,
MPI_Gather call is written correctly. To get the expected results, two changes need to be done.
MPI_Bcast is a collective operation. All processes should call it. So the code should be:
if( rank == 0){
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&H, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else{
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&H, 1, MPI_INT, 0, MPI_COMM_WORLD);
for(i = 0; i < N; i++){
locarr[i] = (((rand() % H) + 1) / rank);
printf("RANK: %d --- NUM: %d\n", rank, locarr[i]);
}
}
Also, rank 0 prints the contents of the bigList. But in for loop, the loop condition is k<rank*N, for rank 0 this will always be false (k<0*N) as a result loop won't be executed and no value will be printed. So it should be world size (wsize) instead of rank.
if( rank == 0){
printf("BigList: ");
for(k = 0; k < wsize*N; k++){
printf(" %d", bigList[k]);
}
printf("\n");
}
The print function was printing some extra garbage values in the BigList array. Instead replacing the print part of the code with this would solve the error
if (rank == 0)
{
printf("BigList: ");
for (k = N; k < wsize * N; k++)
{
printf(" %d", bigList[k]);
}
printf("\n");
}
The full code for the problem is-
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[])
{
double t1, t2;
MPI_Init(&argc, &argv);
int rank;
int wsize;
int N = 10, H = 5;
int num, k, i;
int locarr[25];
int bigList[300];
srand(time(NULL));
if (argc > 1)
{
N = atoi(argv[1]);
H = atoi(argv[2]);
}
t1 = MPI_Wtime();
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &wsize);
if (rank == 0)
{
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&H, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
else
{
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&H, 1, MPI_INT, 0, MPI_COMM_WORLD);
for (i = 0; i < N; i++)
{
locarr[i] = (((rand() % H) + 1) / rank);
printf("RANK: %d --- NUM: %d\n", rank, locarr[i]);
}
}
MPI_Gather(&locarr, N, MPI_INT, bigList, N, MPI_INT, 0, MPI_COMM_WORLD);
if (rank == 0)
{
printf("BigList: ");
for (k = N; k < wsize * N; k++)
{
printf(" %d", bigList[k]);
}
printf("\n");
}
MPI_Finalize();
return 0;
}

parallel Bitonic sort mpi

i'm trying to do a parallel bitonic sort using mpi and C but i obtained a deadlock or any other block state when send e recv array in comp_exchange_max or in comp_exchange_min function. can you help me to resolve this problem? thanks
void comp_exchange_max(int j, int rank, int *local_numbers, int dim_array, int *ordered_array)
{
int message_receive[dim_array];
int i, k, q;
if (rank > 0)
{
MPI_Recv(&message_receive, dim_array, MPI_INT, rank ^ (1 << j), 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Send(local_numbers, dim_array, MPI_INT, rank ^ (1 << j), 0, MPI_COMM_WORLD);
}
/* comparison */
k = dim_array - 1;
q = dim_array - 1;
for (i = dim_array - 1; i >= 0; --i)
{
if (local_numbers[k] > message_receive[q])
{
ordered_array[i] = local_numbers[k];
--k;
}
else
{
ordered_array[i] = message_receive[q];
--q;
}
}
}
void comp_exchange_min(int j, int rank, int *local_numbers, int dim_array, int *ordered_array)
{
int message_receive[dim_array];
int i, k, q;
if (rank > 0)
{
MPI_Send(local_numbers, dim_array, MPI_INT, rank ^ (1 << j), 0, MPI_COMM_WORLD);
MPI_Recv(&message_receive, dim_array, MPI_INT, rank ^ (1 << j), 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
/* comparison */
k = 0;
q = 0;
for (i = 0; i < dim_array; ++i)
{
if (local_numbers[k] < message_receive[q])
{
ordered_array[i] = local_numbers[k];
++k;
}
else
{
ordered_array[i] = message_receive[q];
++q;
}
}
}
all code is here https://pastebin.com/DrR5SGxv

MPI master process convergence loop

I am trying to write a MPI program that simulates temperature flow throughout a grid to reach equilibrium. I have already written a serial version as well as parallel versions using openMP pthreads and cuda.
My goal is to parallelize a for loop that is calculating updated temperature values for a 1 dimensional array. The code I have to do the parallel part is here (all other variables are initialized above):
int nproc, rank,chunksize,leftover,offset,source, tag1=3,tag2=2,tag3=1;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
chunksize = (boxes / (nproc-1));
leftover = (boxes % (nproc-1));
if(rank == 0){
//init dsv
for(int idx = 0; idx < boxes; idx++){
temps[idx] = newtemps[idx];
}
int stop = 0;
int iter = 0;
float max_tmp;
float min_tmp;
while(stop != 1){
offset = 0;
for (int dest=1; dest<nproc; dest++) {
int chunk = (dest <= leftover ? chunksize + 1 : chunksize);
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&temps[offset], chunk, MPI_FLOAT, dest, tag2, MPI_COMM_WORLD);
MPI_Send(&newtemps[offset], chunk, MPI_FLOAT, dest, tag3, MPI_COMM_WORLD);
printf("sent %d temps to process: %d\n",chunk, dest);
offset = offset + chunk;
}
for (int dest=1; dest<nproc; dest++) {
int chunk = (dest <= leftover ? chunksize + 1 : chunksize);
MPI_Recv(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&temps[offset], chunk, MPI_FLOAT, dest, tag2, MPI_COMM_WORLD,&status);
MPI_Recv(&newtemps[offset], chunk, MPI_FLOAT, dest, tag3, MPI_COMM_WORLD,&status);
printf("received %d temps from process: %d\n",chunk, dest);
printf("status: %d\n",status.MPI_TAG);
}
max_tmp = -10000;
min_tmp = 10000;
for(idx = 0; idx < boxes; idx++){
temps[idx] = newtemps[idx];
if(newtemps[idx] > max_tmp){
max_tmp = newtemps[idx];
}
if(newtemps[idx] < min_tmp){
min_tmp = newtemps[idx];
}
}
stop = (max_tmp - min_tmp) <= (max_tmp * epsilon);
iter += 1;
}
}
if (rank > 0){
int chunk = (rank <= leftover ? chunksize + 1 : chunksize);
MPI_Recv(&offset, 1, MPI_INT, 0, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&temps[offset], chunk, MPI_FLOAT, 0, tag2, MPI_COMM_WORLD,&status);
MPI_Recv(&newtemps[offset], chunk, MPI_FLOAT, 0, tag3, MPI_COMM_WORLD,&status);
printf("received %d temps from process: 0\n",chunk);
printf("status: %d\n",status.MPI_TAG);
for(int j = offset; j < offset+chunk; j++){
float weightedtmp = 0;
int perimeter = 0;
int num_iters = neighbors[j][0];
for(int i = 1; i <= num_iters; i++){
weightedtmp += temps[neighbors[j][i]] * mults[j][i];
perimeter += mults[j][i];
}
weightedtmp /= perimeter;
newtemps[j] = temps[j] + (weightedtmp - temps[j] ) * affect_rate;
}
printf("sent %d temps to process: 0\n",chunk);
MPI_Send(&offset, 1, MPI_INT, 0, tag1, MPI_COMM_WORLD);
MPI_Send(&temps[offset], chunk, MPI_FLOAT, 0, tag2, MPI_COMM_WORLD);
MPI_Send(&newtemps[offset], chunk, MPI_FLOAT, 0, tag3, MPI_COMM_WORLD);
}
MPI_Finalize();
My program however is successfully going through the first iteration of the while loop and finding the max value of the while loop (matching my serial version), and then sending the temps, newtemps, and offset variables to each process. Here though my program stalls and the processes never print that they received the message. The console looks like this:
[radeymichael#owens-login04 ~]$ mpicc -o ci changeInput.c
[radeymichael#owens-login04 ~]$ mpirun -np 3 ./ci .1 .1
sent 101 temps to process: 1
sent 100 temps to process: 2
received 101 temps from process: 1
status: 1
received 101 temps from process: 0
status: 1
sent 101 temps to process: 0
received 100 temps from process: 0
status: 1
sent 100 temps to process: 0
received 100 temps from process: 2
status: 1
max: 900.000000
sent 101 temps to process: 1
sent 100 temps to process: 2
I have spent a lot of time trying to find the mistake, but think I am lacking fundamental knowledge to use MPI. If someone can help me find where my misunderstanding is I would greatly appreciate it.
The problem was, rank 0 is inside a while loop and will be sending the data till stop=1 while all the other process will reach the MPI_Finalize after the last MPI_Send in the else part. One solution (as seen in the comment by #Gilles) is to add a while loop based on stop also for all other ranks and broadcast the stop to all the process by the root.
MPI_Bcast(&stop,1, MPI_INT, 0, MPI_COMM_WORLD);
See the below code.
int nproc, rank,chunksize,leftover,offset,source, tag1=3,tag2=2,tag3=1;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
chunksize = (boxes / (nproc-1));
leftover = (boxes % (nproc-1));
int stop = 0;
if(rank == 0){
//init dsv
for(int idx = 0; idx < boxes; idx++){
temps[idx] = newtemps[idx];
}
int iter = 0;
float max_tmp;
float min_tmp;
while(stop != 1){
offset = 0;
for (int dest=1; dest<nproc; dest++) {
int chunk = (dest <= leftover ? chunksize + 1 : chunksize);
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD);
MPI_Send(&temps[offset], chunk, MPI_FLOAT, dest, tag2, MPI_COMM_WORLD);
MPI_Send(&newtemps[offset], chunk, MPI_FLOAT, dest, tag3, MPI_COMM_WORLD);
printf("sent %d temps to process: %d\n",chunk, dest);
offset = offset + chunk;
}
for (int dest=1; dest<nproc; dest++) {
int chunk = (dest <= leftover ? chunksize + 1 : chunksize);
MPI_Recv(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&temps[offset], chunk, MPI_FLOAT, dest, tag2, MPI_COMM_WORLD,&status);
MPI_Recv(&newtemps[offset], chunk, MPI_FLOAT, dest, tag3, MPI_COMM_WORLD,&status);
printf("received %d temps from process: %d\n",chunk, dest);
printf("status: %d\n",status.MPI_TAG);
}
max_tmp = -10000;
min_tmp = 10000;
for(idx = 0; idx < boxes; idx++){
temps[idx] = newtemps[idx];
if(newtemps[idx] > max_tmp){
max_tmp = newtemps[idx];
}
if(newtemps[idx] < min_tmp){
min_tmp = newtemps[idx];
}
}
stop = (max_tmp - min_tmp) <= (max_tmp * epsilon);
iter += 1;
MPI_Bcast(&stop,1, MPI_INT, 0, MPI_COMM_WORLD);
}
}
if (rank > 0){
while(stop != 1){
int chunk = (rank <= leftover ? chunksize + 1 : chunksize);
MPI_Recv(&offset, 1, MPI_INT, 0, tag1, MPI_COMM_WORLD, &status);
MPI_Recv(&temps[offset], chunk, MPI_FLOAT, 0, tag2, MPI_COMM_WORLD,&status);
MPI_Recv(&newtemps[offset], chunk, MPI_FLOAT, 0, tag3, MPI_COMM_WORLD,&status);
printf("received %d temps from process: 0\n",chunk);
printf("status: %d\n",status.MPI_TAG);
for(int j = offset; j < offset+chunk; j++){
float weightedtmp = 0;
int perimeter = 0;
int num_iters = neighbors[j][0];
for(int i = 1; i <= num_iters; i++){
weightedtmp += temps[neighbors[j][i]] * mults[j][i];
perimeter += mults[j][i];
}
weightedtmp /= perimeter;
newtemps[j] = temps[j] + (weightedtmp - temps[j] ) * affect_rate;
}
printf("sent %d temps to process: 0\n",chunk);
MPI_Send(&offset, 1, MPI_INT, 0, tag1, MPI_COMM_WORLD);
MPI_Send(&temps[offset], chunk, MPI_FLOAT, 0, tag2, MPI_COMM_WORLD);
MPI_Send(&newtemps[offset], chunk, MPI_FLOAT, 0, tag3, MPI_COMM_WORLD);
MPI_Bcast(&stop,1, MPI_INT, 0, MPI_COMM_WORLD);
}
}
MPI_Finalize();

MPI Vector multiplication

#include<stdio.h>
#include<mpi.h>
int main()
{
int a_r = 0, a_c = 0, v_s = 0, i = 0, rank = 0, size = 0;
int local_row = 0, partial_sum = 0, sum = 0, j = 0;
int my_first_ele = 0, my_last_ele = 0;
int a[10][10], v[10], partial_mul[10] = {0}, mul[10] = {0};
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if(rank == 0)
{
printf("Enter the row of array A: ");
scanf("%d", &a_r);
printf("Enter the column of array A: ");
scanf("%d", &a_c);
printf("Enter the array A: ");
for(i = 0; i < a_r; i++)
{
for(j = 0; j < a_c; j++)
scanf("%d", &a[i][j]);
}
printf("Enter the size of vector array: ");
scanf("%d", &v_s);
printf("Enter the vector array: ");
for(i = 0; i < v_s; i++)
{
scanf("%d", &v[i]);
}
MPI_Bcast(&a_r, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&a_c, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&v_s, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, a_r*a_c, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(v, v_s, MPI_INT, 0, MPI_COMM_WORLD);
local_row = a_r / size;
my_first_ele = rank * local_row;
my_last_ele = (rank+1) * local_row;
if(a_c == v_s)
{
for(i = my_first_ele; i < my_last_ele; i++)
{
for(j = 0; j < a_c; j++)
{
partial_mul[i] = partial_mul[i] + (a[i][j]*v[j]);
}
}
printf("\nPartial multiplication in Rank 0: \n");
for(i = my_first_ele; i < my_last_ele; i++)
printf("%d \n", partial_mul[i]);
MPI_Gather(partial_mul, local_row, MPI_INT, mul, local_row, MPI_INT, 0, MPI_COMM_WORLD);
printf("\n \nGlobal Multiplication: \n");
for(i = 0; i < a_r; i++)
{
printf("%d \n", mul[i]);
}
}
else
printf("\nCan't multiply. \n");
}
else
{
MPI_Bcast(&a_r, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&a_c, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&v_s, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(a, a_r*a_c, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(v, v_s, MPI_INT, 0, MPI_COMM_WORLD);
local_row = a_r / size;
my_first_ele = rank * local_row;
my_last_ele = (rank+1) * local_row;
if(a_c == v_s)
{
for(i = my_first_ele; i < my_last_ele; i++)
{
for(j = 0; j < a_c; j++)
{
partial_mul[i] = partial_mul[i] + (a[i][j]*v[j]);
}
}
printf("\nPartial multiplication in Rank %d: \n", rank);
for(i = my_first_ele; i < my_last_ele; i++)
printf("%d \n", partial_mul[i]);
MPI_Gather(partial_mul, local_row, MPI_INT, mul, local_row, MPI_INT, 0, MPI_COMM_WORLD);
}
else
printf("\nCan't multiply. \n");
}
MPI_FINALIZE();
}
I have a problem with above code. My partial multiplication value is correct. But in my overall multiplication I can only gather rank 0 elements, rest of the values are being printed as 0. What is the problem can anyone explain.
Looking at your data layout I think you misunderstand data structures in MPI: All data is kept separate in each rank, there is no implict sharing or distribution. Your vector partial_sum is separate on each rank, each with the full 10 elements. So assuming size=2, a_r=10 and zero initialization, after the computation the contents will look like this:
rank 0: {x0,x1,x2,x3,x4,0,0,0,0,0}
rank 1: {0,0,0,0,0,x5,x6,x7,x8,x9}
Where x is the correct computed value. Gather will then collect the first local_row=5 elements from each rank, resulting in {x0,x1,x2,x3,x4,0,0,0,0,0}.
You could just fix this by adding the correct offset:
MPI_Gather(&partial_mul[my_first_ele], local_row, MPI_INT, mul, local_row, MPI_INT, 0, MPI_COMM_WORLD);
But please don't do that. Instead you should reconsider your data structures to really distribute the data, reserve the correct sizes for each part of the vector / array. To send around parts of the data to each rank, use MPI_Scatter (the opposite of MPI_Gather). The most difficult is to get the matrix right. This is explained in detail by this excellent answer.

MPI debugging Segmentation fault

I'm trying to sort an array of random numbers using Odd- Even transposition but I keep getting a segmentation error when running my code:
[islb:48966] *** Process received signal ***
[islb:48966] Signal: Segmentation fault (11)
[islb:48966] Signal code: Address not mapped (1)
[islb:48966] Failing at address: 0x28
[islb:48966] [ 0] /lib64/libpthread.so.0(+0xf810)[0x7fc3da4cb810]
[islb:48966] [ 1] /lib64/libc.so.6(memcpy+0xa3)[0x7fc3da1c7cf3]
[islb:48966] [ 2] /usr/local/lib/libopen-pal.so.6(opal_convertor_unpack+0x10b)[0x7fc3d9c372db]
[islb:48966] [ 3] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_recv_request_progress_match+0x138)[0x7fc3d58507a8]
[islb:48966] [ 4] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_recv_req_start+0x1b1)[0x7fc3d5850d11]
[islb:48966] [ 5] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_recv+0x139)[0x7fc3d5849489]
[islb:48966] [ 6] /usr/local/lib/libmpi.so.1(MPI_Recv+0xc0)[0x7fc3da742f40]
[islb:48966] [ 7] oddEven[0x40115a]
[islb:48966] [ 8] /lib64/libc.so.6(__libc_start_main+0xe6)[0x7fc3da161c36]
[islb:48966] [ 9] oddEven[0x400c19]
[islb:48966] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 48966 on node islb exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
The program allocates the array, it's when it comes to scattering it amongst the processes that the error seems to occur as the print statment directly after the scatter call only prints for process 0 and then prints the error message.
Here's my code:
#include <stdio.h>
#include <math.h>
#include <malloc.h>
#include <time.h>
#include <string.h>
#include "mpi.h"
const int MAX = 10000;
int myid, numprocs;
int i, n, j, k, arrayChunk, minindex;
int A, B;
int temp;
int swap(int *x, int *y) {
temp = *x;
*x = *y;
*y = temp;
return 0;
}
int main(int argc, char **argv) {
int* arr = NULL;
int* value = NULL;
MPI_Status status;
//int arr[] = {16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1};
srand(time(0));
time_t t1, t2;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if (myid == 0) {
printf("Enter the number of elements you would like in the array \n");
scanf("%d", &n);
arrayChunk = n/numprocs;
//printf("cpus: %d, #s per cpu: %d\n", numprocs, arrayChunk);
//Allocate memory for the array
arr = malloc(n * sizeof(int));
value = malloc(n * sizeof(int));
// Generate an array of size n random numbers and prints them
printf("Elements in the array: ");
for (i = 0; i < n; i++) {
arr[i] = (rand() % 100) + 1;
printf("%d ", arr[i]);
}
printf("\n");
time(&t1);
}
if ((n % numprocs) != 0) {
if (myid == 0)
printf("Number of Elements are not divisible by numprocs \n");
MPI_Finalize();
return(0);
}
// Broadcast the size of each chunk
MPI_Bcast(&arrayChunk, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Scatter(&arr, arrayChunk, MPI_INT, &value, arrayChunk, MPI_INT, 0, MPI_COMM_WORLD);
printf("Processor %d receives %d\n", myid, value[0]);
for (i = 0; i < numprocs; i++) {
if (i % 2 == 0) {
if (myid%2 == 0) {
MPI_Send(&value[0], arrayChunk, MPI_INT, myid + 1, 0, MPI_COMM_WORLD);
MPI_Recv(&value[arrayChunk], arrayChunk, MPI_INT, myid + 1, 0, MPI_COMM_WORLD, &status);
for (j = 0; j < (arrayChunk * 2 - 1); j++) {
minindex = j;
for (k = j + 1; k < arrayChunk * 2; k++) {
if (value[k] < value[minindex]) {
minindex = k;
}
}
if (minindex > j) {
swap(&value[j], &value[minindex]);
}
}
//printf("myid %d i: %d, %d\n", myid, i, value[0]);
} else {
MPI_Recv(&value[arrayChunk], arrayChunk, MPI_INT, myid - 1, 0, MPI_COMM_WORLD, &status);
MPI_Send(&value[0], arrayChunk, MPI_INT, myid - 1, 0, MPI_COMM_WORLD);
for (j = 0; j < (arrayChunk * 2 - 1); j++) {
minindex = j;
for (k = j + 1; k < arrayChunk * 2; k++) {
if (value[k] < value[minindex]) {
minindex = k;
}
}
if (minindex > j) {
swap(&value[j], &value[minindex]);
}
}
for (j = 0; j < arrayChunk; j++) {
swap(&value[j], &value[j + arrayChunk]);
}
//printf("myid %d i: %d, %d\n", myid, i, value[0]);
}
} else {
if ((myid%2 == 1) && (myid != (numprocs-1))) {
MPI_Send(&value[0], arrayChunk, MPI_INT, myid + 1, 0, MPI_COMM_WORLD);
MPI_Recv(&value[arrayChunk], arrayChunk, MPI_INT, myid + 1, 0, MPI_COMM_WORLD, &status);
for (j = 0; j < (arrayChunk * 2 - 1); j++) {
minindex = j;
for (k = j + 1; k < arrayChunk * 2; k++) {
if (value[k] < value[minindex]) {
minindex = k;
}
}
if (minindex > j) {
swap(&value[j], &value[minindex]);
}
}
//printf("myid %d i: %d, %d\n", myid, i, value[0]);
} else if (myid != 0 && myid != (numprocs-1)) {
MPI_Recv(&value[arrayChunk], arrayChunk, MPI_INT, myid - 1, 0, MPI_COMM_WORLD, &status);
MPI_Send(&value[0], 1, MPI_INT, myid - 1, 0, MPI_COMM_WORLD);
for (j = 0; j < (arrayChunk * 2 - 1); j++) {
minindex = j;
for (k = j + 1; k < arrayChunk * 2; k++) {
if (value[k] < value[minindex]) {
minindex = k;
}
}
if (minindex > j) {
swap(&value[j], &value[minindex]);
}
}
for (j = 0; j < arrayChunk; j++) {
swap(&value[j], &value[j + arrayChunk]);
}
//printf("myid %d i: %d, %d\n", myid, i, value[0]);
}
}
}
MPI_Gather(&value[0], arrayChunk, MPI_INT, &arr[0], arrayChunk, MPI_INT, 0, MPI_COMM_WORLD);
if (myid == 0) {
time(&t2);
printf("Sorted array: ");
for (i = 0; i < n; i++) {
printf("%d ", arr[i]);
}
printf("\n");
printf("Time in sec. %f\n", difftime(t2, t1));
}
// Free allocated memory
if (arr != NULL) {
free(arr);
arr = NULL;
free(value);
value = NULL;
}
MPI_Finalize();
return 0;
}
I'm not very familiar with C and it could well be that I've used malloc and/or addresses and pointers incorrectly, as such it's probably something simple.
Sorry for the amount of code but I thought it would be better to supply all of it to allow for proper debugging.
The problem is in your MPI_Scatter command. You try to scatter the information and store in value, but if you look above that code, only rank 0 has allocated any memory for value. When any and all other ranks try to store data into value, you will get a segmentation fault (and indeed you do). Instead, remove the value = malloc(...); line from inside the if block, and put it after the MPI_Bcast as value = malloc(arrayChunk * sizeof(int));. I've not looked through the rest of the code to see if there are any issues elsewhere as well, but that is likely the cause of the initial seg-fault.
I would build program with debugging info (most likely -g compile flag), try geting coredump and try using gdb debugger to locate the bug. Corefile is created when process crashes and it holds process memory image at the moment of crash.
If after program crash coredump file is not created, You'll need to figure out how to enable it on Your system. You may create simple buggy program (for example with a=x/0; or similar error) and play a bit. Coredump may be called core, PID.core (PID - number of crashed process), or something similar. Sometimes it is enough to set core file size tu unlimited using ulimit. Also check kernel.core_* sysctl's on Linux.
Once You have corecump, You can use it with gdb or similar debuger (ddd):
gdb executable_file core

Resources