My MPI jacobi iteration program gives wrong result - c

I got the above mentioned error on running the following program in C.
This program Firstly generates 3x3 array and executes Jacobi iteration. It uses MPI library. I don't know what parts of code are wrong.:
#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <math.h> // l2-norm //
#include <time.h>
int main(int argc, char **argv)
{
int numprocs, myid;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
double a[3][3];
double b[3];
double x[3]={0};
double xa[3]={0};
double xnew[3]={0};
double y[3]={0};
float sigancha;
time_t startTime=0, endTime=0;
int n=3;
int i, j =0;
int k=0;
int o;
int hoessu=300;
int minhoessu=300;
double sum=1;
int numsent =0;
int ans;
int row;
MPI_Status status;
int sender;
int po;
double *buffer;
/* synchronization */
MPI_Barrier(MPI_COMM_WORLD);
for (i=0; i<n; i++){
b[i]=i*100;
for (j=0; j<n; j++) {
a[i][j]=((i+j)%10);
if (i==j) {a[i][j]+=5000;}
}
x[i]=b[i]/a[i][i];
}
/* run if sum is greater than 0.0002 */
for (k=0; k<hoessu&&sum>0.0002||k<minhoessu; k++) {
numsent = 0;
for (o=myid+1; o<n+1; o+=numprocs) {
i=o-1;
xa[i]=b[i]+a[i][i]*x[i];
for (j=0; j<n; j++) {
xa[i]-=a[i][j]*x[j];
}
xnew[i]=xa[i]/a[i][i];
/*send xnew[i] to master*/
MPI_Send(&xnew[i],1,MPI_DOUBLE,0,i,MPI_COMM_WORLD);
}
if (myid == 0){
/*get xnew[i]*/
for (i=0; i<n; i++) {
MPI_Recv(&ans, 1, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
sender = status.MPI_SOURCE;
row = status.MPI_TAG;
xnew[row] = ans;
}
/*calculates sum at master*/
for (j=0; j<n; j++){
sum=0.0;
sum+=(xnew[j]-x[j])*(xnew[j]-x[j]);
x[j]=xnew[j];
}
sum=pow(sum,0.5);
MPI_Bcast(&x[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
}
if (myid == 0){
endTime=clock();
sigancha=(float)(endTime-startTime)/(CLOCKS_PER_SEC);
printf("finished\n");
for (j=0; j<n; j++) {
printf("x[%d]=%fl\n",j+1,xnew[j]);
}
printf("iteration; %d times itereation are done. \n l2-norm, error is %fl .\n %f seceonds are used. \n",k,sum,sigancha);
}
MPI_Finalize();
}
Uses mpicc for compile.
mpicc mpijacobi2.c -o taskingyeje
./taskingyeje
Result.
finished
x[1]=-1736884775.000000l
x[2]=-370936800.000000l
x[3]=2118301216.000000l
iteration; 300 times itereation are done.
l2-norm, error is 34332272.000000l .
0.020000 seceonds are used.
however, this result is not intended result. If this program worked perfectly, It should give same result of serial jacobi iteration.
It would be
x[1]=-0.000020l
x[2]=-0.019968l
x[3]=0.399956l
I don't know why this program generate wrong result.

#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <math.h> // l2-norm //
#include <time.h>
int main(int argc, char **argv)
{
int numprocs, myid;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
double a[700][700];
double b[700];
double x[700]={0};
double xa[700]={0};
double xnew[700]={0};
double y[700]={0};
float sigancha;
time_t startTime=0, endTime=0;
int n=700;
int i, j =0;
int k=0;
int o;
int hoessu=300;
int minhoessu=300;
double sum=1;
int numsent =0;
int ans;
int row;
MPI_Status status;
int sender;
int po;
double *buffer;
/* synchronization */
MPI_Barrier(MPI_COMM_WORLD);
for (i=0; i<n; i++){
b[i]=i*100;
for (j=0; j<n; j++) {
a[i][j]=((i+j)%10);
if (i==j) {a[i][j]+=10000;}
}
x[i]=b[i]/a[i][i];
}
/* run if sum is greater than 0.0002 */
for (k=0; k<hoessu&&sum>0.0002||k<minhoessu; k++) {
numsent = 0;
for (o=myid+1; o<n+1; o+=numprocs) {
i=o-1;
xa[i]=b[i]+a[i][i]*x[i];
for (j=0; j<n; j++) {
xa[i]-=a[i][j]*x[j];
}
xnew[i]=xa[i]/a[i][i];
/*send xnew[i] to master*/
ans=xnew[i];
MPI_Allgather(&xnew[i],1,MPI_DOUBLE,&xnew[i],1,MPI_DOUBLE,MPI_COMM_WORLD);
}
if (myid == 0){
/*calculates sum at master*/
for (j=0; j<n; j++){
sum=0.0;
sum+=(xnew[j]-x[j])*(xnew[j]-x[j]);
x[j]=xnew[j];
}
sum=pow(sum,0.5);
MPI_Bcast(&x[0], n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
}
if (myid == 0){
endTime=clock();
sigancha=(float)(endTime-startTime)/(CLOCKS_PER_SEC);
printf("finished\n");
for (j=0; j<n; j++) {
printf("x[%d]=%fl\n",j+1,xnew[j]);
}
printf("iteration; %d times itereation are done. \n l2-norm, error is %fl .\n %f seceonds are used. \n",k,sum,sigancha);
}
MPI_Finalize();
}

Related

Matrix traversal optimization

Given a n x n matrix of ints, I have an algorithm that at each step of a for loop of range n traverses and modifies the matrix. Here is the code:
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
The matrix is built as an array of n*n ints and for each index line i, dist[i] is the address of the row of index i [the above code is the standard way to write the Floyd-Warshall algorithm but my question is not about this algorithm by itself].
The following drawing tries to explain how the matrix is processed:
At each step of the loop of index k, the underlying matrix is traversed line by line.
Now, consider the following transformation of the previous code:
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
At every step, the elements of the matrix are accessed in the same order as in the previous algorithm.
The only difference is that at each step k in the exterior loop, the column of index k is copied into a temporary array, cf. the colk malloc above. It results that the element at position (i, k) is read from this array instead of being accessed directly from the matrix.
This innocuous change leads in fact to a significant speedup: you gain a factor 4 if n=1000.
I know that in C, it's faster to traverse an array in row major order but this is always the case here. So i was wondering why there is a speedup so important. Is it related to cache optimisation?
Complete code
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
typedef int **Matrix;
void floyd_slow(Matrix dist, int n)
{
int d;
for (int k=0; k<n; k++)
{
for (int i=0; i<n; i++)
{
for (int j=0; j<n; j++)
if ((d=dist[k][j]+dist[i][k])<dist[i][j])
dist[i][j]=d;
}
}
for (int i=0; i<n; i++)
dist[i][i]=0;
}
void relax(Matrix dist, int n, int* rowk, int* colk)
{
int d;
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if ((d=rowk[j]+colk[i])<dist[i][j])
dist[i][j]=d;
}
void floyd_fast(Matrix dist, int n)
{
int i, k;
int* colk=malloc(n*sizeof(int));
if (!colk)
exit(EXIT_FAILURE);
for (k=0; k<n; k++)
{
int* rowk =dist[k];
for (i=0; i<n; i++)
colk[i]=dist[i][k];
relax(dist, n, rowk, colk);
}
free(colk);
for (i=0; i<n; i++)
dist[i][i]=0;
}
void print(Matrix dist, int n)
{
int i, j;
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
printf("%d ", dist[i][j]);
printf("\n");
}
}
void test_slow(Matrix dist, int n)
{
clock_t now=clock();
floyd_slow(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed slow: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
void test_fast(Matrix dist, int n)
{
clock_t now=clock();
floyd_fast(dist, n);
// print(dist, n);
int *p=dist[0];
free(dist);
free(p);
fprintf(stderr, "Elapsed fast: %.2f s\n",
(double) (clock() - now) / CLOCKS_PER_SEC);
}
int * data(int n)
{
int N=n*n;
int *t=malloc(N*sizeof(int));
if (!t)
exit(EXIT_FAILURE);
srand(time(NULL));
for (int i=0; i<N;i++)
t[i]=(1+rand())%10;
return t;
}
Matrix getMatrix(int *t, int n)
{
int N=n*n;
int *tt=malloc(N*sizeof(int));
Matrix mat=malloc(n*sizeof(int*));
if (!tt || !mat)
exit(EXIT_FAILURE);
memcpy(tt, t, N*sizeof(int));
for (int i=0; i<n;i++)
mat[i]=&tt[i*n];
return mat;
}
int main(void)
{
int n=1000;
int *t=data(n);
Matrix mat_slow=getMatrix(data(n), n);
Matrix mat_fast=getMatrix(data(n), n);
test_slow(mat_slow, n);
test_fast(mat_fast, n);
return 0;
}
Output:
Elapsed slow: 0.58 s
Elapsed fast: 0.14 s
Compilation options:
rm floyd
gcc -Wall -O3 -march=native -ffast-math -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.c -o floyd -lm

Send chunks of 3D array using MPI subarray

I'm a newby in MPI and I'm trying to learn how to use MPI_Type_create_subarray in order to apply it in my projects.
I've spent lots of time searching for a tutorial which could fits my needing, but without success.
So I've tried to generalize the concept in How to use MPI_Type_create_subarray
to 3D arrays, but something is still missing.
In particular my code return a Segmentation Fault error or shows wrong data when I try to see results.
I can't understand where I made a mistake
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
void printarr(int ***data, int nx, int ny, int nz, char *str);
int ***allocarray(int nx, int ny, int nz);
int main(int argc, char **argv) {
/* array sizes */
const int bigsize =10;
const int subsize_x =2; const int subsize_y =2; const int subsize_z =2;
/* communications parameters */
const int sender =0;
const int receiver=1;
const int ourtag =2;
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size < receiver+1) {
if (rank == 0)
fprintf(stderr,"%s: Needs at least %d processors.\n", argv[0], receiver+1);
MPI_Finalize();
return 1;
}
MPI_Datatype mysubarray;
int starts[3] = {0,0,0};
int subsizes[3] = {subsize_x,subsize_y,subsize_z};
int bigsizes[3] = {bigsize, bigsize, 3};
MPI_Type_create_subarray(3, bigsizes, subsizes, starts, MPI_ORDER_C, MPI_INT, &mysubarray);
MPI_Type_commit(&mysubarray);
if (rank == sender) {
int ***bigarray = allocarray(bigsize,bigsize,3);
for (int k=0; k<3; k++)
for (int j=0; j<bigsize; j++)
for(int i=0; i< bigsize; i++) {
bigarray[k][j][i] = k*(bigsize*bigsize)+j*bigsize+i;
}
printarr(bigarray, bigsize, bigsize, 3, " Sender: Big array ");
MPI_Send(&(bigarray[0][0][0]), 1, mysubarray, receiver, ourtag, MPI_COMM_WORLD);
MPI_Type_free(&mysubarray);
free(bigarray);
} else if (rank == receiver) {
int ***subarray = allocarray(subsize_x,subsize_y,subsize_z);
MPI_Recv(&(subarray[0][0][0]), subsizes[0]*subsizes[1]*subsizes[2], MPI_INT, sender, ourtag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printarr(subarray, subsize_x, subsize_y, subsize_z, " Receiver: Subarray -- after receive");
free(subarray);
}
MPI_Finalize();
return 0;
}
void printarr(int ***data, int nx, int ny, int nz, char *str) {
printf("-- %s --\n", str);
for(int k=0; k<nz; k++) {
printf("\n\n-----%d------\n",k);
for (int j=0; j<ny;j++) {
for (int i=0; i<nx; i++) {
printf("%3d ", data[k][j][i]);
}
printf("\n");
}
}
}
int ***allocarray(int nx, int ny, int nz) {
int*** arr = (int***)malloc(sizeof(int**)*nz);
for(int k = 0; k < nz; k++) {
arr[k]= (int**)malloc(sizeof(int*)*ny);
for(int j = 0; j< ny; j++){
arr[k][j] = (int*)malloc(sizeof(int)*nx);
for(int i = 0; i < nx; i++){
arr[k][j][i] = 0;
}
}
}
return arr;
}

mpi_allreduce sum over a derived datatype vector

I'm trying to reduce (sum) a derived data type which is created by MPI_type_vector. When I'm running the code it crashes and complains about the reduction MPI_SUM is not defined for non-instrinsic datatype.
I wrote a peice of simple code to show my problem. The code tries to reduce the diagonal elements of a 3*3 matrix:
#include "mpi.h"
#include <stdio.h>
int main(int argc, char *argv[]) {
int rank, size, i, j;
double a[3][3] ;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Datatype diag3;
MPI_Type_vector(3,1,4,MPI_DOUBLE,&diag3);
MPI_Type_commit(&diag3);
if(rank==0)
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
a[i][j]=1;
if(rank==1)
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
a[i][j]=-1;
MPI_Allreduce( MPI_IN_PLACE, &a[0][0], 1, diag3, MPI_SUM, MPI_COMM_WORLD );
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
printf("rank=%d\ta[%d][%d]=%f\n",rank,i,j,a[i][j]);
MPI_Finalize();
}
The error after running is something like this:
*** An error occurred in MPI_Allreduce: the reduction operation MPI_SUM is not defined for non-intrinsic datatypes
*** reported by process [140130307538945,1]
*** on communicator MPI_COMM_WORLD
*** MPI_ERR_OP: invalid reduce operation
I thought the Reduce and MPI_SUM could be performed on a derived datatype as the MPI documentations say. So, what's the problem in the code?
Ed Smith is right, you'll need to define your own operation; but it needs to be a little more complicated for non-contiguous types than the version he listed. Below we have an add_double_vector function which will decode any double_vector type and operate on it; it's relatively straightforwardly extended to len > 1.
#include "mpi.h"
#include <stdio.h>
void add_double_vector(void *in, void *inout, int *len, MPI_Datatype *dtype)
{
double *invec = in;
double *inoutvec = inout;
int nints, naddresses, ntypes;
int combiner;
if (*len != 1) {
fprintf(stderr,"my_add: len>1 not implemented.\n");
return;
}
MPI_Type_get_envelope(*dtype, &nints, &naddresses, &ntypes, &combiner);
if (combiner != MPI_COMBINER_VECTOR) {
fprintf(stderr,"my_add: do not understand composite datatype.\n");
return;
}
int vecargs [nints];
MPI_Aint vecaddrs[naddresses];
MPI_Datatype vectypes[ntypes];
MPI_Type_get_contents(*dtype, nints, naddresses, ntypes,
vecargs, vecaddrs, vectypes);
if (vectypes[0] != MPI_DOUBLE) {
fprintf(stderr,"my_add: not a vector of DOUBLEs.\n");
}
int count = vecargs[0];
int blocklen = vecargs[1];
int stride = vecargs[2];
for ( int i=0; i<count; i++ ) {
for ( int j=0; j<blocklen; j++) {
inoutvec[i*stride+j] += invec[i*stride+j];
}
}
}
int main(int argc, char *argv[]) {
int rank, size, i, j;
const int n=3;
double a[n][n] ;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Datatype diag3;
MPI_Type_vector(n,1,n+1,MPI_DOUBLE,&diag3);
MPI_Type_commit(&diag3);
if(rank==0)
for(i=0; i < n ; i++)
for(j=0; j < n ; j++)
a[i][j]=1;
if(rank==1)
for(i=0; i < n ; i++)
for(j=0; j < n ; j++)
a[i][j]=-1;
MPI_Op vector_add;
MPI_Op_create( add_double_vector, 1, &vector_add );
MPI_Allreduce( MPI_IN_PLACE, &a[0][0], 1, diag3, vector_add, MPI_COMM_WORLD );
MPI_Op_free( &vector_add );
for(i=0; i < n ; i++)
for(j=0; j < n ; j++)
printf("rank=%d\ta[%d][%d]=%f\n",rank,i,j,a[i][j]);
MPI_Finalize();
}
Compiling and running gives the correct answer:
$ mpicc -o foo foo.c -std=c99
$ mpirun -np 2 ./foo
rank=1 a[0][0]=0.000000
rank=1 a[0][1]=-1.000000
rank=1 a[0][2]=-1.000000
rank=1 a[1][0]=-1.000000
rank=1 a[1][1]=0.000000
rank=1 a[1][2]=-1.000000
rank=1 a[2][0]=-1.000000
rank=1 a[2][1]=-1.000000
rank=1 a[2][2]=0.000000
rank=0 a[0][0]=0.000000
rank=0 a[0][1]=1.000000
rank=0 a[0][2]=1.000000
rank=0 a[1][0]=1.000000
rank=0 a[1][1]=0.000000
rank=0 a[1][2]=1.000000
rank=0 a[2][0]=1.000000
rank=0 a[2][1]=1.000000
rank=0 a[2][2]=0.000000
I think the error is because there is no defined way to add the vectors you have created. If you define you're own sum operation:
#include "mpi.h"
#include <stdio.h>
void mySum ( int *, int *, int *, MPI_Datatype * );
void mySum(int *invec, int *inoutvec, int *len, MPI_Datatype *dtype)
{
int i;
for ( i=0; i<*len; i++ )
inoutvec[i] += invec[i];
}
int main(int argc, char *argv[]) {
int rank, size, i, j;
double a[3][3] ;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Datatype diag3;
MPI_Type_vector(3,1,4,MPI_DOUBLE,&diag3);
MPI_Type_commit(&diag3);
MPI_Op diagSum;
MPI_Op_create( (MPI_User_function *)mySum, 1, &diagSum );
if(rank==0)
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
a[i][j]=i+j;
if(rank==1)
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
a[i][j]=-1;
MPI_Allreduce( MPI_IN_PLACE, &a[0][0], 1, diag3, diagSum, MPI_COMM_WORLD );
for(i=0; i < 3 ; i++)
for(j=0; j < 3 ; j++)
printf("rank=%d\ta[%d][%d]=%f\n",rank,i,j,a[i][j]);
MPI_Op_free( &diagSum );
MPI_Finalize();
}

C parallel implementation of Gauss elimination with MPI [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 7 years ago.
Improve this question
I'm very new to MPI and I was asked to write a C parallel implementation for Gauss elimination (without pivoting).
I gave it a try (I used a row-wise decomposition) but my code doesn't work. I am hoping someone can give me some pointers here. I've been looking for what's wrong for few days already without success :(
Thank you in advance !
#include<stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int i,j,k;
int map[500];
float A[500][500],b[500],c[500],x[500],sum=0.0;
double range=1.0;
int n=3;
int rank, nprocs;
clock_t begin1, end1, begin2, end2;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* get number of processes */
//////////////////////////////////////////////////////////////////////////////////
if (rank==0)
{
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
A[i][j]=range*(1.0-2.0*(double)rand()/RAND_MAX);
b[i]=range*(1.0-2.0*(double)rand()/RAND_MAX);
}
printf("\n Matrix A (generated randomly):\n");
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
printf("%9.6lf ",A[i][j]);
printf("\n");
}
printf("\n Vector b (generated randomly):\n");
for (i=0; i<n; i++)
printf("%9.6lf ",b[i]);
printf("\n\n");
}
//////////////////////////////////////////////////////////////////////////////////
begin1 =clock();
MPI_Bcast (A,n*n,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast (b,n,MPI_DOUBLE,0,MPI_COMM_WORLD);
for(i=0; i<n; i++)
{
map[i]= i % nprocs;
}
for(k=0;k<n;k++)
{
MPI_Bcast (&A[k][k],n-k,MPI_DOUBLE,map[k],MPI_COMM_WORLD);
MPI_Bcast (&b[k],1,MPI_DOUBLE,map[k],MPI_COMM_WORLD);
for(i= k+1; i<n; i++)
{
if(map[i] == rank)
{
c[i]=A[i][k]/A[k][k];
}
}
for(i= k+1; i<n; i++)
{
if(map[i] == rank)
{
for(j=0;j<n;j++)
{
A[i][j]=A[i][j]-( c[i]*A[k][j] );
}
b[i]=b[i]-( c[i]*b[k] );
}
}
}
end1 = clock();
//////////////////////////////////////////////////////////////////////////////////
begin2 =clock();
if (rank==0)
{
x[n-1]=b[n-1]/A[n-1][n-1];
for(i=n-2;i>=0;i--)
{
sum=0;
for(j=i+1;j<n;j++)
{
sum=sum+A[i][j]*x[j];
}
x[i]=(b[i]-sum)/A[i][i];
}
end2 = clock();
}
//////////////////////////////////////////////////////////////////////////////////
if (rank==0)
{
printf("\nThe solution is:");
for(i=0;i<n;i++)
{
printf("\nx%d=%f\t",i,x[i]);
}
printf("\n\nLU decomposition time: %f", (double)(end1 - begin1) / CLOCKS_PER_SEC);
printf("\nBack substitution time: %f\n", (double)(end2 - begin2) / CLOCKS_PER_SEC);
}
return(0);
MPI_Finalize();
}
And this is the error I'm getting:
mpirun has exited due to process rank 1 with PID XXXX on node XXXX exiting without calling "finalize". This may have caused other processes in the application to be terminated by signals sent by mpirun (as reported here).
As noticed by High Performance Mark, add MPI_Finalize() before return(0). This code will run without prompting any problem...But the result will still be uncorrect. In parallel, it will print nan as being the result, which is false.
The problem comes from MPI_Bcast(A,n*n,MPI_DOUBLE,...). A is defined as float A[500][500].
You need to broadcast the pointer to the first element &A[0][0], not the pointer to the pointer to the first element.
If you send n*n elements (n=3), you will send A[0][0],...,A[0][8] and A[1][1] will be left uninitialized. This could cause wrong results, such as nan. For the shake of simplicity (laziness...), you may change for 500*500.
MPI_DOUBLE corresponds to double precision... Solution is either to change for double A[500][500] or MPI_Bcast(&A[0][0],500*500,MPI_FLOAT,...). Do the same thing for b.
This deterministic use of rand() is really useful for debugging purpose...Do not forget to use srand() to seed your random generator !
EDIT : here is the code :
#include<stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char **argv)
{
MPI_Init(&argc, &argv);
int i,j,k;
int map[500];
double A[500][500],b[500],c[500],x[500],sum=0.0;
double range=1.0;
int n=3;
int rank, nprocs;
clock_t begin1, end1, begin2, end2;
MPI_Status status;
MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size(MPI_COMM_WORLD, &nprocs); /* get number of processes */
//////////////////////////////////////////////////////////////////////////////////
if (rank==0)
{
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
A[i][j]=range*(1.0-2.0*(double)rand()/RAND_MAX);
b[i]=range*(1.0-2.0*(double)rand()/RAND_MAX);
}
printf("\n Matrix A (generated randomly):\n");
for (i=0; i<n; i++)
{
for (j=0; j<n; j++)
printf("%9.6lf ",A[i][j]);
printf("\n");
}
printf("\n Vector b (generated randomly):\n");
for (i=0; i<n; i++)
printf("%9.6lf ",b[i]);
printf("\n\n");
}
//////////////////////////////////////////////////////////////////////////////////
begin1 =clock();
MPI_Bcast (&A[0][0],500*500,MPI_DOUBLE,0,MPI_COMM_WORLD);
MPI_Bcast (b,n,MPI_DOUBLE,0,MPI_COMM_WORLD);
for(i=0; i<n; i++)
{
map[i]= i % nprocs;
}
for(k=0;k<n;k++)
{
MPI_Bcast (&A[k][k],n-k,MPI_DOUBLE,map[k],MPI_COMM_WORLD);
MPI_Bcast (&b[k],1,MPI_DOUBLE,map[k],MPI_COMM_WORLD);
for(i= k+1; i<n; i++)
{
if(map[i] == rank)
{
c[i]=A[i][k]/A[k][k];
}
}
for(i= k+1; i<n; i++)
{
if(map[i] == rank)
{
for(j=0;j<n;j++)
{
A[i][j]=A[i][j]-( c[i]*A[k][j] );
}
b[i]=b[i]-( c[i]*b[k] );
}
}
}
end1 = clock();
//////////////////////////////////////////////////////////////////////////////////
begin2 =clock();
if (rank==0)
{
x[n-1]=b[n-1]/A[n-1][n-1];
for(i=n-2;i>=0;i--)
{
sum=0;
for(j=i+1;j<n;j++)
{
sum=sum+A[i][j]*x[j];
}
x[i]=(b[i]-sum)/A[i][i];
}
end2 = clock();
}
//////////////////////////////////////////////////////////////////////////////////
if (rank==0)
{
printf("\nThe solution is:");
for(i=0;i<n;i++)
{
printf("\nx%d=%f\t",i,x[i]);
}
printf("\n\nLU decomposition time: %f", (double)(end1 - begin1) / CLOCKS_PER_SEC);
printf("\nBack substitution time: %f\n", (double)(end2 - begin2) / CLOCKS_PER_SEC);
}
MPI_Finalize();
return(0);
}
I'm not much of a C programmer but it looks to me as if you have probably called return prematurely. Specifically you have called it before MPI_Finalize(). Try swapping the order of the statements. Or even dropping the return altogether.

MPI partition array into blocks and Send

I am trying to find a maximum element of an array using MPI in C language. I have to compare the time it takes to send and calculation of the maximum using vs MPI_Scatter functions. MPI_Send: Here' the code for the MPI_Scatter function it works great:
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define lim 20
//returns "a-b" in seconds
double timeval_diff(struct timeval *a, struct timeval *b)
{
return
(double)(a->tv_sec + (double)a->tv_usec/1000000) -
(double)(b->tv_sec + (double)b->tv_usec/1000000);
}
//Array to be divided among the processes
int buf[lim]=
{27,24,3,8,45,10,50,15,10,11,9,48,69,25,19,29,61,72,93,20};
int buf2[lim];
int buf3[lim];
int max;
int main(int argc, char *argv[])
{
struct timeval t_ini, t_fin;
double secs;
int n, myid, numprocs, i,j;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
fprintf(stderr,"Process %d in %s\n",myid, processor_name);
/*Check Border Conditions */
n=lim/numprocs;
gettimeofday(&t_ini, NULL); //take the time before sending the buffer with Scatter
MPI_Scatter(buf,n, MPI_INT,buf2,n,MPI_INT, 0, MPI_COMM_WORLD);
gettimeofday(&t_fin, NULL);//take the time to complete the send routine
secs = timeval_diff(&t_fin, &t_ini);
MPI_Reduce(buf2,buf3,n, MPI_INT, MPI_MAX, 0,MPI_COMM_WORLD);
if (myid == 0)
{ max = buf3[0];
for (i=1; i<n ; i++)
if (max < buf3[i]) max = buf3[i];
for (i=0; i<n ; i++)
printf("Buf3[%d]= %d \n", i, buf3[i]);
printf("Max number of the array is: %d \n", max);
}
for (i=0; i<n ; i++){
printf("%d,Buf2[%d]= %d \n",myid, i,buf2[i]);}
printf("%.16g milliseconds\n", secs * 1000.0);
MPI_Finalize();
return 0;
}
The problem comes when I try to do the same procedure with the MPI_Send function because I calculated the maximum array elements, what am I doing wrong?:
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define lim 20
//returns "a-b" in seconds
double timeval_diff(struct timeval *a, struct timeval *b)
{
return
(double)(a->tv_sec + (double)a->tv_usec/1000000) -
(double)(b->tv_sec + (double)b->tv_usec/1000000);
}
//Array to be divided among the processes
int buf[lim]=
{27,24,3,8,45,10,50,15,10,11,9,48,69,25,19,29,61,72,93,20};
int buf2[lim];
int buf3[lim];
int max;
int main(int argc, char *argv[])
{
struct timeval t_ini, t_fin;
double secs;
int n, myid, numprocs, i,j;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
fprintf(stderr,"Process %d in %s\n",myid, processor_name);
/*Check Border Conditions */
n=lim/numprocs;
gettimeofday(&t_ini, NULL); //take the time before sending the buffer with Scatter
for (j=0;j<n;j++){
MPI_Send(buf, lim, MPI_INT, 1, 111, MPI_COMM_WORLD);
}
gettimeofday(&t_fin, NULL);//take the time to complete the send routine
secs = timeval_diff(&t_fin, &t_ini);
if (myid == 0)
{ max = buf3[0];
for (i=1; i<n ; i++)
if (max < buf3[i]) max = buf3[i];
for (i=0; i<n ; i++)
printf("Buf3[%d]= %d \n", i, buf3[i]);
printf("Max number of the array is: %d \n", max);
}
for (i=0; i<n ; i++){
printf("%d,Buf2[%d]= %d \n",myid, i,buf2[i]);}
printf("%.16g milliseconds\n", secs * 1000.0);
MPI_Finalize();
return 0;
}
I wasted some hours watching Where is the fault but I can not see it ... Any help?
you are missing the MPI_Recv call on the other end of your MPI_Send call, these kind of functions are more low level as opposed to the collective scatter, gather, reduce and broadcast functions

Resources