I'm trying to get values in my spawned process using collective MPI functions.
In this case I have a N*N matrix and I want to pass every row to each process. Get the values in each process and sum their values.
I'm using this example:
MPI_Scatter of 2D array and malloc
main
int main(int argc, char *argv[]){
int *n, range, i, j, dato, resultado;
int *matriz;
char *nombre_esclave="esclavo";
//MPI Section
int rank, size;
MPI_Comm hijos;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
matriz = createMatrix(N, N);
printArray(matriz, N * N);
//Child process
MPI_Comm_spawn("slave", MPI_ARGV_NULL, N, MPI_INFO_NULL, 0, MPI_COMM_SELF, &hijos, MPI_ERRCODES_IGNORE);
// received row will contain N integers
int *procRow = malloc(sizeof(int) * N);
MPI_Scatter(matriz, N, MPI_INT, // send one row, which contains N integers
procRow, N, MPI_INT, // receive one row, which contains N integers
MPI_ROOT, hijos);
MPI_Finalize();
return 0;
}
and in slave
slave
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_get_parent(&parent);
if (parent != MPI_COMM_NULL) {
printf("This is a child process\n");
}
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size);
int *procRow = malloc(sizeof(int) * N);
//UNABLE TO GET VALUES FROM THE PARENT
//I need to sum old the values y every portion of the matrix
//passed to every child process
MPI_Reduce(procRow, &resultado_global, N, MPI_INT, MPI_SUM, 0, parent);
UPDATE
With MPI_Comm_spawn I create 3 childs. In every child I want to get a row of matrix (I use scatter in master). Later I use MPI_Reduce to sum every row in child (that's why I say getting values).
UPDATE 2
On slave I have modified the code and I get the rows in every process.
if (parent != MPI_COMM_NULL) {
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size_remote);
int *matrix = malloc(sizeof(int) * size);
int *procRow = malloc(sizeof(int) * size);
MPI_Scatter(matrix, N, MPI_INT,procRow, N, MPI_INT,0, parent);
//procRow values correctly from each row of the matrix
if (procRow != NULL) {
printf("Process %d; %d %d %d \n", pid, procRow[0], procRow[1], procRow[2]);
}
//Unable to sum each row
MPI_Reduce(procRow, &resultado_global, size, MPI_INT, MPI_SUM, ROOT, parent);
//MPI_Reduce(procRow, &resultado_global, size, MPI_INT, MPI_SUM, ROOT, MPI_COMM_WORLD);
}
UPDATE 3 (SOLVED)
IN SLAVE
if (parent != MPI_COMM_NULL) {
//number of processes in the remote group of comm (integer)
MPI_Comm_remote_size(parent, &size_remote);
int *matrix = malloc(sizeof(int) * size);
int *procRow = malloc(sizeof(int) * size);
MPI_Scatter(matrix, N, MPI_INT, procRow, N, MPI_INT, 0, parent);
if (procRow != NULL) {
printf("Process %d; %d %d %d \n", pid, procRow[0], procRow[1], procRow[2]);
sumaParcial=0;
for (int i = 0; i < N; i++)
sumaParcial = sumaParcial + procRow[i];
}
MPI_Reduce(&sumaParcial, &resultado_global, 1, MPI_INT, MPI_SUM, ROOT, parent);
}
IN MASTER
// received row will contain N integers
int *procRow = malloc(sizeof(int) * N);
MPI_Scatter(matriz, N, MPI_INT, // send one row, which contains N integers
procRow, N, MPI_INT, // receive one row, which contains N integers
MPI_ROOT, hijos);
MPI_Reduce(&sumaParcial, &resultado_global, 1, MPI_INT, MPI_SUM, MPI_ROOT, hijos);
printf("\n GLOBAL RESULT :%d\n",resultado_global);
Any idea?
Thanks
From the edit I suppose that the scatter is working correctly.
You main confusion seems to be about MPI_Reduce. It does not do any local reduction. According to your graphic, you want to have the values 6, 15, 24at the ranks 0, 1, 2 in the slaves. That is done entirely without MPI, just by iterating over the local rows.
An MPI_Reduce on the rows would lead to the root having [12, 15, 18]. If you just want the total sum 45 at the root of the slaves, you should first summarize the values locally and then MPI_Reduce the single values from each rank to a single global value.
Related
Below is my code. Currently, it is trying to distribute work to be done for a 1d representation of a matrix (2d array). I MPI_Scatter the portion of the array which needs work. I store that portion into local_C which should be of the same size as the portion sent. I also broadcast the M (col), Q (used for gather function as col size), ....
int main(int argc, char *argv[]) {
int rank;
int nproc;
int n_local;
int N; // rows
int M; // rows/columns
int Q; // columns
MPI_Init (&argc, &argv); /* intialize MPI*/
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Comm_size(comm, &nproc);
MPI_Comm_rank(comm, &rank);
int *matrixA;
int *matrixB;
int *matrixC;
int *local_C;
// manager core constructs factors of matrix representation
if (rank == 0) {
N = atoi(argv[1]);
M = atoi(argv[2]);
Q = atoi(argv[3]);
// check if correct number of input
if (argc != 4) {
printf("Enter <filename> <N> <M> <Q>\n");
exit(1);
}
else if (N % nproc != 0) { // check if N is a multiple of the number of processors
printf("Ensure N is divisible by number of processors: %i\n", nproc);
exit(1);
}
// create matrices of size
matrixA = malloc(N * M * sizeof(long));
randomlyFillArray(matrixA, N * M);
matrixB = malloc(M * Q * sizeof(long));
randomlyFillArray(matrixB, M * Q);
// create resulting product matrix of size
matrixC = malloc(N * Q * sizeof(long));
// sequential compute
//computeMatrixProductSequentially(matrixA, matrixB, matrixC, M, N, Q);
// parallel compute
// block data
n_local = N / nproc;
local_C = malloc(n_local * M * sizeof(long));
MPI_Bcast(&M, 1, MPI_INT, 0, comm);
MPI_Bcast(&Q, 1, MPI_INT, 0, comm);
MPI_Bcast(&n_local, 1, MPI_INT, 0, comm);
// scatter matrixA for n_local row to cores
MPI_Scatter(&matrixA, n_local * M, MPI_LONG, &local_C, n_local * M, MPI_LONG, 0, comm);
// broadcast matrixB to all cores
MPI_Bcast(&matrixB, 1, MPI_LONG, 0, comm);
}
}
else {
MPI_Bcast(&M, 1, MPI_INT, 0, comm);
MPI_Bcast(&Q, 1, MPI_INT, 0, comm);
MPI_Bcast(&n_local, 1, MPI_INT, 0, comm);
// scatter recv matrixA row
MPI_Scatter(&matrixA, n_local * M, MPI_LONG, &local_C, n_local * M, MPI_LONG, 0, comm);
// broadcast recv matrixB
MPI_Bcast(&matrixB, 1, MPI_LONG, 0, comm);
//MPI_Gather();
}
MPI_Finalize();
return 0;
}
Here is the error when trying to compile and run the program.
The purpose, in case it matters, is to multiply two matrices in parallel using 1d arrays.
The problem with your code is that MPI calls take a int* or double* or whateversimpletype* argument. Your MatrixA is int*, so using &MatrixA makes the buffer int**. Solution: pass MatrixA directly as buffer.
Also: you are coding as if the scatter operation on non-zero ranks creates the matrix. That is not the case. You need to allocate the array yourself, and MPI will write the values into it.
Another remark: scattering a matrix is not a scalable solution and is bad MPI coding. It introduces both a memory bottleneck, because your process zero needs to be able to store all the data, and a time bottleneck because all otehr processes have to wait for process zero to construct the matrix. The right way to code this is to let each process construct its own part of the matrix. Always keep your data structure distributed from start to end!
I have two threads (rank 0 and 1), each containing a 3D matrix (x0,y,z) and (x1,y,z) with a different size along the x dimension. I would like to send a specific plane (x0 constant,y,z) from the first thread to the second and replace one of its face (x1 constant, y, z). The following code I made seems to work well when the two matrices have identical dimensions (even in x), but does not send the right face when x0 != x1 :
double ***alloc2(int x, int y,int z){
int i, j;
double ***array = (double ***) malloc(sizeof(double ***)*x);
for (i = 0; i<x; i++){
array[i] = (double **) malloc(sizeof(double*)*y);
for (j=0; j<y; j++){
array[i][j] = (double *) malloc(sizeof(double)*z);}}
return array;
}
int main(int argc, char *argv[]){
MPI_Status status;
MPI_Comm_size(MPI_COMM_WORLD, &nbr);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
/* Some long code I skiped */
/* ... */
MPI_Datatype sub;
MPI_Type_vector(nL+1, nL+1, nL_thread, MPI_DOUBLE, &sub);
MPI_Type_commit(&sub);
if(rank == 0){
MPI_Send(&c_new[3][0][0], 1, sub, rank+1,01, MPI_COMM_WORLD);
MPI_Recv(&c_new[4][0][0], 1, sub, rank+1,10, MPI_COMM_WORLD, &status);}
if(rank == 1){
MPI_Recv(&c_new[0][0][0], 1, sub, rank-1,01, MPI_COMM_WORLD, &status);
MPI_Send(&c_new[1][0][0], 1, sub, rank-1,10, MPI_COMM_WORLD);}
}
nL is the length in the y and z dimensions, same for all threads, nL_thread is the x dimension (in this particular case, nL_thread = 3 for rank 1 and 4 for rank 0). here I am trying to replace the faces (0,y,z) of rank 1 by (3,y,z) of rank 0, and (4,y,z) of rank 0 by (1,y,z) of rank 1.
I have on every processor a list range with numbers. I want to determine the maximum number of every row of these lists range.
The first four lists range for every processor P0-P3. The red list contains the maximum values of each row which every processor after MPI_Allreduce gets.
Here is a working version of my code:
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <mpi.h>
//#define KEY_MAX 100
typedef struct{
int myrank;
int numprocs;
int *range;
} SubDomainKeyTree;
void compRange(SubDomainKeyTree *s, int myrank, int numprocs){
s->myrank = myrank;
s->numprocs = numprocs;
// Allocate memory for (numprocs+1) ranges
s->range = malloc((numprocs+1) * sizeof(int));
// Compute range values
for(int p=0; p<=numprocs; p++){
s->range[p] = rand()%100;
}
for(int p=0; p<s->numprocs; p++){
if(s->myrank == p){
for(int k=0; k<=s->numprocs; k++){
printf("Processor %d: %d random number is %d\n", p, k, s->range[k]);
}
printf("\n");
}
}
}
void compDynRange(SubD *s){
int rangeForAll[s->numprocs+1];
//////////////////////////////////
// This is not really efficient //
//////////////////////////////////
for(int r=0; r<=s->numprocs; r++){
MPI_Allreduce(&s->range[r], &rangeForAll[r], 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
}
for(int p=0; p<s->numprocs; p++){
if(s->myrank == p){
for(int k=0; k<=s->numprocs; k++){
s->range[k] = rangeForAll[k];
printf("Processor %d: %d random number after MPI_Allreduce is %d\n", p, k, s->range[k]);
}
printf("\n");
}
}
}
int main(int argc, char **argv){
int nameLen;
char processorName[MPI_MAX_PROCESSOR_NAME];
int myrank; // Rank of processor
int numprocs; // Number of processes
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Get_processor_name(processorName,&nameLen);
MPI_Status status;
time_t t;
srand((unsigned)time(NULL)+myrank*numprocs+nameLen);
SubD s;
compRange(&s, myrank, numprocs);
compDynRange(&s);
MPI_Finalize();
return 0;
}
I use a for-loop which seems highly inefficient to me. Here I compute the maximum value of every row of all lists one after the other.
But can I use MPI_Allreduce without that for-loop?
I already tried that instead of the for-loop which does not work.
MPI_Allreduce(&s->range, &rangeForAll, s->numprocs+1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
Can someone give me a hint how I can do that?
As already hinted in comment, the error you had in you code was that instead of passing the arrays containing your send and receive buffers, you were passing some pointers to them. I imagine that this error was simply coming fro the change from a single element used initially (like &s->range[r]) which was perfectly correct, to the full array by just removing the indexed access (ie &s->range) which was wrong.
So as explained, using:
MPI_Allreduce(s->range, rangeForAll, s->numprocs+1, MPI_INT, MPI_MAX, MPI_COMM_WORLD)
just does the trick. However, since you want to get the results into the s->range arrays rather than the temporary rangeFarAll ones, you'd better off not defining the later at all, and use the MPI_IN_PLACE keyword as sending parameter and s->range as receiving one. The call becomes:
MPI_Allreduce(MPI_IN_PLACE, s->range, s->numprocs+1, MPI_INT, MPI_MAX, MPI_COMM_WORLD)
and s->range acts both as sending and receiving buffer. Therefore, the final results will all be in the s->range buffers after the call, sparing you the need of doing the copy explicitly.
I am trying to parallelize the following code for calculation of pi.
My approach is to use scatter to parallelize the for, and then use a reduce to calculate the sum value and finally show pi.
My code is the following
#include <stdio.h>
#include <mpi.h>
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i, myid, size;
double x, pi, local_sum = 0.0, sum=0.0;
double send_vec[num_steps], recv_vect[num_steps];
// Initialize the MPI environment
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
if (myid ==0){
int i=0;
for (i=0; i<num_steps;i++){
send_vec[i]=i;
}
}
MPI_Scatter(send_vec, num_steps/size, MPI_INT, recv_vect,
num_steps, MPI_INT, 0, MPI_COMM_WORLD);
for(i = 0; i < num_steps; ++i) {
x = (recv_vect[i]-0.5)*step;
local_sum += 4.0/(1.0+x*x);
}
MPI_Reduce(&local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0){
pi = step*sum;
printf("PI value = %f\n", pi);
}
// Finalize the MPI environment.
MPI_Finalize();
}
The thing is when I run the program with the option -np 1 and 2
I do get the desired result.
Yet when I run with 3, 4 and higher I get the following error:
PIC_Send(284).........: Negative count, value is -240000
Fatal error in PMPI_Scatter: Invalid count, error stack
The call to MPI_Scatter() is to be corrected:
MPI_Scatter(send_vec, num_steps/size, MPI_INT, recv_vect,
num_steps, MPI_INT, 0, MPI_COMM_WORLD);
To send double, use the datatype MPI_DOUBLE as you did in the MPI_Reduce()
Since the sendtype is similar to the recvtype, the number of item sent to each process sendcount must be equal to the number of item received by each process recvcount. In the present case, it's num_steps/size.
Finally, the call to MPI_Scatter() will look like:
MPI_Scatter(send_vec, num_steps/size, MPI_DOUBLE, recv_vect,
num_steps/size, MPI_DOUBLE, 0, MPI_COMM_WORLD);
Lastly, dynamic memory allocation can be used to avoid using the stack for storing large arrays. Moreover, the allocated space can be decreased so as to reduce the memory footprint:
num_steps=(num_steps/size)*size;
double* send_vec=NULL;
double* recv_vec=NULL;
if(rank==0){
send_vec=malloc((num_steps/size)*sizeof(double));
if(send_vec==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
}
recv_vec=malloc(num_steps*sizeof(double));
if(recv_vec==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
...
if(rank==0){
free(send_vec);
}
free(recv_vec);
I want to write a code that:
P0 processor gets an array from keyboard, and sends that array to P1 processor.
P1 processor prints all of the values to screen. For example:
[P0]: Enter the size of array: 1
[P0]: Enter the elements of array: 3
[P1]: The array is: 3
[P0]: Enter the size of array: 3
[P0]: Enter the elements of array: 5 7 5
[P1]: The array is: 5 7 5
.
.
.
and here is my first work. Too many faults I think. But I'm new. Want to learn how to code.
#include <stdio.h>
#include <mpi.h>
#define n 100
int main(int argc, char *argv[]){
int my_rank, size_cm;
int value, i;
int dizi[n];
double senddata[n];
double recvdata[n];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size_cm);
value=0;
if(my_rank == 0){
printf("[%d]: Enter the size of array: ",&my_rank);
scanf("%d",value);
printf("[%d]: Enter the elements of array",&my_rank);
for(i=0; i<n; i++){
scanf("%d", &dizi[n]);
senddata[0] = dizi[];
MPI_Send(senddata, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
}
}
if(my_rank == 1){
MPI_Recv(recvdata, 1, MPI_INT, 0, 0, MPI_COMM_WORLD,
printf("[%d]: The array is: %d ",&my_rank, dizi[n]);
}
MPI_Finalize();
return 0;
}
To get a minimal example that compile, I added the missing argument to MPI_Recv():
MPI_Status status;
MPI_Recv(recvdata, 1, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
I also modifed senddata[0] = dizi[]; to senddata[0] = dizi[i];
As I tried to compile the code you provided, I got a warning :
format ‘%d’ expects argument of type ‘int’, but argument 2 has type ‘int *
The function scanf() needs the pointer to the data to modify it, so int a;scanf("%d",&a); is correct. But printf() just needs the data since it will not modify it : int a;printf("%d",a); is the right way to go.
If you want the array to be populated, use scanf("%d", &dizi[i]);, not scanf("%d", &dizi[n]);. n is the lenght of array dizi. Hence, the index n is out of the array, since indexes of arrays start at 0. This could trigger undefined behaviors (strange values, segmentation fault or even a correct result !).
Since MPI_Send() is called in the for(i=0; i<n; i++), process 0 tries to send n messages to process 1. But process 1 only receives one. Hence, process 0 will be locked at i=1, waiting for process 1 to receive the second message. This is a deadlock.
I assume you are trying to send an array from process 0 to process 1. The following code based on your should do the trick. The actual length of the array is n_arr:
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#define n 100
int main(int argc, char *argv[]){
int my_rank, size_cm;
int n_arr;
int i;
int dizi[n];
// double senddata[n];
// double recvdata[n];
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size_cm);
if(my_rank == 0){
// fflush(stdout); because the standard output is buffered...
printf("[%d]: Enter the size of array: ",my_rank);fflush(stdout);
if(scanf("%d",&n_arr)!=1){fprintf(stderr,"input error\n");exit(1);}
if(n_arr>100){
fprintf(stderr,"The size of the array is too large\n");exit(1);
}
printf("[%d]: Enter the elements of array",my_rank);fflush(stdout);
for(i=0; i<n_arr; i++){
if(scanf("%d", &dizi[i])!=1){fprintf(stderr,"input error\n");exit(1);}
}
//sending the length of the array
MPI_Send(&n_arr, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
//senfing the array
MPI_Send(dizi, n_arr, MPI_INT, 1, 0, MPI_COMM_WORLD);
}
if(my_rank == 1){
// receiving the length of the array
MPI_Recv(&n_arr, 1, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
//receiving the array
MPI_Recv(dizi, n_arr, MPI_INT, 0, 0, MPI_COMM_WORLD,&status);
printf("[%d]: The array of size %d is: ",my_rank,n_arr);
for(i=0; i<n_arr; i++){
printf("%d ",dizi[i]);
}
printf("\n");
}
MPI_Finalize();
return 0;
}
It is compiled by running mpicc main.c -o main and ran by mpirun -np 2 main
I added some stuff to check if the input is correct (always a good thing) and to handle the case of n_arr being larger than n=100. The last one could be avoided by using malloc() to allocate memory for the array: this part is left to you !