c mpi (spawn scatter gather) for variable number of processes - c

I am building up an example with variable no. of processes and bind them to the sockets in a small network with different architecture and number of cpus.
I compile and run with:
mpiicpc avg_4.c -qopenmp -axSSE4.2,AVX,CORE-AVX2 -O3 -par-affinity=noverbose,granularity=core,compact -o b
mpiexec.hydra -machinefile f19 -genv I_MPI_PIN=1 -genv I_MPI_PIN_DOMAIN=socket -genv I_MPI_PIN_ORDER=compact -n 1 ./b
The network (master + slave19) f19 is:
s19:1
ma:1
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <sched.h>
#include <mpi.h>
int *create_mlu(int n_omp, int ws) {
int *mlu = (int *)calloc(n_omp * ws, sizeof(int));
for (int i = 0; i < ws; i++)
for (int j = 0; j < n_omp; j++)
mlu[j + i*n_omp] = j + 100 * i;
return mlu;
}
int *C4_Re(int *mal, int n_omp, int wr, int ws) {
int *rM8 = (int *)malloc(sizeof(int) * n_omp);
char nod[MPI_MAX_PROCESSOR_NAME];
int n_l; MPI_Get_processor_name(nod, &n_l);
#pragma omp parallel for
for (int i = 0; i < n_omp; i++) {
rM8[i] = mal[i] + 10 * omp_get_thread_num();
printf("ws%2d\t\tmpi%2d\t\tmaxTh%2d\t\tmaxPr%2d\t\tomp%2d\t\tcore%3d\t\trM8%4d\t\tnod %s\n", ws, wr, omp_get_num_threads(), omp_get_num_procs(), omp_get_thread_num(), sched_getcpu(), rM8[i], nod);
}
return rM8;
}
int main(void) {
MPI_Init(NULL, NULL);
int ts[2] = {7, 9}; //no of processes
for (int t = 0; t < 2; t++) {
int ws = ts[t];
int errcodes[ws];
MPI_Comm parentcomm, intercomm;
MPI_Comm_get_parent(&parentcomm);
if (parentcomm == MPI_COMM_NULL) {
MPI_Comm_spawn("./b", MPI_ARGV_NULL, ws, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercomm, errcodes);
//printf("I'm the parent.\n");
}
else {
int wr; MPI_Comm_rank(MPI_COMM_WORLD, &wr);// printf("wr %d\n", wr);
//int ps; MPI_Comm_size(parentcomm, &ps);// printf("ps %d\n", ps);
//int pr; MPI_Comm_rank(parentcomm, &pr);// printf("pr %d\n", pr);
int n_omp = 8, *mlu = NULL;
if (wr == 0) {
mlu = create_mlu(n_omp, ws);
//for (int i = 0; i < n_omp*ws; i++) printf("\tmlu[%2d] = %d\n", i, mlu[i]);
}
int *mal = (int *)malloc(n_omp * sizeof(int));
MPI_Scatter(mlu, n_omp, MPI_INT, mal, n_omp, MPI_INT, 0, MPI_COMM_WORLD);
//for (int i = 0; i < n_omp; i++) printf("\t\tmal[%2d] = %d\trank %d\n", i, mal[i], wr);
int *rM8 = NULL;
rM8 = C4_Re(mal, n_omp, wr, ws);
int *rS8 = NULL;
if (wr == 0)
rS8 = (int *)malloc(sizeof(int) * ws * n_omp);
MPI_Gather(rM8, n_omp, MPI_INT, rS8, n_omp, MPI_INT, 0, MPI_COMM_WORLD);
if (wr == 0) {
//for (int i = 0; i < n_omp * ws; i++) printf("\t\trS8[%2d] = %d\n", i, rS8[i]);
free(mlu);
free(rS8); }
free(mal);
free(rM8);
}
//fflush(stdout);
}
fflush(stdout);
MPI_Finalize();
return 0;
}
I have a memory corruption which I need help to find
Some results look like
ws 7 rM8-37253944 nod ma mpi 7 maxTh 6 maxPr 6 omp 4 core 4
but they must look like
ws 7 rM8 624 nod ma mpi 6 maxTh 6 maxPr 6 omp 2 core 2
addition questions
1 - why using parentcomm for Scatter and Gather is not correct? In my opinion parentcomm is the new communicator
2 - should I create different comunicators for 7 and 9?
3 - mpicc gives me wrong results I don't know why

Related

distributed algorithm in C

I am a beginner in C. I have to create a distributed architecture with the library MPI. The following code is:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char **argv)
{
int N, w = 1, L = 2, M = 50; // with N number of threads
int T= 2;
int myid;
int buff;
float mit[N][T]; // I initialize a 2d array
for(int i = 0; i < N; ++i){
mit[i][0]= M / (float) N;
for (int j = 1; j < T; ++j){
mit[i][j] = 0;
}
}
float tab[T]; // 1d array
MPI_Status stat;
/*********************************************
start
*********************************************/
MPI_Init(&argc,&argv); // Initialisation
MPI_Comm_size(MPI_COMM_WORLD, &N);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
for(int j = 0; j < T; j++) {
for(int i = 0; i < N; i++) { // I iterate for each slave
if (myid !=0) {
float y = ((float) rand()) / (float) RAND_MAX;
mit[i][j + 1] = mit[i][j]*(1 + w * L * y);
buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, 0, 0, MPI_COMM_WORLD); // I send the variable buff to the master
buff=0;
}
if( myid == 0 ) { // Master
for(int i = 1; i < N; i++){
MPI_Recv(&buff, 128, MPI_INT, i, 0, MPI_COMM_WORLD, &stat);
tab[j] += buff; // I need to receive all the variables buff sent by the salves, sum them and stock into the tab at the index j
}
printf("\n%.20f\n",tab[j]); // I print the result of the sum at index j
}
}
}
MPI_Finalize();
return 0;
}
}
I use the command in the terminal: mpicc .c -o my_file to compile the program
Then mpirun -np 101 my_file_c to start the program with 101 threads
But the problem is I have the following error int the terminal:
It seems that [at least] one of the processes that was started with
> mpirun did not invoke MPI_INIT before quitting (it is possible that
> more than one process did not invoke MPI_INIT -- mpirun was only
> notified of the first one, which was on node n0).
>
> mpirun can *only* be used with MPI programs (i.e., programs that
> invoke MPI_INIT and MPI_FINALIZE). You can use the "lamexec" program
> to run non-MPI programs over the lambooted nodes.
It seems that I have a problem with the master but i don't know why...
Any idea ???
Thank you :)
This behavior is very likely the result of a memory corruption.
You cannot
int buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, ...);
depending on what you want to achieve, you can try instead
int buff=mit[i][j+1];
MPI_Send(&buff, 1, MPI_INT, ...);
// ...
MPI_Recv(&buff, 1, MPI_INT, ...);
or
int *buff=&mit[i][j+1];
MPI_Send(buff, 128, MPI_INT, ...);
// fix MPI_Recv()

MPI_Scatterv segfault

I'm just starting out with MPI programming and decided to make a simple distributed qsort using OpenMPI. To distribute parts of the array I want to sort I'm trying to use MPI_Scatterv, however the following code segfaults on me:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
srand(time(0));
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs-1;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
}
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
++sizes[curr_worker];
curr_worker = (++curr_worker)%nr_workers;
--left;
}
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
}
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
data_out = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
}
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
}
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, ARRAY_SIZE, MPI_INT, 0, MPI_COMM_WORLD);
} else {
// worker
printf("%d has %d elements!\n",my_rank, sizes[my_rank-1]);
// alloc the input buffer
data_in = malloc(sizeof(int)*sizes[my_rank-1]);
MPI_Scatterv(NULL, NULL, NULL, MPI_INT, data_in, sizes[my_rank-1], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank-1]; ++i) {
printf("%d ", data_in[i]);
}
printf("\n");
}
MPI_Finalize();
return 0;
}
How would I go about using Scatterv? Am I doing something wrong with allocating my input buffer from inside the worker code?
I changed some part in your code to get something working.
MPI_Scatter() will send data to every processors, including himself. According to your program, processor 0 expects ARRAY_SIZE integers, but sizes[0] is much smaller.
There are other problems on other processus : MPI_Scatter will send sizes[my_rank] integers, but sizes[my_rank-1] will be expected...
Here is a code that scatters data_in from 0 to all processors, including 0. Therefore i added 1 to nr_workers :
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
srand(time(0));
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
}
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
++sizes[curr_worker];
curr_worker = (++curr_worker)%nr_workers;
--left;
}
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
}
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
printf("%d %d \n",i,data_in[i]);
}
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
}
} else {
printf("%d has %d elements!\n",my_rank, sizes[my_rank]);
}
data_out = malloc(sizeof(int)*sizes[my_rank]);
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, sizes[my_rank], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank]; ++i) {
printf("%d ", data_out[i]);
}
printf("\n");
free(data_out);
if(my_rank==0){
free(data_in);
}
MPI_Finalize();
return 0;
}
Regarding memory managment, data_in and data_out should be freed at the end of the code.
Is it what you wanted to do ? Good luck with qsort ! I think you are not the first one to sort integers using MPI. See parallel sort using mpi. Your way to generate random numbers on the 0 processus and then scatter them is the right way to go. I think you will be interrested by his TD_Trier() function for communication. Even if you change tri_fusion(T, 0, size - 1); for qsort(...)...
Bye,
Francis

Troubles in MPI -> Failing at address: (nil)

I'm a beginner in C and MPI and i'm trying to do a program to multiply 2 matrix in MPI.
But I don't kwno what is wrong in my code.
I'm try do 'slice' the matrix M1 in n lines and send then to another process do multiply and broadcast de matrix M2 After I make a Gather to build the final matrix M3.
I make this:
mpirun -n 2 matrix
But I receive a error in terminal:
[adiel-VirtualBox:07921] *** Process received signal ***
[adiel-VirtualBox:07921] Signal: Segmentation fault (11)
[adiel-VirtualBox:07921] Signal code: (128)
[adiel-VirtualBox:07921] Failing at address: (nil)
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 7921 on node adiel-VirtualBox exited on signal 0 (Unknown signal 0).
--------------------------------------------------------------------------
2 total processes killed (some possibly by mpirun during cleanup)
mpirun: clean termination accomplished
Can anyone help me?
Here's my code:
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
//#include "mpe.h"
#include <math.h>
void printMatrix(double *M, int m, int n) {
int lin, col;
for (lin=0; lin<m; lin++) {
for (col=0; col<n; col++)
printf("%.2f \t", M[(lin*n+col)]);
printf("\n");
}
}
double* allocateMatrix(int m, int n){
double* M;
M = (double *)malloc(m*n*sizeof(double));
return M;
}
int main( int argc, char *argv[] )
{
int rank, size;
int m1,n1,m2,n2;
int row, col,ctrl,i,k,lines,proc;
double *M1, *M2, *M3, **vp, *v;
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Comm_size( MPI_COMM_WORLD, &size );
m1 = m2 = n1 = n2 = 3;
lines = (int)ceil(n1/size);
v = (double *)malloc(lines*n1*sizeof(double));
M2 = allocateMatrix(m2,n2);
M3 = allocateMatrix(m1,n2);
if(rank==0)
M1 = allocateMatrix(m1,n1);
//startin matrix
for (col = 0; col < n1; col++){
for (row = 0; row < m1; row++) {
if(rank==0)
M1[(row*m1+col)] = 0;
M2[(row*m2+col)] = 0;
M3[(row*m1+col)] = 0;
}
}
//startin pointers with 0
for(i=0;i<lines*n1;i++)
v[i] = 0;
//populate
if(rank == 0){
for (col = 0; col < n1; col++){
for (row = 0; row < m1; row++) {
M1[row*m1+col] = row*3+(col+1);
M2[(row*m2+col)] = 1;
}
}
}
//---------------------sharing and multiply---------------//
//slicing M1 and sending to other process
if(rank == 0){
proc = size-1;
//for each line
for(row = 0;row<m1;row++){
ctrl = floor(row/lines);
//on each column
for(col=0;col<n1;col++){
v[(ctrl*n1)+col] = M1[(row*n1)+col];
}
if(row%lines == (lines - 1)){
if(proc!=0){
MPI_Send(v,lines*n1,MPI_DOUBLE,proc,1, MPI_COMM_WORLD);
proc--;
//clearing pointers
for(i=0;i<lines*n1;i++)
v[i] = 0;
}
}
}
}
//MPI_Bcast(m1, m*n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
MPI_Bcast(M2, m2*n2, MPI_DOUBLE, 0, MPI_COMM_WORLD);
//receiving process
if(rank!=0)
MPI_Recv(v,lines*n1,MPI_DOUBLE,0,1,MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for(row=0;row<lines;row++){
if(v[row*n1]!=0){
for (col = 0; col < n1; col++){
double val = 0.0;
for(k=0;k<m1;k++){
val += v[(row*n1)+k] * M2[(k*n1)+col];
}
M3[((size-1-rank)*size*n1)+(row*n1)+col] = val;
}
}
}
if(rank!=0){
for(row = 0; row < lines; row++){
MPI_Gather(&M3[((size-1-rank)*size*n1)+(row*n1)], n1, MPI_DOUBLE, &M3[((size-1-rank)*size*n1)+(row*n1)], n1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
}
if(rank == 0){
printf("matrix 1------------------------\n");
printMatrix(M1,m1,n1);
printf("matrix 2------------------------\n");
printMatrix(M2,m2,n2);
printf("matrix 3------------------------\n");
printMatrix(M3,m1,n2);
}
MPI_Finalize();
return 0;
}
For one thing, doing all of your sends before the broadcast, and all of the receives after it is asking for trouble. I can easily see that leading to MPI resource exhaustion or deadlock failures. In such a small input that shouldn't arise, but you should fix it regardless. I'll take another look after that.

Bitonic Sorting with MPI

I'm working on a project about the Parallel Bitonic Sorting using MPI and C to implement it. The program I developed works but it's not efficient since a simple QuickSort (sigh) beats it in terms of execution time. Maybe the problem is about the cost of communication but I don't get how to improve that, so here's the code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include "bs-util.h"
#include "quicksort.h"
#define TAG 1
/* Run this program knowing that:
* 1) The number of cores must be a power of 2
* 2) The length of the array to order must be a power of 2
*
* Exec Example: mpirun -n 4 ./bs 1024 1024
* */
void exchange(FILE *log, int i, int partner, int up);
int countTransfer = 0;
int *myArray, *partnerArray;
int currentPartner = -1;
int rank, size;
MPI_Status status;
int verbose = 0; //this var toggles on(1) or off(0) some useful prints for debugging purpose
int amount=0;
int main(int argc, char *argv[])
{
int *array;
int i=0;
int carry=0;
int up=1;
int count=0;
struct timeval tim;
FILE *log;
char logName[15] = "log/";
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* Time meter */
srand((double) time(NULL));
gettimeofday(&tim, NULL);
double t1=tim.tv_sec+(tim.tv_usec/1000000.0);
snprintf(logName+4, 10, "%d",rank);
log = fopen(logName,"w");
printf("Hello world from process %d of %d.\n", rank, size);
MPI_Barrier(MPI_COMM_WORLD);
/* INPUT */
if (rank==0)
{
if (argc==2) /* by file */
{
FILE *input = fopen(argv[1],"r");
char line[20];
count = 0;
while(fgets(line,20,input) != NULL)
{
count++;
}
fclose(input);
array = (int *)malloc(count*sizeof(int));
input = fopen(argv[1],"r");
i = 0;
while(fgets(line,20,input) != NULL)
{
array[i] = atoi(line);
i++;
}
fclose(input);
}
else
if (argc==3) /* by command line */
{
count = atoi(argv[1]);
int max = atoi(argv[2]);
array = (int *)malloc(count*sizeof(int));
srand(time(NULL));
for (i=0; i<count; i++)
{
array[i] = rand()%max;
}
}
else
{
printf("\n\n ----------- ERRORE NEI PARAMETRI DI INPUT ----------- \n\n");
return 1;
}
/* END OF THE INPUT */
if (verbose){
printf("Initial array:\n");
for (i=0; i<count; i++)
{
printf("%d\t", array[i]);
}
printf("\n");
}
/* Everyone wait eachother */
MPI_Barrier(MPI_COMM_WORLD);
carry = count%size;
amount = count/size + carry;
printf("\nParametri: amount=%d carry=%d\n\n", amount, carry);
up=1;
int startIndex = amount;
myArray = (int *)malloc(amount*sizeof(int));
/* Buffer (partner) */
partnerArray = (int *)malloc(amount*sizeof(int));
for (i=0; i<amount; i++)
myArray[i] = array[i];
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
{
printf("%d\t", myArray[i]);
}
printf("\n");
}
/* Sending the big array's chunks */
for (i=1; i<size; i++)
{
up = (i+1) % 2;
MPI_Send(&up, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&amount, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&carry, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(array+startIndex, amount-carry, MPI_INT, i, TAG, MPI_COMM_WORLD);
startIndex += amount-carry;
}
MPI_Barrier(MPI_COMM_WORLD);
}
else
{
MPI_Barrier(MPI_COMM_WORLD);
MPI_Recv(&up, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&amount, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&carry, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
myArray = (int *)malloc(amount*sizeof(int));
partnerArray = (int *)malloc(amount*sizeof(int)); /* Buffer (partner) */
MPI_Recv(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
/* Experimental padding: every chunck has the same amount of items. */
for (i=amount-carry; i<amount; i++)
{
myArray[i] = 0;
}
printf("\n");
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount-carry, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
{
printf("%d\t", myArray[i]);
}
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
/* CORE */
/* Local Quicksort */
int result = quickSort(&myArray[0], amount); //this function is written within src/quicksort.c
if (verbose){
if (result == 1)
printf("Quick Sort: FAIL \n");
else
{
printf("\nLa mia porzione ordinata (processo %d)\n", rank);
for(i=0; i<amount; i++)
{
printf("%d ",myArray[i]);
}
printf ("\n");
}
}
int j;
for (up=8;up<=amount*size;up=2*up)
{
for (j=up>>1;j>0;j=j>>1)
{
for (i=0;i<amount*size;i++)
{
int partner=i^j;
if ((partner)>i)
{
exchange(log,i,partner,i&up);
}
}
}
}
/* END OF THE CORE */
if (rank!=0)
{
MPI_Send(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD);
}
gettimeofday(&tim, NULL);
double t2=tim.tv_sec+(tim.tv_usec/1000000.0);
if (rank==0)
{
myArray = (int *)realloc(myArray,sizeof(int)*amount*size);
for (i=1; i<size; i++)
MPI_Recv(myArray+i*amount, amount, MPI_INT, i, TAG, MPI_COMM_WORLD, &status);
printf("\nTempo trascorso %6f\n", t2-t1);
fprintf(log,"\n\n----------> Array Iniziale <----------\n");
printArray(log,array,count);
fprintf(log,"\n\n----------> Array Finale <----------\n");
printArray(log,myArray+(carry*(size-1)),count);
/*printArray(log,myArray,newAmount*size);*/
}
fprintf(log,"Numero di chunk scambiati: %d\n",countTransfer);
fclose(log);
MPI_Finalize();
return 0;
}
void exchange(FILE *log, int i, int partner, int up)
{
int rank_i = i/amount;
int rank_partner = partner/amount;
int offset_i = i%amount;
int offset_partner = partner%amount;
/*if (verbose)
fprintf(log,"\nnewAmount = %d - Rank_i = %d - Rank_partner = %d - Offset_i = %d - Offset_partner = %d \n",amount,rank_i,rank_partner,offset_i,offset_partner);
*/
if ((rank_i != rank) && (rank_partner != rank))
return;
if ((rank_i == rank) && (rank_partner == rank))
{
if (((up==0) && (myArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < myArray[offset_partner])))
{
int temp = myArray[offset_i];
myArray[offset_i] = myArray[offset_partner];
myArray[offset_partner] = temp;
}
return;
}
if (rank_i == rank && rank_partner != rank)
{
if (currentPartner != rank_partner)
{
MPI_Send(myArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD);
MPI_Recv(partnerArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD, &status);
currentPartner = rank_partner;
countTransfer++;
}
if (((up==0) && (myArray[offset_i] > partnerArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < partnerArray[offset_partner])))
myArray[offset_i] = partnerArray[offset_partner];
return;
}
if (rank_i != rank && rank_partner == rank)
{
if (currentPartner != rank_i)
{
MPI_Recv(partnerArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD, &status);
MPI_Send(myArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD);
currentPartner = rank_i;
countTransfer++;
}
if (((up==0) && (partnerArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (partnerArray[offset_i] < myArray[offset_partner])))
myArray[offset_partner] = partnerArray[offset_i];
return;
}
}
And here's the Make file:
CC = mpicc
OPTIMIZE =
CFLAGS = $(DEFINES) $(OPTIMIZE)
LFLAGS = -lm
PROGS = ./bs
PROGS_SRC = src/bs-util.c src/bs.c src/quicksort.c
all:
$(CC) $(CFLAGS) $(LFLAGS) -o $(PROGS) $(PROGS_SRC)
Help would be very appreciated :)
References: http://goo.gl/nXt4p
Remember that bitonic sort has time complexity of something like N/P (log N)^2 compared to quicksort N log N (in serial version). This means that with log N > P (P ~ number of processors) should even the serial quicksort beat bitonic sort (I am not talking about multiplying with some factors depending on the implementation, neither the communication). Bitonic sort is for really parallel computers (it's pretty good on GPUs), not a grid of few PCs as you probably have.
Many sends/receives (as in exchange function) of small data chunks badly affect performance. More efficient is combining small chunks into one buffer and sending it.
Um. I don't see you doing any collective communication other than barriers...

Problem with MPI matrix-matrix multiply: Cluster slower than single computer

I code a small program using MPI to parallelize matrix-matrix multiplication. The problem is: When running the program on my computer, it takes about 10 seconds to complete, but about 75 seconds on a cluster. I think I have some synchronization problem, but I cannot figure it out (yet).
Here's my source code:
/*matrix.c
mpicc -o out matrix.c
mpirun -np 11 out
*/
#include <mpi.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define N 1000
#define DATA_TAG 10
#define B_SENT_TAG 20
#define FINISH_TAG 30
int master(int);
int worker(int, int);
int main(int argc, char **argv) {
int myrank, p;
double s_time, f_time;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &p);
if (myrank == 0) {
s_time = MPI_Wtime();
master(p);
f_time = MPI_Wtime();
printf("Complete in %1.2f seconds\n", f_time - s_time);
fflush(stdout);
}
else {
worker(myrank, p);
}
MPI_Finalize();
return 0;
}
int *read_matrix_row();
int *read_matrix_col();
int send_row(int *, int);
int recv_row(int *, int, MPI_Status *);
int send_tag(int, int);
int write_matrix(int *);
int master(int p) {
MPI_Status status;
int *a; int *b;
int *c = (int *)malloc(N * sizeof(int));
int i, j; int num_of_finish_row = 0;
while (1) {
for (i = 1; i < p; i++) {
a = read_matrix_row();
b = read_matrix_col();
send_row(a, i);
send_row(b, i);
//printf("Master - Send data to worker %d\n", i);fflush(stdout);
}
wait();
for (i = 1; i < N / (p - 1); i++) {
for (j = 1; j < p; j++) {
//printf("Master - Send next row to worker[%d]\n", j);fflush(stdout);
b = read_matrix_col();
send_row(b, j);
}
}
for (i = 1; i < p; i++) {
//printf("Master - Announce all row of B sent to worker[%d]\n", i);fflush(stdout);
send_tag(i, B_SENT_TAG);
}
//MPI_Barrier(MPI_COMM_WORLD);
for (i = 1; i < p; i++) {
recv_row(c, MPI_ANY_SOURCE, &status);
//printf("Master - Receive result\n");fflush(stdout);
num_of_finish_row++;
}
//printf("Master - Finish %d rows\n", num_of_finish_row);fflush(stdout);
if (num_of_finish_row >= N)
break;
}
//printf("Master - Finish multiply two matrix\n");fflush(stdout);
for (i = 1; i < p; i++) {
send_tag(i, FINISH_TAG);
}
//write_matrix(c);
return 0;
}
int worker(int myrank, int p) {
int *a = (int *)malloc(N * sizeof(int));
int *b = (int *)malloc(N * sizeof(int));
int *c = (int *)malloc(N * sizeof(int));
int i;
for (i = 0; i < N; i++) {
c[i] = 0;
}
MPI_Status status;
int next = (myrank == (p - 1)) ? 1 : myrank + 1;
int prev = (myrank == 1) ? p - 1 : myrank - 1;
while (1) {
recv_row(a, 0, &status);
if (status.MPI_TAG == FINISH_TAG)
break;
recv_row(b, 0, &status);
wait();
//printf("Worker[%d] - Receive data from master\n", myrank);fflush(stdout);
while (1) {
for (i = 1; i < p; i++) {
//printf("Worker[%d] - Start calculation\n", myrank);fflush(stdout);
calc(c, a, b);
//printf("Worker[%d] - Exchange data with %d, %d\n", myrank, next, prev);fflush(stdout);
exchange(b, next, prev);
}
//printf("Worker %d- Request for more B's row\n", myrank);fflush(stdout);
recv_row(b, 0, &status);
//printf("Worker %d - Receive tag %d\n", myrank, status.MPI_TAG);fflush(stdout);
if (status.MPI_TAG == B_SENT_TAG) {
break;
//printf("Worker[%d] - Finish calc one row\n", myrank);fflush(stdout);
}
}
//wait();
//printf("Worker %d - Send result\n", myrank);fflush(stdout);
send_row(c, 0);
for (i = 0; i < N; i++) {
c[i] = 0;
}
}
return 0;
}
int *read_matrix_row() {
int *row = (int *)malloc(N * sizeof(int));
int i;
for (i = 0; i < N; i++) {
row[i] = 1;
}
return row;
}
int *read_matrix_col() {
int *col = (int *)malloc(N * sizeof(int));
int i;
for (i = 0; i < N; i++) {
col[i] = 1;
}
return col;
}
int send_row(int *row, int dest) {
MPI_Send(row, N, MPI_INT, dest, DATA_TAG, MPI_COMM_WORLD);
return 0;
}
int recv_row(int *row, int src, MPI_Status *status) {
MPI_Recv(row, N, MPI_INT, src, MPI_ANY_TAG, MPI_COMM_WORLD, status);
return 0;
}
int wait() {
MPI_Barrier(MPI_COMM_WORLD);
return 0;
}
int calc(int *c_row, int *a_row, int *b_row) {
int i;
for (i = 0; i < N; i++) {
c_row[i] = c_row[i] + a_row[i] * b_row[i];
//printf("%d ", c_row[i]);
}
//printf("\n");fflush(stdout);
return 0;
}
int exchange(int *row, int next, int prev) {
MPI_Request request; MPI_Status status;
MPI_Isend(row, N, MPI_INT, next, DATA_TAG, MPI_COMM_WORLD, &request);
MPI_Irecv(row, N, MPI_INT, prev, MPI_ANY_TAG, MPI_COMM_WORLD, &request);
MPI_Wait(&request, &status);
return 0;
}
int send_tag(int worker, int tag) {
MPI_Send(0, 0, MPI_INT, worker, tag, MPI_COMM_WORLD);
return 0;
}
int write_matrix(int *matrix) {
int i;
for (i = 0; i < N; i++) {
printf("%d ", matrix[i]);
}
printf("\n");
fflush(stdout);
return 0;
}
Well, you have a fairly small matrix (N=1000), and secondly you distribute your algorithm on a row/column basis rather than blocked.
For a more realistic version using better algorithms, you might want to acquire an optimized BLAS library (e.g. GOTO is free), test single-thread performance with that one, then get PBLAS and link it against your optimized BLAS, and compare MPI parallel performance using the PBLAS version.
I see some errors in your program:
First, why are you calling the wait function since its implementation is simply calling MPI_Barrier. MPI_Barrier is a primitive synchronization that blocks all threads until they reach the "barrier" by calling MPI_Barrier. My question is: do you want the master to be synchronized with the workers? In this context, that would not be optimal because a worker doesn't need to wait for the master to begin its calculation.
Second, there are 2 unnecessary for loops.
for (i = 1; i < N / (p - 1); i++) {
for (j = 1; j < p; j++) {
b = read_matrix_col();
send_row(b, j);
}
}
for (i = 1; i < p; i++) {
send_tag(i, B_SENT_TAG);
}
In the first i-loop, you don't use the variable in your statement. Since the j-loop and the second i-loop are the same, you could do:
for (i = 0; i < p; i++) {
b = read_matrix_col();
send_row(b, j);
send_tag(i, B_SENT_TAG);
}
In terms of data transfer, your program is not optimized because you are sending an array of 1000 integers of data for each data transfer. There should be a better way to optimise the data transfer, but I will let you look at it. So make the corrections I told you and tell us what is your new performance.
And as #janneb said, you can use the BLAS library for better performance for matrix multiplication. Good luck!
I did not look over your code, but I can provide some hints about why your result may not unexpected:
As already mentioned, N=1000 may be too small. You should make more tests to see the scalability of your program (try setting N=100, 500, 1000, 5000, 10000, etc.) and compare results on both your system and the cluster.
Compare results between your system (one processor I presume) and a single processor on the cluster. Usually in production environments like servers or clusters a single processor is less powerful than the best processors designed for desktop use, but they provide stability, reliability and other features useful for environments which run 24h/day at full capacity.
If your processor has multiple cores, more than one MPI processes may run at the same time and synchronization between them is negligible compared to the synchronization between nodes in a cluster.
Are the nodes from the cluster statically assigned to you? Maybe other users' programs can be scheduled on the nodes you are running at the same time as you.
Read documentation about the cluster's architecture. Some architectures may be more suitable for particular classes of problems.
Assess latency of the network of the cluster. Ping-ing from each node to another many times and computing the mean value may give a rough estimate.
Last but perhaps the most important, your algorithm may not be optimal. Read a/some books on matrix multiplication (I can recommend "Matrix Computations", Golub and Van Loan).

Resources