I'm trying to make an MPI matrix multiplication program but the scatter function doesn't seem to be working for me. Only one row is getting scattered and the rest of the cores receive garbage value.
Also when calling the display_matrix() function before I MPI_Init() seems to be running 4 threads instead of 1 (I have quad core CPU). Why is this happening even before initialisation?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int **matrix_generator(int row,int col);
int **multiply_matrices(int **matrix_A,int **matrix_B,int rowsA, int colsA,int rowsB,int colsB);
void display_matrix(int **matrixA,int rows,int cols);
void main(int argc,char *argv[])
int **matrix_A,**matrix_B,**matrix_result,*scattered_matrix,*gathered_matrix, rowsA,colsA,rowsB,colsB,world_rank,world_size,i,j;
rowsA = atoi(argv[1]);
colsA = atoi(argv[2]);
rowsB = atoi(argv[3]);
colsB = atoi(argv[4]);
scattered_matrix = (int *)malloc(sizeof(int) * rowsA*colsA/4);
if (argc != 5)
fprintf(stderr,"Usage: mpirun -np <No. of processors> ./a.out <Rows A> <Columns A> <Rows B> <Columns B>\n");
else if(colsA != rowsB)
printf("Check the dimensions of the matrices!\n\n");
matrix_A = matrix_generator(rowsA,colsA);
matrix_B = matrix_generator(rowsB,colsB);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Scatter(matrix_A, rowsA*colsA/4, MPI_INT, scattered_matrix, rowsA*colsA/4, MPI_INT, 0, MPI_COMM_WORLD);
printf("Scattering data %d from root to: %d \n",scattered_matrix[i],world_rank);
int **matrix_generator(int row, int col)
int i, j, **intMatrix;
intMatrix = (int **)malloc(sizeof(int *) * row);
for (i = 0; i < row; i++)
intMatrix[i] = (int *)malloc(sizeof(int *) * col);
for (j = 0;j<col;j++)
return intMatrix;
void display_matrix(int **matrix, int rows,int cols)
int i,j;
for (i = 0; i < rows; i = i + 1)
for (j = 0; j < cols; j = j + 1)
printf("%d ",matrix[i][j]);
The main issue is your matrices are not allocated in contiguous memory (see the comment section for a link)
The MPI standard does not specify what happens before an app invokes MPI_Init().
The two main MPI implementations choose to spawn all the tasks when mpirun is invoked (that means there are 4 independent processes first, and they "join" into a single MPI job when they all call MPI_Init()).
That being said, once upon a time, a vendor chose to have mpirun start a single MPI task, and they use their own remote-fork when MPI_Init() is called.
Bottom line, if you want to write portable code, do as less as possible (and never print anything) before MPI_Init() is called.
I have an MPI program that is solving the "metric traveling salesman problem".
When I run it on windows, it works as expected, and prints the shortest possible path.
when i run it on linux, i get a message saying that mpirun noticed that process rank 0 exited on signal 11.
When I searched this problem on StackOverflow, I saw that it often occurs when sending wrong arguments to MPI's send/receive functions, but I went over my code, and the arguments seems fine.
How else can I check my error?
If it helps, here's the two code files:
main.c :
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
// forward declaration of tsp_main
int tsp_main(int citiesNum, int xCoord[], int yCoord[], int shortestPath[]);
int main(int argc, char** argv)
//int citiesNum = 18; //set a lower number for testing
int citiesNum = 10;
int xCoord[] = {1, 12, 13, 5, 5, 10, 5, 6, 7, 8, 9, 4, 11, 14, 4,8,4,6};
int yCoord[] = {7, 2, 3, 3, 5, 6, 7, 8, 9, 4, 11, 12, 13, 14, 5,1,7,33};
int* shortestPath = (int*)malloc(citiesNum * sizeof(int));
int i, myRank, minPathLen;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
clock_t begin = clock();
minPathLen = tsp_main(citiesNum, xCoord, yCoord, shortestPath);
clock_t end = clock();
if (myRank == 0)
printf("Execution time: %g seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);
printf("The shortest path, %d long, is:\n", minPathLen);
for (i = 0; i < citiesNum; i++)
// print the city (and its distance from the next city in the path)
printf("%d (%d) ", shortestPath[i],
abs(xCoord[shortestPath[i]] - xCoord[shortestPath[(i + 1) % citiesNum]]) +
abs(yCoord[shortestPath[i]] - yCoord[shortestPath[(i + 1) % citiesNum]]) );
printf("%d\n", shortestPath[0]);
MPI_Finalize ();
return 0;
and tsp.c :
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <assert.h>
#include <string.h>
#include <time.h>
//play with this value to change the workload.
#define WORK_LOAD (10)
#define getDistance(a,b) (DistanceArray[a][b])
enum defines {
MASTER_ID = 0, // master ID. set to 0 (better safe than sorry)
DISTRIBUTE_NEW_TASK, // when the master send a new task to a worker
ASK_FOR_TASK, // when a worker asks from the master a new task
KILL, // when the master notifies the worker to die
SEND_MINIMUM, // when a process sends its current minimum
SEND_PATH, // when a worker updates the master of his best path
// initializes the factorials array. the i cell will contain i!. for example, factorials[4] will contain 24.
void initializeFactorials(long long int* factorials, int citiesNum) {
int i;
factorials[0] = 1;
for (i = 1; i < citiesNum; ++i) {
factorials[i] = i * factorials[i-1];
// initializes the two dimensional distance array. Element k,l will contain the distance between city k and city l
void initializeDistanceArray(int** DistanceArray, int citiesNum, int xCoord[], int yCoord[]) {
int k,l;
for (k=0; k < citiesNum; ++k) {
DistanceArray[k] = (int*)malloc(sizeof(int)*citiesNum);
for (k=0; k < citiesNum; ++k) {
for (l=0; l < citiesNum; ++l) {
DistanceArray[k][l] = abs(xCoord[k] - xCoord[l]) + abs(yCoord[k] - yCoord[l]);
/* initializes the edge minimum array. Element i contains the minimum weight of i+1 edges between different cities.
For example, element 0 contains the minimal edge. Element 5 contains the total weight of 6 edges going out from different cities*/
void initializeEdgeMinimum(int* edgeMinimum, int** DistanceArray, int citiesNum) {
int k, l, sum=0;
for (k=0; k < citiesNum; ++k) {
edgeMinimum[k] = INT_MAX;
for (l=0; l < citiesNum; ++l) {
if (l == k) continue;
if (getDistance(k,l) < edgeMinimum[k]) edgeMinimum[k] = getDistance(k,l);
for (k=0; k < citiesNum-1; ++k) {
for (l=k+1; l < citiesNum; ++l) {
if (edgeMinimum[l]>edgeMinimum[k]){
int temp = edgeMinimum[k];
edgeMinimum[k] = edgeMinimum[l];
edgeMinimum[l] = temp;
for (k=citiesNum-1; k >= 0; --k) {
sum += edgeMinimum[k];
edgeMinimum[k] = sum;
/* takes an index of a path as an argument, and converts it according to the decision tree (as explained in the external documentation)
to a path (circle) between cities*/
void convertIndexToPath(long long int index, int citiesNum, int* decisionTree, int* options, int* path, long long int* factorials) {
int i, j, decision;
long long int fact;
for(i = 0; i < citiesNum; ++i) {
fact = factorials[citiesNum-(i+1)];
decision = (int)(index/fact);
decisionTree[i] = decision;
index -= fact*((long long int)decision);
for(i = 0; i < citiesNum; ++i) {
options[i] = i+1;
path[0] = 0;
for(i = 1; i < citiesNum; ++i) {
path[i] = options[decisionTree[i]];
for(j = decisionTree[i]; j < citiesNum-i-1; ++j) {
options[j] = options[j+1];
/* takes the decision tree of the last path (as explained in the external documentation) and converts it to a path
ASSUMPTION: can be used ONLY if the last path was NOT pruned */
void convertdDecisionToPath(long long int index, int citiesNum, int* decisionTree, int* options, int* path, long long int* factorials) {
int i, j;
for(i = citiesNum-2; i > 0; --i) {
decisionTree[i] = (decisionTree[i] + 1) % (citiesNum - i);
if (decisionTree[i] != 0) break;
for(i = 0; i < citiesNum; ++i) {
options[i] = i+1;
path[0] = 0;
for(i = 1; i < citiesNum; ++i) {
path[i] = options[decisionTree[i]];
for(j = decisionTree[i]; j < citiesNum-i-1; ++j) {
options[j] = options[j+1];
// returns one index before the next iteration of the index of the path that we need to explore right after pruning.
long long int getIndexAfterPrune(long long int index, int citiesVisitedInPath, int citiesNum, int* decisionTree, long long int* factorials) {
int decision, i;
long long int fact, nextIndex = 0, indexBackup = index;
for(i = 0; i < citiesNum; ++i) {
fact = factorials[citiesNum-(i+1)];
decision = (int)(index/fact);
decisionTree[i] = decision;
index -= fact*((long long int)decision);
for(i = citiesVisitedInPath + 1; i < citiesNum; ++i) {
decisionTree[i] = 0;
for(i = 0; i < citiesNum; ++i) {
nextIndex += (decisionTree[i] * factorials[citiesNum-1-i]);
nextIndex += factorials[citiesNum - citiesVisitedInPath];
return nextIndex-1;
// returns how many possibilities (paths) there are in a single chunk
long long int getChunkSize(int citiesNum, int workersNum, long long int* factorials) {
long long int allPossibilities = factorials[citiesNum];
// empirically setting the chunk size
long long int chunkSize = WORK_LOAD*(allPossibilities/factorials[citiesNum/2]) / (workersNum);
if (citiesNum <= 3 || chunkSize == 0) { //the job is small, and one worker can handle it
return allPossibilities;
return chunkSize;
// returns the number of chunks that we need to handle
long long int getNumberOfChunks(int citiesNum, long long int chunkSize, long long int* factorials) {
long long int allPossibilities = factorials[citiesNum];
int lastChunk = 0;
if (allPossibilities % chunkSize != 0) {
lastChunk = 1;
return (allPossibilities/chunkSize) + lastChunk;
// returns how many workers should work on the task.
int getNeededWorkers(long long int numberOfChunks, int workersNum) {
if (workersNum >= numberOfChunks) {
return (int)numberOfChunks;
return workersNum;
Splits the problem into many sub problems and sends tasks to the workers.
each task contains the start index (the stop index is simply calculated from the chunk size) and the optimal price known so far.
The master also listens for updates of the optimal price.
when all the workers finish, the masters send them a request to update him with their optimal solution, and then decides what's the global optimum.
conventions: variables are in camelCase, consts are in ALL_CAPS, and two dimensional arrays are in PascalCasing
int runMaster(int citiesNum, int xCoord[], int yCoord[], int shortestPath[], int processesNum) {
// Variables
int doneWorkers = 0, neededWorkers, gotAnswer, junk, currentMinimum = INT_MAX;
long long int chunkSize, indexToCheck = 0, bestPathIndex, numberOfChunks;
MPI_Status status1, status2, status3;
MPI_Request request1 = MPI_REQUEST_NULL, junkRequest = MPI_REQUEST_NULL;
// Arrays
int *decisionTree, *options;
long long int *factorials, *recieveBuffer, *sendBuffer;
// Dynamic Allocations
decisionTree = (int*)malloc(citiesNum * sizeof(int));
options = (int*)malloc(citiesNum * sizeof(int));
factorials = (long long int*)malloc(citiesNum * sizeof(long long int));
recieveBuffer = (long long int*)malloc(2 * sizeof(long long int));
sendBuffer = (long long int*)malloc(2 * sizeof(long long int));
initializeFactorials(factorials, citiesNum);
long long int lastIndex = factorials[citiesNum-1]-1;
chunkSize = getChunkSize(citiesNum, processesNum-1, factorials);
numberOfChunks = getNumberOfChunks(citiesNum, chunkSize, factorials);
neededWorkers = getNeededWorkers(numberOfChunks, processesNum-1);
while (doneWorkers < neededWorkers) {
//check if a worker wants a new task
gotAnswer = 1;
MPI_Iprobe(MPI_ANY_SOURCE, ASK_FOR_TASK, MPI_COMM_WORLD, &gotAnswer, &status1);
if (gotAnswer) {
MPI_Recv(&junk, 0, MPI_INT, MPI_ANY_SOURCE, ASK_FOR_TASK, MPI_COMM_WORLD, &status1); //blocking recieve since we need the request to complete so we'd know who should get the task
if (indexToCheck <= lastIndex) {
// the master sends the current minimum, and a new job to the worker
sendBuffer[0] = currentMinimum;
sendBuffer[1] = indexToCheck;
indexToCheck += chunkSize;
MPI_Irsend(sendBuffer, 2, MPI_LONG_LONG_INT, status1.MPI_SOURCE, DISTRIBUTE_NEW_TASK, MPI_COMM_WORLD, &request1); // we're guaranteed that the worker called IRecv and ready to get a task. no need to block since we can continue doing are own calculations.
} else { // the master kills the worker
MPI_Irsend(&junk, 0, MPI_INT, status1.MPI_SOURCE, KILL, MPI_COMM_WORLD, &junkRequest); // we're guaranteed that the worker called IRecv and ready to get a task. no need to block since we can continue doing are own calculations.
gotAnswer = 1;
if(gotAnswer) { // the master recieves a miminal price from one of the workers and decides if it's the global minimum
MPI_Recv(recieveBuffer, 1, MPI_LONG_LONG_INT, MPI_ANY_SOURCE, SEND_MINIMUM, MPI_COMM_WORLD, &status2); // blocking, since we're going to use the recieve buffer
currentMinimum = (currentMinimum <= recieveBuffer[0]) ? currentMinimum : (int)recieveBuffer[0];
gotAnswer = 1;
MPI_Iprobe(MPI_ANY_SOURCE, SEND_PATH, MPI_COMM_WORLD, &gotAnswer, &status3);
if(gotAnswer) {
// the master recieves a miminal path and price from one of the workers and decides if it's the global minimum
MPI_Recv(recieveBuffer, 2, MPI_LONG_LONG_INT, MPI_ANY_SOURCE, SEND_PATH, MPI_COMM_WORLD, &status3); // blocking, since we're going to use the recieve buffer
if(recieveBuffer[0] <= currentMinimum){
currentMinimum = (int)recieveBuffer[0];
bestPathIndex = recieveBuffer[1];
} //while
free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer);
convertIndexToPath(bestPathIndex, citiesNum, decisionTree, options, shortestPath, factorials);
return currentMinimum;
gets tasks from the master and process them until there are no more tasks to handle.
in each task, we go through all the possibilities in the current chunk, but skip paths that are heavier from the current known optimal path.
when we discover a new optimal path, we update the rest of the threads if necesssary (first, we check if we got a new optimal weight from them).
conventions: variables are in camelCase, consts are in ALL_CAPS, and two dimensional arrays are in PascalCasing
void runWorker(int citiesNum, int xCoord[], int yCoord[], int shortestPath[], int processesNum) {
int sum = 0, gotAnswer = 1, PRUNE_FACTOR = citiesNum - 3, LAST_CITY = citiesNum-1, indexReachedInPath, pruned, sumUntilPruned = 0, IndexUntilPruned = 0, doneWorkers = 0, neededWorkers, junk, myCurrentMinimum = INT_MAX, othersCurrentMinimum = INT_MAX, pid, k;
long long int numberOfChunks, chunkSize, indexToCheck = 0, startIndex, stopIndex, i, bestPathIndex = -1, lastIndex;
MPI_Status status1, status2;
MPI_Request request1 = MPI_REQUEST_NULL, junkRequest = MPI_REQUEST_NULL;
int *decisionTree, *edgeMinimum, *myCurrentPath, *options, **DistanceArray;
long long int *factorials, *recieveBuffer, *sendBuffer;
// Dynamic Allocations
decisionTree = (int*)malloc(citiesNum * sizeof(int));
edgeMinimum = (int*)malloc(sizeof(int)*citiesNum);
myCurrentPath = (int*)malloc(citiesNum * sizeof(int));
options = (int*)malloc(citiesNum * sizeof(int));
factorials = (long long int*)malloc(citiesNum * sizeof(long long int));
recieveBuffer = (long long int*)malloc(2 * sizeof(long long int));
sendBuffer = (long long int*)malloc(2 * sizeof(long long int));
DistanceArray = (int**)malloc(sizeof(int*)*citiesNum);
initializeFactorials(factorials, citiesNum);
initializeDistanceArray(DistanceArray, citiesNum, xCoord, yCoord);
initializeEdgeMinimum(edgeMinimum, DistanceArray, citiesNum);
MPI_Comm_rank(MPI_COMM_WORLD, &pid);
lastIndex = factorials[citiesNum-1]-1;
chunkSize = getChunkSize(citiesNum, processesNum-1, factorials);
numberOfChunks = getNumberOfChunks(citiesNum, chunkSize, factorials);
neededWorkers = getNeededWorkers(numberOfChunks, processesNum-1);
if (pid > neededWorkers){ //free memory and exit
for (k=0; k < citiesNum; ++k) {
free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer); free(edgeMinimum); free(DistanceArray); free(myCurrentPath);
MPI_Irecv(recieveBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, MPI_ANY_TAG, MPI_COMM_WORLD, &request1); // getting ready to recieve a new task from the master. no need to block
MPI_Ssend(&junk, 0, MPI_INT, MASTER_ID, ASK_FOR_TASK, MPI_COMM_WORLD); // asking the master for a new task. synced & blocking, since we don't have anything else to do until we get a new task
while(1) {
MPI_Wait(&request1, &status1); //avoid busy-wait
// the worker got a new job from the master. recieveBuffer[0] contains the server's currentMinimum. recieveBuffer[1] contains indexToCheck
othersCurrentMinimum = (recieveBuffer[0] < othersCurrentMinimum) ? (int)recieveBuffer[0] : othersCurrentMinimum;
startIndex = recieveBuffer[1];
stopIndex = (startIndex + chunkSize >= lastIndex) ? lastIndex + 1 : startIndex + chunkSize;
pruned = 1;
indexReachedInPath = 0;
sum = 0;
for(i = startIndex; i < stopIndex; ++i) {
if (pruned) { // calculate the current path from the index
convertIndexToPath(i, citiesNum, decisionTree, options, myCurrentPath, factorials);
} else { // calculate the current path from the last path (decision tree)
convertdDecisionToPath(i, citiesNum, decisionTree, options, myCurrentPath, factorials);
sum = 0;
indexReachedInPath = 0;
pruned = 0;
for(; indexReachedInPath < LAST_CITY; ++indexReachedInPath) {
sum += getDistance(myCurrentPath[indexReachedInPath], myCurrentPath[indexReachedInPath+1]);
if (indexReachedInPath < PRUNE_FACTOR && sum + edgeMinimum[indexReachedInPath] >= othersCurrentMinimum) {
pruned = 1;
sum -= getDistance(myCurrentPath[indexReachedInPath], myCurrentPath[indexReachedInPath+1]);
if (indexReachedInPath == 0) {
i = getIndexAfterPrune(i,1,citiesNum, decisionTree, factorials);
} else {
i = getIndexAfterPrune(i,indexReachedInPath,citiesNum, decisionTree, factorials);
if(pruned) continue;
sum += getDistance(myCurrentPath[LAST_CITY], myCurrentPath[0]); //return from the last city to the first
if(sum < othersCurrentMinimum) {
myCurrentMinimum = sum;
bestPathIndex = i;
//check for a new global minimum
gotAnswer = 1;
if(gotAnswer) {
MPI_Recv(recieveBuffer, 1, MPI_INT, MPI_ANY_TAG, SEND_MINIMUM, MPI_COMM_WORLD, &status2); // blocking, since we're going to use the recieve buffer
othersCurrentMinimum = (recieveBuffer[0] < othersCurrentMinimum) ? (int)recieveBuffer[0] : othersCurrentMinimum;
if (myCurrentMinimum < othersCurrentMinimum) {
othersCurrentMinimum = sum;
for (k = 0; k < processesNum; ++k) {
if (junk == pid) continue;
MPI_Issend(&myCurrentMinimum, 1, MPI_INT, k, SEND_MINIMUM, MPI_COMM_WORLD, &junkRequest); // sending everyone our minimum, copying it to their memory. obviously, no need to block.
//send my minimum to the master if it's the global minimum
//if (myCurrentMinimum <= othersCurrentMinimum) {
// MPI_Issend(&myCurrentMinimum, 1, MPI_INT, MASTER_ID, SEND_MINIMUM, MPI_COMM_WORLD, &junkRequest); // sending the master our minimum.
// get a new task from the master
MPI_Irecv(recieveBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, MPI_ANY_TAG, MPI_COMM_WORLD, &request1); // getting ready to recieve a new task from the master. no need to block
MPI_Ssend(&junk, 0, MPI_INT, MASTER_ID, ASK_FOR_TASK, MPI_COMM_WORLD); // asking the master for a new task. blocking, since we don't have anything else to do until we get a new task
if(status1.MPI_TAG == KILL) { // free resources, send the master the optimal path and price, and die.
for (k=0; k < citiesNum; ++k) {
free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(edgeMinimum); free(DistanceArray); free(myCurrentPath);
sendBuffer[0] = myCurrentMinimum;
sendBuffer[1] = bestPathIndex;
MPI_Ssend(sendBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, SEND_PATH, MPI_COMM_WORLD); // synced & blocking, since we don't have anything else to do until the master gets the information
} // while
// The static parellel algorithm main function. runs the master and the workers.
int tsp_main(int citiesNum, int xCoord[], int yCoord[], int shortestPath[])
int rank, processesNum, result = 0;
MPI_Comm_size(MPI_COMM_WORLD, &processesNum);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0) {
result = runMaster(citiesNum, xCoord, yCoord, shortestPath, processesNum);
} else {
runWorker(citiesNum, xCoord, yCoord, shortestPath, processesNum);
return result;
I tried your code and i managed to get rid of the error.
I received a signal : "floating point exception : integer divide by zero". I searched where the exception occured and found that it came from the first /fact. It was thrown from proc 0, so i went to runMaster(). There is a line after the free(fact). I permuted these lines and the error disappeared.
This way may be the right one :
convertIndexToPath(bestPathIndex, citiesNum, decisionTree, options, shortestPath, factorials);
free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer);
However, i tried the program using 2 or 3 processus and the outputs were different...I am surprised that it worked before !
Bye, Francis
I'm just starting out with MPI programming and decided to make a simple distributed qsort using OpenMPI. To distribute parts of the array I want to sort I'm trying to use MPI_Scatterv, however the following code segfaults on me:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs-1;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
curr_worker = (++curr_worker)%nr_workers;
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
data_out = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, ARRAY_SIZE, MPI_INT, 0, MPI_COMM_WORLD);
} else {
// worker
printf("%d has %d elements!\n",my_rank, sizes[my_rank-1]);
// alloc the input buffer
data_in = malloc(sizeof(int)*sizes[my_rank-1]);
MPI_Scatterv(NULL, NULL, NULL, MPI_INT, data_in, sizes[my_rank-1], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank-1]; ++i) {
printf("%d ", data_in[i]);
return 0;
How would I go about using Scatterv? Am I doing something wrong with allocating my input buffer from inside the worker code?
I changed some part in your code to get something working.
MPI_Scatter() will send data to every processors, including himself. According to your program, processor 0 expects ARRAY_SIZE integers, but sizes[0] is much smaller.
There are other problems on other processus : MPI_Scatter will send sizes[my_rank] integers, but sizes[my_rank-1] will be expected...
Here is a code that scatters data_in from 0 to all processors, including 0. Therefore i added 1 to nr_workers :
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
curr_worker = (++curr_worker)%nr_workers;
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
printf("%d %d \n",i,data_in[i]);
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
} else {
printf("%d has %d elements!\n",my_rank, sizes[my_rank]);
data_out = malloc(sizeof(int)*sizes[my_rank]);
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, sizes[my_rank], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank]; ++i) {
printf("%d ", data_out[i]);
return 0;
Regarding memory managment, data_in and data_out should be freed at the end of the code.
Is it what you wanted to do ? Good luck with qsort ! I think you are not the first one to sort integers using MPI. See parallel sort using mpi. Your way to generate random numbers on the 0 processus and then scatter them is the right way to go. I think you will be interrested by his TD_Trier() function for communication. Even if you change tri_fusion(T, 0, size - 1); for qsort(...)...
I'm working on a project about the Parallel Bitonic Sorting using MPI and C to implement it. The program I developed works but it's not efficient since a simple QuickSort (sigh) beats it in terms of execution time. Maybe the problem is about the cost of communication but I don't get how to improve that, so here's the code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include "bs-util.h"
#include "quicksort.h"
#define TAG 1
/* Run this program knowing that:
* 1) The number of cores must be a power of 2
* 2) The length of the array to order must be a power of 2
* Exec Example: mpirun -n 4 ./bs 1024 1024
* */
void exchange(FILE *log, int i, int partner, int up);
int countTransfer = 0;
int *myArray, *partnerArray;
int currentPartner = -1;
int rank, size;
MPI_Status status;
int verbose = 0; //this var toggles on(1) or off(0) some useful prints for debugging purpose
int amount=0;
int main(int argc, char *argv[])
int *array;
int i=0;
int carry=0;
int up=1;
int count=0;
struct timeval tim;
FILE *log;
char logName[15] = "log/";
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* Time meter */
srand((double) time(NULL));
gettimeofday(&tim, NULL);
double t1=tim.tv_sec+(tim.tv_usec/1000000.0);
snprintf(logName+4, 10, "%d",rank);
log = fopen(logName,"w");
printf("Hello world from process %d of %d.\n", rank, size);
/* INPUT */
if (rank==0)
if (argc==2) /* by file */
FILE *input = fopen(argv[1],"r");
char line[20];
count = 0;
while(fgets(line,20,input) != NULL)
array = (int *)malloc(count*sizeof(int));
input = fopen(argv[1],"r");
i = 0;
while(fgets(line,20,input) != NULL)
array[i] = atoi(line);
if (argc==3) /* by command line */
count = atoi(argv[1]);
int max = atoi(argv[2]);
array = (int *)malloc(count*sizeof(int));
for (i=0; i<count; i++)
array[i] = rand()%max;
printf("\n\n ----------- ERRORE NEI PARAMETRI DI INPUT ----------- \n\n");
return 1;
if (verbose){
printf("Initial array:\n");
for (i=0; i<count; i++)
printf("%d\t", array[i]);
/* Everyone wait eachother */
carry = count%size;
amount = count/size + carry;
printf("\nParametri: amount=%d carry=%d\n\n", amount, carry);
int startIndex = amount;
myArray = (int *)malloc(amount*sizeof(int));
/* Buffer (partner) */
partnerArray = (int *)malloc(amount*sizeof(int));
for (i=0; i<amount; i++)
myArray[i] = array[i];
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
printf("%d\t", myArray[i]);
/* Sending the big array's chunks */
for (i=1; i<size; i++)
up = (i+1) % 2;
MPI_Send(&up, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&amount, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&carry, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(array+startIndex, amount-carry, MPI_INT, i, TAG, MPI_COMM_WORLD);
startIndex += amount-carry;
MPI_Recv(&up, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&amount, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&carry, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
myArray = (int *)malloc(amount*sizeof(int));
partnerArray = (int *)malloc(amount*sizeof(int)); /* Buffer (partner) */
MPI_Recv(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
/* Experimental padding: every chunck has the same amount of items. */
for (i=amount-carry; i<amount; i++)
myArray[i] = 0;
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount-carry, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
printf("%d\t", myArray[i]);
/* CORE */
/* Local Quicksort */
int result = quickSort(&myArray[0], amount); //this function is written within src/quicksort.c
if (verbose){
if (result == 1)
printf("Quick Sort: FAIL \n");
printf("\nLa mia porzione ordinata (processo %d)\n", rank);
for(i=0; i<amount; i++)
printf("%d ",myArray[i]);
printf ("\n");
int j;
for (up=8;up<=amount*size;up=2*up)
for (j=up>>1;j>0;j=j>>1)
for (i=0;i<amount*size;i++)
int partner=i^j;
if ((partner)>i)
if (rank!=0)
MPI_Send(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD);
gettimeofday(&tim, NULL);
double t2=tim.tv_sec+(tim.tv_usec/1000000.0);
if (rank==0)
myArray = (int *)realloc(myArray,sizeof(int)*amount*size);
for (i=1; i<size; i++)
MPI_Recv(myArray+i*amount, amount, MPI_INT, i, TAG, MPI_COMM_WORLD, &status);
printf("\nTempo trascorso %6f\n", t2-t1);
fprintf(log,"\n\n----------> Array Iniziale <----------\n");
fprintf(log,"\n\n----------> Array Finale <----------\n");
fprintf(log,"Numero di chunk scambiati: %d\n",countTransfer);
return 0;
void exchange(FILE *log, int i, int partner, int up)
int rank_i = i/amount;
int rank_partner = partner/amount;
int offset_i = i%amount;
int offset_partner = partner%amount;
/*if (verbose)
fprintf(log,"\nnewAmount = %d - Rank_i = %d - Rank_partner = %d - Offset_i = %d - Offset_partner = %d \n",amount,rank_i,rank_partner,offset_i,offset_partner);
if ((rank_i != rank) && (rank_partner != rank))
if ((rank_i == rank) && (rank_partner == rank))
if (((up==0) && (myArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < myArray[offset_partner])))
int temp = myArray[offset_i];
myArray[offset_i] = myArray[offset_partner];
myArray[offset_partner] = temp;
if (rank_i == rank && rank_partner != rank)
if (currentPartner != rank_partner)
MPI_Send(myArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD);
MPI_Recv(partnerArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD, &status);
currentPartner = rank_partner;
if (((up==0) && (myArray[offset_i] > partnerArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < partnerArray[offset_partner])))
myArray[offset_i] = partnerArray[offset_partner];
if (rank_i != rank && rank_partner == rank)
if (currentPartner != rank_i)
MPI_Recv(partnerArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD, &status);
MPI_Send(myArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD);
currentPartner = rank_i;
if (((up==0) && (partnerArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (partnerArray[offset_i] < myArray[offset_partner])))
myArray[offset_partner] = partnerArray[offset_i];
And here's the Make file:
CC = mpicc
LFLAGS = -lm
PROGS = ./bs
PROGS_SRC = src/bs-util.c src/bs.c src/quicksort.c
Help would be very appreciated :)
References: http://goo.gl/nXt4p
Remember that bitonic sort has time complexity of something like N/P (log N)^2 compared to quicksort N log N (in serial version). This means that with log N > P (P ~ number of processors) should even the serial quicksort beat bitonic sort (I am not talking about multiplying with some factors depending on the implementation, neither the communication). Bitonic sort is for really parallel computers (it's pretty good on GPUs), not a grid of few PCs as you probably have.
Many sends/receives (as in exchange function) of small data chunks badly affect performance. More efficient is combining small chunks into one buffer and sending it.
Um. I don't see you doing any collective communication other than barriers...