I'm currently developing a piece of code that has 10 processes. Process 0 reads a total of 10000 (from lab7.csv). Afterwards, it distributes the array to all of the processes. In order to do this, I created an array named "intArray[10000]" that is shared by all processes. 49893236 is the correct summation.
The following code makes use of 10 processors to compute the sum of numbers ranging from 1 to 1000. This aggregate is calculated by each of the processors, and the results are shown on the screen.
As a result, a follower error is shown.
I didn't figure out what the problem is. Please assist me in this matter.
#include <mpi.h>
#include <stdio.h>
#include <string.h>
int main()
{
int rank, nodes;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nodes);
MPI_Status status;
int intArray[10000];
int subIntArray[1000];
if(rank == 0) {
// Substitute the full file path for the string file_path
FILE *fp = fopen("./lab7.csv", "r");
int i = 0;
int num;
if (!fp) {
printf("Can't open file\n");
} else {
while (fscanf(fp, "%d", &num) > 0)
{
intArray[i] = num;
i++;
}
// Close the file
fclose(fp);
}
}
MPI_Scatter(intArray, 10000, MPI_INT, subIntArray, 1000, MPI_INT, 0, MPI_COMM_WORLD);
int ans = 0;
int total = 0;
int start = rank * 1000;
int end = start + 999;
for(int i = start; i <= end; i++) {
ans = ans + subIntArray[i];
}
if(rank != 0) {
MPI_Ssend(&ans, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
} else {
total = ans;
for(int j = 1; j < 10; j++) {
MPI_Recv(&ans, 1, MPI_INT, j, 0, MPI_COMM_WORLD, &status);
total += ans;
}
printf("Total is %d\n", total);
}
MPI_Finalize();
return 0;
}
The PBS Job file as followers,
#PBS -l nodes=2
#PBS -l walltime=00:02:00
#PBS -l select=5
cat $PBS_NODEFILE
NPROC=10
cd $PBS_O_WORKDIR
MPISIZE=$NPROC
MPIPROG=`basename $PBS_JOBNAME .pbs`
echo 'Running MPI program' $MPIPROG 'on' $MPISIZE 'processes'
echo 'Started at' `date`
echo '--------------------------------------------------------------------------------'
(time mpirun -n $MPISIZE ./$MPIPROG) 2>&1
echo '--------------------------------------------------------------------------------'
echo 'Finished at' `date`
This is the error message that is shown on the terminal.
The orientation of MPI_Scatter is a bit different than the one you have.
You say: I have NTOT data elements and I want to send them to NODECOUNT nodes so I want each node to process NTOT / NODECOUNT of data. This doesn't work if NTOT is not an exact multiple of NODECOUNT
But, MPI_Scatter is oriented the other way: I have NPER number of elements that each node should process, and NODECOUNT nodes, so the total number of elements is NTOT = NPER * NODECOUNT This is how the manpage example shows it.
You want to give a count of NPER to MPI_Scatter and not NTOT. And, you want the send and receive counts to match.
Also, because MPI_Scatter does the split for you, the slave nodes should not use start/end as you calculated, but always do:
start = 0;
end = NPER - 1;
Also, in your code ...
You were indexing into subIntArray as if you could access 0-9999 instead of 0-999, so you were going beyond the end of the array and had UB (undefined behavior)
It's a bit shaky to hardwire 10, 100, 1000 everywhere. Better to use the some #define and the actual node code nodes
And, you assume you have 10000 valid input data elements rather than calculating this based on your i index variable when you do fscanf
Here is the corrected code, with some extra debug code I used.
I used preprocessor conditionals to denote old vs new code (e.g.):
#if 0
// old code
#else
// new code
#endif
#if 1
// new code
#endif
Also, I didn't have your data files, so I had to synthesize the input data.
Anyway, here it is:
#include <mpi.h>
#include <stdio.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
int rank;
int nodes;
FILE *xfdbg;
#define dbgprtattr(_lvl) \
__attribute__((__format__(__printf__,_lvl,_lvl + 1)))
#if DEBUG || _USE_ZPRT_
#define dbgprt(_fmt...) \
_dbgprt(_fmt)
#else
#define dbgprt(_fmt...) \
do { } while (0)
#endif
void dbgprtattr(1)
_dbgprt(const char *fmt,...)
{
va_list ap;
char buf[10000];
char *bp = buf;
bp += sprintf(bp,"[%d] ",rank);
va_start(ap,fmt);
bp += vsprintf(bp,fmt,ap);
va_end(ap);
fputs(buf,xfdbg);
fflush(xfdbg);
}
//#define NTOT 10000
//#define NPER (NTOT / nodes)
int
main()
{
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nodes);
MPI_Status status;
char logf[100];
sprintf(logf,"log_%2.2d",rank);
xfdbg = fopen(logf,"w");
#if 0
int BIGSIZE = nodes * 1000;
int NPER = NTOT / nodes;
#else
//int NPER = 1000;
int NPER = 50;
//int NTOT = NPER * nodes;
#endif
int val;
int truetotal = 0;
#if 0
int intArray[NTOT];
#else
int intArray[nodes][NPER];
#endif
#if 1
int subIntArray[NPER];
#else
int subIntArray[NTOT];
#endif
if (rank == 0) {
#if 0
// Substitute the full file path for the string file_path
FILE *fp = fopen("./lab7.csv", "r");
int i = 0;
int num;
if (!fp) {
printf("Can't open file\n");
}
else {
while (fscanf(fp, "%d", &num) > 0) {
intArray[i] = num;
i++;
}
// Close the file
fclose(fp);
}
#endif
for (int nd = 0; nd < nodes; ++nd) {
for (int i = 0; i < NPER; ++i) {
val = (nd << 16) | i;
intArray[nd][i] = val;
truetotal += val;
}
}
}
dbgprt("main: hello\n");
#if 0
MPI_Scatter(intArray, NTOT, MPI_INT,
subIntArray, NPER, MPI_INT,
0, MPI_COMM_WORLD);
dbgprt("main: post\n");
#endif
#if 1
MPI_Scatter(intArray, NPER, MPI_INT,
subIntArray, NPER, MPI_INT,
0, MPI_COMM_WORLD);
dbgprt("main: post\n");
#endif
#if 0
MPI_Scatter(intArray, NTOT, MPI_INT,
subIntArray, NTOT, MPI_INT,
0, MPI_COMM_WORLD);
dbgprt("main: post\n");
#endif
//sleep(10);
int ans = 0;
int total = 0;
#if 0
int start = rank * NPER;
int end = start + NPER - 1;
#else
int start = 0;
int end = NPER - 1;
#endif
dbgprt("main: START start=%d end=%d\n",start,end);
for (int i = start; i <= end; i++) {
dbgprt("main: DATA i=%d sub=%8.8X\n",i,subIntArray[i]);
ans = ans + subIntArray[i];
}
dbgprt("main: loopdone ans=%d\n",ans);
if (rank != 0) {
MPI_Ssend(&ans, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
else {
total = ans;
for (int j = 1; j < nodes; j++) {
MPI_Recv(&ans, 1, MPI_INT, j, 0, MPI_COMM_WORLD, &status);
total += ans;
}
printf("Total is %d\n", total);
printf("Total is %d (TRUE)\n", truetotal);
}
fclose(xfdbg);
MPI_Finalize();
return 0;
}
Related
i tried to read a txt file and count all a to z each letter count without considering uppercase and lowercase. i need to ignore other characters and spaces. but i cannot get correct number count with changig the process count. process need to be in between 1 to 100. with 2 number of process it will show correct count but when i increase the number it shows wrong count.
//FILE READING USING MPI FUNCTION
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char* argv[]) {
int size;
int rank;
int tag = 0;
int start;
int letterCounts[26];
MPI_Status status;
int chunksize;
MPI_Offset file_size;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if(size >= 101){
printf(" Process failed");
MPI_Abort(MPI_COMM_WORLD, 1);
}
MPI_File file;
MPI_File_open(MPI_COMM_WORLD, "warandpeace.txt", MPI_MODE_RDONLY, MPI_INFO_NULL, &file);
if (file == MPI_FILE_NULL) {
printf("Error opening file!\n");
MPI_Abort(MPI_COMM_WORLD, 1);
}
// find the file size
MPI_File_get_size(file, &file_size);
//printf("Process %d: filesize %d\n", rank,file_size);
// Allocate a buffer to hold the file contents
char* buffer = (char*)malloc(file_size * sizeof(char));
if(rank==0){
// Read the file into the buffer
MPI_File_read(file, buffer, file_size, MPI_CHAR, MPI_STATUS_IGNORE);
// Close the file
//MPI_File_close(&file);
//free(buffer);
//deviding the file
chunksize = file_size/(size - 1);
//printf("chunksize is %d\n",chunksize);
int end = chunksize;
for (int i = 1; i < size; i++) {
int start = 0;
MPI_Send(&buffer[start], end - start, MPI_CHAR, i, 0, MPI_COMM_WORLD);
start = end;
end += chunksize;
if (i == size - 2) {
end = file_size;
}
printf("destination rank %d: filesize%d: chunksize = %d\n", i,file_size, chunksize);
}
}
else{
// Receive the file size from process 0
//long file_size;
// Allocate memory for the chunks
//int chunksize = file_size / size;
char* buffer = (char*)malloc(file_size * sizeof(char));
// Receive the chunk of the file and count the letters
MPI_Recv(buffer, file_size, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &status);
int count;
MPI_Get_count(&status, MPI_CHAR, &count);
printf("count%d: status = %d\n", count, status);
for (int j = 0; j < count; j++) {
char c = buffer[j];
if (c >= 'a' && c <= 'z') {
letterCounts[c - 'a']++;
}
else if (c >= 'A' && c <= 'Z') {
letterCounts[c - 'A']++;
}
}
}
// Reduce the counts from each process to get the total count
int totalCounts[26];
MPI_Reduce(letterCounts, totalCounts, 26, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (rank == 0) {
// Print the total counts
for (int i = 0; i < 26; i++) {
printf("%c: %d\n", 'a' + i, totalCounts[i]);
}
}
MPI_Finalize();
return 0;
}
I am building up an example with variable no. of processes and bind them to the sockets in a small network with different architecture and number of cpus.
I compile and run with:
mpiicpc avg_4.c -qopenmp -axSSE4.2,AVX,CORE-AVX2 -O3 -par-affinity=noverbose,granularity=core,compact -o b
mpiexec.hydra -machinefile f19 -genv I_MPI_PIN=1 -genv I_MPI_PIN_DOMAIN=socket -genv I_MPI_PIN_ORDER=compact -n 1 ./b
The network (master + slave19) f19 is:
s19:1
ma:1
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <sched.h>
#include <mpi.h>
int *create_mlu(int n_omp, int ws) {
int *mlu = (int *)calloc(n_omp * ws, sizeof(int));
for (int i = 0; i < ws; i++)
for (int j = 0; j < n_omp; j++)
mlu[j + i*n_omp] = j + 100 * i;
return mlu;
}
int *C4_Re(int *mal, int n_omp, int wr, int ws) {
int *rM8 = (int *)malloc(sizeof(int) * n_omp);
char nod[MPI_MAX_PROCESSOR_NAME];
int n_l; MPI_Get_processor_name(nod, &n_l);
#pragma omp parallel for
for (int i = 0; i < n_omp; i++) {
rM8[i] = mal[i] + 10 * omp_get_thread_num();
printf("ws%2d\t\tmpi%2d\t\tmaxTh%2d\t\tmaxPr%2d\t\tomp%2d\t\tcore%3d\t\trM8%4d\t\tnod %s\n", ws, wr, omp_get_num_threads(), omp_get_num_procs(), omp_get_thread_num(), sched_getcpu(), rM8[i], nod);
}
return rM8;
}
int main(void) {
MPI_Init(NULL, NULL);
int ts[2] = {7, 9}; //no of processes
for (int t = 0; t < 2; t++) {
int ws = ts[t];
int errcodes[ws];
MPI_Comm parentcomm, intercomm;
MPI_Comm_get_parent(&parentcomm);
if (parentcomm == MPI_COMM_NULL) {
MPI_Comm_spawn("./b", MPI_ARGV_NULL, ws, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &intercomm, errcodes);
//printf("I'm the parent.\n");
}
else {
int wr; MPI_Comm_rank(MPI_COMM_WORLD, &wr);// printf("wr %d\n", wr);
//int ps; MPI_Comm_size(parentcomm, &ps);// printf("ps %d\n", ps);
//int pr; MPI_Comm_rank(parentcomm, &pr);// printf("pr %d\n", pr);
int n_omp = 8, *mlu = NULL;
if (wr == 0) {
mlu = create_mlu(n_omp, ws);
//for (int i = 0; i < n_omp*ws; i++) printf("\tmlu[%2d] = %d\n", i, mlu[i]);
}
int *mal = (int *)malloc(n_omp * sizeof(int));
MPI_Scatter(mlu, n_omp, MPI_INT, mal, n_omp, MPI_INT, 0, MPI_COMM_WORLD);
//for (int i = 0; i < n_omp; i++) printf("\t\tmal[%2d] = %d\trank %d\n", i, mal[i], wr);
int *rM8 = NULL;
rM8 = C4_Re(mal, n_omp, wr, ws);
int *rS8 = NULL;
if (wr == 0)
rS8 = (int *)malloc(sizeof(int) * ws * n_omp);
MPI_Gather(rM8, n_omp, MPI_INT, rS8, n_omp, MPI_INT, 0, MPI_COMM_WORLD);
if (wr == 0) {
//for (int i = 0; i < n_omp * ws; i++) printf("\t\trS8[%2d] = %d\n", i, rS8[i]);
free(mlu);
free(rS8); }
free(mal);
free(rM8);
}
//fflush(stdout);
}
fflush(stdout);
MPI_Finalize();
return 0;
}
I have a memory corruption which I need help to find
Some results look like
ws 7 rM8-37253944 nod ma mpi 7 maxTh 6 maxPr 6 omp 4 core 4
but they must look like
ws 7 rM8 624 nod ma mpi 6 maxTh 6 maxPr 6 omp 2 core 2
addition questions
1 - why using parentcomm for Scatter and Gather is not correct? In my opinion parentcomm is the new communicator
2 - should I create different comunicators for 7 and 9?
3 - mpicc gives me wrong results I don't know why
I am a beginner in C. I have to create a distributed architecture with the library MPI. The following code is:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
int main(int argc, char **argv)
{
int N, w = 1, L = 2, M = 50; // with N number of threads
int T= 2;
int myid;
int buff;
float mit[N][T]; // I initialize a 2d array
for(int i = 0; i < N; ++i){
mit[i][0]= M / (float) N;
for (int j = 1; j < T; ++j){
mit[i][j] = 0;
}
}
float tab[T]; // 1d array
MPI_Status stat;
/*********************************************
start
*********************************************/
MPI_Init(&argc,&argv); // Initialisation
MPI_Comm_size(MPI_COMM_WORLD, &N);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
for(int j = 0; j < T; j++) {
for(int i = 0; i < N; i++) { // I iterate for each slave
if (myid !=0) {
float y = ((float) rand()) / (float) RAND_MAX;
mit[i][j + 1] = mit[i][j]*(1 + w * L * y);
buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, 0, 0, MPI_COMM_WORLD); // I send the variable buff to the master
buff=0;
}
if( myid == 0 ) { // Master
for(int i = 1; i < N; i++){
MPI_Recv(&buff, 128, MPI_INT, i, 0, MPI_COMM_WORLD, &stat);
tab[j] += buff; // I need to receive all the variables buff sent by the salves, sum them and stock into the tab at the index j
}
printf("\n%.20f\n",tab[j]); // I print the result of the sum at index j
}
}
}
MPI_Finalize();
return 0;
}
}
I use the command in the terminal: mpicc .c -o my_file to compile the program
Then mpirun -np 101 my_file_c to start the program with 101 threads
But the problem is I have the following error int the terminal:
It seems that [at least] one of the processes that was started with
> mpirun did not invoke MPI_INIT before quitting (it is possible that
> more than one process did not invoke MPI_INIT -- mpirun was only
> notified of the first one, which was on node n0).
>
> mpirun can *only* be used with MPI programs (i.e., programs that
> invoke MPI_INIT and MPI_FINALIZE). You can use the "lamexec" program
> to run non-MPI programs over the lambooted nodes.
It seems that I have a problem with the master but i don't know why...
Any idea ???
Thank you :)
This behavior is very likely the result of a memory corruption.
You cannot
int buff=mit[i][j+1];
MPI_Send(&buff, 128, MPI_INT, ...);
depending on what you want to achieve, you can try instead
int buff=mit[i][j+1];
MPI_Send(&buff, 1, MPI_INT, ...);
// ...
MPI_Recv(&buff, 1, MPI_INT, ...);
or
int *buff=&mit[i][j+1];
MPI_Send(buff, 128, MPI_INT, ...);
// fix MPI_Recv()
I'm just starting out with MPI programming and decided to make a simple distributed qsort using OpenMPI. To distribute parts of the array I want to sort I'm trying to use MPI_Scatterv, however the following code segfaults on me:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
srand(time(0));
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs-1;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
}
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
++sizes[curr_worker];
curr_worker = (++curr_worker)%nr_workers;
--left;
}
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
}
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
data_out = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
}
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
}
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, ARRAY_SIZE, MPI_INT, 0, MPI_COMM_WORLD);
} else {
// worker
printf("%d has %d elements!\n",my_rank, sizes[my_rank-1]);
// alloc the input buffer
data_in = malloc(sizeof(int)*sizes[my_rank-1]);
MPI_Scatterv(NULL, NULL, NULL, MPI_INT, data_in, sizes[my_rank-1], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank-1]; ++i) {
printf("%d ", data_in[i]);
}
printf("\n");
}
MPI_Finalize();
return 0;
}
How would I go about using Scatterv? Am I doing something wrong with allocating my input buffer from inside the worker code?
I changed some part in your code to get something working.
MPI_Scatter() will send data to every processors, including himself. According to your program, processor 0 expects ARRAY_SIZE integers, but sizes[0] is much smaller.
There are other problems on other processus : MPI_Scatter will send sizes[my_rank] integers, but sizes[my_rank-1] will be expected...
Here is a code that scatters data_in from 0 to all processors, including 0. Therefore i added 1 to nr_workers :
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#define ARRAY_SIZE 26
#define BUFFER_SIZE 2048
int main(int argc, char** argv) {
int my_rank, nr_procs;
int* data_in, *data_out;
int* sizes;
int* offsets;
srand(time(0));
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &nr_procs);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// everybody generates the control tables
int nr_workers = nr_procs;
sizes = malloc(sizeof(int)*nr_workers);
offsets = malloc(sizeof(int)*nr_workers);
int nr_elems = ARRAY_SIZE/nr_workers;
// basic distribution
for (int i = 0; i < nr_workers; ++i) {
sizes[i] = nr_elems;
}
// distribute the remainder
int left = ARRAY_SIZE%nr_workers;
int curr_worker = 0;
while (left) {
++sizes[curr_worker];
curr_worker = (++curr_worker)%nr_workers;
--left;
}
// offsets
int curr_offset = 0;
for (int i = 0; i < nr_workers; ++i) {
offsets[i] = curr_offset;
curr_offset += sizes[i];
}
if (my_rank == 0) {
// root
data_in = malloc(sizeof(int)*ARRAY_SIZE);
for (int i = 0; i < ARRAY_SIZE; ++i) {
data_in[i] = rand();
printf("%d %d \n",i,data_in[i]);
}
for (int i = 0; i < nr_workers; ++i) {
printf("%d at %d\n", sizes[i], offsets[i]);
}
} else {
printf("%d has %d elements!\n",my_rank, sizes[my_rank]);
}
data_out = malloc(sizeof(int)*sizes[my_rank]);
MPI_Scatterv (data_in, sizes, offsets, MPI_INT, data_out, sizes[my_rank], MPI_INT, 0, MPI_COMM_WORLD);
printf("%d got:\n", my_rank);
for (int i = 0; i < sizes[my_rank]; ++i) {
printf("%d ", data_out[i]);
}
printf("\n");
free(data_out);
if(my_rank==0){
free(data_in);
}
MPI_Finalize();
return 0;
}
Regarding memory managment, data_in and data_out should be freed at the end of the code.
Is it what you wanted to do ? Good luck with qsort ! I think you are not the first one to sort integers using MPI. See parallel sort using mpi. Your way to generate random numbers on the 0 processus and then scatter them is the right way to go. I think you will be interrested by his TD_Trier() function for communication. Even if you change tri_fusion(T, 0, size - 1); for qsort(...)...
Bye,
Francis
I'm working on a project about the Parallel Bitonic Sorting using MPI and C to implement it. The program I developed works but it's not efficient since a simple QuickSort (sigh) beats it in terms of execution time. Maybe the problem is about the cost of communication but I don't get how to improve that, so here's the code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <string.h>
#include "bs-util.h"
#include "quicksort.h"
#define TAG 1
/* Run this program knowing that:
* 1) The number of cores must be a power of 2
* 2) The length of the array to order must be a power of 2
*
* Exec Example: mpirun -n 4 ./bs 1024 1024
* */
void exchange(FILE *log, int i, int partner, int up);
int countTransfer = 0;
int *myArray, *partnerArray;
int currentPartner = -1;
int rank, size;
MPI_Status status;
int verbose = 0; //this var toggles on(1) or off(0) some useful prints for debugging purpose
int amount=0;
int main(int argc, char *argv[])
{
int *array;
int i=0;
int carry=0;
int up=1;
int count=0;
struct timeval tim;
FILE *log;
char logName[15] = "log/";
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
/* Time meter */
srand((double) time(NULL));
gettimeofday(&tim, NULL);
double t1=tim.tv_sec+(tim.tv_usec/1000000.0);
snprintf(logName+4, 10, "%d",rank);
log = fopen(logName,"w");
printf("Hello world from process %d of %d.\n", rank, size);
MPI_Barrier(MPI_COMM_WORLD);
/* INPUT */
if (rank==0)
{
if (argc==2) /* by file */
{
FILE *input = fopen(argv[1],"r");
char line[20];
count = 0;
while(fgets(line,20,input) != NULL)
{
count++;
}
fclose(input);
array = (int *)malloc(count*sizeof(int));
input = fopen(argv[1],"r");
i = 0;
while(fgets(line,20,input) != NULL)
{
array[i] = atoi(line);
i++;
}
fclose(input);
}
else
if (argc==3) /* by command line */
{
count = atoi(argv[1]);
int max = atoi(argv[2]);
array = (int *)malloc(count*sizeof(int));
srand(time(NULL));
for (i=0; i<count; i++)
{
array[i] = rand()%max;
}
}
else
{
printf("\n\n ----------- ERRORE NEI PARAMETRI DI INPUT ----------- \n\n");
return 1;
}
/* END OF THE INPUT */
if (verbose){
printf("Initial array:\n");
for (i=0; i<count; i++)
{
printf("%d\t", array[i]);
}
printf("\n");
}
/* Everyone wait eachother */
MPI_Barrier(MPI_COMM_WORLD);
carry = count%size;
amount = count/size + carry;
printf("\nParametri: amount=%d carry=%d\n\n", amount, carry);
up=1;
int startIndex = amount;
myArray = (int *)malloc(amount*sizeof(int));
/* Buffer (partner) */
partnerArray = (int *)malloc(amount*sizeof(int));
for (i=0; i<amount; i++)
myArray[i] = array[i];
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
{
printf("%d\t", myArray[i]);
}
printf("\n");
}
/* Sending the big array's chunks */
for (i=1; i<size; i++)
{
up = (i+1) % 2;
MPI_Send(&up, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&amount, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(&carry, 1, MPI_INT, i, TAG, MPI_COMM_WORLD);
MPI_Send(array+startIndex, amount-carry, MPI_INT, i, TAG, MPI_COMM_WORLD);
startIndex += amount-carry;
}
MPI_Barrier(MPI_COMM_WORLD);
}
else
{
MPI_Barrier(MPI_COMM_WORLD);
MPI_Recv(&up, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&amount, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&carry, 1, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
myArray = (int *)malloc(amount*sizeof(int));
partnerArray = (int *)malloc(amount*sizeof(int)); /* Buffer (partner) */
MPI_Recv(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD, &status);
/* Experimental padding: every chunck has the same amount of items. */
for (i=amount-carry; i<amount; i++)
{
myArray[i] = 0;
}
printf("\n");
printf("Processo %d riceve amount=%d e up=%d\n", rank, amount-carry, up);
if (verbose){
printf("Mia porzione ---> ");
for (i=0; i<amount; i++)
{
printf("%d\t", myArray[i]);
}
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
/* CORE */
/* Local Quicksort */
int result = quickSort(&myArray[0], amount); //this function is written within src/quicksort.c
if (verbose){
if (result == 1)
printf("Quick Sort: FAIL \n");
else
{
printf("\nLa mia porzione ordinata (processo %d)\n", rank);
for(i=0; i<amount; i++)
{
printf("%d ",myArray[i]);
}
printf ("\n");
}
}
int j;
for (up=8;up<=amount*size;up=2*up)
{
for (j=up>>1;j>0;j=j>>1)
{
for (i=0;i<amount*size;i++)
{
int partner=i^j;
if ((partner)>i)
{
exchange(log,i,partner,i&up);
}
}
}
}
/* END OF THE CORE */
if (rank!=0)
{
MPI_Send(myArray, amount, MPI_INT, 0, TAG, MPI_COMM_WORLD);
}
gettimeofday(&tim, NULL);
double t2=tim.tv_sec+(tim.tv_usec/1000000.0);
if (rank==0)
{
myArray = (int *)realloc(myArray,sizeof(int)*amount*size);
for (i=1; i<size; i++)
MPI_Recv(myArray+i*amount, amount, MPI_INT, i, TAG, MPI_COMM_WORLD, &status);
printf("\nTempo trascorso %6f\n", t2-t1);
fprintf(log,"\n\n----------> Array Iniziale <----------\n");
printArray(log,array,count);
fprintf(log,"\n\n----------> Array Finale <----------\n");
printArray(log,myArray+(carry*(size-1)),count);
/*printArray(log,myArray,newAmount*size);*/
}
fprintf(log,"Numero di chunk scambiati: %d\n",countTransfer);
fclose(log);
MPI_Finalize();
return 0;
}
void exchange(FILE *log, int i, int partner, int up)
{
int rank_i = i/amount;
int rank_partner = partner/amount;
int offset_i = i%amount;
int offset_partner = partner%amount;
/*if (verbose)
fprintf(log,"\nnewAmount = %d - Rank_i = %d - Rank_partner = %d - Offset_i = %d - Offset_partner = %d \n",amount,rank_i,rank_partner,offset_i,offset_partner);
*/
if ((rank_i != rank) && (rank_partner != rank))
return;
if ((rank_i == rank) && (rank_partner == rank))
{
if (((up==0) && (myArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < myArray[offset_partner])))
{
int temp = myArray[offset_i];
myArray[offset_i] = myArray[offset_partner];
myArray[offset_partner] = temp;
}
return;
}
if (rank_i == rank && rank_partner != rank)
{
if (currentPartner != rank_partner)
{
MPI_Send(myArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD);
MPI_Recv(partnerArray, amount, MPI_INT, rank_partner, TAG, MPI_COMM_WORLD, &status);
currentPartner = rank_partner;
countTransfer++;
}
if (((up==0) && (myArray[offset_i] > partnerArray[offset_partner])) || ((up!=0) && (myArray[offset_i] < partnerArray[offset_partner])))
myArray[offset_i] = partnerArray[offset_partner];
return;
}
if (rank_i != rank && rank_partner == rank)
{
if (currentPartner != rank_i)
{
MPI_Recv(partnerArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD, &status);
MPI_Send(myArray, amount, MPI_INT, rank_i, TAG, MPI_COMM_WORLD);
currentPartner = rank_i;
countTransfer++;
}
if (((up==0) && (partnerArray[offset_i] > myArray[offset_partner])) || ((up!=0) && (partnerArray[offset_i] < myArray[offset_partner])))
myArray[offset_partner] = partnerArray[offset_i];
return;
}
}
And here's the Make file:
CC = mpicc
OPTIMIZE =
CFLAGS = $(DEFINES) $(OPTIMIZE)
LFLAGS = -lm
PROGS = ./bs
PROGS_SRC = src/bs-util.c src/bs.c src/quicksort.c
all:
$(CC) $(CFLAGS) $(LFLAGS) -o $(PROGS) $(PROGS_SRC)
Help would be very appreciated :)
References: http://goo.gl/nXt4p
Remember that bitonic sort has time complexity of something like N/P (log N)^2 compared to quicksort N log N (in serial version). This means that with log N > P (P ~ number of processors) should even the serial quicksort beat bitonic sort (I am not talking about multiplying with some factors depending on the implementation, neither the communication). Bitonic sort is for really parallel computers (it's pretty good on GPUs), not a grid of few PCs as you probably have.
Many sends/receives (as in exchange function) of small data chunks badly affect performance. More efficient is combining small chunks into one buffer and sending it.
Um. I don't see you doing any collective communication other than barriers...