open mpi starts very fast but slows down massively soon after - c

I have a C program that takes a very large file (can be 5GB to 65GB) and transposes the data in the file and then writes out the transposed data to other files. In total, the results files are approx 30 times larger due to the transformation. I am using open mpi so each processor used writes to it's own file.
Each processor writes the first ~18 GB of data to it's own results file at a very fast speed. However, at this stage the program slows to a crawl and the %CPU on the top command output drops drastically from ~100% to 0.3%.
Can anyone suggest a reason for this? Am I reaching some system limit?
Code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
unsigned long long impute_len=0;
void write_results(unsigned long long, unsigned long long, int);
void main(int argc, char **argv){
// the impute output
impute_fp=fopen("infile.txt", "r");
// find input file length
fseek(impute_fp, 0, SEEK_END);
impute_len=ftell(impute_fp);
//mpi magic - hopefully!
MPI_Status status;
unsigned long long proc_id, ierr, num_procs, tot_recs, recs_per_proc,
root_recs, start_byte, end_byte, start_recv, end_recv;
// Now replicte this process to create parallel processes.
ierr = MPI_Init(&argc, &argv);
//find out process ID, and how many processes were started.
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &proc_id);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
if(proc_id == 0){
tot_recs = impute_len/54577; //54577 is length of each line
recs_per_proc = tot_recs/num_procs;
if(tot_recs % num_procs != 0){
recs_per_proc=recs_per_proc+1;
root_recs = tot_recs-(recs_per_proc*(num_procs-1));
}else{
root_recs = recs_per_proc;
}
//distribute a portion to each child process
int z=0;
for(int x=1; x<num_procs; x++){
start_byte = ((root_recs*54577))+(z*(recs_per_proc*54577));
end_byte = ((root_recs*54577))+((z+1)*(recs_per_proc*54577));
ierr = MPI_Send(&start_byte, 1 , MPI_UNSIGNED_LONG_LONG, x, 0, MPI_COMM_WORLD);
ierr = MPI_Send(&end_byte, 1 , MPI_UNSIGNED_LONG_LONG, x, 0, MPI_COMM_WORLD);
z++;
}
//root proc bit of work
write_results(0, (root_recs*54577), proc_id);
}else{
//must be a slave process
ierr = MPI_Recv(&start_recv, 1, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD, &status);
ierr = MPI_Recv(&end_recv, 1, MPI_UNSIGNED_LONG_LONG, 0, 0, MPI_COMM_WORLD, &status);
//Write my portion of file
write_results(start_recv, end_recv, proc_id);
}
ierr = MPI_Finalize();
fclose(impute_fp);
}
void write_results(unsigned long long start, unsigned long long end, int proc_id){
**logic to write out transposed data here
}
fclose(results_fp);
}

Related

MPI: Distributing segments of large file does not speedup execution

I have a large bioinformatics file (fasta), and I am using MPI to open the file at specific regions depending on the current program's ID. Then, I transcribe the amino acid sequence into their corresponding proteins.
#include <mpi.h>
int main(int argc, char* argv[]){
MPI_File in;
int id;
int p;
long buffersize = 3000000000/p;
MPI_Offset fileStart = buffersize * id;
char* nucleotides = (char*)malloc(buffersize);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, MPI_INFO_NULL, &in);
MPI_File_read_at(in, fileStart, nucleotides, buffersize, MPI_CHAR, MPI_STATUS_IGNORE);
/* Calculations */
/* Write result */
MPI_File_close(&in);
free(nucleotides);
MPI_Finalize();
return 0;
}
I expect a speedup correlated with number of machines running the algorithm. However, I observe running my applications across multiple machines does not change execution time. Execution time appears to be independent of number of machines listed in my hostfile.
Any ideas how to get the expected behavior of more machines decreasing the read time?
To turn the comments into an answer:
Turn this
#include <mpi.h>
int main(int argc, char* argv[]){
MPI_File in;
int id;
int p;
//you are using p uninitialized here!!!
long buffersize = 3000000000/p;
//same applies to id
MPI_Offset fileStart = buffersize * id;
char* nucleotides = (char*)malloc(buffersize);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, MPI_INFO_NULL, &in);
MPI_File_read_at(in, fileStart, nucleotides, buffersize, MPI_CHAR, MPI_STATUS_IGNORE);
/* Calculations */
/* Write result */
MPI_File_close(&in);
free(nucleotides);
MPI_Finalize();
return 0;
}
into this:
#include <mpi.h>
int main(int argc, char* argv[]){
MPI_File in;
MPI_Init(&argc, &argv);
int id;
int p;
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
long buffersize = 3000000000/p;
MPI_Offset fileStart = buffersize * id;
char* nucleotides = (char*)malloc(buffersize);
MPI_File_open(MPI_COMM_WORLD, argv[1], MPI_MODE_RDONLY, MPI_INFO_NULL, &in);
MPI_File_read_at(in, fileStart, nucleotides, buffersize, MPI_CHAR, MPI_STATUS_IGNORE);
/* Calculations */
/* Write result */
MPI_File_close(&in);
free(nucleotides);
MPI_Finalize();
return 0;
}
No guarantee about whether the rest will work above.
I would highly recommend to make yourself comfortable with C programming before starting to write MPI code because you might end up lost. MPI issues are difficult to debug.

MPI_ERR_BUFFER from MPI_Bsend after removing following print statement?

I have the following code which works:
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n = 10000;
int ni, i;
double t[n];
int x[n];
int buf[n];
int buf_size = n*sizeof(int);
MPI_Buffer_attach(buf, buf_size);
if (world_rank == 0) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
for (i = 0; i < msg_size; ++i) {
msg[i] = rand();
}
double time0 = MPI_Wtime();
MPI_Bsend(&msg, msg_size, MPI_INT, 1, 0, MPI_COMM_WORLD);
t[ni] = MPI_Wtime() - time0;
x[ni] = msg_size;
MPI_Barrier(MPI_COMM_WORLD);
printf("P0 sent msg with size %d\n", msg_size);
}
}
else if (world_rank == 1) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
MPI_Request request;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Irecv(&msg, msg_size, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
MPI_Wait(&request, MPI_STATUS_IGNORE);
printf("P1 received msg with size %d\n", msg_size);
}
}
MPI_Buffer_detach(&buf, &buf_size);
MPI_Finalize();
}
As soon as I remove the print statements, the program crashes, telling me there is a MPI_ERR_BUFFER: invalid buffer pointer. If I remove only one of the print statements the other print statements are still executed, so I believe it crashes at the end of the program. I don't see why it crashes and the fact that it does not crash when I am using the print statements goes beyond my logic...
Would anybody have a clue what is going on here?
You are simply not providing enough buffer space to MPI. In buffered mode, all ongoing messages are stored in the buffer space which is used as a ring buffer. In your code, there can be multiple messages that need to be buffered, regardless of the printf. Note that not even 2*n*sizeof(int) would be enough buffer space - the barriers do not provide a guarantee that the buffer is locally freed even though the corresponding receive is completed. You would have to provide (n*(n-1)/2)*sizeof(int) memory to be sure, or something in-between and hope.
Bottom line: Don't use buffered mode.
Generally, use standard blocking send calls and write the application such that it doesn't deadlock. Tune the MPI implementation such that small messages regardless of the receiver - to avoid wait times on late receivers.
If you want to overlap communication and computation, use nonblocking messages - providing proper memory for each communication.

Timing a broadcasted message to all processors using MPI_Isend and MPI_Irecv instead of MPI_Bcast

I have a project where I need to time any bad implementation of MPI_Bcast using MPI_Isend and MPI_Irecv, and compare it against MPI_Bcast. Because the time on these programs is 0.000000 Seconds, I need to use a large array (as I have done). What is not yet in my code below is that the for loop and MPI_Irecv/Isend functions should be in a loop to make the program take a useful amount of time to finish.
Here is my code, and I'll discuss the problem I am having below it:
#include <stdio.h>
#include <string.h>
#include <mpi.h>
int main(int argc, char **argv) {
int a = 1000000000;
int i, N;
int Start_time, End_time, Elapse_Time;
int proc_rank, partner, world_size;
MPI_Status stat;
float mydata[a];
MPI_Request request;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
Start_time = MPI_Wtime();
for (i = 0; i < a; i++) {
mydata[i] = 0.2567*i;
}
MPI_Irecv(mydata, a, MPI_BYTE, 0, 1, MPI_COMM_WORLD, &request);
MPI_Isend(mydata, a, MPI_BYTE, 0, 1, MPI_COMM_WORLD, &request);
End_time = MPI_Wtime();
Elapse_Time = End_time - Start_time;
printf("Time on process %d is %f Seconds.\n", proc_rank, Elapse_Time);
MPI_Finalize;
return 0;
}
When I run this using the command mpirun -np 4 ./a.out, I only get the time for one processor, but I'm not really sure why. I guess I'm just not understanding how these functions work, or how I should be using them.
Thank you for the help!
There are a few different issues in your code, all likely to lead to it to crash and or to behave strangely:
As already mentioned by #Olaf the allocation of the array mydata on the stack is a very bad idea. For arrays this large, you should definitely go for an allocation on the heap with an explicit call to malloc(). Even so, you are playing with some serious chunks of memory here, so be careful of not exhausting what's available on your machine. Moreover, some MPI libraries have difficulties to deal with messages of size greater than 2GB, which is the case of yours. So again, be careful with that.
You use mydata for both sending and receiving purpose. However, once you posted a non-blocking communication, you cannot reuse the corresponding message until the communication is finished. So in your case, you'll need two arrays, one for sending and one for receiving.
The type of the data you pass to your MPI calls, namely MPI_BYTE, isn't coherent with the actual type of the data you transfer, namely float. You should use MPI_FLOAT instead.
You call MPI_Irecv() and MPI_Isend() without calling any valid MPI_Wait() or MPI_Test() functions. This is wrong since this means that the communications might never occur.
MPI_Wtime() returns a double, not an int. This isn't an error per se but it might lead to unexpected results. Moreover, the format requested in your call to printf() corresponds to a floating point data, not an integer, so you have to make it coherent.
(Minor - typo ) You missed the () for MPI_Finalize().
(Minor - I guess) You only communicate with process #0...
So here is some possible version of a working code:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <mpi.h>
int main(int argc, char **argv) {
int a = 1000000000;
int i, from, to;
double Start_time, End_time, Elapse_Time;
int proc_rank, world_size;
float *mysenddata, *myrecvdata;
MPI_Request requests[2];
MPI_Init( &argc, &argv );
MPI_Comm_rank( MPI_COMM_WORLD, &proc_rank );
MPI_Comm_size( MPI_COMM_WORLD, &world_size );
Start_time = MPI_Wtime();
mysenddata = (float*) malloc( a * sizeof( float ) );
myrecvdata = (float*) malloc( a * sizeof( float ) );
assert( mysenddata != NULL ); /*very crude sanity check */
assert( myrecvdata != NULL ); /*very crude sanity check */
for ( i = 0; i < a; i++ ) {
mysenddata[i] = 0.2567 * i;
}
from = ( proc_rank + world_size - 1 ) % world_size;
to = ( proc_rank + 1 ) % world_size;
MPI_Irecv( myrecvdata, a, MPI_FLOAT, from, 1, MPI_COMM_WORLD, &requests[0] );
MPI_Isend( mysenddata, a, MPI_FLOAT, to, 1, MPI_COMM_WORLD, &requests[1] );
MPI_Waitall( 2, requests, MPI_STATUSES_IGNORE );
End_time = MPI_Wtime();
Elapse_Time = End_time - Start_time;
printf( "Time on process %d is %f Seconds.\n", proc_rank, Elapse_Time );
free( mysenddata );
free( myrecvdata );
MPI_Finalize();
return 0;
}
NB: for the sake of having a code working in all circumstances, I implemented a communication ring here, were process 0 sends to process 1 and receives from process size-1... However, in the context of your re-implementation of a broadcast, you can just ignore this (ie. the from and to parameters).
The only explaination I see is your other process is crashing before the print. Try to put some part of your code in comment and reexecute the code.
Try this way and see if you see a difference
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD, &proc_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
/*Start_time = MPI_Wtime();
for (i = 0; i < a; i++) {
mydata[i] = 0.2567*i;
}
MPI_Irecv(mydata, a, MPI_BYTE, 0, 1, MPI_COMM_WORLD, &request);
MPI_Isend(mydata, a, MPI_BYTE, 0, 1, MPI_COMM_WORLD, &request);
End_time = MPI_Wtime();
Elapse_Time = End_time - Start_time;*/
printf("I'm process %d.\n", proc_rank);
MPI_Finalize;

MPI_Gather of indexed typed to raw data

I have encountered a problem on using MPI_Gather for gathering indexed integers to a vector of integers. When I try to gather the integers without creating a new receive type, I get a MPI_ERR_TRUNCATE error.
*** An error occurred in MPI_Gather
*** on communicator MPI_COMM_WORLD
*** MPI_ERR_TRUNCATE: message truncated
*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort
The minimal example replicating the issue is
#include <stdlib.h>
#include "mpi.h"
int i, comm_rank, comm_size, err;
int *send_data, *recv_data;
int *blocklengths, *displacements;
MPI_Datatype send_type;
int main ( int argc, char *argv[] ){
MPI_Init ( &argc, &argv );
MPI_Comm_rank(MPI_COMM_WORLD, &comm_rank);
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
unsigned int block = 1000;
unsigned int count = 1000;
send_data = malloc(sizeof(int)*block*count);
for (i=0; i<block*count; ++i) send_data[i] = i;
recv_data = 0;
if(comm_rank==0) recv_data = malloc(sizeof(int)*block*count*comm_size);
blocklengths = malloc(sizeof(int)*count);
displacements = malloc(sizeof(int)*count);
for (i=0; i<count; ++i) {
blocklengths[i] = block;
displacements[i] = i*block;
}
MPI_Type_indexed(count, blocklengths, displacements, MPI_INT, &send_type);
MPI_Type_commit(&send_type);
err = MPI_Gather((void*)send_data, 1, send_type, (void*)recv_data, block*count, MPI_INT, 0, MPI_COMM_WORLD);
if (err) MPI_Abort(MPI_COMM_WORLD, err);
free(send_data);
free(recv_data);
free(blocklengths);
free(displacements);
MPI_Finalize ( );
return 0;
}
I noticed that this error does not occur when I use data transfer size less than 6K bytes.
I found a workaround using MPI_Type_contiguous, although it seems I add extra overhead to my code.
MPI_Type_contiguous(block*count, MPI_INT, &recv_type);
MPI_Type_commit(&recv_type);
err = MPI_Gather((void*)send_data, 1, send_type, (void*)recv_data, 1, recv_type, 0, MPI_COMM_WORLD);
I have verified the error occurs in open-mpi v1.6 and v1.8.
Could anyone explain the source of this issue?

MPI - indefinite send and recv

If I am using blocking send and recv (MPI_send(), MPI_recv()), how to make theese two operation indefinite? Like they are repeating all over again?
Sample code:
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
MPI_Comm_rank (MPI_COMM_WORLD,&rank);
if(rank==0){
rc=MPI_Send(msg,1,MPI_CHAR,1,1,MPI_COMM_WORLD);
rc=MPI_Recv(msg,1,MPI_CHAR,1,1,MPI_COMM_WORLD,&status);
}else{
rc=MPI_Recv(msg,1,MPI_CHAR,0,0,MPI_COMM_WORLD,&status);
rc=MPI_Send(msg,1,MPI_CHAR,0,0,MPI_COMM_WORLD);
}
I have tried to put before if(rank==0) -> while(1) and it did the job, but I see there are several sends, then several recieves and I want it like - send(0), receive(1), send(1), recieve(0).
You can code a ring of send-receive operations easily by using MPI_Sendrecv:
int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
int dest, int sendtag, void *recvbuf, int recvcount,
MPI_Datatype recvtype, int source, int recvtag,
MPI_Comm comm, MPI_Status *status)
As you can see it's only a condensed version of a MPI_Send and a MPI_Recv, but it comes handy when all the process needs either to send and receive something.
The following code works for any number of processes (you can adapt it to your needs):
CODE UPDATE #1 (Using MPI_Sendrecv)
#include <stdio.h>
#include <unistd.h>
#include <mpi.h>
int main (int argc, char *argv[])
{
int size, rank, value, next, prev, sendval, recval;
double t0, t;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
value = 5;
if (size > 1)
{
next = (rank + 1)% size;
prev = (size+rank - 1)% size;
sendval = value + rank;
for (;;)
{
t0 = MPI_Wtime();
MPI_Sendrecv(&sendval, 1, MPI_INT, next, 1, &recval, 1, MPI_INT, prev, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
t = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Sended %d to process %d, Received %d from process %d (MPI_SendRecv Time: %f)\n",rank, size-1, sendval, next, recval, prev, (t - t0));
}
}
MPI_Finalize();
return 0;
}
CODE UPDATE #2 (Using separate MPI_Send/MPI_Recv)
#include <stdio.h>
#include <unistd.h>
#include <mpi.h>
int main (int argc, char *argv[])
{
int size, rank, value, next, prev, sendval, recval;
double s0, s, r, r0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
value = 5;
if (size > 1)
{
next = (rank + 1)% size;
prev = (size+rank - 1)% size;
sendval = value + rank;
for (;;)
{
s0 = MPI_Wtime();
MPI_Send(&sendval, 1, MPI_INT, next, 1, MPI_COMM_WORLD);
s = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Sended %d to process %d (MPI_Send Time: %f)\n", rank, size-1, sendval, next, s-s0);
r0 = MPI_Wtime();
MPI_Recv(&recval, 1, MPI_INT, prev, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
r = MPI_Wtime();
fprintf(stdout, "[%d of %d]: Received %d from process %d (MPI_Recv Time: %f)\n", rank, size-1, recval, prev, r-r0);
}
}
MPI_Finalize();
return 0;
}
Running Example
mpicc -o sendrecv sendrecv.c
mpirun -n 2 sendrecv
[0 of 1]: Sended 5 to process 1, Received 6 from process 1 (MPI_SendRecv Time: 0.000121)
[1 of 1]: Sended 6 to process 0, Received 5 from process 0 (MPI_SendRecv Time: 0.000068)
...
It is impossible to give an accurate answer to that without seeing at least the basic layout of your code. Generally, you would place the Send and Receive operations inside an infinite loop. Or, if you're hard pressed for optimal communication costs (or simply feeling adventurous), you could use persistent Send and Receive.

Resources