Force MPI_Send to use eager or rendezvouz protocol - c

I'm doing a little MPI (openmpi) program in C for a workshop at college. Our objective is to observe the time difference between the two main protocols of MPI, eager and rendezvous, regarding the message size.
We haven't worked with MPI before, and we have thought that there may be a way to "select" between the two protocols. Searching on google for information about how to do it I found (somewhere I don't remember) that there is an eager limit. I read that it is set by the MPI implementation, and also, that you can change it somehow.
Any advice on how to choose between the protocols?
Are there any relation between the protocols and MPI_Send/MPI_Isend?
I thought that changing the receiver buffer size will break from eager and start using rendezvous. But it's just a hunch.
Here is my code for now:
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include "mpi.h"
#define KBdata 32000 //openmpi default buffer size
#define ndata KBdata/4 //number of ints that fits in buffer
int main(int argc, char *argv[]) {
int myid, numprocs;
int tag,source,destination,count;
int buffer[ndata];
MPI_Status status;
MPI_Request request;
int iter = 20;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
if (myid == 0 && numprocs == 2) { //to
int recvID = 1;
double acum = 0;
int i;
double startT;
for (i=0;i<iter;++i)
{
double startTime = MPI_Wtime();
MPI_Send(&buffer,ndata,MPI_INT,recvID,0,MPI_COMM_WORLD);
double endTime = MPI_Wtime();
double elapsed = endTime - startTime;
acum += elapsed;
printf("%d, %f, elapsed: %f\n",i,acum,elapsed);fflush(stdout);
usleep(500000);
}
printf("total: %f\nmean: %f\n", acum, acum/iter);
}
else if (numprocs == 2) {
int i;
for(i=0; i<iter; ++i)
{
printf("Waiting for receive\n");fflush(stdout);
MPI_Recv(&buffer,ndata,MPI_INT,0,0,MPI_COMM_WORLD,&status);
printf("Received %d\n",i);fflush(stdout);
}
}
else {
printf("Need only 2 threads\n");
}
MPI_Finalize();
return 0;
}
Thank you in advice.

There is no direct connection between eager/rendezvous and MPI_Send/Isend. However, if you're under the eager limit, your MPI_Send is no longer blocking. If you want it to block regardless, you can use MPI_Ssend.
Regarding eager limits:
MVAPICH2:
MV2_IBA_EAGER_THRESHOLD= < nbytes >
Intel MPI (depending on version):
I_MPI_EAGER_THRESHOLD= < nbytes >
I_MPI_SHM_EAGER_THRESHOLD= < nbytes >
Open MPI:
--mca_btl_openib_eager_limit < nbytes >
--mca_btl_openib_rndv_eager_limit < nbytes >
Cray MPICH:
MPICH_GNI_MAX_EAGER_MSG_SIZE=<value>

Related

multithreading program to perform word count frequency- Segmentation fault

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <pthread.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <time.h>
pthread_mutex_t lock;
//typedef struct for a word
typedef struct {
char word[101];
int frequency;
}Word;
//struct for thread
struct ft{
char* fileName;
int start;
int stop;
};
//compare frequency of 2 words
int compareWords(const void *f1, const void *f2){
Word *a = (Word *)f1;
Word *b = (Word *)f2;
return (b->frequency - a->frequency);
}
//count frequency of a word
void countFrequency(void *arg){
pthread_mutex_lock(&lock);
int i, c;
struct ft* fi = (struct ft*)arg;
FILE *file = fopen(fi->fileName,"r");
fseek(file,fi->start,SEEK_SET);
for(i = 0; i < fi->stop - fi->start; i++){
c = getc(file);
//printf("%d\n", c);
//frequency count
}
fclose(file);
pthread_mutex_unlock(&lock);
}
int main (int argc, char **argv){
//variabies for <time.h>
struct timespec startTime;
struct timespec endTime;
clock_gettime(CLOCK_REALTIME, &startTime);
/*------------main------------------*/
//variables
int nthreads; //number of threads
int chunkSize; //each threas processing size
//if user input is not correct, inform
if(argc < 3){
printf("./a.out text_file #ofthreads \n");
exit(-1);
}
nthreads = atoi(argv[2]);
chunkSize = sizeof(argv[1])/nthreads;
//declare threads and default attributes
pthread_t threads[nthreads];
pthread_attr_t attr;
pthread_attr_init(&attr);
//run threads in parallel
int i;
for (i = 0; i < nthreads; i++){
struct ft data[nthreads];
data[i].start = i*chunkSize;
data[i].stop = data[i].start+chunkSize;
data[i].fileName = argv[1];
// Create a new thread for every segment, and count word frequency for each
pthread_create(&threads[i], &attr, (void*) countFrequency, (void*) &data[i]);
}
//wait for results (all threads)
for (i = 0; i < nthreads; i++){
pthread_join(threads[i], NULL);
}
//func of <time.h>
clock_gettime(CLOCK_REALTIME, &endTime);
time_t sec = endTime.tv_sec - startTime.tv_sec;
long n_sec = endTime.tv_nsec - startTime.tv_nsec;
if (endTime.tv_nsec < startTime.tv_nsec)
{
--sec;
n_sec = n_sec + 1000000000L;
}
printf("Total Time was %ld.%09ld seconds\n", sec, n_sec);
}
I'm working on this program to use multiple threads to read and process a large text file and perform a word count frequency of the top 10 most frequent words in the text that are longer than 6 characters long. But I keep getting the segmentation fault error im not sure why, does anybody have any idea.?
This code:
for (i = 0; i < nthreads; i++){
struct ft data[nthreads];
declares data that is live (legal to use) for the duration of this for loop. This code:
pthread_create(&threads[i], &attr, (void*) countFrequency, (void*) &data[i]);
}
passes the address of data into the threads, and then exits the loop. Once the loop is done, data is no longer live, and all access to it leads to undefined behavior.
The compiler is free to write anything else to the memory where data used to be.
The immediate cause of the crash is that if one of the threads doesn't execute fopen before data is overwritten, then fopen may fail, and you don't check for the failure in fopen.
P.S.
As Eraklon noted, this code: chunkSize = sizeof(argv[1])/nthreads; will divide sizeof(char*) (either 4 or 8 depending on whether you build for 32-bit or for 64-bit) by number of threads. That is unlikely to be what you want, and will yield chinkSize==0 for nthreads > 4 on 32-bit and nthreads > 8 on 64-bit machines.
P.P.S.
There is a concurrency bug in your program as well: since each of the countFrequency invocations locks the same lock for the entire duration, they will all run in sequence (one after another), never in parallel. Thus your program will be slower than if you just did all the work in your main thread.

MPI_ERR_BUFFER from MPI_Bsend after removing following print statement?

I have the following code which works:
#include <mpi.h>
#include <stdio.h>
int main(int argc, char** argv) {
int world_rank, world_size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
int n = 10000;
int ni, i;
double t[n];
int x[n];
int buf[n];
int buf_size = n*sizeof(int);
MPI_Buffer_attach(buf, buf_size);
if (world_rank == 0) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
for (i = 0; i < msg_size; ++i) {
msg[i] = rand();
}
double time0 = MPI_Wtime();
MPI_Bsend(&msg, msg_size, MPI_INT, 1, 0, MPI_COMM_WORLD);
t[ni] = MPI_Wtime() - time0;
x[ni] = msg_size;
MPI_Barrier(MPI_COMM_WORLD);
printf("P0 sent msg with size %d\n", msg_size);
}
}
else if (world_rank == 1) {
for (ni = 0; ni < n; ++ni) {
int msg_size = ni;
int msg[msg_size];
MPI_Request request;
MPI_Barrier(MPI_COMM_WORLD);
MPI_Irecv(&msg, msg_size, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
MPI_Wait(&request, MPI_STATUS_IGNORE);
printf("P1 received msg with size %d\n", msg_size);
}
}
MPI_Buffer_detach(&buf, &buf_size);
MPI_Finalize();
}
As soon as I remove the print statements, the program crashes, telling me there is a MPI_ERR_BUFFER: invalid buffer pointer. If I remove only one of the print statements the other print statements are still executed, so I believe it crashes at the end of the program. I don't see why it crashes and the fact that it does not crash when I am using the print statements goes beyond my logic...
Would anybody have a clue what is going on here?
You are simply not providing enough buffer space to MPI. In buffered mode, all ongoing messages are stored in the buffer space which is used as a ring buffer. In your code, there can be multiple messages that need to be buffered, regardless of the printf. Note that not even 2*n*sizeof(int) would be enough buffer space - the barriers do not provide a guarantee that the buffer is locally freed even though the corresponding receive is completed. You would have to provide (n*(n-1)/2)*sizeof(int) memory to be sure, or something in-between and hope.
Bottom line: Don't use buffered mode.
Generally, use standard blocking send calls and write the application such that it doesn't deadlock. Tune the MPI implementation such that small messages regardless of the receiver - to avoid wait times on late receivers.
If you want to overlap communication and computation, use nonblocking messages - providing proper memory for each communication.

MPI_Send and MPI_Recv, measuring transfer time of a 1Mb message

I am attempting to send a message of size 1Mb using MPI_Send and MPI_Recv and to measure how long it takes to send that message. Here is my c code.
#include <stdio.h>
#include <mpi.h>
#include <assert.h>
#include <sys/time.h>
int main(int argc,char *argv[])
{
int rank,p;
struct timeval t1,t2;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&p);
printf("my rank=%d\n",rank);
printf("Rank=%d: number of processes =%d\n",rank,p);
assert(p>=2);
if(rank==0) {
int x[255] = { 0 };
int dest = 7;
int i = 0;
while (i<254)
{
x[i] = 255;
i++;
}
gettimeofday(&t1,NULL);
MPI_Send(&x[0],255,MPI_INT,dest,1,MPI_COMM_WORLD);
gettimeofday(&t2,NULL);
int tSend = (t2.tv_sec-t1.tv_sec)*1000 + (t2.tv_usec-t1.tv_usec)/1000;
printf("Rank=%d: sent message %d to rank %d; Send time %d millisec\n", rank,*x,dest,tSend);
} else
if (rank==7) {
int y[255]={0};
MPI_Status status;
gettimeofday(&t1,NULL);
MPI_Recv(&y[0],255,MPI_INT,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
gettimeofday(&t2,NULL);
int tRecv = (t2.tv_sec-t1.tv_sec)*1000 + (t2.tv_usec-t1.tv_usec)/1000;
printf("Rank=%d: received message %d from rank %d; Recv time %d millisec\n",rank,*y,status.MPI_SOURCE,tRecv);
}
MPI_Finalize();
}
This code compiles and runs just fine, but it always says that it completes the send and receive in 0 milliseconds, which isn't possible. I'm guessing that my syntax in sending the array is wrong so I'm just sending 4 bytes or something, but I can't figure it out.
Any help would be appreciated!
Maybe a better way to measure the time is to measure it in microseconds
(t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec
and see if you get any values.

Squaring numbers with multiple threads

I am trying to square the numbers 1 - 10,000 with 8 threads. To clarify, I want the 1st thread to do 1^2, 2nd thread to do 2^2, ..., 8th thread to do 8^2, first thread to do 9^2... etc. The problem I am having is that instead of the above happening, each thread computes the squares of 1-10,000.
My code is below. I have marked sections that I'd rather those answering do not modify. Thanks in advance for any tips!
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <pthread.h>
#define NUMBER_OF_THREADS 8
#define START_NUMBER 1
#define END_NUMBER 10000
FILE *f;
void *sqrtfunc(void *tid) { //function for computing squares
int i;
for (i = START_NUMBER; i<=END_NUMBER; i++){
fprintf(f, "%lu squared = %lu\n", i, i*i);
}
}
int main(){
//Do not modify starting here
struct timeval start_time, end_time;
gettimeofday(&start_time, 0);
long unsigned i;
f = fopen("./squared_numbers.txt", "w");
//Do not modify ending here
pthread_t mythreads[NUMBER_OF_THREADS]; //thread variable
long mystatus;
for (i = 0; i < NUMBER_OF_THREADS; i++){ //loop to create 8 threads
mystatus = pthread_create(&mythreads[i], NULL, sqrtfunc, (void *)i);
if (mystatus != 0){ //check if pthread_create worked
printf("pthread_create failed\n");
exit(-1);
}
}
for (i = 0; i < NUMBER_OF_THREADS; i++){
if(pthread_join(mythreads[i], NULL)){
printf("Thread failed\n");
}
}
exit(1);
//Do not modify starting here
fclose(f);
gettimeofday(&end_time, 0);
float elapsed = (end_time.tv_sec-start_time.tv_sec) * 1000.0f + \
(end_time.tv_usec-start_time.tv_usec) / 1000.0f;
printf("took %0.2f milliseconds\n", elapsed);
//Do not modify ending here
}

Poor performance when multi-processes write one msg on linux

I write a test program as follows:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/msg.h>
#include <time.h>
#define PACKET_SIZE 500
#define LOOP_COUNT 30000
int g_sndsucc = 0;
int g_sndfail = 0;
const int C_IPC_KEY = 0x00231a95;
const int COUNT_SIZE = 10000;
unsigned long g_count = 0;
unsigned long g_t1 = 0;
struct timeval s1, s2, s3, s4;
int main(int argc, char* argv[])
{
int ipckey = C_IPC_KEY;
if(argc > 1)
{
ipckey = atoi(argv[1]);
printf("ipckey is %d\n", ipckey);
}
int qid = msgget(ipckey, IPC_CREAT | 0666);
if(qid <= 0)
{
printf("msgget err: %d \n", errno);
return 0;
}
char data[PACKET_SIZE];
memset(data, 'a', PACKET_SIZE-1);
data[PACKET_SIZE-1] = '\0';
*((long *)data) = 0;
int ret = 0;
struct timeval start;
gettimeofday (&start, NULL);
while(1)
{
*((long *)data) +=1;
gettimeofday (&s1, NULL);
ret = msgsnd(qid, data, PACKET_SIZE,0);
gettimeofday (&s2, NULL);
if(ret != 0)
{
g_sndfail ++;
}
else
{
g_sndsucc ++;
}
g_count++;
g_t1 += (s2.tv_sec-s1.tv_sec)*1000000 + (s2.tv_usec-s1.tv_usec);
if ( g_count >= 10000)
{
printf("STAT1: t1 : %f\n",
10000000000.0 / g_t1);
g_count = 0;
g_t1 = 0;
}
usleep(1000);
}
return 0;
}
I create 100 same processes to msgsnd , and on suse, each process's msgsnd tps only reaches 50/s.
But on AIX5 the msgsnd tps can reaches 10000/s.
Does anyone know why the performance of IPC on linux when multi-processes is so poor?
And how to increase the performance on linux??
BTW, the kenel version of suse is linux 3.0.13
I checked the source code of the msgget in linux3.8.
When the thread did not get the msg lock, it is not release cpu and sleep some time.
Instead it will call ipc_lock_by_ptr(&msq->q_perm); frequently.
So the cpu usage will be very high, and the collision rate will grow rapidly when the threads increas.

Resources