Parallel MPI Version - c

I have to write a parallel MPI version of my dot product (code below):
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define SIZE 10000000
volatile float a[SIZE];
volatile float b[SIZE];
int main(int argc, char **argv)
{
long int i;
double sum;
struct timeval time1, time2;
srand(time(0));
for (i = 0; i < SIZE; i++)
{
a[i] = rand();
b[i] = rand();
}
gettimeofday(&time1, 0);
sum = 0.0;
for (i = 0; i < SIZE; i++)
{
sum = sum + a[i]*b[i];
}
gettimeofday(&time2, 0);
MPI_Scatter(a, 1, MPI_INT, &b, 1, MPI_INT, 0, MPI_COMM_WORLD);
printf("Elapsed time (us) = %d\n", (time2.tv_sec-time1.tv_sec)*1000000 + time2.tv_usec - time1.tv_usec);
return 0;
}
My question is, what code do I need to add to the program, what MPI primitives are useful?

Related

Why is my multi-threading program in VirtualBox not faster than my single-thread program?

everybody!
I have two program estimating PI using Monte-Carlo technique : one using single-thread and one using multi-thread.
The single-thread one :
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define BILLION 1000000000.0
int main(int argc, char *argv[])
{
struct timespec start, end;
///////////////////////
clock_gettime(CLOCK_REALTIME, &start);
///////////////////////
if(argc != 2)
{
fprintf(stderr, "usage: a.out <integer value>\n");
return -1;
}
if(atoi(argv[1]) < 0)
{
fprintf(stderr, "%d must be >= 0\n", atoi(argv[1]));
return -1;
}
time_t t;
srand((unsigned) time(&t));
int total = atoi(argv[1]);
int inside = 0;
unsigned int seed = rand()%30000;
for(int i = 0; i < total; ++i)
{
double rand_x = (double)rand_r(&seed)/(double)RAND_MAX;
double rand_y = (double)rand_r(&seed)/(double)RAND_MAX;
double dist = rand_x*rand_x + rand_y*rand_y;
if(dist < 1.0) ++inside;
}
double pi = (double)(4 * inside)/total;
clock_gettime(CLOCK_REALTIME, &end);
double time_spent = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / BILLION;
printf("pi = %lf\n", pi);
printf("time = %f\n", time_spent);
return 0;
}
The multi-thread one:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#include <math.h>
#define N 5
#define BILLION 1000000000.0
int inside = 0;
pthread_mutex_t mutex;
void* countInside(void * n)
{
int total = (int)n;
int hit_count = 0;
unsigned int seed = rand()%30000;
for(int i = 0; i < total; ++i)
{
double rand_x = (double)rand_r(&seed)/(double)RAND_MAX;
double rand_y = (double)rand_r(&seed)/(double)RAND_MAX;
double dist = rand_x*rand_x + rand_y*rand_y;
if(dist < 1.0) ++hit_count;
}
pthread_mutex_lock(&mutex);
inside += hit_count;
pthread_mutex_unlock(&mutex);
pthread_exit(0);
}
int main(int argc, char *argv[])
{
struct timespec start, end;
///////////////////////
clock_gettime(CLOCK_REALTIME, &start);
///////////////////////
if(argc != 2)
{
fprintf(stderr, "usage: a.out <integer value>\n");
return -1;
}
if(atoi(argv[1]) < 0)
{
fprintf(stderr, "%d must be >= 0\n", atoi(argv[1]));
return -1;
}
int total = atoi(argv[1]);
srand((unsigned) time(NULL));
//int N;
//printf("Input the number of thread you desire : ");
//scanf("%d", &N);
int n = total/N;
//pthread_t* tid = malloc(sizeof(pthread_t) * (N));
pthread_t tid[N];
pthread_mutex_init(&mutex, NULL);
for(int i = 0; i < N; ++i)
{
pthread_create(&tid[i], 0, countInside, (void*)n);
}
for(int i = 0; i < N; ++i)
{
pthread_join(tid[i], NULL);
}
double pi = 4.0 * inside / total;
clock_gettime(CLOCK_REALTIME, &end);
double time_spent = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / BILLION;
printf("pi = %lf\n", pi);
printf("time = %lf\n", time_spent);
return 0;
}
When i execute both program with 100000000 points, i get the ouput:
Ouput of single-thread:
quan#quan-VirtualBox:~/Documents/lab5$ ./pi_serial 100000000
pi = 3.141583
time = 1.576207
Output of multi-thread:
quan#quan-VirtualBox:~/Documents/lab5$ ./pi_multi-thread 100000000
pi = 3.141532
time = 1.446410
Note : There are sometimes multi-thread one is even slower than single-thread one.
What's the problem ? I thought multi-thread must have some speed-up compared to single-thread one. Is my multi-thread code wrong ? Please give me some advice? Thank you!

c know how much memory was used in the execution

For example, the next code, how to know the memory without the struct timeval and int microseg?.
Is the problem 1 of project Euler.
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/resource.h>
int main(){
struct timeval t, t2;
struct rusage uso;
int microseg;
gettimeofday(&t, NULL);
int sum = 0;
for (int k = 2; k < 1000; k++){
if(k%3 == 0 || k%5 == 0)
sum +=k;
}
printf("%d \n",sum);
gettimeofday(&t2, NULL);
microsegundos = ((t2.tv_usec - t.tv_usec) + ((t2.tv_sec - t.tv_sec) * 1000000.0f));
printf("CPU time: %d\n",microseg);
getrusage(RUSAGE_SELF, &uso);
printf("Memory: %ld KB\n", (long)uso.ru_maxrss);
return 0;
}

A sample openmp program with speedup

Could someone provide an OpenMP program where the speedup is visible compared to without it. I'm finding it extremely difficult to achieve speedup. Even this simple program runs slower with OpenMP. My processor is Intel® Core™ i3-2370M CPU # 2.40GHz × 4 running on Linux (Ubuntu 14.10)
#include <cmath>
#include <stdio.h>
#include <time.h>
int main() {
clock_t t;
t = clock();
const int size = 4;
long long int k;
#pragma omp parallel for num_threads(4)
for(int n=0; n<size; ++n) {
for(int j=0;j<100000000;j++){
}
printf("\n");
}
t = clock() - t;
printf ("It took me %d clicks (%f seconds).\n",t,((float)t)/CLOCKS_PER_SEC);
return 0;
}
I had a problem related to this, where I wanted to find the max value of an array. I made the same mistake as you, I used clock for measuring the elapsed time. To fix this, I used clock_gettime() instead, and now it works.
As for an example code where the speedup is measurable (Note you migth want to change the value of N):
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
struct timespec diff(struct timespec start, struct timespec end)
{
struct timespec temp;
if(end.tv_sec - start.tv_sec == 0)
{
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_nsec = ((end.tv_sec - start.tv_sec)*1000000000) + end.tv_nsec - start.tv_nsec;
}
return temp;
}
int main()
{
unsigned int N;
struct timespec t_start, t_end;
clock_t start, end;
srand(time(NULL));
FILE *f = fopen("out.txt", "w");
if(f == NULL)
{
printf("Could not open output\n");
return -1;
}
for(N = 1000000; N < 100000000; N += 1000000)
{
fprintf(f, "%d\t", N);
int* array = (int*)malloc(sizeof(int)*N);
if(array == NULL)
{
printf("Not enough space\n");
return -1;
}
for(unsigned int i = 0; i<N; i++) array[i] = rand();
int max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
#pragma omp parallel for reduction(max:max_val)
for(unsigned int i=0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\t", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for(unsigned int i = 0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\n", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
free(array);
}
fclose(f);
return 0;
}
Calculating a integral is a classical one, adjust the parts constant to increase the execution time and see more clearly the runtime, more parts, more execution time. It's getting 21.3 seconds with OpenMP enabled and 26.7 seconds, on a SINGLE core, DUAL thread Intel pentium 4:
#include <math.h>
#include <stdio.h>
#include <omp.h>
#define from 0.0f
#define to 2.0f
#define parts 999999999
#define step ((to - from) / parts)
#define x (from + (step / 2.0f))
int main()
{
double integralSum = 0;
int i;
#pragma omp parallel for reduction(+:integralSum)
for (i = 1; i < (parts+1); ++i)
{
integralSum = integralSum + (step * fabs(pow((x + (step * i)),2) + 4));
}
printf("%f\n", integralSum);
return 0;
}
It calculates the definite integral from 0 to 2 of x^2 + 4

parallel and sequential dotproduct programs different result

I've wrote this two versions of code for computing a dotproduct operation on two arrays. each length is 256. here is very simple sequential code:
#include <stdlib.h>
#include <stdio.h>
int main(int argc, char* argv[]){
double sum;
double a[256], b[256];
int n = 256, i;
for (i=0; i<n; i++){
a[i] = i * 0.5;
b[i] = i * 2.0;
}
sum = 0;
for (i=1; i<=n; i++){
sum = sum + a[i]*b[i];
}
printf ("sum = %f\n", sum);
}//main
answer is 5559680
but the parallel code:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define NUMTHRDS 4
double sum;
double a[256], b[256];
int status;
int n=256;
pthread_t thds[NUMTHRDS];
pthread_mutex_t mutexsum;
void* dotprod(void *arg){
int myid, i, my_first, my_last;
double sum_local;
myid = (int)arg;
my_first = myid * n/NUMTHRDS;
my_last = (myid + 1) * n/NUMTHRDS;
sum_local = 0;
for (i=my_first; i<=my_last; i++){
sum_local = sum_local + a[i]*b[i];
}
pthread_mutex_lock(&mutexsum);
sum = sum + sum_local;
pthread_mutex_unlock(&mutexsum);
pthread_exit((void*)0);
}//dotprod
int main(int argc, char* argv[]){
int i;
pthread_attr_t attr;
for (i=0; i<n; i++){
a[i] = i * 0.5;
b[i] = i * 2.0;
}
pthread_mutex_init(&mutexsum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for (i=0; i<NUMTHRDS; i++){
pthread_create(&thds[i], &attr, dotprod, (void*)i);
}
pthread_attr_destroy(&attr);
for(i=0; i<NUMTHRDS; i++){
pthread_join(thds[i], (void **)&status);
}
printf("sum = %f \n", sum);
pthread_mutex_destroy(&mutexsum);
pthread_exit(NULL);
return 0;
}//main
answer is 5617024
i totally confused what is this difference for?
off by one error.
for (i=1; i<=n; i++){
for (i=0; i<n; i++) {
and
for (i=my_first; i<=my_last; i++){
for (i=my_first; i<my_last; i++){
In the first program, you are adding in a[256] and b[256], which is off the end of the array.
most likely those values were 0, so you got the right answer.
In the second program, you are counting some parts of the array twice: 64, 128, 192, and
still adding in index 256.
Always check the boundary conditions of your loops, especially with array accesses.

MPI partition array into blocks and Send

I am trying to find a maximum element of an array using MPI in C language. I have to compare the time it takes to send and calculation of the maximum using vs MPI_Scatter functions. MPI_Send: Here' the code for the MPI_Scatter function it works great:
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define lim 20
//returns "a-b" in seconds
double timeval_diff(struct timeval *a, struct timeval *b)
{
return
(double)(a->tv_sec + (double)a->tv_usec/1000000) -
(double)(b->tv_sec + (double)b->tv_usec/1000000);
}
//Array to be divided among the processes
int buf[lim]=
{27,24,3,8,45,10,50,15,10,11,9,48,69,25,19,29,61,72,93,20};
int buf2[lim];
int buf3[lim];
int max;
int main(int argc, char *argv[])
{
struct timeval t_ini, t_fin;
double secs;
int n, myid, numprocs, i,j;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
fprintf(stderr,"Process %d in %s\n",myid, processor_name);
/*Check Border Conditions */
n=lim/numprocs;
gettimeofday(&t_ini, NULL); //take the time before sending the buffer with Scatter
MPI_Scatter(buf,n, MPI_INT,buf2,n,MPI_INT, 0, MPI_COMM_WORLD);
gettimeofday(&t_fin, NULL);//take the time to complete the send routine
secs = timeval_diff(&t_fin, &t_ini);
MPI_Reduce(buf2,buf3,n, MPI_INT, MPI_MAX, 0,MPI_COMM_WORLD);
if (myid == 0)
{ max = buf3[0];
for (i=1; i<n ; i++)
if (max < buf3[i]) max = buf3[i];
for (i=0; i<n ; i++)
printf("Buf3[%d]= %d \n", i, buf3[i]);
printf("Max number of the array is: %d \n", max);
}
for (i=0; i<n ; i++){
printf("%d,Buf2[%d]= %d \n",myid, i,buf2[i]);}
printf("%.16g milliseconds\n", secs * 1000.0);
MPI_Finalize();
return 0;
}
The problem comes when I try to do the same procedure with the MPI_Send function because I calculated the maximum array elements, what am I doing wrong?:
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define lim 20
//returns "a-b" in seconds
double timeval_diff(struct timeval *a, struct timeval *b)
{
return
(double)(a->tv_sec + (double)a->tv_usec/1000000) -
(double)(b->tv_sec + (double)b->tv_usec/1000000);
}
//Array to be divided among the processes
int buf[lim]=
{27,24,3,8,45,10,50,15,10,11,9,48,69,25,19,29,61,72,93,20};
int buf2[lim];
int buf3[lim];
int max;
int main(int argc, char *argv[])
{
struct timeval t_ini, t_fin;
double secs;
int n, myid, numprocs, i,j;
int namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
fprintf(stderr,"Process %d in %s\n",myid, processor_name);
/*Check Border Conditions */
n=lim/numprocs;
gettimeofday(&t_ini, NULL); //take the time before sending the buffer with Scatter
for (j=0;j<n;j++){
MPI_Send(buf, lim, MPI_INT, 1, 111, MPI_COMM_WORLD);
}
gettimeofday(&t_fin, NULL);//take the time to complete the send routine
secs = timeval_diff(&t_fin, &t_ini);
if (myid == 0)
{ max = buf3[0];
for (i=1; i<n ; i++)
if (max < buf3[i]) max = buf3[i];
for (i=0; i<n ; i++)
printf("Buf3[%d]= %d \n", i, buf3[i]);
printf("Max number of the array is: %d \n", max);
}
for (i=0; i<n ; i++){
printf("%d,Buf2[%d]= %d \n",myid, i,buf2[i]);}
printf("%.16g milliseconds\n", secs * 1000.0);
MPI_Finalize();
return 0;
}
I wasted some hours watching Where is the fault but I can not see it ... Any help?
you are missing the MPI_Recv call on the other end of your MPI_Send call, these kind of functions are more low level as opposed to the collective scatter, gather, reduce and broadcast functions

Resources