A sample openmp program with speedup - c

Could someone provide an OpenMP program where the speedup is visible compared to without it. I'm finding it extremely difficult to achieve speedup. Even this simple program runs slower with OpenMP. My processor is Intel® Core™ i3-2370M CPU # 2.40GHz × 4 running on Linux (Ubuntu 14.10)
#include <cmath>
#include <stdio.h>
#include <time.h>
int main() {
clock_t t;
t = clock();
const int size = 4;
long long int k;
#pragma omp parallel for num_threads(4)
for(int n=0; n<size; ++n) {
for(int j=0;j<100000000;j++){
}
printf("\n");
}
t = clock() - t;
printf ("It took me %d clicks (%f seconds).\n",t,((float)t)/CLOCKS_PER_SEC);
return 0;
}

I had a problem related to this, where I wanted to find the max value of an array. I made the same mistake as you, I used clock for measuring the elapsed time. To fix this, I used clock_gettime() instead, and now it works.
As for an example code where the speedup is measurable (Note you migth want to change the value of N):
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
struct timespec diff(struct timespec start, struct timespec end)
{
struct timespec temp;
if(end.tv_sec - start.tv_sec == 0)
{
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_nsec = ((end.tv_sec - start.tv_sec)*1000000000) + end.tv_nsec - start.tv_nsec;
}
return temp;
}
int main()
{
unsigned int N;
struct timespec t_start, t_end;
clock_t start, end;
srand(time(NULL));
FILE *f = fopen("out.txt", "w");
if(f == NULL)
{
printf("Could not open output\n");
return -1;
}
for(N = 1000000; N < 100000000; N += 1000000)
{
fprintf(f, "%d\t", N);
int* array = (int*)malloc(sizeof(int)*N);
if(array == NULL)
{
printf("Not enough space\n");
return -1;
}
for(unsigned int i = 0; i<N; i++) array[i] = rand();
int max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
#pragma omp parallel for reduction(max:max_val)
for(unsigned int i=0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\t", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for(unsigned int i = 0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\n", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
free(array);
}
fclose(f);
return 0;
}

Calculating a integral is a classical one, adjust the parts constant to increase the execution time and see more clearly the runtime, more parts, more execution time. It's getting 21.3 seconds with OpenMP enabled and 26.7 seconds, on a SINGLE core, DUAL thread Intel pentium 4:
#include <math.h>
#include <stdio.h>
#include <omp.h>
#define from 0.0f
#define to 2.0f
#define parts 999999999
#define step ((to - from) / parts)
#define x (from + (step / 2.0f))
int main()
{
double integralSum = 0;
int i;
#pragma omp parallel for reduction(+:integralSum)
for (i = 1; i < (parts+1); ++i)
{
integralSum = integralSum + (step * fabs(pow((x + (step * i)),2) + 4));
}
printf("%f\n", integralSum);
return 0;
}
It calculates the definite integral from 0 to 2 of x^2 + 4

Related

Why is my multi-threading program in VirtualBox not faster than my single-thread program?

everybody!
I have two program estimating PI using Monte-Carlo technique : one using single-thread and one using multi-thread.
The single-thread one :
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define BILLION 1000000000.0
int main(int argc, char *argv[])
{
struct timespec start, end;
///////////////////////
clock_gettime(CLOCK_REALTIME, &start);
///////////////////////
if(argc != 2)
{
fprintf(stderr, "usage: a.out <integer value>\n");
return -1;
}
if(atoi(argv[1]) < 0)
{
fprintf(stderr, "%d must be >= 0\n", atoi(argv[1]));
return -1;
}
time_t t;
srand((unsigned) time(&t));
int total = atoi(argv[1]);
int inside = 0;
unsigned int seed = rand()%30000;
for(int i = 0; i < total; ++i)
{
double rand_x = (double)rand_r(&seed)/(double)RAND_MAX;
double rand_y = (double)rand_r(&seed)/(double)RAND_MAX;
double dist = rand_x*rand_x + rand_y*rand_y;
if(dist < 1.0) ++inside;
}
double pi = (double)(4 * inside)/total;
clock_gettime(CLOCK_REALTIME, &end);
double time_spent = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / BILLION;
printf("pi = %lf\n", pi);
printf("time = %f\n", time_spent);
return 0;
}
The multi-thread one:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <pthread.h>
#include <math.h>
#define N 5
#define BILLION 1000000000.0
int inside = 0;
pthread_mutex_t mutex;
void* countInside(void * n)
{
int total = (int)n;
int hit_count = 0;
unsigned int seed = rand()%30000;
for(int i = 0; i < total; ++i)
{
double rand_x = (double)rand_r(&seed)/(double)RAND_MAX;
double rand_y = (double)rand_r(&seed)/(double)RAND_MAX;
double dist = rand_x*rand_x + rand_y*rand_y;
if(dist < 1.0) ++hit_count;
}
pthread_mutex_lock(&mutex);
inside += hit_count;
pthread_mutex_unlock(&mutex);
pthread_exit(0);
}
int main(int argc, char *argv[])
{
struct timespec start, end;
///////////////////////
clock_gettime(CLOCK_REALTIME, &start);
///////////////////////
if(argc != 2)
{
fprintf(stderr, "usage: a.out <integer value>\n");
return -1;
}
if(atoi(argv[1]) < 0)
{
fprintf(stderr, "%d must be >= 0\n", atoi(argv[1]));
return -1;
}
int total = atoi(argv[1]);
srand((unsigned) time(NULL));
//int N;
//printf("Input the number of thread you desire : ");
//scanf("%d", &N);
int n = total/N;
//pthread_t* tid = malloc(sizeof(pthread_t) * (N));
pthread_t tid[N];
pthread_mutex_init(&mutex, NULL);
for(int i = 0; i < N; ++i)
{
pthread_create(&tid[i], 0, countInside, (void*)n);
}
for(int i = 0; i < N; ++i)
{
pthread_join(tid[i], NULL);
}
double pi = 4.0 * inside / total;
clock_gettime(CLOCK_REALTIME, &end);
double time_spent = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / BILLION;
printf("pi = %lf\n", pi);
printf("time = %lf\n", time_spent);
return 0;
}
When i execute both program with 100000000 points, i get the ouput:
Ouput of single-thread:
quan#quan-VirtualBox:~/Documents/lab5$ ./pi_serial 100000000
pi = 3.141583
time = 1.576207
Output of multi-thread:
quan#quan-VirtualBox:~/Documents/lab5$ ./pi_multi-thread 100000000
pi = 3.141532
time = 1.446410
Note : There are sometimes multi-thread one is even slower than single-thread one.
What's the problem ? I thought multi-thread must have some speed-up compared to single-thread one. Is my multi-thread code wrong ? Please give me some advice? Thank you!

Parallel MPI Version

I have to write a parallel MPI version of my dot product (code below):
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define SIZE 10000000
volatile float a[SIZE];
volatile float b[SIZE];
int main(int argc, char **argv)
{
long int i;
double sum;
struct timeval time1, time2;
srand(time(0));
for (i = 0; i < SIZE; i++)
{
a[i] = rand();
b[i] = rand();
}
gettimeofday(&time1, 0);
sum = 0.0;
for (i = 0; i < SIZE; i++)
{
sum = sum + a[i]*b[i];
}
gettimeofday(&time2, 0);
MPI_Scatter(a, 1, MPI_INT, &b, 1, MPI_INT, 0, MPI_COMM_WORLD);
printf("Elapsed time (us) = %d\n", (time2.tv_sec-time1.tv_sec)*1000000 + time2.tv_usec - time1.tv_usec);
return 0;
}
My question is, what code do I need to add to the program, what MPI primitives are useful?

c know how much memory was used in the execution

For example, the next code, how to know the memory without the struct timeval and int microseg?.
Is the problem 1 of project Euler.
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <sys/resource.h>
int main(){
struct timeval t, t2;
struct rusage uso;
int microseg;
gettimeofday(&t, NULL);
int sum = 0;
for (int k = 2; k < 1000; k++){
if(k%3 == 0 || k%5 == 0)
sum +=k;
}
printf("%d \n",sum);
gettimeofday(&t2, NULL);
microsegundos = ((t2.tv_usec - t.tv_usec) + ((t2.tv_sec - t.tv_sec) * 1000000.0f));
printf("CPU time: %d\n",microseg);
getrusage(RUSAGE_SELF, &uso);
printf("Memory: %ld KB\n", (long)uso.ru_maxrss);
return 0;
}

How to parallelize the below for loop

I am trying to integrate a function of curve, and convert the serial code to parallel program, I am using openMP for the same.
I have parallelized the for loop using openMP parallel for and have achieved lesser program time, but the problem is the result is not the expected one, there is something which get messed up in the threads, I want to know how to parallelize the for loop for N number of threads.
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x){
return sin(x)+0.5*x;
}
int main(){
int n=134217728,i;
double a=0,b=9,h,x,sum=0,integral;
double start = omp_get_wtime();
h=fabs(b-a)/n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for reduction (+:sum) shared(x)
for(i=1;i<n;i++){
x=a+i*h;
sum=sum+f(x);
}
integral=(h/2)*(f(a)+f(b)+2*sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n",time);
printf("\nThe integral is: %lf\n",integral);
}
The expected output is 22.161130 but it is getting varied each time the program is ran.
The loop you are trying to parallelise modifies the same variables x and sum in each iteration, this is very cumbersome to parallelize.
You could rewrite the code to make the path to parallelisation more obvious:
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x) {
return sin(x) + 0.5 * x;
}
int main() {
int n = 1 << 27, i, j;
double a = 0, b = 9, h, x, sum, integral;
double sums[64] = { 0 };
double start = omp_get_wtime();
h = fabs(b - a) / n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for
for (j = 0; j < 64; j++) {
for (i = 0; i < n; i += 64) {
sums[j] += f(a + i * h + j * h);
}
}
sum = 0;
for (j = 0; j < 64; j++) {
sum += sums[i];
}
integral = (h / 2) * (f(a) + f(b) + 2 * sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n", time);
printf("\nThe integral is: %lf\n", integral);
return 0;
}

How to measure scanf time in a C program?

In the following program, I want to measure the input time i.e. the time taken by user to enter the variables of the array :
#include <stdio.h>
#include <time.h>
int main()
{
int i, array[10];
double user_input_time;
clock_t input_start, input_end;
input_start = clock();
for (i = 0; i < 10; i++)
{
scanf("%d", &array[i]);
}
input_end = clock();
user_input_time = ((double)(input_end - input_start)) / CLOCKS_PER_SEC;
printf("User Input Time : %f\n", user_input_time);
return 0;
}
Above, what I'm getting is the processor time taken not the input time taken by user to enter all the 10 variable of the array.
Please, can someone help me in doing so.
Include: time.h
Use:
int main()
{
time_t start = time(NULL);
//Do your operations here
printf("%.2f\n", (double)(time(NULL) - start));
return 0;
}
Note - We can use clock_gettime for more precise results - link
Using clock_gettime
int main () {
struct timespec start, finish;
clock_gettime(CLOCK_REALTIME, &start);
// do your operations here
clock_gettime(CLOCK_REALTIME, &finish);
long seconds = finish.tv_sec - start.tv_sec;
long ns = finish.tv_nsec - start.tv_nsec;
if (start.tv_nsec > finish.tv_nsec) { // clock underflow
--seconds;
ns += 1000000000;
}
printf("seconds without ns: %ld\n", seconds);
printf("nanoseconds: %ld\n", ns);
printf("total seconds: %e\n", (double)seconds + (double)ns/(double)1000000000);
}
Which precision do you want ?
Because you can use simply time(NULL) if you only want to know the time at the second.
#include <stdio.h>
#include <time.h>
int main(void)
{
int i, array[10];
time_t user_input_time, input_start, input_end;
input_start = time(NULL);
for (i = 0; i < 10; i++)
{
scanf("%d", &array[i]);
}
input_end = time(NULL);
user_input_time = input_end - input_start;
printf("User Input Time : %d second\n", (int)user_input_time);
return 0;
}
You can use gettimeofday() from sys/time.h
#include <stdio.h>
#include <sys/time.h>
int main(void)
{
int i, array[10];
struct timeval input_start, input_end;
gettimeofday(&input_start, NULL);
for (i = 0; i < 10; i++)
{
scanf("%d", &array[i]);
}
gettimeofday(&input_end, NULL);
printf("User Input Time : %d second\n", input_end.tv_sec - input_start.tv_sec);
return 0;
}

Resources