I was trying to solve this openmp tutorial exercise in which I was required to parallelize the following serial code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I came up with the following solution for this exercise. First I parallelizes both the producer and the consumer using #pragma omp parallel for and then I added a barrier and a flush between the calls to the functions. Here is my code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
#pragma omp barrier
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
However there is a race condition in this code as when I run it multiple times, I get slightly different values. The solution to this problem, as per the tutorial, uses the producer-consumer pattern. It first runs the fill_rand function. Then after that function finishes its execution, the code sets up a flag variable to instruct the consumer to start executing. Of course, it also adds flushes between the two sections of producer and consumer. To me this code looks similar to my solution. As far as I understand it, both pieces of code first run the producer, the flush the array to the memory, and then run the consumer to get the result. My code, however runs twice as fast, probably due to less cache flushes. But, my code seems to have some race conditions. Here is the provided solution which does not have any race conditions:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I can't figure out what I am doing wrong. Can someone please help me out.
I also tried running the following code which adds the barrier and the flush in the parallel region, but even this version has a race condition somewhere.
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update:
As suggested by some comments, the problem was with the global variable randy which was being treated as a shared variable by default by openmp. So I changed that variable to firstprivate and now the run to run variation is no longer there. I suppose that the randy variable was the source of the race condition. However, after making this change in both the provided solution and my own solution, I am getting different answers from both the programs. Note that there is no run to run variation, but the answers that I am getting from both the versions is different. Moreover, the answer that I get when I run my version changes depending on the number of threads that I use. Again, I am not sure why this is happening.
Here is the solution provided by the tutorial which results in the same answer regardless of the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
And here is my solution which results in the different answers depending on the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update #2:
The problem was indeed with the variable randy. Refer to my answer below for more information. Thanks!
As suggested by the comments, the problem was with the variable randy. Declaring it as threadprivate solved the issue. Thanks for the help everyone! For the sake of completion, here is the working code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
#pragma omp threadprivate(randy)
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I am trying to write a parallel program which takes an error rate(i.e 0.01) and returns a PI value which is closer to PI than the error with montecarlo simulation.
I wrote a simple function however it does not terminate as error rate is always around 11.
I appreciate your comments.
#include "stdio.h"
#include "omp.h"
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
double drand48(void);
double monte_carlo(double epsilon){
double x,y, pi_estimate = 0.0;
double drand48(void);
double error = 10000.0;
int n = 0; // total number of points
int i = 0; // total numbers of points inside circle
int p = omp_get_num_threads();
while(error>=epsilon){
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
}
n+=p;
printf("%lf\n", error);
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
}
return pi_estimate;
}
int main(int argc, char* argv[]) {
double epsilon = 0.01;
printf("PI estimate: %lf",monte_carlo(epsilon));
return 0;
}
Calling omp_get_num_threads() outside a parallel section will always return 1, as there is only one active thread at the moment the function is called. The following code should give a correct result, but will be much slower than the serial version due to the large parallelization & synchronization overhead spend for doing a very simple operation.
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
#pragma omp master
n+=omp_get_num_threads();
}
The following avoids repeatedly spawning threads and may be more efficient, but still probably slower.
#pragma omp parallel private(x, y)
while(error>=epsilon){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){
#pragma omp atomic
i++;
}
#pragma omp barrier
#pragma omp single
{
n+=omp_get_num_threads();
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier here
}
In order to really go faster, a minimum number of iterations should be given such as:
#define ITER 1000
#pragma omp parallel private(x, y)
while(error>=epsilon){
#pragma omp for reduction(+:i)
for (int j=1;j<ITER;j++){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0) i+=1;
}
/* implicit barrier + implicit atomic addition
* of thread-private accumulator to shared variable i
*/
#pragma omp single
{
n+=ITER;
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier
}
I installed both gcc-7, gcc-8, gcc-7-offload-nvptx and gcc-8-offload-nvptx
I tried with both to compile a simple OpenMP code with offloading:
#include <omp.h>
#include <stdio.h>
int main(){
#pragma omp target
#pragma omp teams distribute parallel for
for (int i=0; i<omp_get_num_threads(); i++)
printf("%d in %d of %d\n",i,omp_get_thread_num(), omp_get_num_threads());
}
With the following line (with gcc-7 too):
gcc-8 code.c -fopenmp -foffload=nvptx-none
But it doesn't compile, giving the following error:
/tmp/ccKESWcF.o: In function "main":
teste.c:(.text+0x50): undefined reference to "GOMP_target_ext"
/tmp/cc0iOH1Y.target.o: In function "init":
ccPXyu6Y.c:(.text+0x1d): undefined reference to "GOMP_offload_register_ver"
/tmp/cc0iOH1Y.target.o: In function "fini":
ccPXyu6Y.c:(.text+0x41): undefined reference to "GOMP_offload_unregister_ver"
collect2: error: ld returned 1 exit status
some clues?
You code compiles and runs for me using -foffload=disable -fno-stack-protector with gcc7 and gcc-7-offload-nvptx and Ubuntu 17.10.
But on the GPU (without -foffload=disable) it fails to compile. You can't call printf from the GPU. Instead you can do this:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main(){
int nthreads;
#pragma omp target teams map(tofrom:nthreads)
#pragma omp parallel
#pragma omp single
nthreads = omp_get_num_threads();
int *ithreads = malloc(sizeof *ithreads *nthreads);
#pragma omp target teams distribute parallel for map(tofrom:ithreads[0:nthreads])
for (int i=0; i<nthreads; i++) ithreads[i] = omp_get_thread_num();
for (int i=0; i<nthreads; i++)
printf("%d in %d of %d\n", i, ithreads[i], nthreads);
free(ithreads);
}
For me this outputs
0 in 0 of 8
1 in 0 of 8
2 in 0 of 8
3 in 0 of 8
4 in 0 of 8
5 in 0 of 8
6 in 0 of 8
7 in 0 of 8
I'm writing some simple example to understand how the things work with OpenMP programs.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <omp.h>
int main (int argc ,char* argv[]){
omp_set_num_threads(4);
int j =0;
#pragma omp parallel private (j)
{
int i;
for(i=1;i<2;i++){
printf("from thread %d : i is equel to %d and j is equal to %d\n ",omp_get_thread_num(),i,j);
}
}
}
So in this example I should get j=0 each time,
unfortunately the result is j == 0 3 times , and j == 32707 one time.
What is wrong with my example?
Use firstprivate(j) rather than private(j) if you want that each thread has a private copy of j with the initial value being the value before entering the parallel region.
What I am looking for is what is the best way to gather all the data from the parallel for loops into one variable. OpenMP seems to have a different routine then I am used to seeing as I started learning OpenMPI first which has scatter and gather routines.
Calculating PI (embarrassingly parallel routine)
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_STEPS 100
#define CHUNKSIZE 20
int main(int argc, char *argv[])
{
double step, x, pi, sum=0.0;
int i, chunk;
chunk = CHUNKSIZE;
step = 1.0/(double)NUM_STEPS;
#pragma omp parallel shared(chunk) private(i,x,sum,step)
{
#pragma omp for schedule(dynamic,chunk)
for(i = 0; i < NUM_STEPS; i++)
{
x = (i+0.5)*step;
sum = sum + 4.0/(1.0+x*x);
printf("Thread %d: i = %i sum = %f \n",tid,i,sum);
}
pi = step * sum;
}
EDIT: It seems that I could use an array sum[*NUM_STEPS / CHUNKSIZE*] and sum the array into one value, or would it be better to use some sort of blocking routine to sum the product of each iteration
Add this clause to your #pragma omp parallel ... statement:
reduction(+ : pi)
Then just do pi += step * sum; at the end of the parallel region. (Notice the plus!) OpenMP will then automagically sum up the partial sums for you.
Lets see, I am not quite sure what happens, because I havn't got deterministic behaviour on the finished application, but I have something looks like it resembles π. I removed the #pragma omp parallel shared(chunk) and changed the #pragma omp for schedule(dynamic,chunk) to #pragma omp parallel for schedule(dynamic) reduction(+:sum).
#pragma omp parallel for schedule(dynamic) reduction(+:sum)
This requires some explanation, I removed the schedules chunk just to make it all simpler (for me). The part that you are interested in is the reduction(+:sum) which is a normal reduce opeartion with the operator + and using the variable sum.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_STEPS 100
int main(int argc, char *argv[])
{
double step, x, pi, sum=0.0;
int i;
step = 1.0/(double)NUM_STEPS;
#pragma omp parallel for schedule(dynamic) reduction(+:sum)
for(i = 0; i < NUM_STEPS; i++)
{
x = (i+0.5)*step;
sum +=4.0/(1.0+x*x);
printf("Thread %%d: i = %i sum = %f \n",i,sum);
}
pi = step * sum;
printf("pi=%lf\n", pi);
}