I was trying to solve this openmp tutorial exercise in which I was required to parallelize the following serial code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I came up with the following solution for this exercise. First I parallelizes both the producer and the consumer using #pragma omp parallel for and then I added a barrier and a flush between the calls to the functions. Here is my code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
#pragma omp barrier
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
However there is a race condition in this code as when I run it multiple times, I get slightly different values. The solution to this problem, as per the tutorial, uses the producer-consumer pattern. It first runs the fill_rand function. Then after that function finishes its execution, the code sets up a flag variable to instruct the consumer to start executing. Of course, it also adds flushes between the two sections of producer and consumer. To me this code looks similar to my solution. As far as I understand it, both pieces of code first run the producer, the flush the array to the memory, and then run the consumer to get the result. My code, however runs twice as fast, probably due to less cache flushes. But, my code seems to have some race conditions. Here is the provided solution which does not have any race conditions:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 1000000000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
I can't figure out what I am doing wrong. Can someone please help me out.
I also tried running the following code which adds the barrier and the flush in the parallel region, but even this version has a race condition somewhere.
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update:
As suggested by some comments, the problem was with the global variable randy which was being treated as a shared variable by default by openmp. So I changed that variable to firstprivate and now the run to run variation is no longer there. I suppose that the randy variable was the source of the race condition. However, after making this change in both the provided solution and my own solution, I am getting different answers from both the programs. Note that there is no run to run variation, but the answers that I am getting from both the versions is different. Moreover, the answer that I get when I run my version changes depending on the number of threads that I use. Again, I am not sure why this is happening.
Here is the solution provided by the tutorial which results in the same answer regardless of the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
#pragma omp parallel sections
{
#pragma omp section
{
fill_rand(N, A); // Producer: fill an array of data
#pragma omp flush
#pragma omp atomic write
flag = 1;
#pragma omp flush(flag)
}
#pragma omp section
{
#pragma omp flush(flag)
while(1)
{
#pragma omp flush(flag)
#pragma omp atomic read
__flag = flag;
if(__flag == 1) break;
}
#pragma omp flush
sum = Sum_array(N, A); // Consumer: sum the array
}
runtime = omp_get_wtime() - runtime;
}
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
And here is my solution which results in the different answers depending on the number of threads:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp parallel
{
#pragma omp for schedule(static) firstprivate(randy)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
#pragma omp barrier
#pragma omp flush
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Update #2:
The problem was indeed with the variable randy. Refer to my answer below for more information. Thanks!
As suggested by the comments, the problem was with the variable randy. Declaring it as threadprivate solved the issue. Thanks for the help everyone! For the sake of completion, here is the working code:
/*
** PROGRAM: A simple serial producer/consumer program
**
** One function generates (i.e. produces) an array of random values.
** A second functions consumes that array and sums it.
**
** HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>
#define N 10000
/* Some random number constants from numerical recipies */
#define SEED 2531
#define RAND_MULT 1366
#define RAND_ADD 150889
#define RAND_MOD 714025
int randy = SEED;
#pragma omp threadprivate(randy)
/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
int i;
#pragma omp for schedule(static)
for (i=0;i<length;i++) {
randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
*(a+i) = ((double) randy)/((double) RAND_MOD);
}
}
/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
int i; double sum = 0.0;
#pragma omp parallel for reduction(+:sum) schedule(static)
for (i=0;i<length;i++) sum += *(a+i);
return sum;
}
int main()
{
double *A, sum, runtime;
int flag = 0;
int __flag;
A = (double *)malloc(N*sizeof(double));
runtime = omp_get_wtime();
fill_rand(N, A); // Producer: fill an array of data
sum = Sum_array(N, A); // Consumer: sum the array
runtime = omp_get_wtime() - runtime;
printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Related
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define pow(x) ((x) * (x))
#define NUM_THREADS 8
#define wmax 1000
#define Nv 2
#define N 5
int b=0;
float Points[N][Nv]={ {0,1}, {3,4}, {1,2}, {5,1} ,{8,9}};
float length[wmax+1]={0};
float EuclDist(float* Ne, float* Pe) {
int i;
float s = 0;
for (i = 0; i < Nv; i++) {
s += pow(Ne[i] - Pe[i]);
}
return s;
}
void DistanceFinder(float* a[]){
int i;
#pragma omp simd
for (i=1;i<N+1;i++){
length[b] += EuclDist(&a[i],&a[i-1]);
}
//printf(" %f\n", length[b]);
}
void NewRoute(){
//some irrelevant things
DistanceFinder(Points);
}
int main(){
omp_set_num_threads(NUM_THREADS);
do{
b+=1;
NewRoute();
} while (b<wmax);
}
Trying to parallelize this loop and trying different things, tried this one.
Seems to be the fastest, however is it correct to use SIMD like that? Because I'm using a previous iteration (i and i - 1). The results I see though are correct weirdly or not.
Seems to be the fastest, however is it correct to use SIMD like that?
First, there is a race condition that needs to be fixed, namely during the updates of the array length[b]. Moreover, you are accessing memory outside the array a; (iterating from 1 to N + 1), and you are passing &a[i]. You can fix the race condition by using OpenMP reduction clause:
void DistanceFinder(float* a[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = EuclDist(a[i], a[i-1]);
sum += tmp;
}
length[b] += sum;
}
Furthermore, you need to provide a version of EuclDist as follows:
#pragma omp declare simd uniform(Ne, Pe)
float EuclDist(float* Ne, float* Pe) {
int i;
float s = 0;
for (i = 0; i < Nv; i++)
s += pow(Ne[i] - Pe[i]);
return s;
}
Because I'm using a previous iteration (i and i - 1).
In your case, it is okay, since the array a is just being read.
The results I see though are correct weirdly or not.
Very-likely there was no vectorization taking place. Regardless, it would still be undefined behavior due to the aforementioned race condition.
You can simplify your code so that it increases the likelihood of the vectorization actually happening, for instance:
void DistanceFinder(float* a[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = pow(a[i][0] - a[i-1][0]) + pow(a[i][1] - a[i-1][1])
sum += tmp;
}
length[b] += sum;
}
A further change that you can do to improve the performance of your code is to allocate the matrix (that is passed as a parameter of the function DistanceFinder) in a manner that when you iterate over its rows (i.e., a[i]) you would be iterating over continuous memory address.
For instance, you could pass two arrays a1 and a2 to represent the first and second columns of the matrix a:
void DistanceFinder(float a1[], float a2[]){
int i;
float sum = 0;
float tmp;
#pragma omp simd private(tmp) reduction(+:sum)
for (i=1;i<N;i++){
tmp = pow(a1[i] - a1[i-1]) + pow(a2[i][1] - a2[i-1][1])
sum += tmp;
}
length[b] += sum;
}
I am trying to wrap my head around combining openacc with pointers to structs containing dynamically allocated members. The code below fails with
Failing in Thread:1
call to cuStreamSynchronize returned error 700: Illegal address during kernel execution
when compiled using nvc ("nvc 20.9-0 LLVM 64-bit target on x86-64 Linux -tp haswell"). As far as I can tell I am following the approach suggested eg in the OpenACC 'getting started' guide. But somehow presumably the pointers don't stick (?) on the device. Does anyone know what goes wrong here?
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data create(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X[0:g->N])
#pragma acc exit data delete(g[0:1])
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}```
From the compiler feedback messages you'll see something like:
fill:
32, Accelerator restriction: size of the GPU copy of g is unknown
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
32, Generating implicit copyin(g) [if not already present]
37, Generating update self(g->X[:g->N])
The problem being that the compiler can't implicitly copy aggregate types with dynamic data members so you need to add a "present(g)" to indicate that g is already the device.
Also, you'll want to copyin g in order to get the value of N on the device and no need to include the array shape in the exit data delete directive. For example:
% cat test.c
#include <stdlib.h>
#include <stdio.h>
typedef struct grid
{
int N;
double *X;
} grid;
void allocate(grid* g, int N)
{
g->N = N;
g->X = (double*) malloc(sizeof(double) * g->N);
#pragma acc enter data copyin(g[0:1])
#pragma acc enter data create(g->X[0:N])
}
void release(grid* g)
{
#pragma acc exit data delete(g->X)
#pragma acc exit data delete(g)
free(g->X);
}
void fill(grid * g)
{
int i;
#pragma acc parallel loop present(g)
for (i = 0; i < g->N; i++)
{
g->X[i] = 42; // the cuprit, commenting this removes the error too
}
#pragma acc update self(g->X[:g->N])
for (i = 0; i < 4; i++)
{
printf("%d : %f \n",i,g->X[i]);
}
}
int main()
{
grid g;
allocate(&g, 10);
fill(&g);
release(&g);
return 0;
}
% nvc -acc test.c -Minfo=accel -V20.9 ; a.out
allocate:
17, Generating enter data copyin(g[:1])
Generating enter data create(g->X[:N])
release:
24, Generating exit data delete(g[:1],g->X[:1])
fill:
32, Generating present(g[:1])
Generating Tesla code
32, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
37, Generating update self(g->X[:g->N])
0 : 42.000000
1 : 42.000000
2 : 42.000000
3 : 42.000000
I am trying to write a parallel program which takes an error rate(i.e 0.01) and returns a PI value which is closer to PI than the error with montecarlo simulation.
I wrote a simple function however it does not terminate as error rate is always around 11.
I appreciate your comments.
#include "stdio.h"
#include "omp.h"
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
double drand48(void);
double monte_carlo(double epsilon){
double x,y, pi_estimate = 0.0;
double drand48(void);
double error = 10000.0;
int n = 0; // total number of points
int i = 0; // total numbers of points inside circle
int p = omp_get_num_threads();
while(error>=epsilon){
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
}
n+=p;
printf("%lf\n", error);
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
}
return pi_estimate;
}
int main(int argc, char* argv[]) {
double epsilon = 0.01;
printf("PI estimate: %lf",monte_carlo(epsilon));
return 0;
}
Calling omp_get_num_threads() outside a parallel section will always return 1, as there is only one active thread at the moment the function is called. The following code should give a correct result, but will be much slower than the serial version due to the large parallelization & synchronization overhead spend for doing a very simple operation.
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
#pragma omp master
n+=omp_get_num_threads();
}
The following avoids repeatedly spawning threads and may be more efficient, but still probably slower.
#pragma omp parallel private(x, y)
while(error>=epsilon){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){
#pragma omp atomic
i++;
}
#pragma omp barrier
#pragma omp single
{
n+=omp_get_num_threads();
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier here
}
In order to really go faster, a minimum number of iterations should be given such as:
#define ITER 1000
#pragma omp parallel private(x, y)
while(error>=epsilon){
#pragma omp for reduction(+:i)
for (int j=1;j<ITER;j++){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0) i+=1;
}
/* implicit barrier + implicit atomic addition
* of thread-private accumulator to shared variable i
*/
#pragma omp single
{
n+=ITER;
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier
}
What I am looking for is what is the best way to gather all the data from the parallel for loops into one variable. OpenMP seems to have a different routine then I am used to seeing as I started learning OpenMPI first which has scatter and gather routines.
Calculating PI (embarrassingly parallel routine)
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_STEPS 100
#define CHUNKSIZE 20
int main(int argc, char *argv[])
{
double step, x, pi, sum=0.0;
int i, chunk;
chunk = CHUNKSIZE;
step = 1.0/(double)NUM_STEPS;
#pragma omp parallel shared(chunk) private(i,x,sum,step)
{
#pragma omp for schedule(dynamic,chunk)
for(i = 0; i < NUM_STEPS; i++)
{
x = (i+0.5)*step;
sum = sum + 4.0/(1.0+x*x);
printf("Thread %d: i = %i sum = %f \n",tid,i,sum);
}
pi = step * sum;
}
EDIT: It seems that I could use an array sum[*NUM_STEPS / CHUNKSIZE*] and sum the array into one value, or would it be better to use some sort of blocking routine to sum the product of each iteration
Add this clause to your #pragma omp parallel ... statement:
reduction(+ : pi)
Then just do pi += step * sum; at the end of the parallel region. (Notice the plus!) OpenMP will then automagically sum up the partial sums for you.
Lets see, I am not quite sure what happens, because I havn't got deterministic behaviour on the finished application, but I have something looks like it resembles π. I removed the #pragma omp parallel shared(chunk) and changed the #pragma omp for schedule(dynamic,chunk) to #pragma omp parallel for schedule(dynamic) reduction(+:sum).
#pragma omp parallel for schedule(dynamic) reduction(+:sum)
This requires some explanation, I removed the schedules chunk just to make it all simpler (for me). The part that you are interested in is the reduction(+:sum) which is a normal reduce opeartion with the operator + and using the variable sum.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_STEPS 100
int main(int argc, char *argv[])
{
double step, x, pi, sum=0.0;
int i;
step = 1.0/(double)NUM_STEPS;
#pragma omp parallel for schedule(dynamic) reduction(+:sum)
for(i = 0; i < NUM_STEPS; i++)
{
x = (i+0.5)*step;
sum +=4.0/(1.0+x*x);
printf("Thread %%d: i = %i sum = %f \n",i,sum);
}
pi = step * sum;
printf("pi=%lf\n", pi);
}
Is there a simple library to benchmark the time it takes to execute a portion of C code? What I want is something like:
int main(){
benchmarkBegin(0);
//Do work
double elapsedMS = benchmarkEnd(0);
benchmarkBegin(1)
//Do some more work
double elapsedMS2 = benchmarkEnd(1);
double speedup = benchmarkSpeedup(elapsedMS, elapsedMS2); //Calculates relative speedup
}
It would also be great if the library let you do many runs, averaging them and calculating the variance in timing!
Use the function clock() defined in time.h:
startTime = (float)clock()/CLOCKS_PER_SEC;
/* Do work */
endTime = (float)clock()/CLOCKS_PER_SEC;
timeElapsed = endTime - startTime;
Basically, all you want is a high resolution timer. The elapsed time is of course just a difference in times and the speedup is calculated by dividing the times for each task. I have included the code for a high resolution timer that should work on at least windows and unix.
#ifdef WIN32
#include <windows.h>
double get_time()
{
LARGE_INTEGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return (double)t.QuadPart/(double)f.QuadPart;
}
#else
#include <sys/time.h>
#include <sys/resource.h>
double get_time()
{
struct timeval t;
struct timezone tzp;
gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6;
}
#endif
Benchmark C code easily
#include <time.h>
int main(void) {
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time);
}
Easy benchmark of multi-threaded C code
If you want to benchmark multithreaded program you first need to take a closer look at clock:
Description
The clock() function returns an approximation of processor time
used by the program.
Return value
The value returned is the CPU time used so far as a clock_t; to
get the number of seconds used, divide by CLOCKS_PER_SEC. If the
processor time used is not available or its value cannot be
represented, the function returns the value (clock_t)(-1)
Hence it is very important to divide your elapsed_time by the number of threads in order to get the execution time of your function:
#include <time.h>
#include <omp.h>
#define THREADS_NB omp_get_max_threads()
#pragma omp parallel for private(i) num_threads(THREADS_NB)
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time / THREADS_NB); // divide by THREADS_NB!
Example
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define N 20000
#define THREADS_NB omp_get_max_threads()
void init_arrays(double *a, double *b) {
memset(a, 0, sizeof(a));
memset(b, 0, sizeof(b));
for (int i = 0; i < N; i++) {
a[i] += 1.0;
b[i] += 1.0;
}
}
double func2(double i, double j) {
double res = 0.0;
while (i / j > 0.0) {
res += i / j;
i -= 0.1;
j -= 0.000003;
}
return res;
}
double single_thread(double *a, double *b) {
double res = 0;
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
double multi_threads(double *a, double *b) {
double res = 0;
int i, j;
#pragma omp parallel for private(j) num_threads(THREADS_NB) reduction(+:res)
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
int main(void) {
double *a, *b;
a = (double *)calloc(N, sizeof(double));
b = (double *)calloc(N, sizeof(double));
init_arrays(a, b);
clock_t start_time = clock();
double res = single_thread(a, b);
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Default: Done with %f in %f sd\n", res, elapsed_time);
start_time = clock();
res = multi_threads(a, b);
elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("With OMP: Done with %f in %f sd\n", res, elapsed_time / THREADS_NB);
}
Compile with:
gcc -O3 multithread_benchmark.c -fopenmp && time ./a.out
Output:
Default: Done with 2199909813.614555 in 4.909633 sd
With OMP: Done with 2199909799.377532 in 1.708831 sd
real 0m6.703s (from time function)
In POSIX, try getrusage. The relevant argument is RUSAGE_SELF and the relevant fields are ru_utime.tv_sec and ru_utime.tv_usec.
There may be existing utilities that help with this, but I suspect most will use some kind of sampling or possibly injection. But to get specific sections of code timed, you will probably have to add in calls to a timer like you show in your example. If you are using Windows, then the high performance timer works. I answered a similar question and showed example code that will do that. There are similar methods for Linux.