I am attempting to make a program using MPI that will find the value of PI using MPI.
Currently I can find the sum this way:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUMSTEPS 1000000
int main() {
int i;
double x, pi, sum = 0.0;
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
double step = 1.0/(double) NUMSTEPS;
x = 0.5 * step;
for (i=0;i<= NUMSTEPS; i++){
x+=step;
sum += 4.0/(1.0+x*x);
}
pi = step * sum;
clock_gettime(CLOCK_MONOTONIC, &end);
u_int64_t diff = 1000000000L * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec;
printf("PI is %.20f\n",pi);
printf("elapsed time = %llu nanoseconds\n", (long long unsigned int) diff);
return 0;
}
But this does not use MPI.
So I have tried to make my own in MPI. My logic is:
Split the 1000000 into equal parts based on how many processors I have
Calculate the values for each range
Send the calculated value back to the master and then divide by the number of processors. I would like to keep the main thread free and not do any work. Similar to a master-slave system.
Here's what I have currently. This doesn't seem to be working and the send/receive gives errors about incompatible variables for receive and send.
#include <mpi.h>
#include <stdio.h>
#include <string.h>
#define NUMSTEPS 1000000
int main(int argc, char** argv) {
int comm_sz; //number of processes
int my_rank; //my process rank
// Initialize the MPI environment
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
// Slaves
if (my_rank != 0) {
// Process math then send
int i;
double x, pi, sum = 0.0;
double step = 1.0/(double) NUMSTEPS;
x = 0.5 * step;
// Find the start and end for the number
int processors = comm_sz - 1;
int thread_multi = NUMSTEPS / processors;
int start = my_rank * thread_multi;
if((my_rank - 1) != 0){
start += 1;
}
int end = start + thread_multi ;
for (i=start; i <= end; i++){
x+=step;
sum += 4.0 / (1.0 + x * x);
}
pi = step * sum;
MPI_Send(pi, 1.0, MPI_DOUBLE 1, 0, MPI_COMM_WORLD);
// Master
} else {
// Things in here only get called once.
double pi = 0.0;
double total = 0.0;
for (int q = 1; q < comm_sz; q++) {
MPI_Recv(pi, 1, MPI_DOUBLE, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
total += pi;
pi = 0.0;
}
// Take the added totals and divide by amount of processors that processed, to get the average
double finished = total / (comm_sz - 1);
// Print sum here
printf("Pi Is: %d", finished);
}
// Finalize the MPI environment.
MPI_Finalize();
}
I've currently spent around 3 hours working on this. Never used MPI. Any help would be greatly appreciated.
Try compiling with more compiler warnings and try to fix them, for instance -Wall -Wextra should give you excellent clues about what the issues are.
According to MPI_Send documentation the first argument is a pointer, so you seem to be ignoring an automatic "conversion to pointer" error. You have the same issue in the MPI_Recv() call.
You can try to pass pi as &pi in MPI_Recv and MPI_Send and check if that fixes the error.
As a comment, you can declare dummy variables as pi as a local variables inside the master loop to avoid side-effects:
for (int q = 1; q < comm_sz; q++) {
double pi = 0;
MPI_Recv(&pi, 1, MPI_DOUBLE, q, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
total += pi;
}
Related
I'm new with MPI and I'm trying to develop a non-blocking programm (with Isend and Irecv). The functionality is very basic (it's educational):
There is one process (rank 0) who is the master and receives messages from the slaves (rank 1-P). The master only receives results.
The slaves generates an array of N random numbers between 0 and R and then they do some operations with those numbers (again, it's just for educational purpose, the operations don't make any sense)
This whole process (operations + send data) is done M times (this is just for comparing different implementations; blocking and non-blocking)
I get a Segmentation Fault in the Master process when I'm calling the MPI_waitall() funcion
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#include <math.h>
#include <time.h>
#define M 1000 //Number of times
#define N 2000 //Quantity of random numbers
#define R 1000 //Max value of random numbers
double SumaDeRaices (double*);
int main(int argc, char* argv[]) {
int yo; /* rank of process */
int p; /* number of processes */
int dest; /* rank of receiver */
/* Start up MPI */
MPI_Init(&argc, &argv);
/* Find out process rank */
MPI_Comm_rank(MPI_COMM_WORLD, &yo);
/* Find out number of processes */
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Request reqs[p-1];
MPI_Status stats[p-1];
if (yo == 0) {
int i,j;
double result;
clock_t inicio,fin;
inicio = clock();
for(i = 0; i<M; i++){ //M times
for(j = 1; j<p; j++){ //for every slave
MPI_Irecv(&result, sizeof(double), MPI_DOUBLE, j, i, MPI_COMM_WORLD, &reqs[j-1]);
}
MPI_Waitall(p-1,reqs,stats); //wait all slaves (SEG_FAULT)
}
fin = clock()-inicio;
printf("Tiempo total de ejecucion %f segundos \n", ((double)fin)/CLOCKS_PER_SEC);
}
else {
double* numAleatorios = (double*) malloc( sizeof(double) * ((double) N) ); //array with numbers
int i,j;
double resultado;
dest=0;
for(i=0; i<M; i++){ //again, M times
for(j=0; j<N; j++){
numAleatorios[j] = rand() % R ;
}
resultado = SumaDeRaices(numAleatorios);
MPI_Isend(&resultado,sizeof(double), MPI_DOUBLE, dest, i, MPI_COMM_WORLD,&reqs[p-1]); //send result to master
}
}
/* Shut down MPI */
MPI_Finalize();
exit(0);
} /* main */
double SumaDeRaices (double* valores){
int i;
double sumaTotal = 0.0;
//Raices cuadradas de los valores y suma de estos
for(i=0; i<N; i++){
sumaTotal = sqrt(valores[i]) + sumaTotal;
}
return sumaTotal;
}
There are several issues with your code. First and foremost in your Isend you pass &resultado several times without waiting until previous non-blocking operation finishes. You are not allowed to reuse the buffer you pass to Isend before you make sure the operation finishes.
Instead I recommend you using normal Send, because in contrast to synchronous send (SSend) normal blocking send returns as soon as you can reuse the buffer.
Second, there is no need to use message tags. I recommend you to just set tag to 0. In terms of performance it is simply faster.
Third, the result shouldn't be a simple variable, but an array of size at least (p-1)
Fourth, I do not recommend you to allocate arrays on stack, like MPI_Request and MPI_Status if the size is not a known small number. In this case the size of array is (p-1), so you better use malloc for this data structure.
Fifth, if you do not check status, use MPI_STATUSES_IGNORE.
Also instead of sizeof(double) you should specify number of items (1).
But of course the absolutely best version is just to use MPI_Gather.
Moreover, generally there is no reason not to run computations on the root node.
Here is slightly rewritten example:
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
#include <math.h>
#include <time.h>
#define M 1000 //Number of times
#define N 2000 //Quantity of random numbers
#define R 1000 //Max value of random numbers
double SumaDeRaices (double* valores)
{
int i;
double sumaTotal = 0.0;
//Raices cuadradas de los valores y suma de estos
for(i=0; i<N; i++) {
sumaTotal = sqrt(valores[i]) + sumaTotal;
}
return sumaTotal;
}
int main(int argc, char* argv[]) {
int yo; /* rank of process */
int p; /* number of processes */
/* Start up MPI */
MPI_Init(&argc, &argv);
/* Find out process rank */
MPI_Comm_rank(MPI_COMM_WORLD, &yo);
/* Find out number of processes */
MPI_Comm_size(MPI_COMM_WORLD, &p);
double *result;
clock_t inicio, fin;
double *numAleatorios;
if (yo == 0) {
inicio = clock();
}
numAleatorios = (double*) malloc( sizeof(double) * ((double) N) ); //array with numbers
result = (double *) malloc(sizeof(double) * p);
for(int i = 0; i<M; i++){ //M times
for(int j=0; j<N; j++) {
numAleatorios[j] = rand() % R ;
}
double local_result = SumaDeRaices(numAleatorios);
MPI_Gather(&local_result, 1, MPI_DOUBLE, result, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); //send result to master
}
if (yo == 0) {
fin = clock()-inicio;
printf("Tiempo total de ejecucion %f segundos \n", ((double)fin)/CLOCKS_PER_SEC);
}
free(numAleatorios);
/* Shut down MPI */
MPI_Finalize();
} /* main */
I am trying to make the 2D MPI FFTW example from http://www.fftw.org/doc/2d-MPI-example.html#g_t2d-MPI-example work.
The example code is not complete so I have to write some extra lines to make it compile and test (below is the code).
For some reason, the output (in place FFT) is just zeros as far as I can tell.
The output after transform is simply 0 0 (all zeros).
Am I using the MPI FFTW in the wrong way? The example code is simple enough.
// compile with: mpicc simple_mpi_example.c -Wl,-rpath=/usr/local/lib -lfftw3_mpi -lfftw3 -o simple_mpi_example */
#include <fftw3-mpi.h>
int main(int argc, char **argv){
const ptrdiff_t N0 = 1000, N1 = 1000;
fftw_plan plan;
fftw_complex *data; //local data of course
ptrdiff_t alloc_local, local_n0, local_0_start, i, j;
MPI_Init(&argc, &argv);
fftw_mpi_init();
/* get local data size and allocate */
alloc_local = fftw_mpi_local_size_2d(N0, N1, MPI_COMM_WORLD,
&local_n0, &local_0_start);
data = (fftw_complex *) fftw_malloc(sizeof(fftw_complex) * alloc_local);
MPI_Barrier(MPI_COMM_WORLD);
printf("%i %i\n", local_0_start, local_n0);
MPI_Barrier(MPI_COMM_WORLD);
/* create plan for forward DFT */
plan = fftw_mpi_plan_dft_2d(N0, N1, data, data, MPI_COMM_WORLD,
FFTW_FORWARD, FFTW_ESTIMATE);
/* initialize data to some function my_function(x,y) */
for (i = 0; i < local_n0; ++i) for (j = 0; j < N1; ++j){
data[i*N1 + j][0]=local_0_start + i;
data[i*N1 + j][1]=i;
}
MPI_Barrier(MPI_COMM_WORLD);
printf("%f %f\n", data[10*N1 + 10][0], data[10*N1 + 10][1]);
MPI_Barrier(MPI_COMM_WORLD);
/* compute transforms, in-place, as many times as desired */
fftw_execute(plan);
printf("%f %f\n", data[10*N1 + 10][0], data[10*N1 + 10][1]);
fftw_destroy_plan(plan);
fftw_free(data);
MPI_Finalize();
printf("finalize\n");
return 0;
}
Ok, I'm pretty new into CUDA, and I'm kind of lost, really lost.
I'm trying to calculate pi using the Monte Carlo Method, and at the end I just get one add instead of 50.
I don't want to "do while" for calling the kernel, since it's too slow. My issue is, that my code don't loop, it executes only once in the kernel.
And also, I'd like that all the threads access the same niter and pi, so when some thread hit the counters all the others would stop.
#define SEED 35791246
__shared__ int niter;
__shared__ double pi;
__global__ void calcularPi(){
double x;
double y;
int count;
double z;
count = 0;
niter = 0;
//keep looping
do{
niter = niter + 1;
//Generate random number
curandState state;
curand_init(SEED,(int)niter, 0, &state);
x = curand(&state);
y = curand(&state);
z = x*x+y*y;
if (z<=1) count++;
pi =(double)count/niter*4;
}while(niter < 50);
}
int main(void){
float tempoTotal;
//Start timer
clock_t t;
t = clock();
//call kernel
calcularPi<<<1,32>>>();
//wait while kernel finish
cudaDeviceSynchronize();
typeof(pi) piFinal;
cudaMemcpyFromSymbol(&piFinal, "pi", sizeof(piFinal),0, cudaMemcpyDeviceToHost);
typeof(niter) niterFinal;
cudaMemcpyFromSymbol(&niterFinal, "niter", sizeof(niterFinal),0, cudaMemcpyDeviceToHost);
//Ends timer
t = clock() - t;
tempoTotal = ((double)t)/CLOCKS_PER_SEC;
printf("Pi: %g \n", piFinal);
printf("Adds: %d \n", niterFinal);
printf("Total time: %f \n", tempoTotal);
}
There are a variety of issues with your code.
I suggest using proper cuda error checking and run your code with cuda-memcheck to spot any runtime errors. I've omitted proper error checking in my code below for brevity of presentation, but I've run it with cuda-memcheck to indicate no runtime errors.
Your usage of curand() is probably not correct (it returns integers over a large range). For this code to work correctly, you want a floating-point quantity between 0 and 1. The correct call for that is curand_uniform().
Since you want all threads to work on the same values, you must prevent those threads from stepping on each other. One way to do that is to use atomic updates of the variables in question.
It should not be necessary to re-run curand_init on each iteration. Once per thread should be sufficient.
We don't use cudaMemcpy..Symbol operations on __shared__ variables. For convenience, and to preserve something that resembles your original code, I've elected to convert those to __device__ variables.
Here's a modified version of your code that has most of the above issues fixed:
$ cat t978.cu
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#define ITER_MAX 5000
#define SEED 35791246
__device__ int niter;
__device__ int count;
__global__ void calcularPi(){
double x;
double y;
double z;
int lcount;
curandState state;
curand_init(SEED,threadIdx.x, 0, &state);
//keep looping
do{
lcount = atomicAdd(&niter, 1);
//Generate random number
x = curand_uniform(&state);
y = curand_uniform(&state);
z = x*x+y*y;
if (z<=1) atomicAdd(&count, 1);
}while(lcount < ITER_MAX);
}
int main(void){
float tempoTotal;
//Start timer
clock_t t;
t = clock();
int count_final = 0;
int niter_final = 0;
cudaMemcpyToSymbol(niter, &niter_final, sizeof(int));
cudaMemcpyToSymbol(count, &count_final, sizeof(int));
//call kernel
calcularPi<<<1,32>>>();
//wait while kernel finish
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&count_final, count, sizeof(int));
cudaMemcpyFromSymbol(&niter_final, niter, sizeof(int));
//Ends timer
double pi = count_final/(double)niter_final*4;
t = clock() - t;
tempoTotal = ((double)t)/CLOCKS_PER_SEC;
printf("Pi: %g \n", pi);
printf("Adds: %d \n", niter_final);
printf("Total time: %f \n", tempoTotal);
}
$ nvcc -o t978 t978.cu -lcurand
$ cuda-memcheck ./t978
========= CUDA-MEMCHECK
Pi: 3.12083
Adds: 5032
Total time: 0.558463
========= ERROR SUMMARY: 0 errors
$
I've modified the iterations to a larger number, but you can use 50 if you want for ITER_MAX.
Note that there are many criticisms that could be levelled against this code. My aim here, since it's clearly a learning exercise, is to point out what the minimum number of changes could be to get a functional code, using the algorithm you've outlined. As just one example, you might want to change your kernel launch config (<<<1,32>>>) to other, larger numbers, in order to more fully utilize the GPU.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
int main(int argc, char **argv)
{
unsigned long long in = 1;
unsigned long long total = 2;
double tol , change, new, secs , old = 0.0;
struct timeval start , end;
int threads ; /* ignored */
if ( argc < 2) {
exit (-1);
}
threads = atoi ( argv[1]);
tol = atof ( argv[2]);
if (( threads < 1) || ( tol < 0.0)) {
exit (-1);
}
tol = tol *tol;
srand48(clock());
gettimeofday (&start , NULL);
do
{
double x, y;
x = drand48();
y = drand48();
total ++;
if (( x*x + y*y) <= 1.00)
in ++;
new = 4.0 * (double)in/( double)total ;
change = fabs (new - old);
old = new;
}while (change > tol );
gettimeofday (&end, NULL);
secs = (( double)end.tv_sec - (double)start.tv_sec )
+ (( double)end.tv_usec - (double)start.tv_usec )/1000000.0;
printf ( ”Found estimate of pi of %.12f in %llu iterations , %.6f seconds.n n”,
new, total - 2, secs );
}
The code above is a sequential program that takes an argument for the tolerance for how closely to estimate pi. Once the change in these old and new values fall below the tolerance it exits.
I have to parallelize this program in pthreads. I am not trying to have someone do it for me but rather to get some pointers and ideas to think about so that I may be able to this. the pthreads program will take number of threads and tolerance as it's argument and output the estimation. I am very new to parallel programs and don't really know where to start so I will take any advice. Thanks.
Each thread should keep its own in and total counts and use a better exit condition. Then add up the in and total values from each thread.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
void* Function(void* i);
#define MAX_THREADS 200
unsigned long long total[MAX_THREADS] = {0}; //total points for threads
unsigned long long in[MAX_THREADS] = {0}; //points inside for threads
double tolerance, change, new, old = 0.0;
long thread_num;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
int main(int argc, char **argv)
{
long i;
struct timeval start, end;
double secs;
unsigned long long master_total;
pthread_t* threads;
if (argc != 3){
printf("\nMust pass 2 arguments: (Tolerance) (# of Threads)");
exit(-1);
}
thread_num = atoi ( argv[1]);
tolerance = atof ( argv[2]);
if (( thread_num < 1) || ( tolerance < 0.0) || (thread_num > 200)) {
printf("\nIncorrect tolerance or threads.");
exit (-1);
}
threads = malloc(thread_num*sizeof(pthread_t)); //allocating space for threads
tolerance = tolerance * tolerance;
change = 0.5;
srand48(clock());
gettimeofday (&start, NULL);
for( i = 0; i < thread_num; i++ ){
pthread_create(&threads[i], NULL, Function, (void*)i);
}
for( i = 0; i < thread_num; i++ ){
pthread_join(threads[i], NULL);
}
gettimeofday (&end, NULL);
master_total = 0;
for( i = 0; i < thread_num; i++ ){
master_total = master_total + total[i];
}
secs = (( double)end.tv_sec - (double)start.tv_sec )
+ (( double)end.tv_usec - (double)start.tv_usec )/1000000.0;
printf ( "Estimation of pi is %.12f in %llu iterations , %.6f seconds.\n", new, master_total, secs );
}
//Each thread will calculate it's own points for in and total
//Per 1000 points it will calculate new and old values and compare to tolerance
//If desired accuracy is met the threads will return.
void* Function(void* i){
/*
rc - return code
total[i], in[i] - threads own number of calculated points
my_total, my_in - Each thread calculates global in and total, per 1000 points and calculates tolerance
*/
long my_spot = (long) i;
long rc;
long j;
unsigned long long my_total;
unsigned long long my_in;
do
{
double x, y;
x = drand48();
y = drand48();
total[my_spot]++;
if (( x*x + y*y) <= 1.00){
in[my_spot]++;
}
if(total[my_spot] % 1000 == 0){
while ( j < thread_num){
my_total = my_total + total[j];
my_in = my_in + in[j];
}
my_total = my_total;
//critical section
//old, new, and change are all global
rc = pthread_mutex_lock(&mutex);
new = 4.0 * (double)my_in/( double)my_total;
change = fabs (new - old);
old = new;
rc = pthread_mutex_unlock(&mutex);
}
}while (change > tolerance );
return NULL;
}
This is what I whipped up but I am getting errors. It just stops. I just have the threads break out of a loop and return then the main thread joins them. Any advice on what I am doing here?
I run it and it seems that all the threads get locked out when they reach mutex lock. I have each thread check the change in pi every 1000 points.
Is there a simple library to benchmark the time it takes to execute a portion of C code? What I want is something like:
int main(){
benchmarkBegin(0);
//Do work
double elapsedMS = benchmarkEnd(0);
benchmarkBegin(1)
//Do some more work
double elapsedMS2 = benchmarkEnd(1);
double speedup = benchmarkSpeedup(elapsedMS, elapsedMS2); //Calculates relative speedup
}
It would also be great if the library let you do many runs, averaging them and calculating the variance in timing!
Use the function clock() defined in time.h:
startTime = (float)clock()/CLOCKS_PER_SEC;
/* Do work */
endTime = (float)clock()/CLOCKS_PER_SEC;
timeElapsed = endTime - startTime;
Basically, all you want is a high resolution timer. The elapsed time is of course just a difference in times and the speedup is calculated by dividing the times for each task. I have included the code for a high resolution timer that should work on at least windows and unix.
#ifdef WIN32
#include <windows.h>
double get_time()
{
LARGE_INTEGER t, f;
QueryPerformanceCounter(&t);
QueryPerformanceFrequency(&f);
return (double)t.QuadPart/(double)f.QuadPart;
}
#else
#include <sys/time.h>
#include <sys/resource.h>
double get_time()
{
struct timeval t;
struct timezone tzp;
gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6;
}
#endif
Benchmark C code easily
#include <time.h>
int main(void) {
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time);
}
Easy benchmark of multi-threaded C code
If you want to benchmark multithreaded program you first need to take a closer look at clock:
Description
The clock() function returns an approximation of processor time
used by the program.
Return value
The value returned is the CPU time used so far as a clock_t; to
get the number of seconds used, divide by CLOCKS_PER_SEC. If the
processor time used is not available or its value cannot be
represented, the function returns the value (clock_t)(-1)
Hence it is very important to divide your elapsed_time by the number of threads in order to get the execution time of your function:
#include <time.h>
#include <omp.h>
#define THREADS_NB omp_get_max_threads()
#pragma omp parallel for private(i) num_threads(THREADS_NB)
clock_t start_time = clock();
// code or function to benchmark
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Done in %f seconds\n", elapsed_time / THREADS_NB); // divide by THREADS_NB!
Example
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define N 20000
#define THREADS_NB omp_get_max_threads()
void init_arrays(double *a, double *b) {
memset(a, 0, sizeof(a));
memset(b, 0, sizeof(b));
for (int i = 0; i < N; i++) {
a[i] += 1.0;
b[i] += 1.0;
}
}
double func2(double i, double j) {
double res = 0.0;
while (i / j > 0.0) {
res += i / j;
i -= 0.1;
j -= 0.000003;
}
return res;
}
double single_thread(double *a, double *b) {
double res = 0;
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
double multi_threads(double *a, double *b) {
double res = 0;
int i, j;
#pragma omp parallel for private(j) num_threads(THREADS_NB) reduction(+:res)
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
if (i == j) continue;
res += func2(a[i], b[j]);
}
}
return res;
}
int main(void) {
double *a, *b;
a = (double *)calloc(N, sizeof(double));
b = (double *)calloc(N, sizeof(double));
init_arrays(a, b);
clock_t start_time = clock();
double res = single_thread(a, b);
double elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("Default: Done with %f in %f sd\n", res, elapsed_time);
start_time = clock();
res = multi_threads(a, b);
elapsed_time = (double)(clock() - start_time) / CLOCKS_PER_SEC;
printf("With OMP: Done with %f in %f sd\n", res, elapsed_time / THREADS_NB);
}
Compile with:
gcc -O3 multithread_benchmark.c -fopenmp && time ./a.out
Output:
Default: Done with 2199909813.614555 in 4.909633 sd
With OMP: Done with 2199909799.377532 in 1.708831 sd
real 0m6.703s (from time function)
In POSIX, try getrusage. The relevant argument is RUSAGE_SELF and the relevant fields are ru_utime.tv_sec and ru_utime.tv_usec.
There may be existing utilities that help with this, but I suspect most will use some kind of sampling or possibly injection. But to get specific sections of code timed, you will probably have to add in calls to a timer like you show in your example. If you are using Windows, then the high performance timer works. I answered a similar question and showed example code that will do that. There are similar methods for Linux.