How to parallelize the below for loop - c

I am trying to integrate a function of curve, and convert the serial code to parallel program, I am using openMP for the same.
I have parallelized the for loop using openMP parallel for and have achieved lesser program time, but the problem is the result is not the expected one, there is something which get messed up in the threads, I want to know how to parallelize the for loop for N number of threads.
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x){
return sin(x)+0.5*x;
}
int main(){
int n=134217728,i;
double a=0,b=9,h,x,sum=0,integral;
double start = omp_get_wtime();
h=fabs(b-a)/n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for reduction (+:sum) shared(x)
for(i=1;i<n;i++){
x=a+i*h;
sum=sum+f(x);
}
integral=(h/2)*(f(a)+f(b)+2*sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n",time);
printf("\nThe integral is: %lf\n",integral);
}
The expected output is 22.161130 but it is getting varied each time the program is ran.

The loop you are trying to parallelise modifies the same variables x and sum in each iteration, this is very cumbersome to parallelize.
You could rewrite the code to make the path to parallelisation more obvious:
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x) {
return sin(x) + 0.5 * x;
}
int main() {
int n = 1 << 27, i, j;
double a = 0, b = 9, h, x, sum, integral;
double sums[64] = { 0 };
double start = omp_get_wtime();
h = fabs(b - a) / n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for
for (j = 0; j < 64; j++) {
for (i = 0; i < n; i += 64) {
sums[j] += f(a + i * h + j * h);
}
}
sum = 0;
for (j = 0; j < 64; j++) {
sum += sums[i];
}
integral = (h / 2) * (f(a) + f(b) + 2 * sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n", time);
printf("\nThe integral is: %lf\n", integral);
return 0;
}

Related

Parallelize nested loop in C with openMP

How can I parallelize the nested loop with statements using an OpenMP in C. I would be thankful to you.
#include <stdio.h>
#include <string.h>
#include <omp.h>
int main(int argc, char **argv) {
int n= 10000;
double y [10000];
int i;
int j;
double t = 0;
double k, f;
for (i=0; i<n; i++) {
k = (double) (i+1);
f = 0;
for (j=0;j<n;j++)
f += k / (double) (j+1);
t += f;
}
printf("%f\n",t);
}
You should use reduction openmp clause. Here is a short example:
#pragma omp parallel for reduction(+:f)
for (j=0;j<n;j++)
f += k / (double) (j+1);

OpenMP Paralellize Pi program

I have been trying to parallelize the following code using OpenMP, with no success.
I have searched in the internet several examples, yet none of them give me the same answer after executing the program several times.
#include <stdio.h>
#include <omp.h>
#define NUM_THREADS 2
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i;
double x, pi, sum = 0.0;
for(i = 0; i < num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
pi = step*sum;
printf("PI value = %f\n", pi);
}
This is the solution I have so far:
int main (int argc, char **argv){
//Variables
int i=0, aux=0;
double step = 1.0/100000.0;
double x=0.0,
pi=0.0,
sum = 0.0;
#pragma omp parallel shared(sum,i) private(x)
{
x = 0.0;
sum = 0.0;
#pragma omp for
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
#pragma omp critical
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= step*sum;
printf("PI value = %f\n", pi);
}
Please consider to use the same instruction for your loop as mentioned in the OpenMP official website: loop parallelism, I had to change many lines in your code, hope it will be a start point for you to get more familiar with OpenMP and Loop Parallelism in C language.
#include <stdio.h>
#include <omp.h>
#define NUM_STEPS 10000000
int main (int argc, char **argv){
//Variables
long int i, num_steps = NUM_STEPS;
double x, step, sum, pi;
sum = 0.0;
step = 1.0 / (double) num_steps;
#pragma omp parallel private(i,x)
{
#pragma omp for reduction(+:sum)
for (i=0; i<num_steps; ++i) {
x = (i+0.5)*steps;
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= steps*sum;
printf("PI value = %.24f\n", pi);
The answer was:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i;
double x, pi, sum = 0.0;
#pragma omp parallel private(x)
{
#pragma omp for reduction(+:sum)
for(i = 0; i < num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
}
pi = step*sum;
printf("PI value = %f\n", pi);
}
Your main problem is that you declare your loop index i as shared. This leads every thread to use the same i in the evaluation. What you actually want to do with OpenMP is to divide the whole range of i in fractions and assign a different fraction to each thread. So, assign your i as private.
Apart from this, you don't need to re-initialize x and sum in the parallel region. After fixing some irrelevant compilation errors, your code should look like this:
#include<stdio.h>
#include <omp.h>
#define NUM_THREADS 2
int main (int argc, char **argv){
//Variables
int i=0, aux=0;
double step = 1.0/100000.0;
long num_steps = 100000;
double x=0.0,
pi=0.0,
sum = 0.0;
#pragma omp parallel shared(sum) private(i,x)
{
#pragma omp for
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
#pragma omp critical
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= step*sum;
printf("PI value = %f\n", pi);
}
Keep in mind that this is far from perfect in terms of performance, since every time you want to update the sum you pause the whole parallel region. A first step to make your code faster is by removing the critical part and declaring the sum as a reduction instead:
#pragma omp parallel private(i,x)
{
#pragma omp for reduction(+:sum)
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
}

Multithreaded recursive fibonacci with OpenMP Tasks

I know fibonacci is fundamentally sequential. But I just want to test OpenMP Tasks for the recursive implementation of fibonacci series. The following code in C works fine but my Problem is, instead of getting faster results with more threads, it gets worse. Why? You can try it on your self. I want best scalability.
Compile this code with "gcc -O3 -fopenmp -o fib fib.c" and run it.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
double serialFib(int n, double* a) {
if (n < 2) {
a[n] = n;
return n;
}
double x = serialFib(n - 1, a);
double y = serialFib(n - 2, a);
a[n] = x + y;
return x + y;
}
double fib(int n, double* a) {
if (n < 2) {
a[n] = n;
return n;
}
if (n <= 30) { // avoid task creation overhead
return serialFib(n, a);
}
double x, y;
#pragma omp task shared(x, a) firstprivate(n)
{
x = fib(n - 1, a);
}
#pragma omp task shared(y, a) firstprivate(n)
{
y = fib(n - 2, a);
}
#pragma omp taskwait
a[n] = x + y;
return x + y;
}
int main(int argc, char *argv[]) {
double t0, t1;
// To test scalability of recursive approach
// we take N = 40. Otherwise it will take too long.
int N = 40, i, nthreads;
printf("Starting benchmark...\n");
nthreads = atoi(argv[1]);
omp_set_num_threads(nthreads);
double* a = (double *) calloc(N, sizeof(double));
t0 = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
{
fib(N, a);
}
}
t1 = omp_get_wtime();
for (i = 0; i < N; ++i) {
printf("a[%d] = %.2f\n", i, a[i]);
}
printf("Execution time: %f\n", t1 - t0);
free(a);
return 0;
}

A sample openmp program with speedup

Could someone provide an OpenMP program where the speedup is visible compared to without it. I'm finding it extremely difficult to achieve speedup. Even this simple program runs slower with OpenMP. My processor is Intel® Core™ i3-2370M CPU # 2.40GHz × 4 running on Linux (Ubuntu 14.10)
#include <cmath>
#include <stdio.h>
#include <time.h>
int main() {
clock_t t;
t = clock();
const int size = 4;
long long int k;
#pragma omp parallel for num_threads(4)
for(int n=0; n<size; ++n) {
for(int j=0;j<100000000;j++){
}
printf("\n");
}
t = clock() - t;
printf ("It took me %d clicks (%f seconds).\n",t,((float)t)/CLOCKS_PER_SEC);
return 0;
}
I had a problem related to this, where I wanted to find the max value of an array. I made the same mistake as you, I used clock for measuring the elapsed time. To fix this, I used clock_gettime() instead, and now it works.
As for an example code where the speedup is measurable (Note you migth want to change the value of N):
#include <omp.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>
struct timespec diff(struct timespec start, struct timespec end)
{
struct timespec temp;
if(end.tv_sec - start.tv_sec == 0)
{
temp.tv_nsec = end.tv_nsec - start.tv_nsec;
}
else
{
temp.tv_nsec = ((end.tv_sec - start.tv_sec)*1000000000) + end.tv_nsec - start.tv_nsec;
}
return temp;
}
int main()
{
unsigned int N;
struct timespec t_start, t_end;
clock_t start, end;
srand(time(NULL));
FILE *f = fopen("out.txt", "w");
if(f == NULL)
{
printf("Could not open output\n");
return -1;
}
for(N = 1000000; N < 100000000; N += 1000000)
{
fprintf(f, "%d\t", N);
int* array = (int*)malloc(sizeof(int)*N);
if(array == NULL)
{
printf("Not enough space\n");
return -1;
}
for(unsigned int i = 0; i<N; i++) array[i] = rand();
int max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
#pragma omp parallel for reduction(max:max_val)
for(unsigned int i=0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\t", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
max_val = 0.0;
clock_gettime(CLOCK_MONOTONIC, &t_start);
for(unsigned int i = 0; i<N; i++)
{
if(array[i] > max_val) max_val = array[i];
}
clock_gettime(CLOCK_MONOTONIC, &t_end);
fprintf(f, "%lf\n", (double)(diff(t_start, t_end).tv_nsec / 1000000000.0));
free(array);
}
fclose(f);
return 0;
}
Calculating a integral is a classical one, adjust the parts constant to increase the execution time and see more clearly the runtime, more parts, more execution time. It's getting 21.3 seconds with OpenMP enabled and 26.7 seconds, on a SINGLE core, DUAL thread Intel pentium 4:
#include <math.h>
#include <stdio.h>
#include <omp.h>
#define from 0.0f
#define to 2.0f
#define parts 999999999
#define step ((to - from) / parts)
#define x (from + (step / 2.0f))
int main()
{
double integralSum = 0;
int i;
#pragma omp parallel for reduction(+:integralSum)
for (i = 1; i < (parts+1); ++i)
{
integralSum = integralSum + (step * fabs(pow((x + (step * i)),2) + 4));
}
printf("%f\n", integralSum);
return 0;
}
It calculates the definite integral from 0 to 2 of x^2 + 4

C code stops running

I have the following C code which gives an error:
Program stopped at 0x4019b3.
It stopped with signal SIGSEGV, Segmentation fault.
when debugging.
Here is the code:
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <time.h>
int main()
{
clock_t begin, end;
double time_spent;
begin = clock();
int n = 100;int i; int j;
int N = 64;int r;
double complex (s)[4] = {-1-1*I, -1+1*I, 1-1*I, 1+1*I};
double complex symbol[n][N];
for (i=0; i<n; i++){
for (j=0; j<N; j++){
r = rand() % 4;
symbol[i][j]=s[r];
}
// Now add pilots:
symbol[i][11] = 1;
symbol[i][22] = 1;
symbol[i][33] = 1;
symbol[i][44] = 1;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
return 0;
}
Any idea what's wrong?
EDIT:
Now I can put it all together after these valuable discussions. Here is the working code with timing and memory allocation and every thing:
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <time.h>
int main()
{
clock_t begin, end;
double time_spent;
begin = clock();
int n = 100000; int i; int j;
int N = 64;int r;
double complex (s)[4] = {-1-1*I, -1+1*I, 1-1*I, 1+1*I};
double complex (*symbol)[N] = malloc(n * sizeof *symbol);
for (i=0; i<n; i++){
for (j=0; j<N; j++){
r = rand() % 4;
symbol[i][j]=s[r];
}
// Now add pilots:
symbol[i][11] = 1;
symbol[i][22] = 1;
symbol[i][33] = 1;
symbol[i][44] = 1;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%3.7f\n",time_spent);
return 0;
}
Memory needed to hold the variable declared by the line
double complex symbol[100000][64];
is too much for the stack.
Even a simple program like below runs into Segmentation fault when run an a 64-bit machine.
#include <stdio.h>
#include <complex.h>
void foo()
{
double complex symbol[100000][64];
printf("%zu\n", sizeof(symbol));
}
int main(int argc, char** argv)
{
foo();
return 0;
}
Consider allocating that memory from the heap, for example:
double complex (*symbol)[N] = malloc(n * sizeof *symbol);
The other problem is that in the loops:
for (i=0; i<n; i++){
for (j=0; i<N; j++){ // Problem line
r = rand() % 4;
symbol[i][j]=s[r];
}
You are accessing out of bounds memory. The problem line should be changed to:
for (j=0; j<N; j++){
^^ Use j not i
You have a copy and paste error in your second loop test:
for (j=0; j<N; j++){
^
It should be j not i
[Also, unrelated but you should not use modulus % on the result of rand() because the low bits are not as random as the high bits. Use division instead.]
[Another answer points out that you may also be exhausting your stack, although I would expect a different error. Worth checking though.]

Resources