How can I parallelize the nested loop with statements using an OpenMP in C. I would be thankful to you.
#include <stdio.h>
#include <string.h>
#include <omp.h>
int main(int argc, char **argv) {
int n= 10000;
double y [10000];
int i;
int j;
double t = 0;
double k, f;
for (i=0; i<n; i++) {
k = (double) (i+1);
f = 0;
for (j=0;j<n;j++)
f += k / (double) (j+1);
t += f;
}
printf("%f\n",t);
}
You should use reduction openmp clause. Here is a short example:
#pragma omp parallel for reduction(+:f)
for (j=0;j<n;j++)
f += k / (double) (j+1);
Related
I am trying to integrate a function of curve, and convert the serial code to parallel program, I am using openMP for the same.
I have parallelized the for loop using openMP parallel for and have achieved lesser program time, but the problem is the result is not the expected one, there is something which get messed up in the threads, I want to know how to parallelize the for loop for N number of threads.
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x){
return sin(x)+0.5*x;
}
int main(){
int n=134217728,i;
double a=0,b=9,h,x,sum=0,integral;
double start = omp_get_wtime();
h=fabs(b-a)/n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for reduction (+:sum) shared(x)
for(i=1;i<n;i++){
x=a+i*h;
sum=sum+f(x);
}
integral=(h/2)*(f(a)+f(b)+2*sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n",time);
printf("\nThe integral is: %lf\n",integral);
}
The expected output is 22.161130 but it is getting varied each time the program is ran.
The loop you are trying to parallelise modifies the same variables x and sum in each iteration, this is very cumbersome to parallelize.
You could rewrite the code to make the path to parallelisation more obvious:
#include <stdio.h>
#include <omp.h>
#include <math.h>
double f(double x) {
return sin(x) + 0.5 * x;
}
int main() {
int n = 1 << 27, i, j;
double a = 0, b = 9, h, x, sum, integral;
double sums[64] = { 0 };
double start = omp_get_wtime();
h = fabs(b - a) / n;
omp_set_dynamic(0);
omp_set_num_threads(64);
#pragma omp parallel for
for (j = 0; j < 64; j++) {
for (i = 0; i < n; i += 64) {
sums[j] += f(a + i * h + j * h);
}
}
sum = 0;
for (j = 0; j < 64; j++) {
sum += sums[i];
}
integral = (h / 2) * (f(a) + f(b) + 2 * sum);
double end = omp_get_wtime();
double time = end - start;
printf("Execution time: %2.3f seconds\n", time);
printf("\nThe integral is: %lf\n", integral);
return 0;
}
#include "stdio.h"
#include "math.h"
#include "stdlib.h"
#include "x86intrin.h"
void dd_m(double *clo, int m)
{
int j;
__m256d *vclo = (__m256d *)clo;
__m256d al=_mm256_set_pd(0,0,0,0);
__m256d clo_n=_mm256_set_pd(0,0,0,0);
int i;
for (j = 0; j < m; j++) {
for (i = 0; i < m; i++) {
al = _mm256_add_pd(vclo[m/4*j+i] , clo_n);
}
}
}
int main(int argc, const char * argv[]){
int m;
double* zlo;
int i;
m=(int)pow(2,8);
zlo=(double *)_mm_malloc(sizeof(double) * m*m,32);
for (i=0;i<m*m;i++) {
zlo[i]=0.0;
}
dd_m(zlo, m);
_mm_free(zlo);
return 0;
}
Here's my code.
It generate an error
"Thread 1: EXC_BAD_ACCESS (code=1, address=0x102900000)"
inside for loop.
I used latest xcode with clang.
What should I do?
By casting your clo to point to 256-bit vectors as vclo, your row-length is divided by four, you changed it in the index computation, but not in the inner loop over i.
for (j = 0; j < m; j++) {
for (i = 0; i < m/4; i++) { // in vclo, the rows are only m/4 long
al = _mm256_add_pd(vclo[m/4*j+i] , clo_n);
}
}
I am trying to compare the difference between openmp reduction sum and the normal serial summation of an array. But I only got garbage number from my subroutine "fast_sum", while the result in serial_sum is correct. I am not sure why... I can compile my code successfully using gcc-7 -fopenmp. Please help me and thank you in advance!
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <sys/time.h>
#define NUM_THREADS 8
//////// serial /////////
double serial_sum(double *A, int N){
double sum=0.0;
int i;
for(i=1;i<=N;++i)
sum += A[i];
return(sum);
}
double fast_sum(double *A, int N){
double sum=0.0;
int i;
#pragma omp parallell for reduction(+:sum) num_threads(NUM_THREADS)
for (i =1; i<=N; ++i)
sum += A[i];
return(sum);
}
int main(void)
{
int N = 256;
double sum1, sum2, sum3;
double *A;
int i, h;
A = (double*)malloc(N * sizeof(double));
for( i = 1; i<=N;++i){
A[i] = i;
}
/* initialization A[N] */
sum1 = serial_sum(A,N);
printf("normal_sum is %f\n", sum1);
sum2 = fast_sum(A,N);
printf("fast_sum is %f\n", sum2);
free(A);
}
You have an indexing error here. C is 0-indexed You never initialize A[0], and you try to access A[N], even though the array is only allocated out to N-1. So who knows what values are in A[0] and A[N] - they are probably quite large, giving you the garbage value.
I have been trying to parallelize the following code using OpenMP, with no success.
I have searched in the internet several examples, yet none of them give me the same answer after executing the program several times.
#include <stdio.h>
#include <omp.h>
#define NUM_THREADS 2
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i;
double x, pi, sum = 0.0;
for(i = 0; i < num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
pi = step*sum;
printf("PI value = %f\n", pi);
}
This is the solution I have so far:
int main (int argc, char **argv){
//Variables
int i=0, aux=0;
double step = 1.0/100000.0;
double x=0.0,
pi=0.0,
sum = 0.0;
#pragma omp parallel shared(sum,i) private(x)
{
x = 0.0;
sum = 0.0;
#pragma omp for
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
#pragma omp critical
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= step*sum;
printf("PI value = %f\n", pi);
}
Please consider to use the same instruction for your loop as mentioned in the OpenMP official website: loop parallelism, I had to change many lines in your code, hope it will be a start point for you to get more familiar with OpenMP and Loop Parallelism in C language.
#include <stdio.h>
#include <omp.h>
#define NUM_STEPS 10000000
int main (int argc, char **argv){
//Variables
long int i, num_steps = NUM_STEPS;
double x, step, sum, pi;
sum = 0.0;
step = 1.0 / (double) num_steps;
#pragma omp parallel private(i,x)
{
#pragma omp for reduction(+:sum)
for (i=0; i<num_steps; ++i) {
x = (i+0.5)*steps;
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= steps*sum;
printf("PI value = %.24f\n", pi);
The answer was:
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
long num_steps = 100000;
double step = 1.0/100000.0;
int main() {
int i;
double x, pi, sum = 0.0;
#pragma omp parallel private(x)
{
#pragma omp for reduction(+:sum)
for(i = 0; i < num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
}
pi = step*sum;
printf("PI value = %f\n", pi);
}
Your main problem is that you declare your loop index i as shared. This leads every thread to use the same i in the evaluation. What you actually want to do with OpenMP is to divide the whole range of i in fractions and assign a different fraction to each thread. So, assign your i as private.
Apart from this, you don't need to re-initialize x and sum in the parallel region. After fixing some irrelevant compilation errors, your code should look like this:
#include<stdio.h>
#include <omp.h>
#define NUM_THREADS 2
int main (int argc, char **argv){
//Variables
int i=0, aux=0;
double step = 1.0/100000.0;
long num_steps = 100000;
double x=0.0,
pi=0.0,
sum = 0.0;
#pragma omp parallel shared(sum) private(i,x)
{
#pragma omp for
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
#pragma omp critical
sum += 4.0/(1.0+x*x);
}
}
/* All threads join master thread and terminate */
pi= step*sum;
printf("PI value = %f\n", pi);
}
Keep in mind that this is far from perfect in terms of performance, since every time you want to update the sum you pause the whole parallel region. A first step to make your code faster is by removing the critical part and declaring the sum as a reduction instead:
#pragma omp parallel private(i,x)
{
#pragma omp for reduction(+:sum)
for (i=0; i<num_steps; ++i) {
x = (i-0.5)*step;
sum += 4.0/(1.0+x*x);
}
}
I have the following C code which gives an error:
Program stopped at 0x4019b3.
It stopped with signal SIGSEGV, Segmentation fault.
when debugging.
Here is the code:
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <time.h>
int main()
{
clock_t begin, end;
double time_spent;
begin = clock();
int n = 100;int i; int j;
int N = 64;int r;
double complex (s)[4] = {-1-1*I, -1+1*I, 1-1*I, 1+1*I};
double complex symbol[n][N];
for (i=0; i<n; i++){
for (j=0; j<N; j++){
r = rand() % 4;
symbol[i][j]=s[r];
}
// Now add pilots:
symbol[i][11] = 1;
symbol[i][22] = 1;
symbol[i][33] = 1;
symbol[i][44] = 1;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
return 0;
}
Any idea what's wrong?
EDIT:
Now I can put it all together after these valuable discussions. Here is the working code with timing and memory allocation and every thing:
#include <stdio.h>
#include <complex.h>
#include <stdlib.h>
#include <time.h>
int main()
{
clock_t begin, end;
double time_spent;
begin = clock();
int n = 100000; int i; int j;
int N = 64;int r;
double complex (s)[4] = {-1-1*I, -1+1*I, 1-1*I, 1+1*I};
double complex (*symbol)[N] = malloc(n * sizeof *symbol);
for (i=0; i<n; i++){
for (j=0; j<N; j++){
r = rand() % 4;
symbol[i][j]=s[r];
}
// Now add pilots:
symbol[i][11] = 1;
symbol[i][22] = 1;
symbol[i][33] = 1;
symbol[i][44] = 1;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("%3.7f\n",time_spent);
return 0;
}
Memory needed to hold the variable declared by the line
double complex symbol[100000][64];
is too much for the stack.
Even a simple program like below runs into Segmentation fault when run an a 64-bit machine.
#include <stdio.h>
#include <complex.h>
void foo()
{
double complex symbol[100000][64];
printf("%zu\n", sizeof(symbol));
}
int main(int argc, char** argv)
{
foo();
return 0;
}
Consider allocating that memory from the heap, for example:
double complex (*symbol)[N] = malloc(n * sizeof *symbol);
The other problem is that in the loops:
for (i=0; i<n; i++){
for (j=0; i<N; j++){ // Problem line
r = rand() % 4;
symbol[i][j]=s[r];
}
You are accessing out of bounds memory. The problem line should be changed to:
for (j=0; j<N; j++){
^^ Use j not i
You have a copy and paste error in your second loop test:
for (j=0; j<N; j++){
^
It should be j not i
[Also, unrelated but you should not use modulus % on the result of rand() because the low bits are not as random as the high bits. Use division instead.]
[Another answer points out that you may also be exhausting your stack, although I would expect a different error. Worth checking though.]