When is a variable first private in a task?
int a;
void foo()
{
int b,c;
#pragma omp parallel shared(c){
int d;
#pragma omp task
{
int e;
//Want to find out the scope of the variables in the task region
}
}
}
What should be the scope for the variables a,b,c,d and e?
Related
I am trying to write a parallel program which takes an error rate(i.e 0.01) and returns a PI value which is closer to PI than the error with montecarlo simulation.
I wrote a simple function however it does not terminate as error rate is always around 11.
I appreciate your comments.
#include "stdio.h"
#include "omp.h"
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
double drand48(void);
double monte_carlo(double epsilon){
double x,y, pi_estimate = 0.0;
double drand48(void);
double error = 10000.0;
int n = 0; // total number of points
int i = 0; // total numbers of points inside circle
int p = omp_get_num_threads();
while(error>=epsilon){
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
}
n+=p;
printf("%lf\n", error);
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
}
return pi_estimate;
}
int main(int argc, char* argv[]) {
double epsilon = 0.01;
printf("PI estimate: %lf",monte_carlo(epsilon));
return 0;
}
Calling omp_get_num_threads() outside a parallel section will always return 1, as there is only one active thread at the moment the function is called. The following code should give a correct result, but will be much slower than the serial version due to the large parallelization & synchronization overhead spend for doing a very simple operation.
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
#pragma omp master
n+=omp_get_num_threads();
}
The following avoids repeatedly spawning threads and may be more efficient, but still probably slower.
#pragma omp parallel private(x, y)
while(error>=epsilon){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){
#pragma omp atomic
i++;
}
#pragma omp barrier
#pragma omp single
{
n+=omp_get_num_threads();
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier here
}
In order to really go faster, a minimum number of iterations should be given such as:
#define ITER 1000
#pragma omp parallel private(x, y)
while(error>=epsilon){
#pragma omp for reduction(+:i)
for (int j=1;j<ITER;j++){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0) i+=1;
}
/* implicit barrier + implicit atomic addition
* of thread-private accumulator to shared variable i
*/
#pragma omp single
{
n+=ITER;
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier
}
I'm tryng to add all the members of an array using openmp this way
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char *argv[])
{
int v[] ={1,2,3,4,5,6,7,8,9};
int sum = 0;
#pragma omp parallel private(v, sum)
{
#pragma reduction(+: sum)
{
for (int i = 0; i < sizeof(v)/sizeof(int); i++){
sum += v[i];
}
}
}
printf("%d\n",sum);
}
But when I print sum the result is 0
You are very confused about data-sharing attributes and work-sharing for OpenMP. This answer does not attempt to properly teach them to you, but only give you a concise specific example.
Your code does not make any sense and does not compile.
You do not need multiple regions or such, and there are only two variables. v - which is defined outside, is read by all and must be shared - which it implicitly is because it is defined outside. Then there is sum, which is a reduction variable.
Further, you need to apply worksharing (for) to the loop. So in the end it looks like this:
int v[] ={1,2,3,4,5,6,7,8,9};
int sum = 0;
#pragma omp parallel for reduction(+: sum)
for (int i = 0; i < sizeof(v)/sizeof(int); i++){
sum += v[i];
}
printf("%d\n",sum);
Note there are private variables in this example. Private variables are very dangerous because they are uninitialized inside the parallel region, simply don't use them explicitly. If you need something local, declare it inside the parallel region.
I sequentially generate a number which then is looked up in an array which never change. Therefore it would be convenient if this array some how stays in the spawned threads cache.
unsigned int unordered_set[65535];
void init_set( unsigned int *a) {
...
}
unsigned int generate_number() {
...
}
unsigned int find_number(unsigned int a) {
unsigned int result=0;
#pragma omp parallel for
for(unsigned int i=0; i<65535;i++) {
if (unordered_set[i]==a)
result=i;
}
return result
}
void main() {
unsigned int x;
/* Fill the array with unique numbers */
init_set(unordered_set);
while(x>0) {
/*This loop can only be done sequentially */
x=generate_number();
unsigned int r=find_number(x);
if (r>0)
printf ("Found: %d %d",x,r);
}
}
I suppose that this never happens in the above code. Every time find_number is called the array unordered_set again and again is loaded into the threads caches. What can be done to ensure that the array stays in cache?
I have a simple program that uses openMP to run 4 threads that read in 4 different text files and find anagrams. I am just trying to figure out why the last thread that is reported shows a thread number of 26478...I can't quite figure it out. The function countAnagrams doesn't do anything with tid, it just prints it to the screen when the function is done running.
Below is my code and the output. Any help would be greatly appreciated.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void countAnagrams(char* fileName, int threadNum);
void main ()
{
char *fileNames[] = {"AnagramA.txt","AnagramB.txt","AnagramC.txt","AnagramD.txt"};
int i;
int tid;
int nthreads = 4;
omp_set_num_threads(nthreads);
#pragma omp parallel
{
#pragma omp sections
{
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[0], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[1], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[2], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[3], tid);}
}
}
}
Output:
Filename: AnagramD.txt
Hello from thread: 1
Number of anagrams: 286
Longest anagram: 8
Filename: AnagramB.txt
Hello from thread: 0
Number of anagrams: 1148
Longest anagram: 8
Filename: AnagramC.txt
Hello from thread: 2
Number of anagrams: 5002
Longest anagram: 8
Filename: AnagramA.txt
Hello from thread: 26478
Number of anagrams: 3184
Longest anagram: 8
What's causing your issue is that you have not declared your thread ID variable private when you create your parallel region. Thus, threads are stomping over each other there and garbage can result. To fix this, make sure that all variables that should only be accessible by a single thread are declared private like so:
#pragma omp parallel private(tid)
The thing that may cause this problem is that tid is declared in main function. Try to do it in the following manner:``
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void countAnagrams(char* fileName, int threadNum);
void main ()
{
char *fileNames[] = {"AnagramA.txt","AnagramB.txt","AnagramC.txt","AnagramD.txt"};
int i;
int nthreads = 4;
omp_set_num_threads(nthreads);
#pragma omp parallel private(tid) //now each thread has its private copy of tid
{
#pragma omp sections
{
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[0], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[1], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[2], tid);}
#pragma omp section
{tid = omp_get_thread_num();
countAnagrams(fileNames[3], tid);}
}
}
}
i write a simple test code to use SIMD in openmp 4.0, but no accelerate i got.
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define N 40000000
#pragma omp declare simd
double am(double a, double b){
return a*b+a*b+a*b+a*b+a*b;
}
int main(){
int i,j,s;
double * a=(double *)malloc(sizeof(double)*N);
double * b=(double *)malloc(sizeof(double)*N);
double * c=(double *)malloc(sizeof(double)*N);
for(i=1;i<N;i++){
a[i]=0.56;
b[i]=0.66;
}
clock_t t1=clock();
#pragma omp simd
for(i=0;i<N;i++){
c[i] = am(a[i], b[i]);
}
clock_t t2=clock();
printf("%d\n",(t2-t1));
scanf("%d",&s);
free(a);
free(b);
free(c);
return 0;
}
the time with and without “#pragma omp declare simd” & “#pragma omp simd” are all 2900+ on my SNB.
what's more, warning "...\opm2.c(7,30): warning #13401: vector function was emulated" was accured....i really want to know why.
the same problem also accured in the followed code(enabled c99):
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
int main(){
int n=40000000;
int i,j,s;
double *a,*b,*c;
a=(double *)malloc(sizeof(double)*n);
b=(double *)malloc(sizeof(double)*n);
c=(double *)malloc(sizeof(double)*n);
for(i=1;i<n;i++){
a[i]=0.56;
b[i]=0.66;
}
clock_t t1=clock();
#pragma omp simd
for(i=0;i<n;i++){
c[i]=a[i]*b[i]+a[i]*b[i]+a[i]*b[i]-a[i]*b[i]+a[i]*b[i]+a[i]*b[i];
}
clock_t t2=clock();
printf("%d\n",(t2-t1));
scanf("%d",&s);
free(a);
free(b);
free(c);
return 0;
}
i really enabled openmp and the "#pragma omp padallel for" works well.i use intel c/c++ 2015.
if you can find the reason, really thanks, ありがとう, 多谢啦.
Although I don't know why...but on gcc 4.9 when I use -O1/O2/O3... the "#pragma omp simd" work well, and when I use -O0,it don't work. Ps:if you want to use avx rather than sse, don't forget "-mavx".