Why does the compiler ignore OpenMP pragmas? - c

In the following C code I am using OpenMP in a nested loop. Since race condition occurs, I want to perform atomic operations at the end:
double mysumallatomic() {
double S2 = 0.;
#pragma omp parallel for shared(S2)
for(int a=0; a<128; a++){
for(int b=0; b<128;b++){
double myterm = (double)a*b;
#pragma omp atomic
S2 += myterm;
}
}
return S2;
}
The thing is that #pragma omp atomic has no effect on the program behaviour, even if I remove it, nothing happens. Even if I change it to #pragma oh_my_god, I get no error!
I wonder what is going wrong here, whether I can tell the compiler to be more strict when checking omp pragmas or why I do not get an error when I make the last change
PS: For compilation I use:
gcc-4.2 -fopenmp main.c functions.c -o main_elec_gcc.exe
PS2: New code that gives me the same problem and based on gillespie idea:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#define NRACK 64
#define NSTARS 1024
double mysumallatomic_serial(float rocks[NRACK][3], float moon[NSTARS][3],
float qr[NRACK],float ql[NSTARS]) {
int j,i;
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
for(j=0; j<NRACK; j++){
for(i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
S2 += ql[i]*temp_div*qr[j];
}
}
return S2;
}
double mysumallatomic(float rocks[NRACK][3], float moon[NSTARS][3],
float qr[NRACK],float ql[NSTARS]) {
float temp_div=0.,temp_sqrt=0.;
float difx,dify,difz;
float mod2x, mod2y, mod2z;
double S2 = 0.;
#pragma omp parallel for shared(S2)
for(int j=0; j<NRACK; j++){
for(int i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
float myterm=ql[i]*temp_div*qr[j];
#pragma omp atomic
S2 += myterm;
}
}
return S2;
}
int main(int argc, char *argv[]) {
float rocks[NRACK][3], moon[NSTARS][3];
float qr[NRACK], ql[NSTARS];
int i,j;
for(j=0;j<NRACK;j++){
rocks[j][0]=j;
rocks[j][1]=j+1;
rocks[j][2]=j+2;
qr[j] = j*1e-4+1e-3;
//qr[j] = 1;
}
for(i=0;i<NSTARS;i++){
moon[i][0]=12000+i;
moon[i][1]=12000+i+1;
moon[i][2]=12000+i+2;
ql[i] = i*1e-3 +1e-2 ;
//ql[i] = 1 ;
}
printf(" serial: %f\n", mysumallatomic_serial(rocks,moon,qr,ql));
printf(" openmp: %f\n", mysumallatomic(rocks,moon,qr,ql));
return(0);
}

Using the flag -Wall highlights pragma errors. For example, when I misspell atomic I get the following warning.
main.c:15: warning: ignoring #pragma omp atomic1
I'm sure you know, but just in case, your example should be handled with a reduction
When you use omp parallel, the default is for all variables to be shared. This is not what you want in your case. For example, each thread will have a different value difx. Instead, your loop should be:
#pragma omp parallel for default(none),\
private(difx, dify, difz, mod2x, mod2y, mod2z, temp_sqrt, temp_div, i, j),\
shared(rocks, moon, ql, qr), reduction(+:S2)
for(j=0; j<NRACK; j++){
for(i=0; i<NSTARS;i++){
difx=rocks[j][0]-moon[i][0];
dify=rocks[j][1]-moon[i][1];
difz=rocks[j][2]-moon[i][2];
mod2x=difx*difx;
mod2y=dify*dify;
mod2z=difz*difz;
temp_sqrt=sqrt(mod2x+mod2y+mod2z);
temp_div=1/temp_sqrt;
S2 += ql[i]*temp_div*qr[j];
}
}

I know this is an old post, but I think the problem is the order of the parameters of gcc, -fopenmp should be at the end of the compilation line.

First, depending on the implementation, reduction might be better than using atomic. I would try both and time them to see for sure.
Second, if you leave off the atomic, you may or may not see the problem (wrong result) associated with the race. It is all about timing, which from one run to the next can be quite different. I have seen cases where the result was wrong only once in 150,000 runs and others where it has been wrong all the time.
Third, the idea behind pragmas was that the user doesn't need to know about them if they don't have an effect. Besides that, the philosophy in Unix (and its derivatives) is that it is quiet unless there is a problem. Saying that, many implementations have some sort of flag so the user can get more information because they didn't know what was happening. You can try -Wall with gcc, and at least it should flag the oh_my_god pragma as being ignored.

You have
#pragma omp parallel for shared(S2)
for(int a=0; a<128; a++){
....
So the only parallelization will be to the for loop.
If you want to have the atomic or reduction
you have to do
#pragma omp parallel
{
#pragma omp for shared(S2)
for(int a=0; a<128; a++){
for(int b=0; b<128;b++){
double myterm = (double)a*b;
#pragma omp atomic
S2 += myterm;
} // end of second for
} // end of 1st for
} // end of parallel code
return S2;
} // end of function
Otherwise everything after # will be comment

Related

Is it beneficial to parallelize variable declaration?

I wonder if it is beneficial when writing a parallel program to insert variables declarations into the parallel section? Because the Amdahl's law says that if more portion of the program is parallel it's better but I don't see the point to parallelize variables declaration and return statements, for example, this is the normal parallel code:
#include <omp.h>
int main(void) {
int a = 0;
int b[5];
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < 5; ++i) {
b[i] = a;
}
}
return 0;
}
Will it be beneficial regarding Amdahl's law to write this (so 100% of the program is parallel):
#include <omp.h>
int main(void) {
#pragma omp parallel
{
int a = 0;
int b[5];
#pragma omp for
for (int i = 0; i < 5; ++i) {
b[i] = a;
}
return 0;
}
}
These codes are not equivalent: in the first case, a and b are shared variables (since shared is the default behavior for variables), in the second case these are thread-private variables that do not exist beyond the scope of the parallel region.
Besides, the return statement within the parallel region in the second piece of code is illegal and must cause a compilation error.
As seen for instance in this OpenMP 4.0 reference card
An OpenMP executable directive applies to the succeeding structured
block or an OpenMP construct. Each directive starts with #pragma omp.
The remainder of the directive follows the conventions of the C and
C++ standards for compiler directives. A structured-block is a single
statement or a compound statement with a single entry at the top and a
single exit at the bottom.
A block that contains the return statement is not a structured-block since it does not have a single exit at the bottom (i.e. the closing brace } is not the only exit since return is another one). It may not legally follow the #pragma omp parallel directive.

Reordered output despite critical section

I'm trying to adapt this pascal triangle program to a parallel program using OpenMp. I used the for directive to parallelize the printPas function for loop, and put the conditional statements inside of the critical section so only one thread can print at a time, but it seems like I'm still getting a data race because my output is really inconsistent.
#include <stdio.h>
#ifndef N
#define N 2
#endif
unsigned int t1[2*N+1], t2[2*N+1];
unsigned int *e=t1, *r=t2;
int l = 0;
//the problem is here in this function
void printPas() {
#pragma omp parallel for private(l)
for (l=0; l<2*N+1; l++) {
#pragma omp critical
if (e[l]==0)
printf(" ");
else
printf("%6u", e[l]);
}
printf("\n");
}
void update() {
r[0] = e[1];
#pragma omp parallel for
for (int u=1; u<2*N; u++)
r[u] = e[u-1]+e[u+1];
r[2*N] = e[2*N-1];
unsigned int *tmp = e; e=r; r=tmp;
}
int main() {
e[N] = 1;
for (int i=0; i<N; i++) {
printPas();
update();
}
printPas();
}
Your critical section is causing the prints to run sequentially. Therefore, the code takes longer using 'critical' than it would if you didn't attempt to parallelise it.
Using different threads to print, you have no idea which one will access the critical section first. Therefore, the for-loop will not execute in the order that you would hope.
I suggest either removing the parallel directive ("#pragma omp parallel for private(l)"), or removing the 'critical' and accepting that the prints will come out in a different order every time.

Counting does not work properly in OpenMP

I have the function
void collatz(int startNumber, int endNumber, int* iter, int nThreads)
{
int i, n, counter;
int isodd; /* 1 if n is odd, 0 if even */
#pragma omp parallel for
for (i = startNumber; i <= endNumber; i++)
{
counter = 0;
n = i;
omp_set_num_threads(nThreads);
while (n > 1)
{
isodd = n%2;
if (isodd)
n = 3*n+1;
else
n/=2;
counter++;
}
iter[i - startNumber] = counter;
}
}
It works as I wish when running serial (i.e. compiling without OpenMP or commenting out #pragma omp parallel for and omp_set_num_threads(nThreads);). However, the parallel version produces the wrong result and I think it is because the counter variable need to be set to zero at the beginning of each for loop and perhaps another thread can work with the non-zeroed counter value. But even if I use #pragma omp parallel for private(counter), the problem still occurs. What am I missing?
I compile the program as C89.
Inside your OpenMP parallel region, you are assigning values to the counter, n and isodd scalar variables. These cannot therefore be just shared as they are by default. You need to pay extra attention to them.
A quick analysis shows that as their values is only meaningful inside the parallel region and only for the current thread, so it becomes clear that they need to be declared private.
Adding a private( counter, n, isodd ) clause to your #pragma omp parallel directive should fix the issue.

open MP - dot product

I am implementing parallel dot product in open MP
I have this code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <omp.h>
#define SIZE 1000
int main (int argc, char *argv[]) {
float u[SIZE], v[SIZE], dp,dpp;
int i, j, tid;
dp=0.0;
for(i=0;i<SIZE;i++){
u[i]=1.0*(i+1);
v[i]=1.0*(i+2);
}
printf("\n values of u and v:\n");
for (i=0;i<SIZE;i++){
printf(" u[%d]= %.1f\t v[%d]= %.1f\n",i,u[i],i,v[i]);
}
#pragma omp parallel shared(u,v,dp,dpp) private (tid,i)
{
tid=omp_get_thread_num();
#pragma omp for private (i)
for(i=0;i<SIZE;i++){
dpp+=u[i]*v[i];
printf("thread: %d\n", tid);
}
#pragma omp critical
{
dp=dpp;
printf("thread %d\n",tid);
}
}
printf("\n dot product is %f\n",dp);
}
I am starting it with: pgcc -B -Mconcur -Minfo -o prog prog.c
And result I get in console is:
33, Loop not parallelized: innermost
39, Loop not vectorized/parallelized: contains call
48, Loop not vectorized/parallelized: contains call
What am I doing wrong?
From my side of view, everything looks Ok.
First of all, a simple 1,000-element dot product does not have enough computational cost to justify multi-threading --- you will pay so much more in communication and synchronization costs than you will gain in performance that it is not worth it.
Secondly, it looks like you are computing the full dot product in each thread, not dividing the computation across multiple threads and combining the result at the end.
Here is an example of how to do vector dot products from https://computing.llnl.gov/tutorials/openMP/#SHARED
#include <omp.h>
main ()
{
int i, n, chunk;
float a[100], b[100], result;
/* Some initializations */
n = 100;
chunk = 10;
result = 0.0;
for (i=0; i < n; i++) {
a[i] = i * 1.0;
b[i] = i * 2.0;
}
#pragma omp parallel for \
default(shared) private(i) \
schedule(static,chunk) \
reduction(+:result)
for (i=0; i < n; i++)
result += (a[i] * b[i]);
printf("Final result= %f\n",result);
}
Basically, OpenMP is good for doing coarse-grained parallelism when you have large, expensive loops. In general, when you are doing parallel programming, the larger the "chunks" of computation you can do before re-synchronizing, the better. Especially as the number of cores grows, the communication and synchronization costs will grow. Pretend that each synchronization (grabbing a new index or chunk of indexes to execute, entering a critical section, etc.) costs you 10ms, or 1M instructions to get a better idea of when/where/how to parallelize your code.
The problem is still the same as in your latest question. You are accumulating values in a variable and you must tell OpenMp how to do that:
#pragma omp for reduction(+: dpp)
for(size_t i=0; i<SIZE; i++){
dpp += u[i]*v[i];
}
Use a loop-local variable for the index and that is all you need, forget about all the stuff that you are doing around that. If you want to see then what the compiler is doing of your code run it with -S and check the assembler output. This can be very instructive, because you then learn what simple statements like that amount to when they are parallelized.
And don't use int for loop indices. Sizes and stuff like that are size_t.

Trouble with nested loops and openmp

I am having trouble applying openmp to a nested loop like this:
#pragma omp parallel shared(S2,nthreads,chunk) private(a,b,tid)
{
tid = omp_get_thread_num();
if (tid == 0)
{
nthreads = omp_get_num_threads();
printf("\nNumber of threads = %d\n", nthreads);
}
#pragma omp for schedule(dynamic,chunk)
for(a=0;a<NREC;a++){
for(b=0;b<NLIG;b++){
S2=S2+cos(1+sin(atan(sin(sqrt(a*2+b*5)+cos(a)+sqrt(b)))));
}
} // end for a
} /* end of parallel section */
When I compare the serial with the openmp version, the last one gives weird results. Even when I remove #pragma omp for, the results from openmp are not correct, do you know why or can point to a good tutorial explicit about double loops and openmp?
This is a classic example of a race condition. Each of your openmp threads is accessing and updating a shared value at the same time, and there's no guaantee that some of the updates won't get lost (at best) or the resulting answer won't be gibberish (at worst).
The thing with race conditions is that they depend sensitively on the timing; in a smaller case (eg, with smaller NREC and NLIG) you might sometimes miss this, but in a larger case, it'll eventually always come up.
The reason you get wrong answers without the #pragma omp for is that as soon as you enter the parallel region, all of your openmp threads start; and unless you use something like an omp for (a so-called worksharing construct) to split up the work, each thread will do everything in the parallel section - so all the threads will be doing the same entire sum, all updating S2 simultatneously.
You have to be careful with OpenMP threads updating shared variables. OpenMP has atomic operations to allow you to safely modify a shared variable. An example follows (unfortunately, your example is so sensitive to summation order it's hard to see what's going on, so I've changed your sum somewhat:). In the mysumallatomic, each thread updates S2 as before, but this time it's done safely:
#include <omp.h>
#include <math.h>
#include <stdio.h>
double mysumorig() {
double S2 = 0;
int a, b;
for(a=0;a<128;a++){
for(b=0;b<128;b++){
S2=S2+a*b;
}
}
return S2;
}
double mysumallatomic() {
double S2 = 0.;
#pragma omp parallel for shared(S2)
for(int a=0; a<128; a++){
for(int b=0; b<128;b++){
double myterm = (double)a*b;
#pragma omp atomic
S2 += myterm;
}
}
return S2;
}
double mysumonceatomic() {
double S2 = 0.;
#pragma omp parallel shared(S2)
{
double mysum = 0.;
#pragma omp for
for(int a=0; a<128; a++){
for(int b=0; b<128;b++){
mysum += (double)a*b;
}
}
#pragma omp atomic
S2 += mysum;
}
return S2;
}
int main() {
printf("(Serial) S2 = %f\n", mysumorig());
printf("(All Atomic) S2 = %f\n", mysumallatomic());
printf("(Atomic Once) S2 = %f\n", mysumonceatomic());
return 0;
}
However, that atomic operation really hurts parallel performance (after all, the whole point is to prevent parallel operation around the variable S2!) so a better approach is to do the summations and only do the atomic operation after both summations rather than doing it 128*128 times; that's the mysumonceatomic() routine, which only incurs the synchronization overhead once per thread rather than 16k times per thread.
But this is such a common operation that there's no need to implment it yourself. One can use an OpenMP built-in functionality for reduction operations (a reduction is an operation like calculating a sum of a list, finding the min or max of a list, etc, which can be done one element at a time only by looking at the result so far and the next element) as suggested by #ejd. OpenMP will work and is faster (it's optimized implementation is much faster than what you can do on your own with other OpenMP operations).
As you can see, either approach works:
$ ./foo
(Serial) S2 = 66064384.000000
(All Atomic) S2 = 66064384.000000
(Atomic Once) S2 = 66064384.00000
The problem isn't with double loops but with variable S2. Try putting a reduction clause on your for directive:
#pragma omp for schedule(dynamic,chunk) reduction(+:S2)

Resources