I'm just started with OpenMP and I need help.
I have a program and I need to parallelize it. This is what I have:
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
#define N1 3000000
#define it 5
struct timeval t0, t1;
int i, itera_kop;
int A[N1], B[N1];
void Exe_Denbora(char * pTestu, struct timeval *pt0, struct timeval *pt1)
{
double tej;
tej = (pt1->tv_sec - pt0->tv_sec) + (pt1->tv_usec - pt0->tv_usec) / 1e6;
printf("%s = %10.3f ms (%d hari)\n",pTestu, tej*1000, omp_get_max_threads());
}
void sum(char * pTestu, int *b, int n)
{
double bat=0;
int i;
for (i=0; i<n; i++) bat+=b[i];
printf ("sum: %.1f\n",bat);
}
main ()
{
for (itera_kop=1;itera_kop<it;itera_kop++)
{
for(i=0; i<N1; i++)
{
A[i] = 1;
B[i] = 3;
}
gettimeofday(&t0, 0);
#pragma omp parallel for private(i)
for(i=2; i<N1; i++)
{
A[i] = 35 / (7/B[i-1] + 2/A[i]);
B[i] = B[i] / (A[i-1]+2) + 3 / B[i];
}
gettimeofday(&t1, 0);
Exe_Denbora("T1",&t0,&t1);
printf ("\n");
}
printf("\n\n");
sum("A",A,N1);
sum("B",B,N1);
}
If I execute the code without using #pragma omp parallel for I get:
A sum: 9000005.5
B sum: 3000005.5
But if I try to parallelize the code I get:
A sum: 9000284.0
B sum: 3000036.0
using 32 threads.
I would like to know why I can't parallelize the code that way
As you are likely aware, your problem is in this for loop. You have dependency between the two lines in the loop.
for(i=2; i<N1; i++)
{
A[i] = 35 / (7/B[i-1] + 2/A[i]);
B[i] = B[i] / (A[i-1]+2) + 3 / B[i];
}
We cannot know the order in which any given thread reaches one of those two lines. Therefore, as an example, when the second line executes, the value in B[i] will be different depending on if A[i-1] has already been changed or not by another thread. The same can be said of A[i]'s dependency on the value of B[i-1]. A short and clear explanation of dependencies can be found at the following link. I would recommend you take a look if this still is not clear. https://scs.senecac.on.ca/~gpu621/pages/content/omp_2.html
Related
This is the c code to edit with openmp instruction to improve the velocity of execution and below the stucture of the file used by the program.
I've try with:
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
but it doesn't work, the expected result is wrong, and different from the serial one.
I think that there is a logical error in the parallelization, I mean that this algorithm have to be parallelized in another way.
// FEEDFORWARD AND BACKPROPAGATION ALGORITHM
// WITH IMPLEMENTAtION OF BATCH TECHNIQUE
// compute the error in a batch of 5 input and then propagate the error, usefull for the parallelization.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#define INPUTN 3 // number of neurons in the input layer
#define HN 3 // number of neurons in the hidden layer
#define OUTN 1 // number of neurons in the output layer
#define DATANUM 1000 // number of training samples
#define EPOCHS 1000
#define BATCH_SIZE 20
typedef struct DataS{
double input[INPUTN];
double teach;
}DataS;
int main(){
double alpha = 0.0000001; //learning rate
double hn_out[HN];
double price_M;
double y_out = 0.0;
double error; //loss function
int k,g;
double delta_y;
double delta_w[HN][INPUTN];
double delta_b[HN];
DataS data[DATANUM];
double w[HN][INPUTN];
double v[HN];
double b[HN];
FILE *fp1;
double relative_err = 0;
double y_avg = 0.0;
double y_out_avg = 0.0;
double y_exp_avg = 0.0;
//weights initialization
for(int i=0; i<HN; i++){
v[i]= 1.0;
for(int j=0; j<INPUTN; j++)
w[i][j]= 1.0;
b[i]=0.0;
}
//get Dataset
fp1 = fopen("Dataset_3.txt", "r");
if(fp1 == NULL)
{
printf("cannot open file");
exit(1);
}
for(int i=0;i<DATANUM; i++){
fscanf(fp1, "%lf\t%lf\t%lf\t%lf", &data[i].input[0], &data[i].input[1], &data[i].input[2], &data[i].teach);
printf("%lf\t%lf\t%lf\t%lf\n", data[i].input[0], data[i].input[1], data[i].input[2], data[i].teach);
y_avg += data[i].teach/DATANUM;
}
fclose(fp1);
//START ALGORITHM
double ti = omp_get_wtime(); //initial time
for (int i = 0; i < EPOCHS; i ++) {
printf("\nepoch %d) ", i);
relative_err=0;
#pragma omp parallel for reduction (+: hn_out, y_out) private (k,g) shared (y_out_avg, y_exp_avg)
for(int j=0; j<DATANUM/BATCH_SIZE; j++){
//FEEDFORWARD
//compute hn_out[HN]
int base = j*BATCH_SIZE;
printf("Avg of data:");
for(int i_b=0; i_b<BATCH_SIZE; i_b++){
printf(" %d", base+i_b);
for(k=0; k<HN; k++){
hn_out[k]= 0.0;
}
for(k=0; k<HN; k++){
for(g=0; g<INPUTN; g++){
hn_out[k]+= w[k][g]*data[base+i_b].input[g];
}
hn_out[k]+= b[k];
}
//compute y_out[OUTN]
y_out= 0.0;
for(g=0; g<HN; g++){
y_out += hn_out[g]*v[g];
}
y_out = y_out/HN;
y_out_avg += y_out/BATCH_SIZE;
y_exp_avg += data[base+i_b].teach/BATCH_SIZE;
}
//LOSS FUNCTION
error = pow((y_out_avg-y_exp_avg),2);
printf("\nESTIM_AVG\tREAL_AVG\tRELATIVE_ERROR");
relative_err = fabs((y_out_avg-y_exp_avg)/y_avg); //relative_error: (prezzo calcolato - prezzo atteso)/misura attesa media
printf("\n%lf\t%lf\t%lf\n", y_out_avg, y_exp_avg, relative_err);
//BACKPROPAGATION
//update bias and weight
for(k=0;k<HN;k++){
for(g=0; g<INPUTN; g++){
w[k][g] = w[k][g]-2*alpha*data[j].input[g]*(y_out_avg-y_exp_avg);
v[g]= v[g]-2*alpha*(y_out_avg-y_exp_avg);
}
b[k]= b[k]-2*alpha*(y_out_avg-y_exp_avg);
//b[k]= 0;
}
y_out_avg = 0.0;
y_exp_avg = 0.0;
}
}
double tf = omp_get_wtime(); //final time
double time = tf - ti; //effective time for the execution
printf ("Elapsed time: %lf\n", time);
return 0;
}
using a file "Dataset_3.txt" which have 1000 rows of data here an example of 10 data:
u can copy and paste and create a file of 1000 rows or edit the code to run it correctly.
121.3168139 6.873759459 7 322386.5042
99.60902165 4.63043755 7 284554.0498
135.7221604 6.663354979 4 284796.0999
133.7192657 3.496973506 7 343977.1519
155.0125801 2.259712681 8 390169.2343
152.0527816 3.643403786 4 309419.1429
64.71485146 5.10618215 7 235827.262
130.6841885 5.405015338 4 280079.0986
56.36704 1.557336041 5 193401.2459
96.33489022 2.840480371 4 234694.1379
need some help for speed-up the program execution using openmp.
The level at witch you placed you OpenMP directive isn't the right one as there are too many things in the j loop that are not meant to be executed in parallel.
However, you can consider parallelizing the i_b loop.
For this one, a good stating point for this would be for example:
#pragma omp parallel for reduction(+:y_out_avg,y_exp_avg) private(k,g,y_out,hn_out)
If/when you're happy with the correctness of the code and if you want to go further in the parallelisation, then you can consider the "BACKPROPAGATION" loops and see what could be done there...
I have been trying to parallelize computing the sum value of series using a certain number of terms to the processors using block allocation.
In this program, I am generating arithmetic series and want to pass array as a shared variable in the pragma parallel directive.
but error coming in line #pragma omp parallel num_threads(comm_sz, number, BLOCK_LOW, BLOCK_HIGH, a[n], first, difference, global_sum1) providing the error below
expected a)a before a[a token
I am new to OPENMP-C. I have written th code below and facing the error above. -I researched in google but unable to find the solution.
Kindly help me how to declare array as shared variable in pragma parallel directive. I am attaching the code below
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int main (int argc, char *argv[])
{
int rank, comm_sz;
int number, i, first, difference, global_sum1, global_sum, nprocs, step, local_sum1, local_n;
int* a;
int BLOCK_LOW, BLOCK_HIGH;
double t0, t1;
comm_sz = atoi(argv[1]);
first = atoi(argv[2]);
difference = atoi(argv[3]);
number = atoi(argv[4]);
omp_set_num_threads (comm_sz);
rank = omp_get_thread_num();
a = (int*) malloc (n*sizeof(int));
printf("comm_sz=%d, first=%d, difference=%d, number of terms=%d\n",comm_sz, first, difference, number);
for(i=1; i <= number; i++){
a[i-1] = first + (i-1)*difference;
printf("a[%d]=%d\n",i-1,a[i]);
}
for(i=0; i < number; i++){
printf("a[%d]=%d\n",i,a[i]);}
t0 = omp_get_wtime();
#pragma omp parallel num_threads(comm_sz, number, BLOCK_LOW, BLOCK_HIGH, a[n], first, difference, global_sum1)
{
BLOCK_LOW = (rank * number)/comm_sz;
BLOCK_HIGH = ((rank+1) * number)/comm_sz;
#pragma omp parallel while private(i, local_sum1)
//int local_sum1 = 0;
i=BLOCK_LOW;
while( i < BLOCK_HIGH )
{
printf("%d, %d\n",BLOCK_LOW,BLOCK_HIGH);
local_sum1 = local_sum1 + a[i];
i++;
}
//global_sum1 = global_sum1 + local_sum1;
#pragma omp while reduction(+:global_sum1)
i=0;
for (i < comm_sz) {
global_sum1 = global_sum1 + local_sum1;
i++;
}
}
step = 2*first + (n-1)*difference;
sum = 0.5*n*step;
printf("sum is %d\n", global_sum );
t1 = omp_get_wtime();
printf("Estimate of pi: %7.5f\n", global_sum1);
printf("Time: %7.2f\n", t1-t0);
}
I'm trying to parallelize the dot product operation and I measure the running time of the operation run on various number of cores using OpenMP. I'm getting the result that if N=1e9, then for 1 core the cpu time is 5.6 seconds, for 8 cores 6.0 seconds and for 16 cores 10.8 seconds. Why do the computation time rise when I use more cores?
Here's my code:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_par, *y, *z, cput_par;
clock_t start, end;
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_par = 0;
//nthreads = omp_get_max_threads();
nthreads = 1;
printf("n threads = %d\n", nthreads);
start=clock();
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
end=clock();
cput_par = ((double)(end-start)/(double)(CLOCKS_PER_SEC));
printf("Parallel time use: %f\n", cput_par);
printf("x_par = %f\n", x_par);
return 0;
}
The fault was the the total CPU time of all cores/threads used was calculated. To get the average cpu-time given each thread that value needs to be divided by the number of threads. Another way to solve it can be to measure the walltime (i.e. the difference of the actual time of the day before and after the operation). If the walltime is used then the operating system might run another program in between and this is then also included in the walltime. To illustrate this, along with a comparison for a strict sequential case, I post this code:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> //gettimeofday()
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_seq, x_par, *y, *z;
struct timeval time;
double tstart_cpu, tend_cpu, tstart_wall, tend_wall;
double walltime_seq, walltime_par, cputime_seq, cputime_par;
nthreads = 8;
printf("- - -DOT PROCUCT: OPENMP - - -\n");
printf("Vector size : %d\n", N);
printf("Number of threads used: %d\n", nthreads);
// INITIALIZATION
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_seq = 0;
x_par = 0;
// SEQUENTIAL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
for (i=0; i<N; i++) x_seq += y[i] * z[i];
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_seq = tend_cpu-tstart_cpu;
walltime_seq = tend_wall - tstart_wall;
printf("Sequential CPU time: %f\n", cputime_seq);
printf("Sequential Walltime: %f\n", walltime_seq);
printf("Sequential result : %f\n", x_seq);
// PARALLEL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_par = tend_cpu - tstart_cpu;
walltime_par = tend_wall - tstart_wall;
cputime_par /= nthreads; // take the average cpu time per thread
printf("Parallel CPU time : %f\n", cputime_par);
printf("Parallel Walltime : %f\n", walltime_par);
printf("Parallel result : %f\n", x_par);
// SPEEDUP
printf("Speedup (cputime) : %f\n", cputime_seq/cputime_par);
printf("Speedup (walltime) : %f\n", walltime_seq/walltime_par);
return 0;
}
And a typical run of it outputs:
- - -DOT PROCUCT: OPENMP - - -
Vector size : 1000000000
Number of threads used: 8
Sequential CPU time: 4.871956
Sequential Walltime: 4.878946
Sequential result : 38685626227668133590597632.000000
Parallel CPU time : 0.751475
Parallel Walltime : 0.757933
Parallel result : 133586303067416523805032448.000000
Speedup (cputime) : 6.483191
Speedup (walltime) : 6.437172
As you can see the resulting dot product is not correct, but this answers the initial question.
I have been trying to create a Multithreaded program that calculates the multiples of 3 and 5 from 1 to 999 but I can't seem to get it right every time I run it I get a different value I think it might have to do with the fact that I use a shared variable with 10 threads but I have no idea how to get around that. Also The program does work if I calculate the multiples of 3 and 5 from 1 to 9.
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <string.h>
#define NUM_THREADS 10
#define MAX 1000
//finds multiples of 3 and 5 and sums up all of the multiples
int main(int argc, char ** argv)
{
omp_set_num_threads(10);//set number of threads to be used in the parallel loop
unsigned int NUMS[1000] = { 0 };
int j = 0;
#pragma omp parallel
{
int ID = omp_get_thread_num();//get thread ID
int i;
for(i = ID + 1;i < MAX; i+= NUM_THREADS)
{
if( i % 5 == 0 || i % 3 == 0)
{
NUMS[j++] = i;//Store Multiples of 3 and 5 in an array to sum up later
}
}
}
int i = 0;
unsigned int total;
for(i = 0; NUMS[i] != 0; i++)total += NUMS[i];//add up multiples of 3 and 5
printf("Total : %d\n", total);
return 0;
}
"j++" is not an atomic operation.
It means "take the value contained at the storage location called j, use it in the current statement, add one to it, then store it back in the same location it came from".
(That's the simple answer. Optimization and whether or not the value is kept in a register can and will change things even more.)
When you have multiple threads doing that to the same variable all at the same time, you get different and unpredictable results.
You can use thread variables to get around that.
In your code j is a shared inductive variable. You can't rely on using shared inductive variables efficiently with multiple threads (using atomic every iteration is not efficient).
You could find a special solution not using inductive variables (for example using wheel factorization with seven spokes {0,3,5,6,9,10,12} out of 15) or you could find a general solution using private inductive variables like this
#pragma omp parallel
{
int k = 0;
unsigned int NUMS_local[MAX] = {0};
#pragma omp for schedule(static) nowait reduction(+:total)
for(i=0; i<MAX; i++) {
if(i%5==0 || i%3==0) {
NUMS_local[k++] = i;
total += i;
}
}
#pragma omp for schedule(static) ordered
for(i=0; i<omp_get_num_threads(); i++) {
#pragma omp ordered
{
memcpy(&NUMS[j], NUMS_local, sizeof *NUMS *k);
j += k;
}
}
}
This solution does not make optimal use of memory however. A better solution would use something like std::vector from C++ which you could implement for example using realloc in C but I'm not going to do that for you.
Edit:
Here is a special solution which does not use shared inductive variables using wheel factorization
int wheel[] = {0,3,5,6,9,10,12};
int n = MAX/15;
#pragma omp parallel for reduction(+:total)
for(int i=0; i<n; i++) {
for(int k=0; k<7; k++) {
NUMS[7*i + k] = 7*i + wheel[k];
total += NUMS[7*i + k];
}
}
//now clean up for MAX not a multiple of 15
int j = n*7;
for(int i=n*15; i<MAX; i++) {
if(i%5==0 || i%3==0) {
NUMS[j++] = i;
total += i;
}
}
Edit: It's possible to do this without a critical section (from the ordered clause). This does memcpy in parallel and also makes better use of memory at least for the shared array.
int *NUMS;
int *prefix;
int total=0, j;
#pragma omp parallel
{
int i;
int nthreads = omp_get_num_threads();
int ithread = omp_get_thread_num();
#pragma omp single
{
prefix = malloc(sizeof *prefix * (nthreads+1));
prefix[0] = 0;
}
int k = 0;
unsigned int NUMS_local[MAX] = {0};
#pragma omp for schedule(static) nowait reduction(+:total)
for(i=0; i<MAX; i++) {
if(i%5==0 || i%3==0) {
NUMS_local[k++] = i;
total += i;
}
}
prefix[ithread+1] = k;
#pragma omp barrier
#pragma omp single
{
for(i=1; i<nthreads+1; i++) prefix[i+1] += prefix[i];
NUMS = malloc(sizeof *NUMS * prefix[nthreads]);
j = prefix[nthreads];
}
memcpy(&NUMS[prefix[ithread]], NUMS_local, sizeof *NUMS *k);
}
free(prefix);
This is a typical thread synchronization issue. All you need to do is using a kernel synchronization object for the sake of atomicity of any desired operation (incrementing the value of variable j in your case). It would be a mutex, semaphore or an event object depending on the operating system you're working on. But whatever your development environment is, to provide atomicity, the fundamental flow logic should be like the following pseudo-code:
{
lock(kernel_object)
// ...
// do your critical operation (increment your variable j in your case)
// ++j;
// ...
unlock(kernel_object)
}
If you're working on Windows operating system, there are some special synchronization mechanisms provided by the environment (i.e: InterlockedIncrement or CreateCriticalSection etc.) If you're working on a Unix/Linux based operating system, you can use mutex or semaphore kernel synchronization objects. Actually all those synchronization mechanism are stem from the concept of semaphores which is invented by Edsger W. Dijkstra in the begining of 1960's.
Here's some basic examples below:
Linux
#include <pthread.h>
pthread_mutex_t g_mutexObject = PTHREAD_MUTEX_INITIALIZER;
int main(int argc, char* argv[])
{
// ...
pthread_mutex_lock(&g_mutexObject);
++j; // incrementing j atomically
pthread_mutex_unlock(&g_mutexObject);
// ...
pthread_mutex_destroy(&g_mutexObject);
// ...
exit(EXIT_SUCCESS);
}
Windows
#include <Windows.h>
CRITICAL_SECTION g_csObject;
int main(void)
{
// ...
InitializeCriticalSection(&g_csObject);
// ...
EnterCriticalSection(&g_csObject);
++j; // incrementing j atomically
LeaveCriticalSection(&g_csObject);
// ...
DeleteCriticalSection(&g_csObject);
// ...
exit(EXIT_SUCCESS);
}
or just simply:
#include <Windows.h>
LONG volatile g_j; // our little j must be volatile in here now
int main(void)
{
// ...
InterlockedIncrement(&g_j); // incrementing j atomically
// ...
exit(EXIT_SUCCESS);
}
The problem you have is that threads doesn't necesarlly execute in order so the last thread to wirete may not have read the value in order so you overwrite wrong data.
There is a form to set that the threads in a loop, do a sumatory when they finish with the openmp options. You have to wirte somthing like this to use it.
#pragma omp parallel for reduction(+:sum)
for(k=0;k<num;k++)
{
sum = sum + A[k]*B[k];
}
/* Fin del computo */
gettimeofday(&fin,NULL);
all you have to do is write the result in "sum", this is from an old code i have that do a sumatory.
The other option you have is the dirty one. Someway, make the threads wait and get in order using a call to the OS. This is easier than it looks. This will be a solution.
#pragma omp parallel
for(i = ID + 1;i < MAX; i+= NUM_THREADS)
{
printf("asdasdasdasdasdasdasdas");
if( i % 5 == 0 || i % 3 == 0)
{
NUMS[j++] = i;//Store Multiples of 3 and 5 in an array to sum up later
}
}
but i recommendo you to read fully the openmp options.
I'd like get to know OpenMP a bit, cause I'd like to have a huge loop parallelized. After some reading (SO, Common OMP mistakes, tutorial, etc), I've taken as a first step the basically working c/mex code given below (which yields different results for the first test case).
The first test does sum up result values - functions serial, parallel -,
the second takes values from an input array and writes the processed values to an output array - functions serial_a, parallel_a.
My questions are:
Why differ the results of the first test, i. e. the results of the serial and parallel
Suprisingly the second test succeeds. My concern is about, how to handle memory (array locations) which possibly are read by multiple threads? In the example this should be emulated by a[i])/cos(a[n-i].
Are there some easy rules how to determine which variables to declare as private, shared and reduction?
In both cases int i is outside the pragma, however the second test appears to yield correct results. So is that okay or has i to be moved into the pragma omp parallel region, as being said here?
Any other hints on spoted mistakes?
Code
#include "mex.h"
#include <math.h>
#include <omp.h>
#include <time.h>
double serial(int x)
{
double sum=0;
int i;
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
return sum;
}
double parallel(int x)
{
double sum=0;
int i;
#pragma omp parallel num_threads(6) shared(sum) //default(none)
{
//printf(" I'm thread no. %d\n", omp_get_thread_num());
#pragma omp for private(i, x) reduction(+: sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
return sum;
}
void serial_a(double* a, int n, double* y2)
{
int i;
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
void parallel_a(double* a, int n, double* y2)
{
int i;
#pragma omp parallel num_threads(6)
{
#pragma omp for private(i)
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
}
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
double sum, *y1, *y2, *a, s, p;
int x, n, *d;
/* Check for proper number of arguments. */
if(nrhs!=2) {
mexErrMsgTxt("Two inputs required.");
} else if(nlhs>2) {
mexErrMsgTxt("Too many output arguments.");
}
/* Get pointer to first input */
x = (int)mxGetScalar(prhs[0]);
/* Get pointer to second input */
a = mxGetPr(prhs[1]);
d = (int*)mxGetDimensions(prhs[1]);
n = (int)d[1]; // row vector
/* Create space for output */
plhs[0] = mxCreateDoubleMatrix(2,1, mxREAL);
plhs[1] = mxCreateDoubleMatrix(n,2, mxREAL);
/* Get pointer to output array */
y1 = mxGetPr(plhs[0]);
y2 = mxGetPr(plhs[1]);
{ /* Do the calculation */
clock_t tic = clock();
y1[0] = serial(x);
s = (double) clock()-tic;
printf("serial....: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
y1[1] = parallel(x);
p = (double) clock()-tic;
printf("parallel..: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
mexEvalString("drawnow");
tic = clock();
serial_a(a, n, y2);
s = (double) clock()-tic;
printf("serial_a..: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
parallel_a(a, n, &y2[n]);
p = (double) clock()-tic;
printf("parallel_a: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
}
}
Output
>> mex omp1.c
>> [a, b] = omp1(1e8, 1:1e8);
serial....: 13399 ms
parallel..: 2810 ms
ratio.....: 0.21
serial_a..: 12840 ms
parallel_a: 2740 ms
ratio.....: 0.21
>> a(1) == a(2)
ans =
0
>> all(b(:,1) == b(:,2))
ans =
1
System
MATLAB Version: 8.0.0.783 (R2012b)
Operating System: Microsoft Windows 7 Version 6.1 (Build 7601: Service Pack 1)
Microsoft Visual Studio 2005 Version 8.0.50727.867
In your function parallel you have a few mistakes. The reduction should be declared when you use parallel. Private and share variables should also be declared when you use parallel. But when you do a reduction you should not declare the variable that is being reduced as shared. The reduction will take care of this.
To know what to declare private or shared you have to ask yourself which variables are being written to. If a variable is not being written to then normally you want it to be shared. In your case the variable x does not change so you should declare it shared. The variable i, however, does change so normally you should declare it private so to fix your function you could do
#pragma omp parallel reduction(+:sum) private(i) shared(x)
{
#pragma omp for
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
However, OpenMP automatically makes the iterator of a parallel for region private and variables declared outside of parallel regions are shared by default so for your parallel function you can simply do
#pragma omp parallel for reduction(+:sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
Notice that the only difference between this and your serial code is the pragma statment. OpenMP is designed so that you don't have to change your code except for pragma statments.
When it comes to arrays as long as each iteration of a parallel for loop acts on a different array element then you don't have to worry about shared and private. So you can write your private_a function simply as
#pragma omp parallel for
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
and once again it is the same as your serial_a function except for the pragma statement.
But be careful with assuming iterators are private. Consider the following double loop
for(i=0; i<n; i++) {
for(j=0; j<m; j++) {
//
}
}
If you use #pragma parallel for with that the i iterator will be made private but the j iterator will be shared. This is because the parallel for only applies to the outer loop over i and since j is shared by default it is not made private. In this case you would need to explicitly declare j private like this #pragma parallel for private(j).