I'm developing a code to simulate the response of some dynamical systems to my PhD research. To this end, I'm trying to implement parallelization with OpenMP to increase the performance of the code.
Basically I have a solution function parallel_dynamical_diagram_solution that calls other functions within the same file (realloc_vector, perturb_wolf, rk4, lyapunov_wolf, store_LE, get_attractor) that contains some operations and numerical methods. Also, the solution functions calls a pointer to a function edosys that is declared in another file. The solution function is shown below:
void parallel_dynamical_diagram_solution(int dim, int np, int ndiv, int trans, int *attrac, int maxper, double t, double **x, double *varparX, double *varparY, double *parrange, double *par, void (*edosys)(int, double *, double, double *, double *), int bifmode) {
// Allocate memory for x` = f(x)
double *f = malloc(dim * sizeof *f);
// Allocate memory for vectors necessary for lyapunov exponents calculation
double *cum = malloc(dim * sizeof *cum); // Cumulative Vector
double *lambda = malloc(dim *sizeof *lambda); // Lyapunov Exponents vector
double *s_cum = malloc(dim * sizeof *s_cum); // Short Cumulative Vector
double *s_lambda = malloc(dim * sizeof *s_lambda); // Short Lyapunov Exponents Vector
double *znorm = malloc(dim * sizeof *znorm); // Norm of Vectors
double *gsc = malloc((dim - 1) * sizeof *gsc); // Inner Products Vector
// Store Initial Conditions
double t0 = t;
double *IC = malloc(dim * sizeof *IC);
for (int i = 0; i < dim; i++) {
IC[i] = (*x)[i];
}
// Declare rk4 timestep, final time, short initial time and pi
double h, tf, s_T0;
const double pi = 4 * atan(1); // Pi number definition
// Declare and define increment of control parameters
double varstep[2];
varstep[0] = (parrange[1] - parrange[0])/(parrange[2] - 1); // -1 in the denominator ensures the input resolution
varstep[1] = (parrange[4] - parrange[3])/(parrange[5] - 1); // -1 in the denominator ensures the input resolution
// Numerical control parameters
int ndim = dim + (dim * dim); // Define new dimension to include linearized dynamical equations
// Declare vector and allocate memory to store poincare map values: poinc[number of permanent regime forcing periods][dimension original system]
double **poinc = malloc((np - trans) * sizeof **poinc);
for (int i = 0; i < np - trans; i++) {
poinc[i] = malloc(dim * sizeof **poinc);
}
// Declare vector to store the chosen Lyapunov Exponents to determine the attractor
double *LE = malloc(dim * sizeof *LE);
// Declare vector for temporary storage of periodicity values to check if all directions are equal
int *tmp_attrac = malloc(dim * sizeof *tmp_attrac);
// Declare variable to flag if all directions present same periodicity or not (0 = all the same, 1 = not the same)
int diffAttrac = -1;
// Prepare x vector to include perturbed values
realloc_vector(x, ndim);
// Starts the parallel block
int k;
#pragma omp parallel default(none) firstprivate(x, t, par, IC, varstep, diffAttrac, poinc, varparY, varparX) \
private(k, f, attrac, lambda, s_lambda, LE, cum, s_cum, znorm, gsc, h, tf, s_T0, tmp_attrac, edosys) \
shared(parrange, dim, ndim, np, ndiv, trans, t0, bifmode, maxper)
{
// Starts to increment bifurcation control parameter
#pragma omp for schedule(static)
// Loop for Y control parameter
for (k = 0; k < (int)parrange[5]; k++) {
// Value of Y control parameter based on index k
(*varparY) = parrange[3] + k*varstep[1];
printf("(Iteration: %d) varparY = %lf\n", k, (*varparY));
// Reset Initial conditions for the beggining of a horizontal line
for (int i = 0; i < dim; i++) {
(*x)[i] = IC[i];
printf("(Iteration: %d) x[%d] = %lf\n", k, i, (*x)[i]);
}
// Loop for X control parameter
for (int m = 0; m < (int)parrange[2]; m++) {
// Value of X control parameter based on index m
(*varparX) = parrange[0] + m*varstep[0];
printf("(Iteration: %d) a\n", k);
// Reset Variables
t = t0;
printf("(Iteration: %d) b\n", k);
for (int i = 0; i < dim; i++) {
lambda[i] = 0.0;
s_lambda[i] = 0.0;
LE[i] = 0.0;
printf("(Iteration: %d) c\n", k);
}
// Check the mode of the bifurcation
if (bifmode == 1) {
// Reset Initial conditions in each bifurcation step
for (int i = 0; i < dim; i++) {
(*x)[i] = IC[i];
}
}
// Vary timestep if varpar = par[0], varying also final time and short initial time
h = (2 * pi) / (ndiv * par[0]); // par[0] = OMEGA
tf = h*np*ndiv; // Final time
s_T0 = ((double) trans/ (double) np) * tf; // Advanced initial time
// Assign initial perturbation
perturb_wolf(x, dim, ndim, &cum, &s_cum);
// Call Runge-Kutta 4th order integrator N = nP * nDiv times
for (int i = 0; i < np; i++) {
for (int j = 0; j < ndiv; j++) {
rk4(ndim, *x, t, h, par, f, edosys);
lyapunov_wolf(x, t, h, dim, ndim, s_T0, &cum, &s_cum, &lambda, &s_lambda, &znorm, &gsc);
t = t + h;
// Apply poincare map at permanent regime
if (i >= trans) {
// Choose any point in the trajectory for poincare section placement
if (j == 1) {
// Stores poincare values in poinc[np - trans][dim] vector
for (int p = 0; p < dim; p++) {
poinc[i - trans][p] = (*x)[p];
}
}
}
}
}
// Define which lyapunov will be taken: lambda[dim] or s_lambda[dim]
store_LE(dim, lambda, s_lambda, LE);
// Verify the type of motion of the system
(*attrac) = get_attractor(poinc, LE, dim, np, trans, tmp_attrac, &diffAttrac, maxper);
printf("[k = %d] [m = %d]: Attractor = %d, lambda_1 = %lf, lambda_2 = %lf\n",k, m, (*attrac), lambda[0], lambda[1]);
}
}
// Free memory
free(f); free(cum); free(s_cum); free(lambda); free(s_lambda);
free(znorm); free(gsc); free(LE); free(tmp_attrac); free(IC);
for (int i = 0; i < np - trans; i++) {
free(poinc[i]);
}
free(poinc);
} // Ends Parallel Block
}
When I run the code, a segmentation fault error occurs as shown below:
(Iteration: 0) varparY = 0.010000
(Iteration: 0) x[0] = 1.000000
(Iteration: 0) x[1] = 0.000000
(Iteration: 0) a
(Iteration: 0) b
zsh: segmentation fault ./dyndiag
It appears to be happening when I assign the values of lambda, s_lambda and LE, but I don't know why as these variables are declared as private. I'm new to OpenMP and parallelization in general, could someone help me? What am I missing here?
Thanks in advance!
The easy part is to tell why you get segmentation faults: many of your variables are unitialized inside the parallel block (attrac, lambda, s_lambda, LE, cum, s_cum, gsc, etc). Consider the following code segment:
double *lambda = malloc(dim *sizeof *lambda);
#pragma omp parallel private(lambda)
{
lambda[0]=0; // lambda is unitialized --> undefined behavior
}
The private clause makes the pointer private, but will not initialize your private variable or allocate memory for each thread. The code above practically means the following (it may be easier to understand actually what is happening):
double *lambda = malloc(dim *sizeof *lambda);
#pragma omp parallel
{
double* lambda; // a local variable is created for each thread
lambda[0]=0; // BUT it is unitialized --> undefined behavior
}
The solution is to allocate (and free) the memory inside the parallel block:
#pragma omp parallel
{
double *lambda = malloc(dim *sizeof *lambda);
....
lambda[0]=0; // lambda is private and initialized -- OK
...
free(lambda);
}
This will solve your segmentation fault problem, but unfortunately your code will not work properly. You still have issues with your firstprvate variables, but the biggest problem is that you wish to parallelize iterations. As far as I understand your code the output of an iteration is the input of the next one. You simply cannot parallelize it, iteration is a sequential process. You should first read a decent book on OpenMP then rewrite your code and parallelize work inside an iteration.
Related
I am struggling to figure out how to parallelize this code with OpenMP, any help is appreciated. Below is the base code and a description.
In the simulation of a collection of soft particles (such as proteins in a fluid), there is a repulsive force between a pair of particles when they overlap. The goal of this assignment is to use parallel computing to accelerate the computation of these repulsive forces, using multiple cores with Open-MP.
In the force repulsion function, the particles are assumed to have unit radius. The particles are in a “simulation box” of dimensions L × L × L. The dimension L is chosen such that the volume fraction of particles is φ = 0.3. The simulation box has periodic (wrap-around) boundary conditions, which explains why we need to use the remainder function to compute the distance between two particles. If the particles overlap, i.e., the distance s between two particles is less than 2, then the repulsive force is proportional to k(2−s) where k is a force constant. The force is along the vector joining the two particles.
Write a program that tests the correctness of your code. This can be done by computing the correct forces and comparing them to the forces computed by your optimized code. Give evidence in your report that your program works correctly using your test program
How much faster is your accelerated code compared to the provided baseline code? Include timings for different problem sizes. Be sure to include a listing of your code in your report.
Code to parallelize
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
double get_walltime() {
struct timeval tp;
gettimeofday(&tp, NULL);
return (double) (tp.tv_sec + tp.tv_usec*1e-6); }
void force_repulsion(int np, const double *pos, double L, double krepulsion, double *forces)
{
int i, j;
double posi [4]; double rvec [4];
double s2, s, f;
// initialize forces to zero
for (i=0; i<3*np; i++)
forces[i] = 0.;
// loop over all pairs
for (i=0; i<np; i++)
{
posi[0] = pos[3*i ];
posi[1] = pos[3*i+1]; posi[2] = pos[3*i+2];
for (j=i+1; j<np; j++)
{
// compute minimum image difference
rvec[0] = remainder(posi[0] - pos[3*j ], L);
rvec[1] = remainder(posi[1] - pos[3*j+1], L);
rvec[2] = remainder(posi[2] - pos[3*j+2], L);
s2 = rvec [0]* rvec [0] + rvec [1]* rvec [1] + rvec [2]* rvec [2];
if (s2 < 4)
{
s = sqrt(s2);
rvec[0] /= s; rvec[1] /= s;
rvec[2] /= s;
f = krepulsion*(2.-s);
forces[3*i ] += f*rvec[0];
forces[3*i+1] += f*rvec[1];
forces[3*i+2] += f*rvec[2];
forces[3*j ] += -f*rvec[0];
forces[3*j+1] += -f*rvec[1];
forces[3*j+2] += -f*rvec[2]; }
} }
}
int main(int argc, char *argv[]) {
int i;
int np = 100; // default number of particles
double phi = 0.3; // volume fraction
double krepulsion = 125.; // force constant
double *pos; double *forces;
double L, time0 , time1;
if (argc > 1)
np = atoi(argv[1]);
L = pow(4./3.*3.1415926536*np/phi, 1./3.);
// generate random particle positions inside simulation box
forces = (double *) malloc(3*np*sizeof(double));
pos = (double *) malloc(3*np*sizeof(double));
for (i=0; i<3*np; i++)
pos[i] = rand()/(double)RAND_MAX*L;
// measure execution time of this function
time0 = get_walltime ();
force_repulsion(np, pos, L, krepulsion, forces);
time1 = get_walltime ();
printf("number of particles: %d\n", np);
printf("elapsed time: %f\n", time1-time0);
free(forces);
free(pos);
return 0; }
Theoretically, it would be as simple as this:
void force_repulsion(int np, const double *pos, double L, double krepulsion,
double *forces)
{
// initialize forces to zero
#pragma omp parallel for
for (int i = 0; i < 3 * np; i++)
forces[i] = 0.;
// loop over all pairs
#pragma omp parallel for
for (int i = 0; i < np; i++)
{
double posi[4];
double rvec[4];
double s2, s, f;
posi[0] = pos[3 * i];
//...
Compilation:
g++ -fopenmp example.cc -o example
Note that I did not check for correctness. Make sure you won't have global variable inside the parallel for (as I updated your code..)
!!! HOMEWORK - ASSIGNMENT !!!
Please do not post code as I would like to complete myself but rather if possible point me in the right direction with general information or by pointing out mistakes in thought or other possible useful and relevant resources.
I have a method that creates my square npages * npages matrix hat of double for use in my pagerank algorithm.
I have made it with pthreads, SIMD and with both pthreads and SIMD. I have used xcode instruments time profiler and found that the pthreads only version is the fastest, next is the SIMD only version and slowest is the version with both SIMD and pthreads.
As it is homework it can be run on multiple different machines however we were given the header #include so it is to be assumed we can use upto AVX at least. We are given how many threads the program will use as the argument to the program and store it in a global variable g_nthreads.
In my tests I have been testing it on my machine which is an IvyBridge with 4 hardware cores and 8 logical cores and I've been testing it with 4 threads as an arguments and with 8 threads as an argument.
RUNNING TIMES:
SIMD ONLY:
*331ms - for consturct_matrix_hat function *
PTHREADS ONLY (8 threads):
70ms - each thread concurrently
SIMD & PTHREADS (8 threads):
110ms - each thread concurrently
What am I doing that is slowing it down more when using both forms of optimisation?
I will post each implementation:
All versions share these macros:
#define BIG_CHUNK (g_n2/g_nthreads)
#define SMALL_CHUNK (g_npages/g_nthreads)
#define MOD BIG_CHUNK - (BIG_CHUNK % 4)
#define IDX(a, b) ((a * g_npages) + b)
Pthreads:
// struct used for passing arguments
typedef struct {
double* restrict m;
double* restrict m_hat;
int t_id;
char padding[44];
} t_arg_matrix_hat;
// Construct matrix_hat with pthreads
static void* pthread_construct_matrix_hat(void* arg) {
t_arg_matrix_hat* t_arg = (t_arg_matrix_hat*) arg;
// set coordinate limits thread is able to act upon
size_t start = t_arg->t_id * BIG_CHUNK;
size_t end = t_arg->t_id + 1 != g_nthreads ? (t_arg->t_id + 1) * BIG_CHUNK : g_n2;
// Initialise coordinates with given uniform value
for (size_t i = start; i < end; i++) {
t_arg->m_hat[i] = ((g_dampener * t_arg->m[i]) + HAT);
}
return NULL;
}
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
// create structs to send and retrieve matrix and value from threads
t_arg_matrix_hat t_args[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
t_args[i] = (t_arg_matrix_hat) {
.m = matrix,
.m_hat = matrix_hat,
.t_id = i
};
}
// create threads and send structs with matrix and value to divide the matrix and
// initialise the coordinates with the given value
pthread_t threads[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
pthread_create(threads + i, NULL, pthread_construct_matrix_hat, t_args + i);
}
// join threads after all coordinates have been intialised
for (size_t i = 0; i < g_nthreads; i++) {
pthread_join(threads[i], NULL);
}
return matrix_hat;
}
SIMD:
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
double dampeners[4] = {g_dampener, g_dampener, g_dampener, g_dampener};
__m256d b = _mm256_loadu_pd(dampeners);
// Use simd to subtract values from each other
for (size_t i = 0; i < g_mod; i += 4) {
__m256d a = _mm256_loadu_pd(matrix + i);
__m256d res = _mm256_mul_pd(a, b);
_mm256_storeu_pd(&matrix_hat[i], res);
}
// Subtract values from each other that weren't included in simd
for (size_t i = g_mod; i < g_n2; i++) {
matrix_hat[i] = g_dampener * matrix[i];
}
double hats[4] = {HAT, HAT, HAT, HAT};
b = _mm256_loadu_pd(hats);
// Use simd to raise each value to the power 2
for (size_t i = 0; i < g_mod; i += 4) {
__m256d a = _mm256_loadu_pd(matrix_hat + i);
__m256d res = _mm256_add_pd(a, b);
_mm256_storeu_pd(&matrix_hat[i], res);
}
// Raise each value to the power 2 that wasn't included in simd
for (size_t i = g_mod; i < g_n2; i++) {
matrix_hat[i] += HAT;
}
return matrix_hat;
}
Pthreads & SIMD:
// struct used for passing arguments
typedef struct {
double* restrict m;
double* restrict m_hat;
int t_id;
char padding[44];
} t_arg_matrix_hat;
// Construct matrix_hat with pthreads
static void* pthread_construct_matrix_hat(void* arg) {
t_arg_matrix_hat* t_arg = (t_arg_matrix_hat*) arg;
// set coordinate limits thread is able to act upon
size_t start = t_arg->t_id * BIG_CHUNK;
size_t end = t_arg->t_id + 1 != g_nthreads ? (t_arg->t_id + 1) * BIG_CHUNK : g_n2;
size_t leftovers = start + MOD;
__m256d b1 = _mm256_loadu_pd(dampeners);
//
for (size_t i = start; i < leftovers; i += 4) {
__m256d a1 = _mm256_loadu_pd(t_arg->m + i);
__m256d r1 = _mm256_mul_pd(a1, b1);
_mm256_storeu_pd(&t_arg->m_hat[i], r1);
}
//
for (size_t i = leftovers; i < end; i++) {
t_arg->m_hat[i] = dampeners[0] * t_arg->m[i];
}
__m256d b2 = _mm256_loadu_pd(hats);
//
for (size_t i = start; i < leftovers; i += 4) {
__m256d a2 = _mm256_loadu_pd(t_arg->m_hat + i);
__m256d r2 = _mm256_add_pd(a2, b2);
_mm256_storeu_pd(&t_arg->m_hat[i], r2);
}
//
for (size_t i = leftovers; i < end; i++) {
t_arg->m_hat[i] += hats[0];
}
return NULL;
}
// Construct matrix_hat
double* construct_matrix_hat(double* matrix) {
double* matrix_hat = malloc(sizeof(double) * g_n2);
// create structs to send and retrieve matrix and value from threads
t_arg_matrix_hat t_args[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
t_args[i] = (t_arg_matrix_hat) {
.m = matrix,
.m_hat = matrix_hat,
.t_id = i
};
}
// create threads and send structs with matrix and value to divide the matrix and
// initialise the coordinates with the given value
pthread_t threads[g_nthreads];
for (size_t i = 0; i < g_nthreads; i++) {
pthread_create(threads + i, NULL, pthread_construct_matrix_hat, t_args + i);
}
// join threads after all coordinates have been intialised
for (size_t i = 0; i < g_nthreads; i++) {
pthread_join(threads[i], NULL);
}
return matrix_hat;
}
I think it's because your SIMD code is horribly inefficient: It loops over the memory twice, instead of doing the add with the multiply, before storing. You didn't test SIMD vs. a scalar baseline, but if you had you'd probably find that your SIMD code wasn't a speedup with a single thread either.
STOP READING HERE if you want to solve the rest of your homework yourself.
If you used gcc -O3 -march=ivybridge, the simple scalar loop in the pthread version probably auto-vectorized into something like what you should have done with intrinsics. You even used restrict, so it might realize that the pointers can't overlap with each other, or with g_dampener.
// this probably autovectorizes well.
// Initialise coordinates with given uniform value
for (size_t i = start; i < end; i++) {
t_arg->m_hat[i] = ((g_dampener * t_arg->m[i]) + HAT);
}
// but this would be even safer to help the compiler's aliasing analysis:
double dampener = g_dampener; // in case the compiler things one of the pointers might point at the global
double *restrict hat = t_arg->hat;
const double *restrict mat = t_arg->m;
... same loop but using these locals instead of
It's probably not a problem for an FP loop, since double definitely can't alias with double *.
The coding style is also pretty nasty. You should give meaningful names to your __m256d variables whenever possible.
Also, you use malloc, which doesn't guarantee that matrix_hat will be aligned to a 32B boundary. C11's aligned_alloc is probably the nicest way, vs. posix_memalign (clunky interface), _mm_malloc (have to free with _mm_free, not free(3)), or other options.
double* construct_matrix_hat(const double* matrix) {
// double* matrix_hat = malloc(sizeof(double) * g_n2);
double* matrix_hat = aligned_alloc(64, sizeof(double) * g_n2);
// double dampeners[4] = {g_dampener, g_dampener, g_dampener, g_dampener}; // This idiom is terrible, and might actually compile to code that stores it 4 times on the stack and then loads.
__m256d vdamp = _mm256_set1_pd(g_dampener); // will compile to a broadcast-load (vbroadcastsd)
__m256d vhat = _mm256_set1_pd(HAT);
size_t last_full_vector = g_n2 & ~3ULL; // don't load this from a global.
// it's better for the compiler to see how it's calculated from g_n2
// ??? Use simd to subtract values from each other // huh? this is a multiply, not a subtract. Also, everyone can see it's using SIMD, that part adds no new information
// if you really want to manually vectorize this, instead of using an OpenMP pragma or -O3 on the scalar loop, then:
for (size_t i = 0; i < last_full_vector; i += 4) {
__m256d vmat = _mm256_loadu_pd(matrix + i);
__m256d vmul = _mm256_mul_pd(vmat, vdamp);
__m256d vres = _mm256_add_pd(vmul, vhat);
_mm256_store_pd(&matrix_hat[i], vres); // aligned store. Doesn't matter for performance.
}
#if 0
// Scalar cleanup
for (size_t i = last_vector; i < g_n2; i++) {
matrix_hat[i] = g_dampener * matrix[i] + HAT;
}
#else
// assume that g_n2 >= 4, and do a potentially-overlapping unaligned vector
if (last_full_vector != g_n2) {
// Or have this always run, and have the main loop stop one element sooner (so this overlaps by 0..3 instead of by 1..3 with a conditional)
assert(g_n2 >= 4);
__m256d vmat = _mm256_loadu_pd(matrix + g_n2 - 4);
__m256d vmul = _mm256_mul_pd(vmat, vdamp);
__m256d vres = _mm256_add_pd(vmul, vhat);
_mm256_storeu_pd(&matrix_hat[g_n2-4], vres);
}
#endif
return matrix_hat;
}
This version compiles (after defining a couple globals) to the asm we expect. BTW, normal people pass sizes around as function arguments. This is another way of avoiding optimization-failure due to C aliasing rules.
Anyway, really your best bet is to let OpenMP auto-vectorize it, because then you don't have to write a cleanup loop yourself. There's nothing tricky about the data organization, so it vectorizes trivially. (And it's not a reduction, like in your other question, so there's no loop-carried dependency or order-of-operations concern).
I'm trying to implement a kernel which does parallel reduction. The code below works on occasion, I have not been able to pin down why it goes wrong on the occasions it does.
__kernel void summation(__global float* input, __global float* partialSum, __local float *localSum){
int local_id = get_local_id(0);
int workgroup_size = get_local_size(0);
localSum[local_id] = input[get_global_id(0)];
for(int step = workgroup_size/2; step>0; step/=2){
barrier(CLK_LOCAL_MEM_FENCE);
if(local_id < step){
localSum[local_id] += localSum[local_id + step];
}
}
if(local_id == 0){
partialSum[get_group_id(0)] = localSum[0];
}}
Essentially I'm summing the values per work group and storing each work group's total into partialSum, the final summation is done on the host. Below is the code which sets up the values for the summation.
size_t global[1];
size_t local[1];
const int DATA_SIZE = 15000;
float *input = NULL;
float *partialSum = NULL;
int count = DATA_SIZE;
local[0] = 2;
global[0] = count;
input = (float *)malloc(count * sizeof(float));
partialSum = (float *)malloc(global[0]/local[0] * sizeof(float));
int i;
for (i = 0; i < count; i++){
input[i] = (float)i+1;
}
I'm thinking it has something to do when the size of the input is not a power of two? I noticed it begins to go off for numbers around 8000 and beyond. Any assistance is welcome. Thanks.
I'm thinking it has something to do when the size of the input is not a power of two?
Yes. Consider what happens when you try to reduce, say, 9 elements. Suppose you launch 1 work-group of 9 work-items:
for (int step = workgroup_size / 2; step > 0; step /= 2){
// At iteration 0: step = 9 / 2 = 4
barrier(CLK_LOCAL_MEM_FENCE);
if (local_id < step) {
// Branch taken by threads 0 to 3
// Only 8 numbers added up together!
localSum[local_id] += localSum[local_id + step];
}
}
You're never summing the 9th element, hence the reduction is incorrect. An easy solution is to pad the input data with enough zeroes to make the work-group size the immediate next power-of-two.
I'd like get to know OpenMP a bit, cause I'd like to have a huge loop parallelized. After some reading (SO, Common OMP mistakes, tutorial, etc), I've taken as a first step the basically working c/mex code given below (which yields different results for the first test case).
The first test does sum up result values - functions serial, parallel -,
the second takes values from an input array and writes the processed values to an output array - functions serial_a, parallel_a.
My questions are:
Why differ the results of the first test, i. e. the results of the serial and parallel
Suprisingly the second test succeeds. My concern is about, how to handle memory (array locations) which possibly are read by multiple threads? In the example this should be emulated by a[i])/cos(a[n-i].
Are there some easy rules how to determine which variables to declare as private, shared and reduction?
In both cases int i is outside the pragma, however the second test appears to yield correct results. So is that okay or has i to be moved into the pragma omp parallel region, as being said here?
Any other hints on spoted mistakes?
Code
#include "mex.h"
#include <math.h>
#include <omp.h>
#include <time.h>
double serial(int x)
{
double sum=0;
int i;
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
return sum;
}
double parallel(int x)
{
double sum=0;
int i;
#pragma omp parallel num_threads(6) shared(sum) //default(none)
{
//printf(" I'm thread no. %d\n", omp_get_thread_num());
#pragma omp for private(i, x) reduction(+: sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
return sum;
}
void serial_a(double* a, int n, double* y2)
{
int i;
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
void parallel_a(double* a, int n, double* y2)
{
int i;
#pragma omp parallel num_threads(6)
{
#pragma omp for private(i)
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
}
}
void mexFunction(int nlhs, mxArray* plhs[], int nrhs, const mxArray* prhs[])
{
double sum, *y1, *y2, *a, s, p;
int x, n, *d;
/* Check for proper number of arguments. */
if(nrhs!=2) {
mexErrMsgTxt("Two inputs required.");
} else if(nlhs>2) {
mexErrMsgTxt("Too many output arguments.");
}
/* Get pointer to first input */
x = (int)mxGetScalar(prhs[0]);
/* Get pointer to second input */
a = mxGetPr(prhs[1]);
d = (int*)mxGetDimensions(prhs[1]);
n = (int)d[1]; // row vector
/* Create space for output */
plhs[0] = mxCreateDoubleMatrix(2,1, mxREAL);
plhs[1] = mxCreateDoubleMatrix(n,2, mxREAL);
/* Get pointer to output array */
y1 = mxGetPr(plhs[0]);
y2 = mxGetPr(plhs[1]);
{ /* Do the calculation */
clock_t tic = clock();
y1[0] = serial(x);
s = (double) clock()-tic;
printf("serial....: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
y1[1] = parallel(x);
p = (double) clock()-tic;
printf("parallel..: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
mexEvalString("drawnow");
tic = clock();
serial_a(a, n, y2);
s = (double) clock()-tic;
printf("serial_a..: %.0f ms\n", s);
mexEvalString("drawnow");
tic = clock();
parallel_a(a, n, &y2[n]);
p = (double) clock()-tic;
printf("parallel_a: %.0f ms\n", p);
printf("ratio.....: %.2f \n", p/s);
}
}
Output
>> mex omp1.c
>> [a, b] = omp1(1e8, 1:1e8);
serial....: 13399 ms
parallel..: 2810 ms
ratio.....: 0.21
serial_a..: 12840 ms
parallel_a: 2740 ms
ratio.....: 0.21
>> a(1) == a(2)
ans =
0
>> all(b(:,1) == b(:,2))
ans =
1
System
MATLAB Version: 8.0.0.783 (R2012b)
Operating System: Microsoft Windows 7 Version 6.1 (Build 7601: Service Pack 1)
Microsoft Visual Studio 2005 Version 8.0.50727.867
In your function parallel you have a few mistakes. The reduction should be declared when you use parallel. Private and share variables should also be declared when you use parallel. But when you do a reduction you should not declare the variable that is being reduced as shared. The reduction will take care of this.
To know what to declare private or shared you have to ask yourself which variables are being written to. If a variable is not being written to then normally you want it to be shared. In your case the variable x does not change so you should declare it shared. The variable i, however, does change so normally you should declare it private so to fix your function you could do
#pragma omp parallel reduction(+:sum) private(i) shared(x)
{
#pragma omp for
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
}
However, OpenMP automatically makes the iterator of a parallel for region private and variables declared outside of parallel regions are shared by default so for your parallel function you can simply do
#pragma omp parallel for reduction(+:sum)
for(i = 0; i<x; i++){
sum += sin(x*i) / cos(x*i+1.0);
}
Notice that the only difference between this and your serial code is the pragma statment. OpenMP is designed so that you don't have to change your code except for pragma statments.
When it comes to arrays as long as each iteration of a parallel for loop acts on a different array element then you don't have to worry about shared and private. So you can write your private_a function simply as
#pragma omp parallel for
for(i = 0; i<n; i++){
y2[i] = sin(a[i]) / cos(a[n-i]+1.0);
}
and once again it is the same as your serial_a function except for the pragma statement.
But be careful with assuming iterators are private. Consider the following double loop
for(i=0; i<n; i++) {
for(j=0; j<m; j++) {
//
}
}
If you use #pragma parallel for with that the i iterator will be made private but the j iterator will be shared. This is because the parallel for only applies to the outer loop over i and since j is shared by default it is not made private. In this case you would need to explicitly declare j private like this #pragma parallel for private(j).
/*
Matricefilenames:
small matrix A.bin of dimension 100 × 50
small matrix B.bin of dimension 50 × 100
large matrix A.bin of dimension 1000 × 500
large matrix B.bin of dimension 500 × 1000
An MPI program should be implemented such that it can
• accept two file names at run-time,
• let process 0 read the A and B matrices from the two data files,
• let process 0 distribute the pieces of A and B to all the other processes,
• involve all the processes to carry out the the chosen parallel algorithm
for matrix multiplication C = A * B ,
• let process 0 gather, from all the other processes, the different pieces
of C ,
• let process 0 write out the entire C matrix to a data file.
*/
int main(int argc, char *argv[]) {
printf("Oblig 2 \n");
double **matrixa;
double **matrixb;
int ma,na,my_ma,my_na;
int mb,nb,my_mb,my_nb;
int i,j,k;
int myrank,numprocs;
int konstanta,konstantb;
MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
if(myrank==0) {
read_matrix_binaryformat ("small_matrix_A.bin", &matrixa, &ma, &na);
read_matrix_binaryformat ("small_matrix_B.bin", &matrixb, &mb, &nb);
}
//mpi broadcast
MPI_Bcast(&ma,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&mb,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&na,1,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&nb,1,MPI_INT,0,MPI_COMM_WORLD);
fflush(stdout);
int resta = ma % numprocs;//rest antall som har den største verdien
//int restb = mb % numprocs;
if (myrank == 0) {
printf("ma : %d",ma);
fflush(stdout);
printf("mb : %d",mb);
fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);
if (resta == 0) {
my_ma = ma / numprocs;
printf("null rest\n ");
fflush(stdout);
} else {
if (myrank < resta) {
my_ma = ma / numprocs + 1;//husk + 1
} else {
my_ma = ma / numprocs; //heltalls divisjon gir nedre verdien !
}
}
my_na = na;
my_nb = nb;
double **myblock = malloc(my_ma*sizeof(double*));
for(i=0;i<na;i++) {
myblock[i] = malloc(my_na*sizeof(double));
}
//send_cnt for scatterv
//________________________________________________________________________________________________________________________________________________
int* send_cnta = (int*)malloc(numprocs*sizeof(int));//array med antall elementer sendt til hver prosess array[i] = antall elementer , i er process
int tot_elemsa = my_ma*my_na;
MPI_Allgather(&tot_elemsa,1,MPI_INT,&send_cnta[0],1,MPI_INT,MPI_COMM_WORLD);//arrays i c må sendes &array[0]
//send_disp for scatterv
//__________________________________________________________________________________
int* send_dispa = (int*)malloc(numprocs*sizeof(int)); //hvorfor trenger disp
// int* send_dispb = (int*)malloc(numprocs*sizeof(int));
//disp hvor i imagechars første element til hver prosess skal til
fflush(stdout);
if(resta==0) {
send_dispa[myrank]=myrank*my_ma*my_na;
} else if(myrank<=resta) {
if(myrank<resta) {
send_dispa[myrank]=myrank*my_ma*my_na;
} else {//my_rank == rest
send_dispa[myrank]=myrank*(my_ma+1)*my_na;
konstanta=myrank*(my_ma+1)*my_na;
}
}
MPI_Bcast(&konstanta,1,MPI_INT,resta,MPI_COMM_WORLD);
if (myrank>resta){
send_dispa[myrank]=((myrank-resta)*(my_ma*my_na))+konstanta;
}
MPI_Allgather(&send_dispa[myrank],1,MPI_INT,&send_dispa[0],1,MPI_INT,MPI_COMM_WORLD);
//___________________________________________________________________________________
printf("print2: %d" , myrank);
fflush(stdout);
//recv_buffer for scatterv
double *recv_buffera=malloc((my_ma*my_na)*sizeof(double));
MPI_Scatterv(&matrixa[0], &send_cnta[0], &send_dispa[0], MPI_UNSIGNED_CHAR, &recv_buffera[0], my_ma*my_na, MPI_UNSIGNED_CHAR, 0, MPI_COMM_WORLD);
for(i=0; i<my_ma; i++) {
for(j=0; j<my_na; j++) {
myblock[i][j]=recv_buffera[i*my_na + j];
}
}
MPI_Finalize();
return 0;
}
OLD:I get three type of errors. I can get scatterv count error, segmentationfault 11, or the processes just get stuck. It seems to be random which error I get. I run the code with 2 procs each time. When it gets stuck it gets stuck before the printf("print2: %d" , myrank);. When my friend runs the code on his own computer also with two prosesses, he does not get past by the first MPI_Bcast. Nothing is printed out when he runs it. Here is a link for the errors I get: http://justpaste.it/zs0
UPDATED PROBLEM: Now I get only a segmentation fault after " printf("print2: %d" , myrank); " before the scatterv call. EVEN if I remove all the code after the printf statement I get the segmentation fault, but only if I run the code for more than two procs.
I'm having a little difficulty tracing what you were trying to do. I think you're making the scatterv call more complicated than it needs to be though. Here's a snippet I had from a similar assignment this year. Hopefully it's a clearer example of how scatterv works.
/*********************************************************************
* Scatter A to All Processes
* - Using Scatterv for versatility.
*********************************************************************/
int *send_counts; // Send Counts
int *displacements; // Send Offsets
int chunk; // Number of Rows per Process (- Root)
int chunk_size; // Number of Doubles per Chunk
int remainder; // Number of Rows for Root Process
double * rbuffer; // Receive Buffer
// Do Some Math
chunk = m / (p - 1);
remainder = m % (p - 1);
chunk_size = chunk * n;
// Setup Send Counts
send_counts = malloc(p * sizeof(int));
send_counts[0] = remainder * n;
for (i = 1; i < p; i++)
send_counts[i] = chunk_size;
// Setup Displacements
displacements = malloc(p * sizeof(int));
displacements[0] = 0;
for (i = 1; i < p; i++)
displacements[i] = (remainder * n) + ((i - 1) * chunk_size);
// Allocate Receive Buffer
rbuffer = malloc(send_counts[my_rank] * sizeof(double));
// Scatter A Over All Processes!
MPI_Scatterv(A, // A
send_counts, // Array of counts [int]
displacements, // Array of displacements [int]
MPI_DOUBLE, // Sent Data Type
rbuffer, // Receive Buffer
send_counts[my_rank], // Receive Count - Per Process
MPI_DOUBLE, // Received Data Type
root, // Root
comm); // Comm World
MPI_Barrier(comm);
Also, this causes a segfault on my machine, no mpi... Pretty sure it's the way myblock is being allocated. You should do what #Hristo suggested in the comments. Allocate both matrices and the resultant matrix as flat arrays. That would eliminate the use of double pointers and make your life a whole lot simpler.
#include <stdio.h>
#include <stdlib.h>
void main ()
{
int na = 5;
int my_ma = 5;
int my_na = 5;
int i;
int j;
double **myblock = malloc(my_ma*sizeof(double*));
for(i=0;i<na;i++) {
myblock = malloc(my_na*sizeof(double));
}
unsigned char *recv_buffera=malloc((my_ma*my_na)*sizeof(unsigned char));
for(i=0; i<my_ma; i++) {
for(j=0; j<my_na; j++) {
myblock[i][j]=(float)recv_buffera[i*my_na + j];
}
}
}
Try allocating more like this:
// Allocate A, b, and y. Generate random A and b
double *buff=0;
if (my_rank==0)
{
int A_size = m*n, b_size = n, y_size = m;
int size = (A_size+b_size+y_size)*sizeof(double);
buff = (double*)malloc(size);
if (buff==NULL)
{
printf("Process %d failed to allocate %d bytes\n", my_rank, size);
MPI_Abort(comm,-1);
return 1;
}
// Set pointers
A = buff; b = A+m*n; y = b+n;
// Generate matrix and vector
genMatrix(m, n, A);
genVector(n, b);
}