I want to do a reduction on an array using OpenMP and SIMD. I read that a reduction in OpenMP is equivalent to:
inline float sum_scalar_openmp2(const float a[], const size_t N) {
float sum = 0.0f;
#pragma omp parallel
{
float sum_private = 0.0f;
#pragma omp parallel for nowait
for(int i=0; i<N; i++) {
sum_private += a[i];
}
#pragma omp atomic
sum += sum_private;
}
return sum;
}
I got this idea from the follow link:
http://bisqwit.iki.fi/story/howto/openmp/#ReductionClause
But atomic also does not support complex operators. What I did was replace atomic with critical and implemented the reduction with OpenMP and SSE like this:
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline float sum_vector4_openmp(const float a[], const size_t N) {
__m128 sum4 = _mm_set1_ps(0.0f);
#pragma omp parallel
{
__m128 sum4_private = _mm_set1_ps(0.0f);
#pragma omp for nowait
for(int i=0; i < ROUND_DOWN(N, 4); i+=4) {
__m128 a4 = _mm_load_ps(a + i);
sum4_private = _mm_add_ps(a4, sum4_private);
}
#pragma omp critical
sum4 = _mm_add_ps(sum4_private, sum4);
}
__m128 t1 = _mm_hadd_ps(sum4,sum4);
__m128 t2 = _mm_hadd_ps(t1,t1);
float sum = _mm_cvtss_f32(t2);
for(int i = ROUND_DOWN(N, 4); i < N; i++) {
sum += a[i];
}
return sum;
}
However, this function does not perform as well as I hope. I'm using Visual Studio 2012 Express. I know I can improve the performance a bit by unrolling the SSE load/add a few times but that still is less than I expect.
I get much better performance by running over slices of the arrays equal to the number of threads:
inline float sum_slice(const float a[], const size_t N) {
int nthreads = 4;
const int offset = ROUND_DOWN(N/nthreads, nthreads);
float suma[8] = {0};
#pragma omp parallel for num_threads(nthreads)
for(int i=0; i<nthreads; i++) {
suma[i] = sum_vector4(&a[i*offset], offset);
}
float sum = 0.0f;
for(int i=0; i<nthreads; i++) {
sum += suma[i];
}
for(int i=nthreads*offset; i < N; i++) {
sum += a[i];
}
return sum;
}
inline float sum_vector4(const float a[], const size_t N) {
__m128 sum4 = _mm_set1_ps(0.0f);
int i = 0;
for(; i < ROUND_DOWN(N, 4); i+=4) {
__m128 a4 = _mm_load_ps(a + i);
sum4 = _mm_add_ps(sum4, a4);
}
__m128 t1 = _mm_hadd_ps(sum4,sum4);
__m128 t2 = _mm_hadd_ps(t1,t1);
float sum = _mm_cvtss_f32(t2);
for(; i < N; i++) {
sum += a[i];
}
return sum;
}
Does someone know if there is a better way of doing reductions with more complicated operators in OpenMP?
I guess the answer to your question is No. I don't think there is a better way of doing reduction with more complicated operators in OpenMP.
Assuming that the array is 16 bit aligned, number of openmp threads is 4, one might expect the performance gain to be 12x - 16x by OpenMP + SIMD. In realistic, it might not produce enough performance gain because
There is a overhead in creating the openmp threads.
The code is doing 1 addition operation for 1 Load operation. Hence, the CPU isn't doing enough computation. So, it almost looks like the CPU spends most of the time in loading the data, kind of memory bandwidth bound.
Related
I want to write a parallel program that computes a perimeter, area and diagonal of multiple rectangles. The way I thought is to use a functional parallelism to build my program.
First, the perimeter, area and diagonal measures will be parallelized.
In my program, to compute these measures, I'm using #pragma omp for to divide loop iterations into threads and #pragma omp simd to vectorize the computation. But the problem is that I need to use #pragma omp task that represents a non-structured functional parallelism in the code but I donĀ“t know how to use it with #pragma omp simd.
I know if I use in the way shown below, I'll be vectorizing the task creation, what makes no sense to SIMD architecture.
If someone could explain how to use #pragma omp for, #pragma omp simd and #pragma omp task, I'll be very grateful.
My code:
#include<stdio.h>
#include<stdlib.h>
#include<omp.h>
#include<math.h>
#include<time.h>
#define NUM_THREADS 4
/**
* This program computes three measures of a rectangle
* 1. Perimeter ==> P = 2B + 2H
* 2. Area ==> A = BH
* 3. Diagonal ==> D^2 = B^2 + H^2
*
*
*
* You must use these 4 directives: parallel, for, simd, task
*/
enum {
BASE = 0,
HEIGHT = 1,
};
#pragma omp declare simd
double compute_perimeter(double base, double height){ return (2 * base + 2 * height); }
#pragma omp declare simd
double compute_area(double base, double height){ return (base * height); }
#pragma omp declare simd
double compute_diagonal(double base, double height){ return (sqrt((base * base) + (height * height))); }
double ** generate_rectangles(size_t n){
double ** rectangles = (double **)malloc(n * sizeof(double *));
for(size_t i = 0; i < n; ++i){
rectangles[i] = (double *)malloc(n * sizeof(double));
for(size_t j = 0; j < n; ++j) rectangles[i][j] = rand() % 100;
}
return rectangles;
}
void destroy_rectangles(double ** rectangles, size_t n){
for(size_t i = 0; i < n; ++i) free(rectangles[i]);
free(rectangles);
}
int main(){
srand(time(NULL));
size_t n = rand() % 100000;
double * perimeters = (double *)malloc(n * sizeof(double));
double * areas = (double *)malloc(n * sizeof(double));
double * diagonals = (double *)malloc(n * sizeof(double));
double ** rectangles = generate_rectangles(n);
#pragma omp parallel for simd num_threads(NUM_THREADS) schedule(static)
for(size_t i = 0; i < n; ++i){
#pragma omp task
{perimeters[i] = compute_perimeter(rectangles[i][BASE], rectangles[i][HEIGHT]);}
#pragma omp task
{areas[i] = compute_area(rectangles[i][BASE], rectangles[i][HEIGHT]);}
#pragma omp task
{diagonals[i] = compute_diagonal(rectangles[i][BASE], rectangles[i][HEIGHT]);}
}
for(size_t i = 0; i < n; ++i) printf("rectangle %d:\n base = %lf, height = %lf, perimeter = %lf, area = %lf, diagonal = %lf\n\n", i, rectangles[i][BASE], rectangles[i][HEIGHT], perimeters[i], areas[i], diagonals[i]);
free(perimeters);
free(areas);
free(diagonals);
destroy_rectangles(rectangles, n);
return 0;
}
I have created a C program to calculates values of the function y(x) = sin(nx) for
n = 1, 2, 3, 4. The constant M_PI is defined in the math.h header file.
What is the best way to parallelize this program using OpenMP to ensure
that it can scale to make effective use of a modern multi-core processor?
#include <stdio.h>
#include <math.h>
int main(){
const int NPOINTS=1001;
const int NCURVES=4;
double dtheta;
double y[NCURVES][NPOINTS];
double theta[NPOINTS];
dtheta = 2*M_PI / ( (float) (NPOINTS-1) );
for (int n=0; n<NCURVES; n++){
for(int i=0; i<NPOINTS; i++){
theta[i] = ( (float) i) * dtheta;
y[n][i] = sin( ((float) (n+1)) * theta[i]);
}
}
return 0;
}
Your example is pretty straightforward since there is no loop dependencies or potential race-conditions. You just need to assign the iterations of the two loops among threads as follows:
#pragma omp parallel for collapse(2)
for (int n=0; n<NCURVES; n++){
for(int i=0; i<NPOINTS; i++){
theta[i] = ( (float) i) * dtheta;
y[n][i] = sin( ((float) (n+1)) * theta[i]);
}
}
You can swap both loops as well:
#pragma omp parallel for
for (int i=0; i < NPOINTS; i++){
theta[i] = ((float) i) * dtheta;
for(int n=0; n < NCURVES; n++){
y[n][i] = sin( ((float) (n+1)) * theta[i]);
}
}
You need to test and see which one scales the best.
As pointed out in the comments by tstanisl there is little sense storing the ((float) i) * dtheta since
"because modern CPU will compute it much faster than fetching data
from cache":
#pragma omp parallel for collapse(2)
for (int n=0; n<NCURVES; n++){
for(int i=0; i<NPOINTS; i++){
y[n][i] = sin( ((float) (n+1)) * ( (float) i) * dtheta);
}
}
I wrote the Matrix-Vector product program using OpenMP and AVX2.
However, I got the wrong answer because of OpenMP.
The true answer is all of the value of array c would become 100.
My answer was mix of 98, 99, and 100.
The actual code is below.
I compiled Clang with -fopenmp, -mavx, -mfma.
#include "stdio.h"
#include "math.h"
#include "stdlib.h"
#include "omp.h"
#include "x86intrin.h"
void mv(double *a,double *b,double *c, int m, int n, int l)
{
int k;
#pragma omp parallel
{
__m256d va,vb,vc;
int i;
#pragma omp for private(i, va, vb, vc) schedule(static)
for (k = 0; k < l; k++) {
vb = _mm256_broadcast_sd(&b[k]);
for (i = 0; i < m; i+=4) {
va = _mm256_loadu_pd(&a[m*k+i]);
vc = _mm256_loadu_pd(&c[i]);
vc = _mm256_fmadd_pd(vc, va, vb);
_mm256_storeu_pd( &c[i], vc );
}
}
}
}
int main(int argc, char* argv[]) {
// set variables
int m;
double* a;
double* b;
double* c;
int i;
m=100;
// main program
// set vector or matrix
a=(double *)malloc(sizeof(double) * m*m);
b=(double *)malloc(sizeof(double) * m*1);
c=(double *)malloc(sizeof(double) * m*1);
//preset
for (i=0;i<m;i++) {
a[i]=1;
b[i]=1;
c[i]=0.0;
}
for (i=m;i<m*m;i++) {
a[i]=1;
}
mv(a, b, c, m, 1, m);
for (i=0;i<m;i++) {
printf("%e\n", c[i]);
}
free(a);
free(b);
free(c);
return 0;
}
I know critical section would help. However critical section was slow.
So, how can I solve the problem?
The fundamental operation you want is
c[i] = a[i,k]*b[k]
If you use row-major order storage this becomes
c[i] = a[i*l + k]*b[k]
If you use column-major order storage this becomes
c[i] = a[k*m + i]*b[k]
For row-major order you can parallelize like this
#pragma omp parallel for
for(int i=0; i<m; i++) {
for(int k=0; k<l; k++) {
c[i] += a[i*l+k]*b[k];
}
}
For column-major order you can parallelize like this
#pragma omp parallel
for(int k=0; k<l; k++) {
#pragma omp for
for(int i=0; i<m; i++) {
c[i] += a[k*m+i]*b[k];
}
}
Matrix-vector operations are Level 2 operations which are memory bandwidth bound operation. The Level 1 and Level 2 operations don't scale e.g with the number of cores. It's only the Level 3 operations (e.g. dense matrix multiplication) which scale https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3.
The issue is not with your AVX intrinsics, let's look at the code without the intrinsics for a minute:
void mv(double *a,double *b,double *c, int m, int n, int l)
{
#pragma omp parallel for schedule(static)
for (int k = 0; k < l; k++) {
double xb = b[k];
for (int i = 0; i < m; i++) {
double xa = a[m*k+i];
double xc = c[i];
xc = xc + xa * xb;
c[i] = xc;
}
}
}
Note: your private declaration was technically correct and redundant because declared inside of the parallel loop, but it is just so much easier to reason about the code if you declare the variables as locally as possible.
The race condition on your code is on c[i] - which multiple threads try to update. Now even if you could protect that with say an atomic update, the performance would be horrible: Not only because of the protection, but because the data of c[i] has to be constantly shifted around between caches of different cores.
One thing you can do about this is to use an array reduction on c. This makes a private copy of c for each thread and they get merged at the end:
void mv(double *a,double *b,double *c, int m, int n, int l)
{
#pragma omp parallel for schedule(static) reduction(+:c[:m])
for (int k = 0; k < l; k++) {
for (int i = 0; i < m; i++) {
c[i] += a[m*k+i] * b[k];
}
}
}
This should be reasonably efficient as long as two m-vectors fit in your cache but you still may get a lot of overhead due to thread management overhead. Eventually you will be limited by memory bandwidth because in a vector-matrix multiplication you only have one computation per element read from a.
Anyway, you can of course swap i and k loops and save the reduction, but then your memory access pattern on a will be inefficient (strided) - so you should block the loop to avoid that.
Now if you look at the output of any modern compiler, it will generate SIMD code on its own. Of course you can apply your own SIMD intrinsics if you want to. But make sure that you handle the edge cases correctly if m is not divisible by 4 (you did not in your original version).
At the end of the day, if you really want performance - use the functions from a BLAS library (e.g. MKL). If you want to play around with optimization, there are ample of opportunities to go in deep details.
For example:
int a[100];
int i=0;
for(i=0;i<100;i++)
{
a[i]=i;
}
#pragma omp parallel for firstprivate(a[0])
for(i=0;i<100;i++)
{
a[i]=a[i]+a[((i+1)%100)];
}
However, it compiles failure:
error: syntax error in omp clause
#pragma omp parallel for firstprivate(a[0])
^
Sorry for my previous description. I want the output to be : a[i]=a[i]+a[((i+1)%100)],for example, a[10]=a[10]+a[11],a[99]=a[99]+a[0]. However, when i becomes 99, a[99] should be a[99]=a[99]+a[0]. But when the thread executes a[99]=a[99]+a[0], the value of a[0] has been changed into a[0]=a[0]+a[1] by another paralleled thread(it has dependency). What should I do to guarantee that each thread can use original value in a[i] to finish a[i]=a[i]+a[((i+1)%100)] ?
Your code is not guaranteed to get the correct result because it has a race condition. One way to fix this is to do it out-of-place like this
void foo2(int * __restrict a, int * __restrict b) {
int i;
#pragma omp parallel for schedule(static)
for (i = 0; i<N; i++) {
b[i] = a[i] + a[((i + 1) % N)];
}
}
If you really want to do it in-place then it's a bit complicated
void foo3(int a[]) {
int n = N - 1;
#pragma omp parallel
{
int i;
int ithread = omp_get_thread_num();
int nthread = omp_get_num_threads();
int start = ithread*n / nthread;
int finish = (ithread + 1)*n/ nthread;
int tmp = a[finish];
for (i = start; i < finish-1; i++) {
a[i] += a[i + 1];
}
a[finish-1] += tmp;
}
a[n] += a[0];
}
I am an OpenMP beginner. I come across such a problem.
I have a mask array M with the length N, whose element is either 0 or 1. I hope to extract all indices i that satisfies M[i]=1 and store them into a new array T.
Can this problem be accelerated by OpenMP?
I have tried following code. But it is not performance effective.
int count = 0;
#pragma omp parallel for
for(int i = 0; i < N; ++i) {
if(M[i] == hashtag) {
int pos = 0;
#pragma omp critical (c1)
pos = count++;
T[pos] = i;
}
I am not 100% sure this will be much better, but you could try the following:
int count = 0;
#pragma omp parallel for
for(int i = 0; i < N; ++i) {
if(M[i]) {
#pragma omp atomic
T[count++] = i;
}
}
If the array is quite sparse, threads will be able to zip through a lot of zeros without waiting for others. But you can only update one index at a time. The problem is really that different threads are writing to the same memory block (T), which means you will be running into issues of caching: every time one thread writes to T, the cache of all the other cores is "dirty" - so when they try to modify it, a lot of shuffling goes on behind the scenes. All this is transparent to you (you don't need to write code to handle it) but it slows things down signficantly - I suspect that's your real bottleneck. If your matrix is big enough to make it worth your while, you might try to do the following:
Create as many arrays T as there are threads
Let each thread update its own version of T
Combine all the T arrays into one after the loops have completed
It might be faster (because the different threads don't write to the same memory) - but since there are so few statements inside the loop, I suspect it won't be.
EDIT I created a complete test program, and found two things. First, the atomic directive doesn't work in all versions of omp, and you may well have to use T[count++] += i; for it to even compile (which is OK since T can be set to all zeros initially); more troubling, you will not get the same answer twice if you do this (the final value of count changes from one pass to the next); if you use critical, that doesn't happen.
A second observation is that the speed of the program really slows down when you increase the number of threads, which confirms what I was suspecting about shared memory (times for 10M elements processed:
threads elapsed
1 0.09s
2 0.73s
3 1.21s
4 1.62s
5 2.34s
You can see this is true by changing how sparse matrix M is - when I create M as a random array, and test for M[i] < 0.01 * RAND_MAX (0.1% dense matrix), things run much more quickly than if I make it 10% dense - showing that the part inside the critical section is really slowing us down.
That being the case, I don't think there is a way of speeding up this task in OMP - the job of consolidating the outputs of all the threads into a single list at the end is just going to eat up any speed advantage you may have had, given how little is going on inside the inner loop. So rather than using multiple threads, I suggest you rewrite the loop as efficiently as possible - for example:
for( i = 0; i < N; i++) {
T[count] = i;
count += M[i];
}
In my quick benchmark, this was faster than the OMP solution - comparable with the threads = 1 solution. Again - this is because of the way memory is being accessed here. Note that I avoid using an if statement - this keeps the code as fast as possible. Instead, I take advantage of the fact that M[i] is always zero or one. At the end of the loop you have to discard the element T[count] because it will be invalid... the "good elements" are T[0] ... T[count-1]. An array M with 10M elements was processed by this loop in ~ 0.02 sec on my machine. Should be sufficient for most purposes?
Based on Floris's fast function I tried to see if I could find a way to find a faster solution with OpenMP. I came up with two functions foo_v2 and foo_v3 which are faster for larger arrays, foo_v2 is faster independent of density and foo_v3 is faster for sparser arrays. The function foo_v2 essentially creates a 2D array with width N*nthreads as well as an array countsa which contains the counts for each thread. This is better explained with code. The following code would loop over all the elements written out to T.
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
T[ithread*N/nthread + i]
}
}
The functionfoo_v3creates a 1D array as requested. In all casesNhas to be pretty large to overcome the OpenMP overhead. The code below defaults to 256MB with a density ofMabout 10%. The OpenMP functions are both faster by over a factor of 2 on my 4 core Sandy Bridge system. If you put the density at 50%foo_v2is faster still by about about a factor of 2 butfoo_v3is no longer faster.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
int foo_v1(int *M, int *T, const int N) {
int count = 0;
for(int i = 0; i<N; i++) {
T[count] = i;
count += M[i];
}
return count;
}
int foo_v2(int *M, int *T, int *&counta, const int N) {
int nthreads;
#pragma omp parallel
{
nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
counta = new int[nthreads];
int count_private = 0;
#pragma omp for
for(int i = 0; i<N; i++) {
T[ithread*N/nthreads + count_private] = i;
count_private += M[i];
}
counta[ithread] = count_private;
}
return nthreads;
}
int foo_v3(int *M, int *T, const int N) {
int count = 0;
int *counta = 0;
#pragma omp parallel reduction(+:count)
{
const int nthreads = omp_get_num_threads();
const int ithread = omp_get_thread_num();
#pragma omp single
{
counta = new int[nthreads+1];
counta[0] = 0;
}
int *Tprivate = new int[N/nthreads];
int count_private = 0;
#pragma omp for nowait
for(int i = 0; i<N; i++) {
Tprivate[count_private] = i;
count_private += M[i];
}
counta[ithread+1] = count_private;
count += count_private;
#pragma omp barrier
int offset = 0;
for(int i=0; i<(ithread+1); i++) {
offset += counta[i];
}
for(int i=0; i<count_private; i++) {
T[offset + i] = Tprivate[i];
}
delete[] Tprivate;
}
delete[] counta;
return count;
}
void compare(const int *T1, const int *T2, const int N, const int count, const int *counta, const int nthreads) {
int diff = 0;
int n = 0;
for(int ithread=0; ithread<nthreads; ithread++) {
for(int i=0; i<counta[ithread]; i++) {
int i2 = N*ithread/nthreads+i;
//printf("%d %d\n", T1[n], T2[i2]);
int tmp = T1[n++] - T2[i2];
if(tmp<0) tmp*=-1;
diff += tmp;
}
}
printf("diff %d\n", diff);
}
void compare_v2(const int *T1, const int *T2, const int count) {
int diff = 0;
int n = 0;
for(int i=0; i<count; i++) {
int tmp = T1[i] - T2[i];
//if(tmp!=0) printf("%i %d %d\n", i, T1[i], T2[i]);
if(tmp<0) tmp*=-1;
diff += tmp;
}
printf("diff %d\n", diff);
}
int main() {
const int N = 1 << 26;
printf("%f MB\n", 4.0*N/1024/1024);
int *M = new int[N];
int *T1 = new int[N];
int *T2 = new int[N];
int *T3 = new int[N];
int *counta;
double dtime;
for(int i=0; i<N; i++) {
M[i] = ((rand()%10)==0);
}
//int repeat = 10000;
int repeat = 1;
int count1, count2;
int nthreads;
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count1 = foo_v1(M, T1, N);
dtime = omp_get_wtime() - dtime;
printf("time v1 %f\n", dtime);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) nthreads = foo_v2(M, T2, counta, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
compare(T1, T2, N, count1, counta, nthreads);
dtime = omp_get_wtime();
for(int i=0; i<repeat; i++) count2 = foo_v3(M, T3, N);
dtime = omp_get_wtime() - dtime;
printf("time v2 %f\n", dtime);
printf("count1 %d, count2 %d\n", count1, count2);
compare_v2(T1, T3, count1);
}
The critical operation should be atomic instead of critical; actually in your case you have to use the atomic capture clause:
int pos, count = 0; // pos declared outside the loop
#pragma omp parallel for private(pos) // and privatized, count is implicitly
for(int i = 0; i < N; ++i) { // shared by all the threads
if(M[i]) {
#pragma omp atomic capture
pos = count++;
T[pos] = i;
}
}
Take a look at this answer to have an overview over all the possible possibilities of atomic operations with OpenMP.