I am trying to write an application calculating l2 norm of 2 arrays. I have to parallel my calculation.
Here is the code that I have parallelized:
double time_start_openmp = omp_get_wtime();
#pragma omp parallel for
for (i = 0; i < n; i++)
{
numberOfThreads = omp_get_num_threads();
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
time_end_openmp = omp_get_wtime();
l2_norm = sqrt(l2_norm);
openmp_exec_time = time_end_openmp - time_start_openmp;
printf("OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
I compile the code as:
gcc -fopenmp -g -ggdb -Wall -lm -o test test.c
I am running this code with 1 threads and 32 threads. The output is the exact opposite of what's expected. Here is an example output:
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=32
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 32 0.001084 0.000000000000e+00
[hayri#hayri-durmaz MatrixMultipication_MPI]$ export OMP_NUM_THREADS=1
[hayri#hayri-durmaz MatrixMultipication_MPI]$ ./test 10000
OPENMP: 10000 1 0.000106 0.000000000000e+00
Am I seeing wrong or using 32 threads is 10 times slower than 1 thread? So, what am I doing wrong here?
Here is my full code:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#define MATSIZE 2000
static size_t totalMemUsage = 0;
size_t vectors_dot_prod(double *x, double *y, size_t n)
{
double res = 0.0;
size_t i;
for (i = 0; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
size_t vectors_dot_prod2(double *x, double *y, size_t n)
{
size_t res = 0.0;
size_t i = 0;
for (; i <= n - 4; i += 4)
{
res += (x[i] * y[i] +
x[i + 1] * y[i + 1] +
x[i + 2] * y[i + 2] +
x[i + 3] * y[i + 3]);
}
for (; i < n; i++)
{
res += x[i] * y[i];
}
return res;
}
void matrix_vector_mult(double **mat, double *vec, double *result, size_t rows, size_t cols)
{ // in matrix form: result = mat * vec;
size_t i;
for (i = 0; i < rows; i++)
{
result[i] = vectors_dot_prod2(mat[i], vec, cols);
}
}
double get_random()
{
double range = 1000;
double div = RAND_MAX / range;
double randomNumber = (rand() / div);
// printf("%d\n", randomNumber);
return randomNumber;
}
void print_2d_arr(double *arr, size_t row, size_t col)
{
size_t i, j, index;
for (i = 0; i < row; i++)
{
for (j = 0; j < col; j++)
{
index = i * col + j;
printf("%3f ", arr[index]);
}
printf("\n");
}
}
void print_1d_arr(double *arr, size_t row)
{
size_t i;
for (i = 0; i < row; i++)
{
printf("%f, ", arr[i]);
}
printf("\n");
}
size_t **fullfillArrayWithRandomNumbers(double *arr, size_t n)
{
/*
* Fulfilling the array with random numbers
* */
size_t i;
for (i = 0; i < n; i++)
{
arr[i] = get_random();
}
return 0;
}
double *allocarray1D(size_t size)
{
double *array = calloc(size, sizeof(double));
totalMemUsage = totalMemUsage + size * sizeof(double);
return array;
}
size_t ParallelRowMatrixVectorMultiply(size_t n, double *a, double *b, double *x, MPI_Comm comm)
{
size_t i, j;
size_t nlocal;
double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
x[i] += a[index] * fb[j];
}
}
free(fb);
return 0;
}
size_t ParallelRowMatrixVectorMultiply_WithoutAllgather(size_t n, double *a, double *b, double *x_partial, double *x, MPI_Comm comm)
{
// Process 0 sends b to everyone
MPI_Bcast(b, n, MPI_DOUBLE, 0, MPI_COMM_WORLD);
size_t i, j;
size_t nlocal;
// double *fb;
int npes, myrank;
MPI_Comm_size(comm, &npes);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
// fb = (double *)malloc(n * sizeof(double));
nlocal = n / npes;
// MPI_Allgather(b, nlocal, MPI_DOUBLE, fb, nlocal, MPI_DOUBLE, comm);
for (i = 0; i < nlocal; i++)
{
x_partial[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x_partial[i] += a[index] * b[j];
}
}
// free(b);
// Process 0 gathers x_partials to create x
MPI_Gather(x_partial, nlocal, MPI_DOUBLE, x, nlocal, MPI_DOUBLE, 0, MPI_COMM_WORLD);
return 0;
}
size_t SequentialMatrixMultiply(size_t n, double *a, double *b, double *x)
{
size_t i, j;
for (i = 0; i < n; i++)
{
x[i] = 0.0;
for (j = 0; j < n; j++)
{
size_t index = i * n + j;
// printf("%f x %f\n", a[index], b[j]);
x[i] += a[index] * b[j];
}
}
return 0;
}
int main(int argc, char *argv[])
{
// Global declerations
size_t i;
// MPI_Status status;
// Initialize the MPI environment
MPI_Init(&argc, &argv);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int taskid;
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
// Get the name of the processor
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
if (argc != 2)
{
if (taskid == 0)
printf("Usage: %s <N>\n", argv[0]);
MPI_Finalize();
return 0;
}
srand(time(NULL) + taskid);
size_t n = atoi(argv[1]);
size_t nOverK = n / world_size;
double *a = allocarray1D(n * n);
double *b = allocarray1D(n);
double *x = allocarray1D(n);
double *x_partial = allocarray1D(nOverK);
double *xseq = allocarray1D(n);
double *a_partial = allocarray1D(n * nOverK);
if (a == NULL || b == NULL || x == NULL || xseq == NULL || x_partial == NULL)
{
if (taskid == 0)
printf("Allocation failed\n");
MPI_Finalize();
return 0;
}
// Process 0 creates A matrix.
if (taskid == 0)
{
fullfillArrayWithRandomNumbers(a, n * n);
// Process 0 produces the b
fullfillArrayWithRandomNumbers(b, n);
}
// Process 0 sends a_partial to everyone
if (!(world_size == 1 && n == 64000))
{
MPI_Scatter(a, n * nOverK, MPI_DOUBLE, a_partial, n * nOverK, MPI_DOUBLE, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
double time_start = MPI_Wtime();
ParallelRowMatrixVectorMultiply_WithoutAllgather(n, a_partial, b, x_partial, x, MPI_COMM_WORLD);
double time_end = MPI_Wtime();
double parallel_exec_time = time_end - time_start;
double *exec_times = allocarray1D(world_size);
// Process 0 gathers x_partials to create x
MPI_Gather(¶llel_exec_time, 1, MPI_DOUBLE, exec_times, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// print_1d_arr(x, n);
if (taskid == 0)
{
SequentialMatrixMultiply(n, a, b, xseq);
// check difference between x and xseq using OpenMP
//print_1d_arr(exec_times, world_size);
// print_1d_arr(xseq, n);
double max_exec, min_exec, avg_exec;
min_exec = 1000;
for (i = 0; i < world_size; i++)
{
if (max_exec < exec_times[i])
{
max_exec = exec_times[i];
}
if (min_exec > exec_times[i])
{
min_exec = exec_times[i];
}
avg_exec += exec_times[i];
}
avg_exec = avg_exec / world_size;
long double time_start_openmp = omp_get_wtime();
long double time_end_openmp, openmp_exec_time, min_exec_time, max_exec_time, avg_exec_time;
max_exec_time = 0;
max_exec_time = 1000;
long double l2_norm = 0;
size_t numberOfThreads = 0;
size_t r = 0;
double *diff_vector = allocarray1D(n);
size_t nrepeat = 10000;
if (world_size == 1)
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
else
{
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
}
l2_norm = sqrt(l2_norm);
time_end_openmp = omp_get_wtime();
openmp_exec_time = time_end_openmp - time_start_openmp;
// print matrix size, number of processors, number of threads, time, time_openmp, L2 norm of difference of x and xseq (use %.12e while printing norm)
if (world_size == 1)
{
printf("OPENMP: %d %ld %Lf %.12e\n", n, numberOfThreads, openmp_exec_time, openmp_exec_time, l2_norm);
printf("NEW_OPENMP: %d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
printf("MIN_AVG_MAX: %d %d %f %f %f\n", n, world_size, min_exec, max_exec, avg_exec);
printf("MPI: %d %d %f %.12Lf %.12e\n", n, world_size, max_exec, l2_norm, l2_norm);
totalMemUsage = totalMemUsage / (1024 * 1024 * 1024);
printf("TOTALMEMUSAGE: %zu\n", totalMemUsage);
//printf("process: %d %d %d %f %.12e\n", taskid, n, world_size, parallel_exec_time, l2_norm);
//printf("%d %ld %f %.12e\n", n, numberOfThreads, openmp_exec_time, l2_norm);
}
MPI_Finalize();
return 0;
}
Here is the output;
cn009
36
mpicc -fopenmp -g -ggdb -lm -o rowmv rowmv.c
OPENMP: 32000 1 0.000299 2.991110086441e-04
MIN_AVG_MAX: 32000 1 3.112523 3.112523 3.112523
MPI: 32000 1 3.112523 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 2 0.000535 5.350699648261e-04
MIN_AVG_MAX: 32000 1 3.125519 3.125519 3.125519
MPI: 32000 1 3.125519 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 4 0.000434 4.341900348663e-04
MIN_AVG_MAX: 32000 1 3.170650 3.170650 3.170650
MPI: 32000 1 3.170650 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 8 0.000454 4.542167298496e-04
MIN_AVG_MAX: 32000 1 3.168685 3.168685 3.168685
MPI: 32000 1 3.168685 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 16 0.000507 5.065393634140e-04
MIN_AVG_MAX: 32000 1 3.158761 3.158761 3.158761
MPI: 32000 1 3.158761 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
OPENMP: 32000 32 0.000875 8.752988651395e-04
MIN_AVG_MAX: 32000 1 3.166051 3.166051 3.166051
MPI: 32000 1 3.166051 0.000000000000 9.532824124368e-130
TOTALMEMUSAGE: 15
Am I seeing wrong or using 32 threads is 10 times slower than 1
thread? So, what am I doing wrong here?
In the portion of code that is being both profiled and parallelized with OpenMP:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
there is a race condition, namely the access to the variable l2_norm. Moreover, you can drop the private(i), since the index variable (i.e., i) in the parallelized loop will be set implicitly as private by OpenMP. The race condition can be fixed with the OpenMP reduction. Furthermore, your loop is not actually distributing the iterations among threads as you wanted. Because you added again the parallel clause to that #pragma omp for, and assuming that you have nested parallelism disabled, which by default it is, each of the threads created in the outer parallel region will execute "sequentially" the code within that region, namely:
#pragma omp parallel for private(i)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
Hence, each thread will execute all the N iterations of the loop that you intended to be parallelized. Consequently, removing the parallelism and adding additional overhead (e.g., thread creation) to the sequential code. To fix those problems (i.e., race condition and "nested" parallel region) change this code to:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp for reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
Now, having fixed those problems you are left still with another problem (performance-wise), namely that the parallel loop is being performed in the context of a hybrid parallelization of OpenMP + MPI, and you did not explicitly bind the OpenMP threads (within the MPI processes) to the corresponded cores. Without that explicit binding, one cannot be sure in which cores those threads will end up. Naturally, more often than not, having multiple threads running in the same logical core will increase the overall execution of the application being parallelized.
If your application uses threads, then you probably want to ensure that you are either not bound at all (by specifying --bind-to none), or bound to multiple cores using an appropriate binding level or a specific number of processing elements per application process. You can solve this problem by either:
disabling the binding with the MPI flag --bind-to none, to enable threads to be assigned to different cores;
or perform the bound of threads, accordingly. Check this SO thread on how to map the threads to cores in Hybrid parallelizations such as MPI + OpenMP.
By explicitly setting the number of threads per process accordingly, you can avoid that multiple threads end up in the same core, and consequently, avoid that threads within the same core fight for the same resources.
Advice:
IMO you should first test the performance of the OpenMP alone, without any MPI process. In this context, test the scalability of code by measuring the sequential version against 2 threads, then 4, 8, and so on, gradually increasing the number of threads. Eventually, there will be a number of threads for which the code simply stops scaling. Naturally, the amount of parallel work being performed by the threads has to be big enough to overcome the overhead of parallelism. Therefore, you should also test around with bigger and bigger inputs.
After having profiled, tested an improved your OpenMP version you can then extent that shared-memory parallelization with multiple processes using MPI.
Besides the race condition in updating a shared variable as noted in #dreamcrash's answer, your code is not distributing the work properly.
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
#pragma omp parallel for private(i)
~~~~~~~~
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
The parallel construct in the inner loop makes it a nested combined parallel for construct. It means that each thread in the team executing the outer parallel loop spawns a brand new parallel region and distributes the i-loop over the threads in it. There is no distribution happening in the outer parallel region and you end up with N threads all repeating the exact same work. By default nested parallelism is disabled, so the nested parallel region runs sequentially and your code is effectively doing this:
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
There is no distribution of work and all threads write to the same locations in the diff_vector[] array.
On one hand, this code in general is a memory-bound one since the amount of computation per byte of data is low - modern CPUs can do many multiplications and subtractions per cycle while fetching data from memory and writing results back there takes many cycles. Memory-bound problems don't get any faster with more threads since the limiting factor is the memory bandwidth. This isn't that big of a problem in your case because 32K array entries take up 256 KB of memory and that fits in most CPU caches, and the L3 cache is blazing fast, but is still larger than the fastest L1 cache of a single CPU core. On the other hand, writing to the same memory areas from multiple threads results in true and false sharing, with the associated inter-thread cache invalidation, which usually results in the parallel code running way slower than the sequential version.
There are tools that can help you analyse the performance of your code and spot problems. As I already wrote in a comment, Intel VTune is one of them and is freely available as part of the oneAPI toolkit. Intel Inspector is another one (again, free and part of the oneAPI toolkit) and it finds problems such as data races. The two tools work very well together and I couldn't recommend them strongly enough to any aspiring parallel programmer.
There is also a minor race condition writing to numberOfThreads, but since all values written are the same, that isn't much of a logical problem. The correct version of the code in question should be:
#pragma omp parallel
{
#pragma omp master
numberOfThreads = omp_get_num_threads();
#pragma omp parallel reduction(+:l2_norm)
for (i = 0; i < n; i++)
{
double local_diff = x[i] - xseq[i];
diff_vector[i] = local_diff;
l2_norm += (local_diff * local_diff);
}
}
I wrote two programs in C which are doing a tall-skinny-matrix-matrix multiplication with openmp.
The algorithm is memory bounded for my machine.
For one of the codes I used and array of pointers (aop) for storing the matrices.
For the other code I used just on array where the rows of the matrix are stored one after another, called pta from now on.
Now I observed that pta always outperforms the aop version.
Especially when using 12 instead of 6 cores, the performance for aop goes slightly down where the performance for pta doubles.
I can't really explain this behavior, I just assume that the cores are somehow interfering during computation.
Does somebody can explain the behavior?
Pointer to array version:
int main(int argc, char *argv[])
{
// parallel region to verify that pinning works correctly
#pragma omp parallel
{
printf("OpenMP thread %d / %d runs on core %d\n", omp_get_thread_num(), omp_get_num_threads(), sched_getcpu());
}
//define dimensions
int dim_n=atoi(*(argv+1));
int dim_nb=2;
printf("n = %d, nb = %d\n",dim_n,dim_nb);
//allocate space for matrix M, V and W
//each element of **M is a pointer for the first element of an array
//size of double and double* is depending on compiler and machine
double *M = malloc((dim_nb*dim_nb) * sizeof(double));
//Initialize Matrix M
for(int i=0; i<dim_nb; i++)
{
for(int j=0; j<dim_nb; j++)
{
M[i*dim_nb+j]=((i+1)-1.0)*dim_nb+(j+1)-1.0;
}
}
double *V = malloc((dim_n*dim_nb) * sizeof(double));
double *W = malloc((dim_n*dim_nb) * sizeof(double));
// using parallel region to Initialize the matrix V
#pragma omp parallel for schedule(static)
for (int i=0; i<dim_n; i++)
{
for (int j=0; j<dim_nb; j++)
{
V[i*dim_nb+j]=j+1;
}
}
int max_iter=100;
double time = omp_get_wtime();
// calculate the matrix-matrix product VM product max_iter times
for(int iter=0; iter<max_iter; iter++)
{
// calculate matrix-matrix product in parallel
#pragma omp parallel for schedule(static)
// i < #rows of V
for(int i=0; i<dim_n; i++)
{
// j < #columns of M
for(int j=0; j<dim_nb; j++)
{
// Initialize W_ij with zero, everytime W_ij is calculated
W[i*dim_nb+j]=0;
// k < #colums of V = rows of M
for(int k=0; k<dim_nb; k++)
{
W[i*dim_nb+j] += V[i*dim_nb+k]*M[k*dim_nb+j];
}
}
}
}
time=omp_get_wtime()-time;
'''
Array of pointers version:
int main(int argc, char *argv[])
{
// parallel region to verify that pinning works correctly
#pragma omp parallel
{
printf("OpenMP thread %d / %d runs on core %d\n", omp_get_thread_num(), omp_get_num_threads(), sched_getcpu());
}
//define dimensions
int dim_n=atoi(*(argv+1));
int dim_nb=2;
printf("n = %d, nb = %d\n",dim_n,dim_nb);
//allocate space for matrix M, V and W
// each element of **M is a pointer for the first element of an array
//size of double and double* is depending on compiler and machine
double **M = malloc(dim_nb * sizeof(double *));
for(int i = 0; i < dim_nb; i++)
{
M[i] = malloc(dim_nb * sizeof(double));
}
//Initialize Matrix
for(int i=0; i<dim_nb; i++)
{
for(int j=0; j<dim_nb; j++)
{
M[i][j]=((i+1)-1.0)*dim_nb+(j+1)-1.0;
}
}
double **V = malloc(dim_n * sizeof(double *));
for(int i=0; i<dim_n; i++)
{
V[i] = malloc(dim_nb * sizeof(double));
}
double **W = malloc(dim_n * sizeof(double *));
for(int i=0; i<dim_n; i++)
{
W[i] = malloc(dim_nb * sizeof(double));
}
// using parallel region to Initialize the matrix V
#pragma omp parallel for schedule(static)
for (int i=0; i<dim_n; i++)
{
for (int j=0; j<dim_nb; j++)
{
V[i][j]=j+1;
}
}
int max_iter=100;
double time = omp_get_wtime();
// calculate the matrix-matrix product VM product max_iter times
for(int iter=0; iter<max_iter; iter++)
{
// calculate matrix-matrix product in parallel
#pragma omp parallel for schedule(static)
// i < #rows of V
for(int i=0; i<dim_n; i++)
{
// j < #columns of M
for(int j=0; j<dim_nb; j++)
{
// Initialize W_ij with zero, everytime W_ij is calculated
W[i][j]=0;
// k < #colums of V = rows of M
for(int k=0; k<dim_nb; k++)
{
W[i][j] += V[i][k]*M[k][j];
}
}
}
}
time=omp_get_wtime()-time;
It is quite easy to explain as pointer version has to access pointer first then dereference this pointer. Those memory location can be very far fro each other and it is much more likely cache to be flushed as well. Data in the array is stored in one memory chunk so less memory accesses are needed and it is more likely CPU to do not miss the cache.
https://godbolt.org/z/c_8c7c
I wrote the Matrix-Vector product program using OpenMP and AVX2.
However, I got the wrong answer because of OpenMP.
The true answer is all of the value of array c would become 100.
My answer was mix of 98, 99, and 100.
The actual code is below.
I compiled Clang with -fopenmp, -mavx, -mfma.
#include "stdio.h"
#include "math.h"
#include "stdlib.h"
#include "omp.h"
#include "x86intrin.h"
void mv(double *a,double *b,double *c, int m, int n, int l)
{
int k;
#pragma omp parallel
{
__m256d va,vb,vc;
int i;
#pragma omp for private(i, va, vb, vc) schedule(static)
for (k = 0; k < l; k++) {
vb = _mm256_broadcast_sd(&b[k]);
for (i = 0; i < m; i+=4) {
va = _mm256_loadu_pd(&a[m*k+i]);
vc = _mm256_loadu_pd(&c[i]);
vc = _mm256_fmadd_pd(vc, va, vb);
_mm256_storeu_pd( &c[i], vc );
}
}
}
}
int main(int argc, char* argv[]) {
// set variables
int m;
double* a;
double* b;
double* c;
int i;
m=100;
// main program
// set vector or matrix
a=(double *)malloc(sizeof(double) * m*m);
b=(double *)malloc(sizeof(double) * m*1);
c=(double *)malloc(sizeof(double) * m*1);
//preset
for (i=0;i<m;i++) {
a[i]=1;
b[i]=1;
c[i]=0.0;
}
for (i=m;i<m*m;i++) {
a[i]=1;
}
mv(a, b, c, m, 1, m);
for (i=0;i<m;i++) {
printf("%e\n", c[i]);
}
free(a);
free(b);
free(c);
return 0;
}
I know critical section would help. However critical section was slow.
So, how can I solve the problem?
The fundamental operation you want is
c[i] = a[i,k]*b[k]
If you use row-major order storage this becomes
c[i] = a[i*l + k]*b[k]
If you use column-major order storage this becomes
c[i] = a[k*m + i]*b[k]
For row-major order you can parallelize like this
#pragma omp parallel for
for(int i=0; i<m; i++) {
for(int k=0; k<l; k++) {
c[i] += a[i*l+k]*b[k];
}
}
For column-major order you can parallelize like this
#pragma omp parallel
for(int k=0; k<l; k++) {
#pragma omp for
for(int i=0; i<m; i++) {
c[i] += a[k*m+i]*b[k];
}
}
Matrix-vector operations are Level 2 operations which are memory bandwidth bound operation. The Level 1 and Level 2 operations don't scale e.g with the number of cores. It's only the Level 3 operations (e.g. dense matrix multiplication) which scale https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3.
The issue is not with your AVX intrinsics, let's look at the code without the intrinsics for a minute:
void mv(double *a,double *b,double *c, int m, int n, int l)
{
#pragma omp parallel for schedule(static)
for (int k = 0; k < l; k++) {
double xb = b[k];
for (int i = 0; i < m; i++) {
double xa = a[m*k+i];
double xc = c[i];
xc = xc + xa * xb;
c[i] = xc;
}
}
}
Note: your private declaration was technically correct and redundant because declared inside of the parallel loop, but it is just so much easier to reason about the code if you declare the variables as locally as possible.
The race condition on your code is on c[i] - which multiple threads try to update. Now even if you could protect that with say an atomic update, the performance would be horrible: Not only because of the protection, but because the data of c[i] has to be constantly shifted around between caches of different cores.
One thing you can do about this is to use an array reduction on c. This makes a private copy of c for each thread and they get merged at the end:
void mv(double *a,double *b,double *c, int m, int n, int l)
{
#pragma omp parallel for schedule(static) reduction(+:c[:m])
for (int k = 0; k < l; k++) {
for (int i = 0; i < m; i++) {
c[i] += a[m*k+i] * b[k];
}
}
}
This should be reasonably efficient as long as two m-vectors fit in your cache but you still may get a lot of overhead due to thread management overhead. Eventually you will be limited by memory bandwidth because in a vector-matrix multiplication you only have one computation per element read from a.
Anyway, you can of course swap i and k loops and save the reduction, but then your memory access pattern on a will be inefficient (strided) - so you should block the loop to avoid that.
Now if you look at the output of any modern compiler, it will generate SIMD code on its own. Of course you can apply your own SIMD intrinsics if you want to. But make sure that you handle the edge cases correctly if m is not divisible by 4 (you did not in your original version).
At the end of the day, if you really want performance - use the functions from a BLAS library (e.g. MKL). If you want to play around with optimization, there are ample of opportunities to go in deep details.
I want to do a reduction on an array using OpenMP and SIMD. I read that a reduction in OpenMP is equivalent to:
inline float sum_scalar_openmp2(const float a[], const size_t N) {
float sum = 0.0f;
#pragma omp parallel
{
float sum_private = 0.0f;
#pragma omp parallel for nowait
for(int i=0; i<N; i++) {
sum_private += a[i];
}
#pragma omp atomic
sum += sum_private;
}
return sum;
}
I got this idea from the follow link:
http://bisqwit.iki.fi/story/howto/openmp/#ReductionClause
But atomic also does not support complex operators. What I did was replace atomic with critical and implemented the reduction with OpenMP and SSE like this:
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline float sum_vector4_openmp(const float a[], const size_t N) {
__m128 sum4 = _mm_set1_ps(0.0f);
#pragma omp parallel
{
__m128 sum4_private = _mm_set1_ps(0.0f);
#pragma omp for nowait
for(int i=0; i < ROUND_DOWN(N, 4); i+=4) {
__m128 a4 = _mm_load_ps(a + i);
sum4_private = _mm_add_ps(a4, sum4_private);
}
#pragma omp critical
sum4 = _mm_add_ps(sum4_private, sum4);
}
__m128 t1 = _mm_hadd_ps(sum4,sum4);
__m128 t2 = _mm_hadd_ps(t1,t1);
float sum = _mm_cvtss_f32(t2);
for(int i = ROUND_DOWN(N, 4); i < N; i++) {
sum += a[i];
}
return sum;
}
However, this function does not perform as well as I hope. I'm using Visual Studio 2012 Express. I know I can improve the performance a bit by unrolling the SSE load/add a few times but that still is less than I expect.
I get much better performance by running over slices of the arrays equal to the number of threads:
inline float sum_slice(const float a[], const size_t N) {
int nthreads = 4;
const int offset = ROUND_DOWN(N/nthreads, nthreads);
float suma[8] = {0};
#pragma omp parallel for num_threads(nthreads)
for(int i=0; i<nthreads; i++) {
suma[i] = sum_vector4(&a[i*offset], offset);
}
float sum = 0.0f;
for(int i=0; i<nthreads; i++) {
sum += suma[i];
}
for(int i=nthreads*offset; i < N; i++) {
sum += a[i];
}
return sum;
}
inline float sum_vector4(const float a[], const size_t N) {
__m128 sum4 = _mm_set1_ps(0.0f);
int i = 0;
for(; i < ROUND_DOWN(N, 4); i+=4) {
__m128 a4 = _mm_load_ps(a + i);
sum4 = _mm_add_ps(sum4, a4);
}
__m128 t1 = _mm_hadd_ps(sum4,sum4);
__m128 t2 = _mm_hadd_ps(t1,t1);
float sum = _mm_cvtss_f32(t2);
for(; i < N; i++) {
sum += a[i];
}
return sum;
}
Does someone know if there is a better way of doing reductions with more complicated operators in OpenMP?
I guess the answer to your question is No. I don't think there is a better way of doing reduction with more complicated operators in OpenMP.
Assuming that the array is 16 bit aligned, number of openmp threads is 4, one might expect the performance gain to be 12x - 16x by OpenMP + SIMD. In realistic, it might not produce enough performance gain because
There is a overhead in creating the openmp threads.
The code is doing 1 addition operation for 1 Load operation. Hence, the CPU isn't doing enough computation. So, it almost looks like the CPU spends most of the time in loading the data, kind of memory bandwidth bound.