I have a function that I want to parallelize. This is the serial version.
void parallelCSC_SpMV(float *x, float *b)
{
int i, j;
for(i = 0; i < numcols; i++)
{
for(j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++)
{
b[irem[j] - 1] += xrem[j]*x[i];
}
}
}
I figured a decent way to do this was to have each thread write to a private copy of the b array (which does not need to be a protected critical section because its a private copy), after the thread is done, it will then copy its results to the actual b array. Here is my code.
void parallelCSC_SpMV(float *x, float *b)
{
int i, j, k;
#pragma omp parallel private(i, j, k)
{
float* b_local = (float*)malloc(sizeof(b));
#pragma omp for nowait
for(i = 0; i < numcols; i++)
{
for(j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++)
{
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
b_local[index] += current_add;
}
}
for (k = 0; k < sizeof(b) / sizeof(b[0]); k++)
{
// Separate question: Is this if statement allowed?
//if (b_local[k] == 0) { continue; }
#pragma omp atomic
b[k] += b_local[k];
}
}
}
However, I get a segmentation fault as a result of the second for loop. I do not need to a "#pragma omp for" on that loop because I want each thread to execute it fully. If I comment out the content inside the for loop, no segmentation fault. I am not sure what the issue would be.
You're probabily trying to access an out of range position in the dynamic array b_local.
See that sizeof(b) will return the size in bytes of float* (size of a float pointer).
If you want to know the size of the array that you are passing to the function, i would suggest you add it to the parameters of the function.
void parallelCSC_SpMV(float *x, float *b, int b_size){
...
float* b_local = (float*) malloc(sizeof(float)*b_size);
...
}
And, if the size of colptrs is numcols i would be careful with colptrs[i+1], since when i=numcols-1 will have another out of range problem.
First, as pointed out by Jim Cownie:
In all of these answers, b_local is uninitialised, yet you are adding
to it. You need to use calloc instead of malloc
Just to add to the accepted answer, I thing you can try the following approach to avoid calling malloc in parallel, and also the overhead of calling #pragma omp atomic.
void parallelCSC_SpMV(float *x, float *b, int b_size, int num_threads) {
float* b_local[num_threads];
for(int i = 0; i < num_threads; i++)
b_local[i] = calloc(b_size, sizeof(float));
#pragma omp parallel num_threads(num_threads)
{
int tid = omp_get_thread_num();
#pragma omp for
for(int i = 0; i < numcols; i++){
for(int j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++){
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
b_local[tid][index] += current_add;
}
}
}
for(int id = 0; id < num_threads; id++)
{
#pragma omp for simd
for (int k = 0; k < b_size; k++)
{
b[k] += b_local[id][k];
}
free(b_local[id]);
}
}
I have not tested the performance of this, so please feel free to do so and provide feedback.
You can further optimize by instead of creating a local_b for the master thread just reused the original b, as follows:
void parallelCSC_SpMV(float *x, float *b, int b_size, int num_threads) {
float* b_local[num_threads-1];
for(int i = 0; i < num_threads-1; i++)
b_local[i] = calloc(b_size, sizeof(float));
#pragma omp parallel num_threads(num_threads)
{
int tid = omp_get_thread_num();
float *thread_b = (tid == 0) ? b : b_local[tid-1];
#pragma omp for
for(int i = 0; i < numcols; i++){
for(int j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++){
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
thread_b[index] += current_add;
}
}
}
for(int id = 0; id < num_threads-1; id++)
{
#pragma omp for simd
for (int k = 0; k < b_size; k++)
{
b[k] += b_local[id][k];
}
free(b_local[id]);
}
}
Related
I have implemented the BW and FW algorithms to solve L and U triangular matrix.
The algorithm that I implement run very fast in a serial way, but I can not figure out if this is the best method to parallelize it.
I think that I have taken into account every possible data race (on alpha), am I right?
void solveInverse (double **U, double **L, double **P, int rw, int cw) {
double **inverseA = allocateMatrix(rw,cw);
double* x = allocateArray(rw);
double* y = allocateArray(rw);
double alpha;
//int i, j, t;
// Iterate along the column , so at each iteration we generate a column of the inverse matrix
for (int j = 0; j < rw; j++) {
// Lower triangular solve Ly=P
y[0] = P[0][j];
#pragma omp parallel for reduction(+:alpha)
for (int i = 1; i < rw; i++) {
alpha = 0;
for (int t = 0; t <= i-1; t++)
alpha += L[i][t] * y[t];
y[i] = P[i][j] - alpha;
}
// Upper triangular solve Ux=P
x[rw-1] = y[rw-1] / U[rw-1][rw-1];
#pragma omp parallel for reduction(+:alpha)
for (int i = rw-2; (i < rw) && (i >= 0); i--) {
alpha = 0;
for (int t = i+1; t < rw; t++)
alpha += U[i][t]*x[t];
x[i] = (y[i] - alpha) / U[i][i];
}
for (int i = 0; i < rw; i++)
inverseA[i][j] = x[i];
}
freeMemory(inverseA,rw);
free(x);
free(y);
}
After a private discussion with the user dreamcrash, we have come to the solution proposed in his comments, creating a couple of vector x and y for each thread that will work indipendently on a single column.
After a discussion with the OP on the comments (that were removed afterwards), we both came to the conclusion that:
You do not need to reduce the alpha variable, because outside the first parallel region it is initialized again to zero. Instead, make the alpha variable private.
#pragma omp parallel for
for (int i = 1; i < rw; i++) {
double alpha = 0;
for (int t = 0; t <= i-1; t++)
alpha += L[i][t] * y[t];
y[i] = P[i][j] - alpha;
}
and the same applies to the second parallel region as well.
#pragma omp parallel for
for (int i = rw-2; (i < rw) && (i >= 0); i--) {
double alpha = 0;
for (int t = i+1; t < rw; t++)
alpha += U[i][t]*x[t];
x[i] = (y[i] - alpha) / U[i][i];
}
Instead of having one parallel region per j iteration. You can extract the parallel region to encapsulate the entire outermost loop, and use #pragma omp for instead of #pragma omp parallel for. Notwithstanding, although with this approach we reduced the number of parallel regions created from rw to only 1, the speedup achieved with this optimization should not be that significant, because an efficient OpenMP implementation will use a pool of threads, where the threads are initialized on the first parallel region but reused on the next parallel regions. Consequently, saving on the overhead of creating and destroying threads.
#pragma omp parallel
{
for (int j = 0; j < rw; j++)
{
y[0] = P[0][j];
#pragma omp for
for (int i = 1; i < rw; i++) {
double alpha = 0;
for (int t = 0; t <= i-1; t++)
alpha += L[i][t] * y[t];
y[i] = P[i][j] - alpha;
}
x[rw-1] = y[rw-1] / U[rw-1][rw-1];
#pragma omp for
for (int i = rw-2; (i < rw) && (i >= 0); i--) {
double alpha = 0;
for (int t = i+1; t < rw; t++)
alpha += U[i][t]*x[t];
x[i] = (y[i] - alpha) / U[i][i];
}
#pragma omp for
for (int i = 0; i < rw; i++)
inverseA[i][j] = x[i];
}
}
I have shown you this code transformations so that you could see some potential tricks that you can use on other future parallelizations. Unfortunately, as it is that parallelization will not work.
Why?
Let us look at the first loop:
#pragma omp parallel for
for (int i = 1; i < rw; i++) {
double alpha = 0;
for (int t = 0; t <= i-1; t++)
alpha += L[i][t] * y[t];
y[i] = P[i][j] - alpha;
}
there is a dependency between y[t] being read in alpha += L[i][t] * y[t]; and y[i] being written in y[i] = P[i][j] - alpha;.
So what you can do instead is to parallelize the outermost loop (i.e., assign each column to the threads) and create separate x and y arrays for each thread so that there is no race-conditions during the updates/reads of those arrays.
#pragma omp parallel
{
double* x = allocateArray(rw);
double* y = allocateArray(rw);
#pragma omp for
for (int j = 0; j < rw; j++)
{
y[0] = P[0][j];
for (int i = 1; i < rw; i++) {
double alpha = 0;
for (int t = 0; t <= i-1; t++)
alpha += L[i][t] * y[t];
y[i] = P[i][j] - alpha;
}
x[rw-1] = y[rw-1] / U[rw-1][rw-1];
for (int i = rw-2; i >= 0; i--) {
double alpha = 0;
for (int t = i+1; t < rw; t++)
alpha += U[i][t]*x[t];
x[i] = (y[i] - alpha) / U[i][i];
}
for (int i = 0; i < rw; i++)
inverseA[i][j] = x[i];
}
free(x);
free(y);
}
I'm doing a scholar work and I have to obtain the histogram from a IMAGE.
All is going well, but when I tried to make the code parallel with the OpenMP, the compiler returns me this error: user defined reduction not found for 'histog'
The code that I used is this:
void HistogramaParaleloRed(int *histog)
{
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < NG; i++)
{
histog[i] = 0;
}
#pragma omp for reduction(+ : histog)
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
histog[IMAGEN[i][j]]++;
}
}
}
}
And the call to the function in Main is: HistogramaParaleloRed(histog_pal_red);
The error
user defined reduction not found for
can happen because either the code was compiled with a compiler that does not support the OpenMP 4.5 array reduction feature (or that compiler is misconfigured) or because your are trying the reduce a naked pointer (like it is the case of your example). In the latter, the compiler cannot tell how many elements are to be reduce.
So either you use a compiler that supports OpenMP 5.0 and take advantage of array sections feature as follows:
void HistogramaParaleloRed(int *histog)
{
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < NG; i++)
{
histog[i] = 0;
}
#pragma omp for reduction(+ : histog[:N])
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
histog[IMAGEN[i][j]]++;
}
}
}
}
or alternatively, implement the reduction yourself.
Implement the Reduction manually
One approach is to create a shared structure among threads (i.e., thread_histog), then each thread updates its position, and afterward, threads reduce the values of the shared structure into the original histog array.
void HistogramaParaleloRed(int *histog, int number_threads)
{
int thread_histog[number_threads][NG] = {{0}};
#pragma omp parallel
{
int thread_id = omp_get_thread_num();
#pragma omp for
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
thread_histog[thread_id][IMAGEN[i][j]]++;
#pragma omp for no_wait
for (int i = 0; i < NG; i++)
for(int j = 0; j < number_threads; j++)
histog[i] += thread_histog[j][i]
}
}
Another approach is to create an array of locks, one for each element of the histog array. Whenever a thread updates a given histog position, first acquires the lock corresponded to that position so that no other thread will be updating concurrently the same array position.
void HistogramaParaleloRed(int *histog)
{
omp_lock_t locks[NG];
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < NG; i++)
omp_init_lock(&locks[i]);
int thread_id = omp_get_thread_num();
#pragma omp for
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
int pos = IMAGEN[i][j]
omp_set_lock(&locks[pos]);
thread_histog[thread_id][pos]++;
omp_unset_lock(&locks[pos]);
}
#pragma omp for no_wait
for (int i = 0; i < NG; i++)
omp_destroy_lock(&locks[i]);
}
}
The error is attempting to directly reduce the argument pointer int *histog. You must instead reduce a local array, and copy over the reduced results to histog.
void HistogramaParaleloRed(int *histog)
{
int localHistog[NG];
/*
* your code as before, replacing histog with localHistog
*/
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < NG; i++)
{
localHistog[i] = 0;
}
#pragma omp for reduction(+ : localHistog)
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
localHistog[IMAGEN[i][j]]++;
}
}
/*
* copy localHistog into output histog
*/
#pragma omp for
for (int i = 0; i < NG; i++)
{
histog[i] = localHistog[i];
}
}
}
I was doing an activity at my university that requires to populate a matrix of [2000][2000] elements and then calculate the sum of all elements that are multiples of 5 in a parallel way.
At first I tried using a 5 x 5 matrix, I did a parcial sum (sumP) of the elements and them I added all the elements on a variable called Sum into a critical region.
On my university computer the parcial sum was receiving thrash values (like 36501) when the values must be lower than 100; I noted that it only happend on the [0][i] (line zero) of the matrix.
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 5
int main() {
int i, j, k, l;
int sum = 0;
int sumP = 0;
int A[N][N];
printf("sumP : %i\n", sumP );
printf("sum: %i\n", sum);
#pragma omp parallel shared (A) private (i, j)
{
#pragma omp for
for (i = 0; i < N; i++) {
for(j = 0; j < N; j++){
A[i][j] = i%5;
printf("Number: %i, pos[%i][%i]\n", A[i][j], i, j);
}
}
}
#pragma omp parallel shared(A, sum) private (k, l, sumP)
{
#pragma omp for
for (k = 0; k < N; k++) {
for (l = 0; l < N; l++){
if (A[l][k] % 5 == 0 && A[l][k] != 0){
sumP = sumP + A[k][l];
printf("numero: %i, pos [%i],[%i] sumP: %i\n", A[k][l], k, l, sumP);
}
}
}
#pragma omp critical
sum += sumP;
}
//printf("sumP: %i\n", sumP);
printf("sum: %i\n", sum);
return (0);
}
I tested it declaring the value of sumP to 0 between the "for" statemants, and it worked:
#pragma omp parallel shared(A, soma) private (k, l, somap2)
{
#pragma omp for
for (k = 0; k < N; k++) {
sumP = 0;
for (l = 0; l < N; l++){
when I tested it home it worked without having to declare the sumP as 0 (on the parcial sum "sumP"), like I did above, but now the final Sum result is not correct...
You observe this behavior because private variables in OpenMP are uninitialized. To be precise, they are initialized as if you would have a local variable without an explicit initialization. Which means it is undefined what value they have initially. You observe different behavior on different systems because some combinations of compiler, options, and OS use this "undefined" differently. Your code is incorrect in any case, even if it sometimes produces the correct result.
Now you can do this setting to zero as you tried out. However, I would generally suggest to instead declare variables as local as possible. This makes reasoning about the (parallel) code much easier, and you can omit the "private/shared" declarations. So your code would look like this:
#pragma omp parallel
{
int sumP = 0;
#pragma omp for
for (int k = 0; k < N; k++) {
for (int l = 0; l < N; l++) {
if (A[l][k] % 5 == 0 && A[l][k] != 0) {
sumP = sumP + A[k][l];
printf("numero: %i, pos [%i],[%i] sumP: %i\n", A[k][l], k, l, sumP);
}
}
}
#pragma omp critical
sum += sumP;
}
In addition to that, there is another way to drastically simplify this code by using a reduction:
#pragma omp parallel for reduction(+:sum)
for (int k = 0; k < N; k++) {
for (int l = 0; l < N; l++) {
if (A[l][k] % 5 == 0 && A[l][k] != 0) {
sum += A[k][l];
}
}
}
The compiler will basically do the same thing for you (but better) and the code is much cleaner.
Considering that your code would spend most of its time dealing with I/O it would be a good idea to comment the printf
But as I understand sumP should contain the partial sum of your inner loop
Pragmas have been compressed for readability
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#define N 1000
int main() {
int i, j;
int sum = 0;
int sumP = 0;
int A[N][N]; // will cause segfault with large N
printf("sumP : %i\n", sumP );
printf("sum: %i\n", sum);
#pragma omp parallel for shared (A) private (i, j)
for (i = 0; i < N; i++) {
for(j = 0; j < N; j++){
A[i][j] = i%5; // populate array with numbers in [0,1,2,3,4]
//printf("Number: %i, pos[%i][%i]\n", A[i][j], i, j);
}
}
#pragma omp parallel for shared(A) private (i, j, sumP) reduction(+: sum)
for (i = 0; i< N; i++) { // outer (parallel)loop
sumP = 0; // initialize partial sum
for (j = 0; j < N; j++){ // inner sequential loop
//if (A[i][j] % 5 == 0 && A[i][j] != 0){ // Explain this condition
sumP += A[i][j];
//printf("numero: %i, pos [%i],[%i] sumP: %i\n", A[i][j], i, j, sumP);
//}
}
//printf("sumP: %i\n", sumP);
sum += sumP; // add partial sum
}
//printf("sumP: %i\n", sumP);
printf("sum: %i\n", sum);
return (0);
}
I have the following radix sort algorithm that I am trying to parallelize using OpenMP:
void radixSortEdgesBySource(struct Edge *edges_sorted, struct Edge *edges, int numVertices, int numEdges) {
int i, j, d, c;
int key;
int pos;
int maximum = 0;
int *vertex_cnt = (int*)malloc(numVertices*sizeof(int));
maximum = edges[0].src;
for (c = 0; c < numEdges; c++)
{
if (edges[c].src > maximum)
{
maximum = edges[c].src;
}
}
while(maximum != 0)
{
maximum /= 10;
++d;
}
for (j = 1; j < d; j++)
{
#pragma omp parallel for num_threads(4)
for(i = 0; i < numVertices; ++i)
vertex_cnt[i] = 0;
}
#pragma omp parallel for num_threads(4)
for(i = 0; i < numEdges; ++i)
{
key = edges[i].src;
vertex_cnt[key]++;
}
for(i = 1; i < numVertices; ++i) {
vertex_cnt[i] += vertex_cnt[i - 1];
}
#pragma omp parallel for num_threads(4)
for (i = numEdges - 1; i >= 0; --i) {
key = edges[i].src;
pos = vertex_cnt[key] - 1;
edges_sorted[pos] = edges[i];
vertex_cnt[key]--;
}
}
free(vertex_cnt);
}
I want to know if the way I have used #pragma omp is correct? Because I am not really seeing any considerable change in the speed of execution.
And also how would I go about parallelizing the loop block that does the cumulative summing?
I am new to the OpenMP, not sure what was wrong with this code, the results are not making sense.
Thanks.
#include <omp.h>
#include <stdio.h>
#define N 20
int cnt = 0;
int A[N];
int main (int argc, char *argv[]) {
#pragma omp parallel for
for (int i = 0; i <= N; i++) {
if ((i%2)==0) cnt++;
A[i] = cnt;
printf("i=%d, cnt=%d\n", i, cnt);
}
printf("outside the parallel cnt=%d\n", cnt);
for (int i = 0; i <= N; i++)
printf("A[%d]=%d\n", i, A[i]);
}
Edit:
the cnt outside the parallel region should be 11, most time it was correct, but sometime it gave me 10. For array A I understand why the values do not match with the indices, but I would hope the array A be like this following, is it possible ?
A[0]=1 A[1]=1 A[2]=2 A[3]=2 A[4]=3 A[5]=3 A[6]=4 A[7]=4 A[8]=5 A[9]=5 A[10]=6
A[11]=6 A[12]=7 A[13]=7 A[14]=8 A[15]=8 A[16]=9 A[17]=9 A[18]=10 A[19]=10
A[20]=11
Your code has multiple bugs. Let's address the silly one first. You write to N+1 elements but only allocate N elements. Change N to 21 and then change
for (int i = 0; i <= N; i++)
to
for (int i = 0; i < N; i++)
But your code has another more subtle bug. You're using an induction variable. I don't know an easy way to use induction variables with OpenMP.
In your case one easy fix is not use an induction variable and instead do
#pragma omp parallel for
for (int i = 0; i < N; i++) {
int j = i / 2 + 1;
A[i] = j;
}
cnt = N/2;
You can also use a reduction for the final value of cnt but it's redundant and less efficient.
#pragma omp parallel for reduction(+:cnt)
for (int i = 0; i < N; i++) {
if ((i % 2) == 0) cnt++;
int j = i / 2 + 1;
A[i] = j;
}
If you really want to use an induction variable then you have to do something like this:
#pragma omp parallel
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i < finish; i++) {
if ((i % 2) == 0) j++;
A[i] = j;
}
}
cnt = N/2;
You can also use a reduction for the final value of cnt but as is clear in the code below it's redundant.
#pragma omp parallel reduction(+:cnt)
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i <finish; i++) {
if ((i % 2) == 0) {
j++; cnt++;
}
A[i] = j;
}
}