I have the following radix sort algorithm that I am trying to parallelize using OpenMP:
void radixSortEdgesBySource(struct Edge *edges_sorted, struct Edge *edges, int numVertices, int numEdges) {
int i, j, d, c;
int key;
int pos;
int maximum = 0;
int *vertex_cnt = (int*)malloc(numVertices*sizeof(int));
maximum = edges[0].src;
for (c = 0; c < numEdges; c++)
{
if (edges[c].src > maximum)
{
maximum = edges[c].src;
}
}
while(maximum != 0)
{
maximum /= 10;
++d;
}
for (j = 1; j < d; j++)
{
#pragma omp parallel for num_threads(4)
for(i = 0; i < numVertices; ++i)
vertex_cnt[i] = 0;
}
#pragma omp parallel for num_threads(4)
for(i = 0; i < numEdges; ++i)
{
key = edges[i].src;
vertex_cnt[key]++;
}
for(i = 1; i < numVertices; ++i) {
vertex_cnt[i] += vertex_cnt[i - 1];
}
#pragma omp parallel for num_threads(4)
for (i = numEdges - 1; i >= 0; --i) {
key = edges[i].src;
pos = vertex_cnt[key] - 1;
edges_sorted[pos] = edges[i];
vertex_cnt[key]--;
}
}
free(vertex_cnt);
}
I want to know if the way I have used #pragma omp is correct? Because I am not really seeing any considerable change in the speed of execution.
And also how would I go about parallelizing the loop block that does the cumulative summing?
Related
I have a function that I want to parallelize. This is the serial version.
void parallelCSC_SpMV(float *x, float *b)
{
int i, j;
for(i = 0; i < numcols; i++)
{
for(j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++)
{
b[irem[j] - 1] += xrem[j]*x[i];
}
}
}
I figured a decent way to do this was to have each thread write to a private copy of the b array (which does not need to be a protected critical section because its a private copy), after the thread is done, it will then copy its results to the actual b array. Here is my code.
void parallelCSC_SpMV(float *x, float *b)
{
int i, j, k;
#pragma omp parallel private(i, j, k)
{
float* b_local = (float*)malloc(sizeof(b));
#pragma omp for nowait
for(i = 0; i < numcols; i++)
{
for(j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++)
{
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
b_local[index] += current_add;
}
}
for (k = 0; k < sizeof(b) / sizeof(b[0]); k++)
{
// Separate question: Is this if statement allowed?
//if (b_local[k] == 0) { continue; }
#pragma omp atomic
b[k] += b_local[k];
}
}
}
However, I get a segmentation fault as a result of the second for loop. I do not need to a "#pragma omp for" on that loop because I want each thread to execute it fully. If I comment out the content inside the for loop, no segmentation fault. I am not sure what the issue would be.
You're probabily trying to access an out of range position in the dynamic array b_local.
See that sizeof(b) will return the size in bytes of float* (size of a float pointer).
If you want to know the size of the array that you are passing to the function, i would suggest you add it to the parameters of the function.
void parallelCSC_SpMV(float *x, float *b, int b_size){
...
float* b_local = (float*) malloc(sizeof(float)*b_size);
...
}
And, if the size of colptrs is numcols i would be careful with colptrs[i+1], since when i=numcols-1 will have another out of range problem.
First, as pointed out by Jim Cownie:
In all of these answers, b_local is uninitialised, yet you are adding
to it. You need to use calloc instead of malloc
Just to add to the accepted answer, I thing you can try the following approach to avoid calling malloc in parallel, and also the overhead of calling #pragma omp atomic.
void parallelCSC_SpMV(float *x, float *b, int b_size, int num_threads) {
float* b_local[num_threads];
for(int i = 0; i < num_threads; i++)
b_local[i] = calloc(b_size, sizeof(float));
#pragma omp parallel num_threads(num_threads)
{
int tid = omp_get_thread_num();
#pragma omp for
for(int i = 0; i < numcols; i++){
for(int j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++){
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
b_local[tid][index] += current_add;
}
}
}
for(int id = 0; id < num_threads; id++)
{
#pragma omp for simd
for (int k = 0; k < b_size; k++)
{
b[k] += b_local[id][k];
}
free(b_local[id]);
}
}
I have not tested the performance of this, so please feel free to do so and provide feedback.
You can further optimize by instead of creating a local_b for the master thread just reused the original b, as follows:
void parallelCSC_SpMV(float *x, float *b, int b_size, int num_threads) {
float* b_local[num_threads-1];
for(int i = 0; i < num_threads-1; i++)
b_local[i] = calloc(b_size, sizeof(float));
#pragma omp parallel num_threads(num_threads)
{
int tid = omp_get_thread_num();
float *thread_b = (tid == 0) ? b : b_local[tid-1];
#pragma omp for
for(int i = 0; i < numcols; i++){
for(int j = colptrs[i] - 1; j < colptrs[i+1] - 1; j++){
float current_add = xrem[j]*x[i];
int index = irem[j] - 1;
thread_b[index] += current_add;
}
}
}
for(int id = 0; id < num_threads-1; id++)
{
#pragma omp for simd
for (int k = 0; k < b_size; k++)
{
b[k] += b_local[id][k];
}
free(b_local[id]);
}
}
int v[10] = {2,9,1,3,5,7,1,2,0,0};
int maximo = 0;
int b = 0;
int i;
#pragma omp parallel for shared(v) private(i) reduction(max:maximo)
for(i = 0; i< 10; i++){
if (v[i] > maximo)
maximo = v[i];
b = i + 100;
}
How can I get the value that b gets during the iteration when maximo gets its max value (and therefore, its value after the for loop)?
TL;DR You can use User-Defined Reduction.
First, instead of:
for(i = 0; i< 10; i++){
if (v[i] > maximo)
maximo = v[i];
b = i + 100;
}
you meant this:
for(i = 0; i< 10; i++){
if (v[i] > maximo){
maximo = v[i];
b = i + 100;
}
}
OpenMP has in-build reduction functions that consider a single target value, however in your case you want to reduce taking into account 2 values the max and the array index. After OpenMP 4.0 one can create its own reduction functions (i.e., User-Defined Reduction).
First, create a struct to store the two relevant values:
struct MyMax {
int max;
int index;
};
then we need to teach the OpenMP implementation how to reduce it:
#pragma omp declare reduction(maximo : struct MyMax : omp_out = omp_in.max > omp_out.max ? omp_in : omp_out)
we set our parallel region accordingly:
#pragma omp parallel for reduction(maximo:myMaxStruct)
for(int i = 0; i< 10; i++){
if (v[i] > myMaxStruct.max){
myMaxStruct.max = v[i];
myMaxStruct.index = i + 100;
}
}
Side Note You do not really need private(i), because with the #pragma omp parallel for the index variable of the for loop will be implicitly private anyway.
All put together:
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
struct MyMax {
int max;
int index;
};
int main(void)
{
#pragma omp declare reduction(maximo : struct MyMax : omp_out = omp_in.max > omp_out.max ? omp_in : omp_out)
struct MyMax myMaxStruct;
myMaxStruct.max = 0;
myMaxStruct.index = 0;
int v[10] = {2,9,1,3,5,7,1,2,0,0};
#pragma omp parallel for reduction(maximo:myMaxStruct)
for(int i = 0; i< 10; i++){
if (v[i] > myMaxStruct.max){
myMaxStruct.max = v[i];
myMaxStruct.index = i + 100;
}
}
printf("Max %d : Index %d\n", myMaxStruct.max, myMaxStruct.index);
}
OUTPUT:
Max 9 : Index 101
(Index is 101 because you have b = i + 100)
I've coded this but not compiled or tested it:
int v[10] = { 2, 9, 1, 3, 5, 7, 1, 2, 0, 0 };
int maximo = 0;
int b = 0;
int i;
int nt = omp_get_num_threads();
int bv[nt] = { 0 };
#pragma omp parallel for shared(v) shared(bv) private(i) reduction(max:maximo)
for (i = 0; i < 10; i++) {
if (v[i] > maximo) {
maximo = v[i];
bv[omp_get_thread_num()] = i + 100;
}
}
for (i = 0; i < nt; ++i)
printf("bv[%d] = %d\n",i,bv[i]);
Beware that "Returns the number of threads in the current team. In a sequential section of the program omp_get_num_threads returns 1"
Okay, I've recoded it [and built/run it] and it does produce one non-zero bv output:
#include <stdio.h>
#include <omp.h>
int
main(void)
{
int v[10] = { 2, 9, 1, 3, 5, 7, 1, 2, 0, 0 };
int i;
int nt;
int maximo = 0;
int index = 0;
int bv[32] = { 0 };
int max[32] = { 0 };
#pragma omp parallel shared(v, bv)
{
nt = omp_get_num_threads();
int thread_id = omp_get_thread_num();
#pragma omp for private(i)
for (i = 0; i < 10; i++) {
if (v[i] > max[thread_id]) {
max[thread_id] = v[i];
bv[thread_id] = i + 100;
}
}
}
// Reducing sequentially
for (i = 0; i < nt; ++i){
if(max[i] > maximo){
maximo = max[i];
index = bv[i];
}
}
printf("Max %d at index %d\n", maximo, index);
return 0;
}
Here is the program output:
Max 9 at index 101
I have a for loop in the following code.
int min = -1;
int pos;
int array[100];
for(i = 0; i < 100; i++){
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
I think that the following code is a correct implementation with openMP but it is too slow.
int min = -1;
int pos;
int array[100];
#pragma omp parallel for default(none) shared(array, min)
for(i = 0; i < 100; i++){
#pragma omp critical
{
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
}
I think that could be data hazards if i put the critical section inside the condition instead of outside. There is a smart way to implement it? Some suggestions?
I've coded up a small parallel search function. I've only tested that it compiles, but I believe the principle is sound:
#include <stddef.h>
#define MINDIVIDE 1024
int parallelminsearch(int const *array, size_t size)
{
int minimum;
if (size < MINDIVIDE)
{
minimum = array[0];
for (size_t i = 1; i < size; i++)
{
if (array[i] < minimum)
minimum = array[i];
}
return minimum;
}
int pmin[2];
#pragma omp parallel for
for (size_t i = 0; i < 2; i++)
{
pmin[i] = parallelminsearch(&array[i*size/2], (size+1)/2);
}
minimum = (pmin[0] < pmin[1])?pmin[0]:pmin[1];
return minimum;
}
I am new to the OpenMP, not sure what was wrong with this code, the results are not making sense.
Thanks.
#include <omp.h>
#include <stdio.h>
#define N 20
int cnt = 0;
int A[N];
int main (int argc, char *argv[]) {
#pragma omp parallel for
for (int i = 0; i <= N; i++) {
if ((i%2)==0) cnt++;
A[i] = cnt;
printf("i=%d, cnt=%d\n", i, cnt);
}
printf("outside the parallel cnt=%d\n", cnt);
for (int i = 0; i <= N; i++)
printf("A[%d]=%d\n", i, A[i]);
}
Edit:
the cnt outside the parallel region should be 11, most time it was correct, but sometime it gave me 10. For array A I understand why the values do not match with the indices, but I would hope the array A be like this following, is it possible ?
A[0]=1 A[1]=1 A[2]=2 A[3]=2 A[4]=3 A[5]=3 A[6]=4 A[7]=4 A[8]=5 A[9]=5 A[10]=6
A[11]=6 A[12]=7 A[13]=7 A[14]=8 A[15]=8 A[16]=9 A[17]=9 A[18]=10 A[19]=10
A[20]=11
Your code has multiple bugs. Let's address the silly one first. You write to N+1 elements but only allocate N elements. Change N to 21 and then change
for (int i = 0; i <= N; i++)
to
for (int i = 0; i < N; i++)
But your code has another more subtle bug. You're using an induction variable. I don't know an easy way to use induction variables with OpenMP.
In your case one easy fix is not use an induction variable and instead do
#pragma omp parallel for
for (int i = 0; i < N; i++) {
int j = i / 2 + 1;
A[i] = j;
}
cnt = N/2;
You can also use a reduction for the final value of cnt but it's redundant and less efficient.
#pragma omp parallel for reduction(+:cnt)
for (int i = 0; i < N; i++) {
if ((i % 2) == 0) cnt++;
int j = i / 2 + 1;
A[i] = j;
}
If you really want to use an induction variable then you have to do something like this:
#pragma omp parallel
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i < finish; i++) {
if ((i % 2) == 0) j++;
A[i] = j;
}
}
cnt = N/2;
You can also use a reduction for the final value of cnt but as is clear in the code below it's redundant.
#pragma omp parallel reduction(+:cnt)
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i <finish; i++) {
if ((i % 2) == 0) {
j++; cnt++;
}
A[i] = j;
}
}
How do i parallelize this function in OpenMP for C
int zeroRow(int**A,int n) {
int i, j, sum, num = 0;
for(i= 0;i< n;i++) {
sum = 0;
for(j = 0; j < n; j++) {
sum += A[i][j];
}
if(sum == 0) {
num++;
}
}
return num;
}
I did this check if this is the right procedure.
int zeroRow(int**A,int n) {
int num = 0;
#pragma omp parallel for reduction(+:num);
for(int i= 0;i< n;i++) {
int sum = 0;
for(int j = 0; j < n; j++) {
sum += A[i][j];
}
if(sum == 0) {
num++;
}
}
return num;
}
please tell me if what i have done is right or wring i have parallelized the outer loop using reduction and a separate num is given to each thread.
Looks correct parallelized.
The only thing you should add is a term specifying the use of A.
You rely that the default case is shared. You should explicitly name the status with
#pragma omp parallel for reduction(+:num) default(shared)
or
#pragma omp parallel for reduction(+:num) shared(A)
also you do not need to write a semicolon (;) at the end of the pragma line (but writing it would be no error)