I am currently new to OpenMp and trying to write a simple OpenMP-C matrix-vector multiplication program. On increasing the matrix size to 750x750 elements, my program stops responding and the window hangs. I would like to know if that is a limitation of my laptop or is it a data-race condition I am facing.
I am trying to define a matrix A and a vector u and put random elements (0-10). Then I am calculating the vector result b.
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
int main()
{
int x_range = 50;
int y_range = 50;
int A[x_range][y_range];
int u[y_range];
int b[y_range];
printf("Measuring time resolution %g\n", omp_get_wtick());
printf("Parallel program start time %g\n", omp_get_wtime());
#pragma omp parallel num_threads(x_range)
{
int b_temp[y_range];
for (int j = 0; j < y_range; j++)
{
b_temp[j] = 0;
}
#pragma omp for
for (int i = 0; i < x_range; i++)
{
for (int j = 0; j < y_range; j++)
{
A[i][j] = (rand() % 10) + 1;
}
}
#pragma omp for
for (int j = 0; j < y_range; j++)
{
{
u[j] = (rand() % 10) + 1;
}
}
#pragma omp for
for (int i = 0; i < x_range; i++)
{
for(int j = 0; j < y_range; j++)
{
b_temp[i] = b_temp[i] + A[i][j]*u[j];
}
}
#pragma omp critical
for(int j = 0; j < y_range; j++)
{
b[j] = b[j] + b_temp[j];
}
}
printf("parallel program end time %g\n", omp_get_wtime());
return 0;
}
First off, operations you're performing cannot have data race conditions, because there's no RAW , WAR , WAW dependency. You can read more about them in wiki.
Secondly, Your system is hanging because you're creating 750 threads as dictated by x_range
I have the following radix sort algorithm that I am trying to parallelize using OpenMP:
void radixSortEdgesBySource(struct Edge *edges_sorted, struct Edge *edges, int numVertices, int numEdges) {
int i, j, d, c;
int key;
int pos;
int maximum = 0;
int *vertex_cnt = (int*)malloc(numVertices*sizeof(int));
maximum = edges[0].src;
for (c = 0; c < numEdges; c++)
{
if (edges[c].src > maximum)
{
maximum = edges[c].src;
}
}
while(maximum != 0)
{
maximum /= 10;
++d;
}
for (j = 1; j < d; j++)
{
#pragma omp parallel for num_threads(4)
for(i = 0; i < numVertices; ++i)
vertex_cnt[i] = 0;
}
#pragma omp parallel for num_threads(4)
for(i = 0; i < numEdges; ++i)
{
key = edges[i].src;
vertex_cnt[key]++;
}
for(i = 1; i < numVertices; ++i) {
vertex_cnt[i] += vertex_cnt[i - 1];
}
#pragma omp parallel for num_threads(4)
for (i = numEdges - 1; i >= 0; --i) {
key = edges[i].src;
pos = vertex_cnt[key] - 1;
edges_sorted[pos] = edges[i];
vertex_cnt[key]--;
}
}
free(vertex_cnt);
}
I want to know if the way I have used #pragma omp is correct? Because I am not really seeing any considerable change in the speed of execution.
And also how would I go about parallelizing the loop block that does the cumulative summing?
I have a for loop in the following code.
int min = -1;
int pos;
int array[100];
for(i = 0; i < 100; i++){
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
I think that the following code is a correct implementation with openMP but it is too slow.
int min = -1;
int pos;
int array[100];
#pragma omp parallel for default(none) shared(array, min)
for(i = 0; i < 100; i++){
#pragma omp critical
{
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
}
I think that could be data hazards if i put the critical section inside the condition instead of outside. There is a smart way to implement it? Some suggestions?
I've coded up a small parallel search function. I've only tested that it compiles, but I believe the principle is sound:
#include <stddef.h>
#define MINDIVIDE 1024
int parallelminsearch(int const *array, size_t size)
{
int minimum;
if (size < MINDIVIDE)
{
minimum = array[0];
for (size_t i = 1; i < size; i++)
{
if (array[i] < minimum)
minimum = array[i];
}
return minimum;
}
int pmin[2];
#pragma omp parallel for
for (size_t i = 0; i < 2; i++)
{
pmin[i] = parallelminsearch(&array[i*size/2], (size+1)/2);
}
minimum = (pmin[0] < pmin[1])?pmin[0]:pmin[1];
return minimum;
}
Have some problems with assigning parallel algorithm to prefix sum issue. I am using openMP for parallel implementation. I have the code in c as below.
Result showing:
seqsum[6] = 28 != parallelsum[6] = 34
Please advise. Thanks.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "omp.h"
#include <string.h>
#define N 10 //33554432 // 2 ^ 25
#define NUM_THREADS 4
void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
int nthr, *z, *x = _pprefixsum;
int i, j, tid, work, lo, hi;
#pragma omp parallel shared(nthr,x,z) private(i,j,tid,work,lo,hi)
{
int prev_sum;
memcpy((void *)x, (void *)iplist, sizeof(int)*size);
// Assume nthr = 2^k
#pragma omp single
{
nthr = omp_get_num_threads();
z = malloc(sizeof(int)*nthr);
}
tid = omp_get_thread_num();
work = size /nthr + (i = tid < size%nthr ? 1 : 0);
lo = (size/nthr)*tid + (i==1 ? tid : size%nthr);
hi = lo + work;
if (hi > size)
hi = size;
// local prefix sum over x
for(i=lo+1; i<hi; i++)
x[i] += x[i-1];
// local prefix sum for tid
z[tid] = x[hi-1];
#pragma omp barrier
// global prefix sum over z
for(j=1; j<nthr; j=2*j) {
if (tid >= j)
z[tid] = z[tid] + z[tid-j];
#pragma omp barrier
}
// Update local prefix sum x
prev_sum = z[tid] - x[hi-1];
for(i=lo;i<hi;i++)
x[i] += prev_sum;
}
free(z);
}
void initlist(int *iplist, unsigned long size)
{
int i;
for ( i = 0; i < size; i++)
iplist[i] = i+1;
// iplist[i] = rand() % 13;
}
void printlist(int *list, unsigned long size)
{
int i;
for(i = 0; i < size; i++) {
printf("%d ", list[i]);
}
printf("\n");
}
void computeseqprefixsum(int *iplist, int *seqprefixsum, unsigned long size)
{
int i;
seqprefixsum[0] = iplist[0];
for(i = 1; i < size; i++) {
seqprefixsum[i] = seqprefixsum[i-1] + iplist[i];
}
}
void checkresults(int *seqsum, int *parallelsum, unsigned long size)
{
int i;
for(i = 0; i < size; i++)
{
if(seqsum[i] != parallelsum[i]) {
printf("seqsum[%d] = %d != parallelsum[%d] = %d\n", i, seqsum[i], i,
parallelsum[i]);
exit(1);
}
}
}
int main(int argc, char *argv[])
{
// seed the rand generator
srand(time(NULL));
double seqstart, seqend, parstart, parend, seqtime, partime;
// initialize list
int *iplist, *seqprefixsum, *pprefixsum ;
iplist = (int*) malloc(sizeof(int) * N);
seqprefixsum = (int*) malloc(sizeof(int) * N);
pprefixsum = (int*) malloc(sizeof(int) * N);
if(iplist == NULL || seqprefixsum == NULL || pprefixsum == NULL) {
printf("memory cannot be allocated\n");
exit(1);
}
initlist(iplist, N);
seqstart = omp_get_wtime();
computeseqprefixsum(iplist, seqprefixsum, N);
seqend = omp_get_wtime();
seqtime = seqend - seqstart;
omp_set_num_threads(NUM_THREADS);
parstart = omp_get_wtime();
computeparallelprefix(iplist, pprefixsum, N);
parend= omp_get_wtime();
partime = parend - parstart;
checkresults(seqprefixsum, pprefixsum, N);
printf("Seq Time : %f, Par Time : %f, Speedup : %f\n", seqtime, partime,
seqtime/partime);
free(iplist); free(seqprefixsum); free(pprefixsum);
return 0;
}
You have the right idea for the prefix sum with your code.
I'm not sure exactly why you don't get the correct result but I cleaned up your code and my version gets the correct result. See the following question for more details parallel-cumulative-prefix-sums-in-openmp-communicating-values-between-thread
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "omp.h"
#define N 11 //33554432 // 2 ^ 25
void computeparallelprefix(int *iplist, int *_pprefixsum, unsigned long size)
{
int nthr, *z, *x = _pprefixsum;
#pragma omp parallel
{
int i;
#pragma omp single
{
nthr = omp_get_num_threads();
z = malloc(sizeof(int)*nthr+1);
z[0] = 0;
}
int tid = omp_get_thread_num();
int sum = 0;
#pragma omp for schedule(static)
for(i=0; i<size; i++) {
sum += iplist[i];
x[i] = sum;
}
z[tid+1] = sum;
#pragma omp barrier
int offset = 0;
for(i=0; i<(tid+1); i++) {
offset += z[i];
}
#pragma omp for schedule(static)
for(i=0; i<size; i++) {
x[i] += offset;
}
}
free(z);
}
int main(void ) {
int *iplist, *pprefixsum ;
iplist = (int*) malloc(sizeof(int) * N);
pprefixsum = (int*) malloc(sizeof(int) * N);
for(int i=0; i<N; i++) iplist[i] = i+1;
for(int i=0; i<N; i++) printf("%d ", iplist[i]); printf("\n");
computeparallelprefix(iplist, pprefixsum, N);
for(int i=0; i<N; i++) printf("%d ", pprefixsum[i]); printf("\n");
for(int i=0; i<N; i++) printf("%d ", (i+1)*(i+2)/2); printf("\n");
return 0;
}
I am new to the OpenMP, not sure what was wrong with this code, the results are not making sense.
Thanks.
#include <omp.h>
#include <stdio.h>
#define N 20
int cnt = 0;
int A[N];
int main (int argc, char *argv[]) {
#pragma omp parallel for
for (int i = 0; i <= N; i++) {
if ((i%2)==0) cnt++;
A[i] = cnt;
printf("i=%d, cnt=%d\n", i, cnt);
}
printf("outside the parallel cnt=%d\n", cnt);
for (int i = 0; i <= N; i++)
printf("A[%d]=%d\n", i, A[i]);
}
Edit:
the cnt outside the parallel region should be 11, most time it was correct, but sometime it gave me 10. For array A I understand why the values do not match with the indices, but I would hope the array A be like this following, is it possible ?
A[0]=1 A[1]=1 A[2]=2 A[3]=2 A[4]=3 A[5]=3 A[6]=4 A[7]=4 A[8]=5 A[9]=5 A[10]=6
A[11]=6 A[12]=7 A[13]=7 A[14]=8 A[15]=8 A[16]=9 A[17]=9 A[18]=10 A[19]=10
A[20]=11
Your code has multiple bugs. Let's address the silly one first. You write to N+1 elements but only allocate N elements. Change N to 21 and then change
for (int i = 0; i <= N; i++)
to
for (int i = 0; i < N; i++)
But your code has another more subtle bug. You're using an induction variable. I don't know an easy way to use induction variables with OpenMP.
In your case one easy fix is not use an induction variable and instead do
#pragma omp parallel for
for (int i = 0; i < N; i++) {
int j = i / 2 + 1;
A[i] = j;
}
cnt = N/2;
You can also use a reduction for the final value of cnt but it's redundant and less efficient.
#pragma omp parallel for reduction(+:cnt)
for (int i = 0; i < N; i++) {
if ((i % 2) == 0) cnt++;
int j = i / 2 + 1;
A[i] = j;
}
If you really want to use an induction variable then you have to do something like this:
#pragma omp parallel
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i < finish; i++) {
if ((i % 2) == 0) j++;
A[i] = j;
}
}
cnt = N/2;
You can also use a reduction for the final value of cnt but as is clear in the code below it's redundant.
#pragma omp parallel reduction(+:cnt)
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i <finish; i++) {
if ((i % 2) == 0) {
j++; cnt++;
}
A[i] = j;
}
}