I have a for loop in the following code.
int min = -1;
int pos;
int array[100];
for(i = 0; i < 100; i++){
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
I think that the following code is a correct implementation with openMP but it is too slow.
int min = -1;
int pos;
int array[100];
#pragma omp parallel for default(none) shared(array, min)
for(i = 0; i < 100; i++){
#pragma omp critical
{
if(array[i] < min || min == -1){
min = array[i];
pos = i;
}
}
}
I think that could be data hazards if i put the critical section inside the condition instead of outside. There is a smart way to implement it? Some suggestions?
I've coded up a small parallel search function. I've only tested that it compiles, but I believe the principle is sound:
#include <stddef.h>
#define MINDIVIDE 1024
int parallelminsearch(int const *array, size_t size)
{
int minimum;
if (size < MINDIVIDE)
{
minimum = array[0];
for (size_t i = 1; i < size; i++)
{
if (array[i] < minimum)
minimum = array[i];
}
return minimum;
}
int pmin[2];
#pragma omp parallel for
for (size_t i = 0; i < 2; i++)
{
pmin[i] = parallelminsearch(&array[i*size/2], (size+1)/2);
}
minimum = (pmin[0] < pmin[1])?pmin[0]:pmin[1];
return minimum;
}
Related
I need to write a program that iterates through all possible combinations for a base-2 (binary) vector. If the size of this vector is 3 you can do this with three nested loops, like this:
bool array[3];
for(int i = 0; i < 2; i++)
{
for(int j = 0; j < 2; j++)
{
for(int k = 0; k < 2; k++)
{
array[0] = i;
array[1] = j;
array[2] = k;
}
}
}
But the problem is that in my application, the array size is variable and can basically be any number. If I'm looking to find all values of a 12-bit vector, I don't want to write 12 nested loops and so it is not maintainable to use the code above. Instead I have come up with the following solution:
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#define SIZE 12
int main(void)
{
bool array[SIZE];
for(int i = 0; i < SIZE; i++) array[i] = 1;
int max_num = pow(2, SIZE);
for(int i = 0; i < max_num; i++)
{
if(array[0] == 0) array[0]++;
else
{
array[0] = 0;
for(int j = 1; j < SIZE; j++)
{
if(array[j] == 1) array[j] = 0;
else
{
array[j] = 1;
break;
}
}
}
for(int j = 0; j < SIZE; j++)
{
printf("%d", array[j]);
if(j != SIZE - 1) printf(", ");
else printf("\n");
}
}
}
This still seems as a lot of code to me for such a relatively simple thing. My question is: is there a more efficient way to do this?
What you are doing with the array is effectively incrementing (adding one) to the number represented by the array.
Let's leave the incrementing to the compiler and use bits from the integer.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#define SIZE 12
int main(void)
{
bool array[SIZE];
int max_num = 1 << SIZE;
for(int i = 0; i < max_num; i++)
{
for(int j = 0; j < SIZE; j++)
{
array[j] = (i >> j) & 1;
}
for(int j = 0; j < SIZE; j++)
{
printf("%d", array[j]);
if(j != SIZE - 1) printf(", ");
else printf("\n");
}
}
}
As pointed out by others, it is essentially incrementing a binary number. However, in keeping with the spirit of the original code, I decided not to "cheat" by using native addition/increment operators to increment the vector, and came up with the following:
#include <stddef.h>
#include <stdbool.h>
bool first(size_t size, bool array[size])
{
size_t i;
for (i = 0; i < size; i++)
{
array[i] = 0;
}
return i > 0;
}
bool next(size_t size, bool array[size])
{
size_t i;
for (i = 0; i < size && array[i]; i++)
{
array[i] = 0;
}
if (i < size)
{
array[i] = 1;
return 1;
}
return 0;
}
#include <stdio.h>
int main(void)
{
enum { SIZE = 12 };
bool array[SIZE];
bool going;
for (going = first(SIZE, array); going; going = next(SIZE, array))
{
size_t i;
for (i = 0; i < SIZE - 1; i++)
{
printf("%d, ", array[i]);
}
printf("%d\n", array[i]);
}
return 0;
}
It could be adapted to work in other bases easily:
#include <stddef.h>
#include <stdbool.h>
bool first(size_t size, unsigned int array[size])
{
size_t i;
for (i = 0; i < size; i++)
{
array[i] = 0;
}
return i > 0;
}
bool next(size_t size, unsigned int array[size], unsigned int base)
{
size_t i;
for (i = 0; i < size && array[i] == base - 1; i++)
{
array[i] = 0;
}
if (i < size)
{
array[i]++;
return 1;
}
return 0;
}
#include <stdio.h>
int main(void)
{
enum { SIZE = 5 };
enum { BASE = 3 };
unsigned int array[SIZE];
bool going;
for (going = first(SIZE, array); going; going = next(SIZE, array, BASE))
{
size_t i;
for (i = 0; i < SIZE - 1; i++)
{
printf("%u, ", array[i]);
}
printf("%u\n", array[i]);
}
return 0;
}
I have implemented a Counting Sort in an assignment given to us by a teacher but sometimes it doesn't work for large arrays.
Here is the code:
void countingSort(int *t, int n) {
int min = findMin(t, n);
int max = findMax(t, n);
int range = max - min + 1;
int *count, *output;
int i;
count = (int *)malloc(range * sizeof(int));
output = (int *)malloc(n * sizeof(int));
for (i = 0; i < range; i++) {
count[i] = 0;
}
for (i = 0; i < n; i++) {
count[t[i] - min]++;
}
for (i = 1; i < range; i++) {
count[i] += count[i - 1];
}
for (i = n - 1; i >= 0; i--) {
output[count[t[i] - min] - 1] = t[i];
count[t[i] - min]--;
}
for (i = 0; i < n; i++) {
t[i] = output[i];
}
}
What's wrong with my code?
Your code seems to work for small values of range, but might fail if min and max are too far apart, causing the computation of range to overflow the range of int and malloc() to fail.
You should check for overflow in range and check memory allocation success. Note too that calloc() is more appropriate than malloc() for the count array. Finally, you must free the allocated arrays.
Here is a modified version:
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
int findMax(const int *t, int n) {
int max = INT_MIN;
while (n-- > 0) {
if (max < *t) max = *t;
t++;
}
return max;
}
int findMin(const int *t, int n) {
int min = INT_MAX;
while (n-- > 0) {
if (min > *t) min = *t;
t++;
}
return min;
}
int countingSort(int *t, int n) {
int min, max, range, i;
int *count, *output;
if (n <= 0)
return 0;
min = findMin(t, n);
max = findMax(t, n);
if (min < 0 && max >= 0 && (unsigned)max + (unsigned)(-min) >= INT_MAX) {
fprintf(stderr, "countingSort: value range too large: %d..%d\n", min, max);
return -1;
}
range = max - min + 1;
if ((count = (int *)calloc(range, sizeof(int))) == NULL) {
fprintf(stderr, "countingSort: cannot allocate %d element count array\n", range);
return -1;
}
if ((output = (int *)malloc(n * sizeof(int))) == NULL) {
fprintf(stderr, "countingSort: cannot allocate %d element output array\n", n);
free(count);
return -1;
}
for (i = 0; i < n; i++) {
count[t[i] - min]++;
}
for (i = 1; i < range; i++) {
count[i] += count[i - 1];
}
for (i = n; i-- > 0;) {
output[count[t[i] - min] - 1] = t[i];
count[t[i] - min]--;
}
for (i = 0; i < n; i++) {
t[i] = output[i];
}
free(count);
free(output);
return 0;
}
You can avoid the cumbersome and potentially inefficient downward loop by replacing the second and third for loops with this:
/* compute the first index for each value */
int index = 0;
for (i = 0; i < range; i++) {
incr = count[i];
count[i] = index;
index += incr;
}
/* copy each value at the corresponding index and update it */
for (i = 0; i < n; i++) {
output[count[t[i] - min]++] = t[i];
}
I have the following radix sort algorithm that I am trying to parallelize using OpenMP:
void radixSortEdgesBySource(struct Edge *edges_sorted, struct Edge *edges, int numVertices, int numEdges) {
int i, j, d, c;
int key;
int pos;
int maximum = 0;
int *vertex_cnt = (int*)malloc(numVertices*sizeof(int));
maximum = edges[0].src;
for (c = 0; c < numEdges; c++)
{
if (edges[c].src > maximum)
{
maximum = edges[c].src;
}
}
while(maximum != 0)
{
maximum /= 10;
++d;
}
for (j = 1; j < d; j++)
{
#pragma omp parallel for num_threads(4)
for(i = 0; i < numVertices; ++i)
vertex_cnt[i] = 0;
}
#pragma omp parallel for num_threads(4)
for(i = 0; i < numEdges; ++i)
{
key = edges[i].src;
vertex_cnt[key]++;
}
for(i = 1; i < numVertices; ++i) {
vertex_cnt[i] += vertex_cnt[i - 1];
}
#pragma omp parallel for num_threads(4)
for (i = numEdges - 1; i >= 0; --i) {
key = edges[i].src;
pos = vertex_cnt[key] - 1;
edges_sorted[pos] = edges[i];
vertex_cnt[key]--;
}
}
free(vertex_cnt);
}
I want to know if the way I have used #pragma omp is correct? Because I am not really seeing any considerable change in the speed of execution.
And also how would I go about parallelizing the loop block that does the cumulative summing?
I'm working on a project (written in C) involving matrix factorization and I need some help.
My objective is to allocate memory for an upper triangular matrix and I want to access it via algebraic row and column notation (i.e. i,j in {1,2,...,n} instead of i,j in {0,1,...,n-1}).
For example, in a 5x5 matrix I should be able to access the [3][4] element if I input matrix[3][4].
My code for a non-algebraic index upper triangular matrix looks like this:
double** malloc_sup_matrix (int n)
{
double** L;
int i;
L = (double**)malloc((n)*sizeof(double*));
if(L == NULL)
printerror("allocating space for the matrix (rows).");
for(i = 0; i < n; i++)
{
L[i] = (double*)malloc((n-i)*sizeof(double));
if(L[i] == NULL)
printerror("allocating space for the matrix (cols).");
L[i]-=i;
}
return L;
}
My code for the algebraic index one (I'm not checking if the allocated space is null yet, I'll do it when I stop messing around with this):
int** m;
int i, n;
n = 10;
m = (int**)malloc((n+1)*sizeof(int*));
for(i = 0; i < n; i++)
{
m[i] = (int*)calloc((n+1)-(i),sizeof(int));
m[i] -= i;
}
m--;
for(i = 0; i < n; i++)
{
m[i]--;
}
It works just the way I want it, but I have issues when freeing the space I've used. This is the way I'm doing it:
for(i = 1; i <= n; i++)
{
m[i]++;
}
for(i = 0; i < n; i++)
{
m[i] += (i);
free(m[i]);
}
m++;
free(m);
Do you guys have any suggestions? Thank you so much in advance ^^.
There's a problem on this line:
m--;
for(i = 0; i < n; i++)
{
m[i]--;
}
You're decrementing m, but then go ahead and index it from 0 ... I guess you may end up messing up the heap structures.
I managed to have your code valgrind error-free like this:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int** m;
int i, j, n;
n = 10;
m = (int**)malloc((n+1)*sizeof(int*));
for(i = 0; i < n; i++)
{
m[i] = (int*)calloc((n+1)-(i), sizeof(int));
m[i] -= i;
}
for(i = 0; i < n; i++)
{
m[i]--;
}
m--;
/* Access it like m[1][1] ... m[n][n], m[i][j] (with i <= j) */
/*
for (i = 1; i <= n; i++) {
for (j = i; j <= n; j++) {
m[i][j] = i+j;
}
}
*/
m++;
for(i = 0; i < n; i++)
{
m[i]++;
}
for(i = 0; i < n; i++)
{
m[i] += (i);
free(m[i]);
}
free(m);
return 0;
}
I am new to the OpenMP, not sure what was wrong with this code, the results are not making sense.
Thanks.
#include <omp.h>
#include <stdio.h>
#define N 20
int cnt = 0;
int A[N];
int main (int argc, char *argv[]) {
#pragma omp parallel for
for (int i = 0; i <= N; i++) {
if ((i%2)==0) cnt++;
A[i] = cnt;
printf("i=%d, cnt=%d\n", i, cnt);
}
printf("outside the parallel cnt=%d\n", cnt);
for (int i = 0; i <= N; i++)
printf("A[%d]=%d\n", i, A[i]);
}
Edit:
the cnt outside the parallel region should be 11, most time it was correct, but sometime it gave me 10. For array A I understand why the values do not match with the indices, but I would hope the array A be like this following, is it possible ?
A[0]=1 A[1]=1 A[2]=2 A[3]=2 A[4]=3 A[5]=3 A[6]=4 A[7]=4 A[8]=5 A[9]=5 A[10]=6
A[11]=6 A[12]=7 A[13]=7 A[14]=8 A[15]=8 A[16]=9 A[17]=9 A[18]=10 A[19]=10
A[20]=11
Your code has multiple bugs. Let's address the silly one first. You write to N+1 elements but only allocate N elements. Change N to 21 and then change
for (int i = 0; i <= N; i++)
to
for (int i = 0; i < N; i++)
But your code has another more subtle bug. You're using an induction variable. I don't know an easy way to use induction variables with OpenMP.
In your case one easy fix is not use an induction variable and instead do
#pragma omp parallel for
for (int i = 0; i < N; i++) {
int j = i / 2 + 1;
A[i] = j;
}
cnt = N/2;
You can also use a reduction for the final value of cnt but it's redundant and less efficient.
#pragma omp parallel for reduction(+:cnt)
for (int i = 0; i < N; i++) {
if ((i % 2) == 0) cnt++;
int j = i / 2 + 1;
A[i] = j;
}
If you really want to use an induction variable then you have to do something like this:
#pragma omp parallel
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i < finish; i++) {
if ((i % 2) == 0) j++;
A[i] = j;
}
}
cnt = N/2;
You can also use a reduction for the final value of cnt but as is clear in the code below it's redundant.
#pragma omp parallel reduction(+:cnt)
{
int ithread = omp_get_thread_num();
int nthreads = omp_get_num_threads();
int start = ithread*N/nthreads;
int finish = (ithread + 1)*N/nthreads;
int j = start / 2;
if (start % 2) j++;
for (int i = start; i <finish; i++) {
if ((i % 2) == 0) {
j++; cnt++;
}
A[i] = j;
}
}