OpenMP : Parallel QuickSort - c

I try to use OpenMP to parallelize QuickSort in partition part and QuickSort part. My C code is as follows:
#include "stdlib.h"
#include "stdio.h"
#include "omp.h"
// parallel partition
int ParPartition(int *a, int p, int r) {
int b[r-p];
int key = *(a+r); // use the last element in the array as the pivot
int lt[r-p]; // mark 1 at the position where its element is smaller than the key, else 0
int gt[r-p]; // mark 1 at the position where its element is bigger than the key, else 0
int cnt_lt = 0; // count 1 in the lt array
int cnt_gt = 0; // count 1 in the gt array
int j=p;
int k = 0; // the position of the pivot
// deal with gt and lt array
#pragma omp parallel for
for ( j=p; j<r; ++j) {
b[j-p] = *(a+j);
if (*(a+j) < key) {
lt[j-p] = 1;
gt[j-p] = 0;
} else {
lt[j-p] = 0;
gt[j-p] = 1;
}
}
// calculate the new position of the elements
for ( j=0; j<(r-p); ++j) {
if (lt[j]) {
++cnt_lt;
lt[j] = cnt_lt;
} else
lt[j] = cnt_lt;
if (gt[j]) {
++cnt_gt;
gt[j] = cnt_gt;
} else
gt[j] = cnt_gt;
}
// move the pivot
k = lt[r-p-1];
*(a+p+k) = key;
// move elements to their new positon
#pragma omp parallel for
for ( j=p; j<r; ++j) {
if (b[j-p] < key)
*(a+p+lt[j-p]-1) = b[j-p];
else if (b[j-p] > key)
*(a+k+gt[j-p]) = b[j-p];
}
return (k+p);
}
void ParQuickSort(int *a, int p, int r) {
int q;
if (p<r) {
q = ParPartition(a, p, r);
#pragma omp parallel sections
{
#pragma omp section
ParQuickSort(a, p, q-1);
#pragma omp section
ParQuickSort(a, q+1, r);
}
}
}
int main() {
int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
ParQuickSort(a, 0, 9);
int i=0;
for (; i!=10; ++i)
printf("%d\t", a[i]);
printf("\n");
return 0;
}
For the example in the main function, the sorting result is:
0 9 9 2 2 2 6 7 7 7
I used gdb to debug. In the early recursion, all went well. But in some recursions, it suddenly messed up to begin duplicate elements. Then generate the above result.
Can someone help me figure out where the problem is?

I decided to post this answer because:
the accepted answer is wrong, and the user seems inactive these days. There is a race-condition on
#pragma omp parallel for
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i]; //<- race condition lt_n is shared
}else{
gt[gt_n++] = a[i]; //<- race condition gt_n is shared
}
}
Nonetheless, even if it was correct, the modern answer to this question is to use OpenMP tasks instead of sections.
I am providing the community with full runnable example of such approach including tests and profiling.
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define TASK_SIZE 100
unsigned int rand_interval(unsigned int min, unsigned int max)
{
// https://stackoverflow.com/questions/2509679/
int r;
const unsigned int range = 1 + max - min;
const unsigned int buckets = RAND_MAX / range;
const unsigned int limit = buckets * range;
do
{
r = rand();
}
while (r >= limit);
return min + (r / buckets);
}
void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
for (int i = 0; i < size; i++)
m[i] = rand_interval(min, max);
}
void init(int *a, int size){
for(int i = 0; i < size; i++)
a[i] = 0;
}
void printArray(int *a, int size){
for(int i = 0; i < size; i++)
printf("%d ", a[i]);
printf("\n");
}
int isSorted(int *a, int size){
for(int i = 0; i < size - 1; i++)
if(a[i] > a[i + 1])
return 0;
return 1;
}
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, p, div - 1);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, div + 1, r);
}
}
int main(int argc, char *argv[])
{
srand(123456);
int N = (argc > 1) ? atoi(argv[1]) : 10;
int print = (argc > 2) ? atoi(argv[2]) : 0;
int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
int *X = malloc(N * sizeof(int));
int *tmp = malloc(N * sizeof(int));
omp_set_dynamic(0); /** Explicitly disable dynamic teams **/
omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/
// Dealing with fail memory allocation
if(!X || !tmp)
{
if(X) free(X);
if(tmp) free(tmp);
return (EXIT_FAILURE);
}
fillupRandomly (X, N, 0, 5);
double begin = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
quicksort(X, 0, N);
}
double end = omp_get_wtime();
printf("Time: %f (s) \n",end-begin);
assert(1 == isSorted(X, N));
if(print){
printArray(X, N);
}
free(X);
free(tmp);
return (EXIT_SUCCESS);
return 0;
}
How to run:
This program accepts three parameters:
The size of the array;
Print or not the array, 0 for no, otherwise yes;
The number of Threads to run in parallel.
Mini Benchmark
In a 4 core machine : Input 100000 with
1 Thread -> Time: 0.784504 (s)
2 Threads -> Time: 0.424008 (s) ~ speedup 1.85x
4 Threads -> Time: 0.282944 (s) ~ speedup 2.77x

I feel sorry for my first comment.It does not matter with your problem.I have not found the true problem of your question(Maybe your move element has the problem).According to your opinion, I wrote a similar program, it works
fine.(I am also new on OpenMP).
#include <stdio.h>
#include <stdlib.h>
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
#pragma omp parallel for
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp parallel sections
{
#pragma omp section
quicksort(a, p, div - 1);
#pragma omp section
quicksort(a, div + 1, r);
}
}
}
int main(void)
{
int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
int i;
quicksort(a, 0, 9);
for(i = 0;i < 10; i++){
printf("%d\t", a[i]);
}
printf("\n");
return 0;
}

I've implemented parallel quicksort in a production environment, although with concurrent processes (i.e. fork() and join()) and not OpenMP. I also found a pretty good pthread solution, but a concurrent process solution was the best in terms of worst-case runtime. Let me start by saying that it doesn't seem like you're making copies of your input array for each thread, so you'll definitely encounter race conditions which can corrupt your data.
Essentially, what is happening is you have created an array N in shared memory, and when you do a #pragma omp parallel sections, you're spawning as many worker threads as there are #pragma omp section's. Each time a worker thread tries to access and modify elements of a, it will execute a series of instructions: "read the n'th value of N from the given address", "modify the n'th value of N", "write the n'th value of N back to the given address". Since you have multiple threads with no locking or synchronization, the read, modify, and write instructions may be executed in any order by multiple processors, so the threads may overwrite each other's modifications or read a non-updated value.
The best solution that I found (after many weeks of testing and benchmarking many solutions that I came up with) is to subdivide the list log(n) times, where n is the number of processors. For example, if you have a quad core machine (n = 4), subdivide the list 2 times (log(4) = 2) choosing pivots that are the medians of the data set. It is important that the pivots are medians, because otherwise you can end up with a case where a poorly chosen pivot causes the lists to be distributed unevenly amongst processes. Then each process does quicksort on its local subarray, then merges its results with the results of other processes. This is called "hyperquicksort", and from an initial github search, I found this. I can't vouch for the code in there, and can't publish any of the code that I wrote since it is protected under an NDA.
By the way, one of the best parallel sorting algorithm is PSRS (Parallel Sorting by Regular Sampling), which keeps list sizes more balanced amongst processes, doesn't unnecessarily communicate keys between processes, and can work on an arbitrary number of concurrent processes (they don't necessarily have to be a power of 2).

Related

OpenMP parallel multiplication slower than Sequential multiplication

I'm learning OpenMP and I'm trying to do a simple task: A[r][c] * X[c] = B[r] (matrix vector multiplication).
The problem is: the sequential code is faster than parallel and I don't know why!
My code:
#include <omp.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/types.h>
// Defined variables
#define row_matriz_A 80000
#define col_matriz_A 800
#define THREADS_NUM 4
// FUNCAO - GERAR MATRIZES
void gerarMatrizes(int r, int c, int mA[], int vX[], int vB[]){...}
// FUNCAO - SEQUENTIAL MULTIPLICATION
void multSequencial(int r, int c, int mA[], int vX[], int vB[]){
// Variables
int i, j, offset, sum;
struct timeval tv1,tv2;
double t1, t2;
// Begin Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao sequencial foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - MULTIPLICACAO PARALELA COM OpenMP
void matvecHost(int r, int c, int mA[], int vX[], int vB[]){
// Variaveis
int tID, i, j, offset, sum;
struct timeval tv1, tv2;
double t1, t2;
// Init vB
for(i = 0; i < r; i++) vB[i] = 0;
// BEGIN Time
gettimeofday(&tv1, NULL);
t1 = (double)(tv1.tv_sec) + (double)(tv1.tv_usec)/ 1000000.00;
omp_set_num_threads(THREADS_NUM);
#pragma omp parallel private(tID, i, j) shared(mA, vB, vX)
{
tID = omp_get_thread_num();
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
offset = i * c + j;
sum += mA[offset] * vX[j];
}
vB[i] = sum;
}
}
// End time
gettimeofday(&tv2, NULL);
t2 = (double)(tv2.tv_sec) + (double)(tv2.tv_usec)/ 1000000.00;
printf("\nO tempo de execucao OpenMP foi: %lf segundos.\n", (t2 - t1));
return;
}
// FUNCAO - PRINCIPAL
int main(int argc, char * argv[]) {
int row, col;
row = row_matriz_A;
col = col_matriz_A;
int *matrizA = (int *)calloc(row * col, sizeof(int));
int *vectorX = (int *)calloc(col * 1, sizeof(int));
int *vectorB = (int *)calloc(row * 1, sizeof(int));
gerarMatrizes(row, col, matrizA, vectorX, vectorB);
multSequencial(row, col, matrizA, vectorX, vectorB);
matvecHost(row, col, matrizA, vectorX, vectorB);
return 0;
}
Previous solutions that did not worked:
Use collapse in my squared for
Increse rows and columns size
Increase thread numbers (A teacher recommend to use thread number == threads physical number)
Use malloc instead of m[i][j]
EDIT - ANSWER
My parallel block was correctly changed based on the correct answer:
#pragma omp parallel private(i, j, sum) shared(mA, vB, vX)
{
#pragma omp for
for(i = 0; i < r; i++){
sum = 0;
for(j = 0; j < c; j++){
sum += mA[i * c + j] * vX[j];
}
vB[i] = sum;
}
}
I still got some a doubt:
If I define i, j and sum inside my parallel block, they will be set as private automatically? This improve the speed in my code or not?
You have race conditions on sum and offset - those are shared between the threads instead of being thread-private.
This also likely explains the slowdown: On x86, the CPU will actually work hard to make sure accesses to shared variables "work". This involves flushing cache lines after every (!) write to offset and sum - so all the threads are wildly writing into the same variables, but each one has to wait until the write from the previous thread (on a different core) has arrived in the local cache again after having been flushed. And of course it will produce completely nonsensical results.
I don't know why you are declaring all your variables at the start of the function - that's prone to these kind of mistakes. If you declared i, j, sum and offset (and the unused tID) in the smallest possible scopes instead, you wouldn't ever had this problem because they would be thread-private automatically in that case.

OpenMP segmentation fault

recently I am working on a c OpenMP code which carrying out the affinity scheduling. Basically, after a thread has finished its assigned iterations, it will start looking for other threads which has the most work load and steal some jobs from them.
Everything works fine, I can compile the file using icc. However, when I try to run it, it gives me the segmentation fault(core dumped). But the funny thing is, the error is not always happen, that is, even I get an error when I first run the code, when I try to run again, sometimes it works. This is so weird to me. I wonder what I did wrong in my code and how to fix the problem. Thank you. I did only modified the method runloop and affinity, others are given at the beginning which works fine.
#include <stdio.h>
#include <math.h>
#define N 729
#define reps 1000
#include <omp.h>
double a[N][N], b[N][N], c[N];
int jmax[N];
void init1(void);
void init2(void);
void runloop(int);
void loop1chunk(int, int);
void loop2chunk(int, int);
void valid1(void);
void valid2(void);
int affinity(int*, int*, int, int, float, int*, int*);
int main(int argc, char *argv[]) {
double start1,start2,end1,end2;
int r;
init1();
start1 = omp_get_wtime();
for (r=0; r<reps; r++){
runloop(1);
}
end1 = omp_get_wtime();
valid1();
printf("Total time for %d reps of loop 1 = %f\n",reps, (float)(end1-start1));
init2();
start2 = omp_get_wtime();
for (r=0; r<reps; r++){
runloop(2);
}
end2 = omp_get_wtime();
valid2();
printf("Total time for %d reps of loop 2 = %f\n",reps, (float)(end2-start2));
}
void init1(void){
int i,j;
for (i=0; i<N; i++){
for (j=0; j<N; j++){
a[i][j] = 0.0;
b[i][j] = 3.142*(i+j);
}
}
}
void init2(void){
int i,j, expr;
for (i=0; i<N; i++){
expr = i%( 3*(i/30) + 1);
if ( expr == 0) {
jmax[i] = N;
}
else {
jmax[i] = 1;
}
c[i] = 0.0;
}
for (i=0; i<N; i++){
for (j=0; j<N; j++){
b[i][j] = (double) (i*j+1) / (double) (N*N);
}
}
}
void runloop(int loopid)
{
int nthreads = omp_get_max_threads(); // we set it before the parallel region, using opm_get_num_threads() will always return 1 otherwise
int ipt = (int) ceil((double)N/(double)nthreads);
float chunks_fraction = 1.0 / nthreads;
int threads_lo_bound[nthreads];
int threads_hi_bound[nthreads];
#pragma omp parallel default(none) shared(threads_lo_bound, threads_hi_bound, nthreads, loopid, ipt, chunks_fraction)
{
int myid = omp_get_thread_num();
int lo = myid * ipt;
int hi = (myid+1)*ipt;
if (hi > N) hi = N;
threads_lo_bound[myid] = lo;
threads_hi_bound[myid] = hi;
int current_lower_bound = 0;
int current_higher_bound = 0;
int affinity_steal = 0;
while(affinity_steal != -1)
{
switch(loopid)
{
case 1: loop1chunk(current_lower_bound, current_higher_bound); break;
case 2: loop2chunk(current_lower_bound, current_higher_bound); break;
}
#pragma omp critical
{
affinity_steal = affinity(threads_lo_bound, threads_hi_bound, nthreads, myid, chunks_fraction, &current_lower_bound, &current_higher_bound);
}
}
}
}
int affinity(int* threads_lo_bound, int* threads_hi_bound, int num_of_thread, int thread_num, float chunks_fraction, int *current_lower_bound, int *current_higher_bound)
{
int current_pos;
if (threads_hi_bound[thread_num] - threads_lo_bound[thread_num] > 0)
{
current_pos = thread_num;
}
else
{
int new_pos = -1;
int jobs_remain = 0;
int i;
for (i = 0; i < num_of_thread; i++)
{
int diff = threads_hi_bound[i] - threads_lo_bound[i];
if (diff > jobs_remain)
{
new_pos = i;
jobs_remain = diff;
}
}
current_pos = new_pos;
}
if (current_pos == -1) return -1;
int remaining_iterations = threads_hi_bound[current_pos] - threads_lo_bound[current_pos];
int iter_size_fractions = (int)ceil(chunks_fraction * remaining_iterations);
*current_lower_bound = threads_lo_bound[current_pos];
*current_higher_bound = threads_lo_bound[current_pos] + iter_size_fractions;
threads_lo_bound[current_pos] = threads_lo_bound[current_pos] + iter_size_fractions;
return current_pos;
}
void loop1chunk(int lo, int hi) {
int i,j;
for (i=lo; i<hi; i++){
for (j=N-1; j>i; j--){
a[i][j] += cos(b[i][j]);
}
}
}
void loop2chunk(int lo, int hi) {
int i,j,k;
double rN2;
rN2 = 1.0 / (double) (N*N);
for (i=lo; i<hi; i++){
for (j=0; j < jmax[i]; j++){
for (k=0; k<j; k++){
c[i] += (k+1) * log (b[i][j]) * rN2;
}
}
}
}
void valid1(void) {
int i,j;
double suma;
suma= 0.0;
for (i=0; i<N; i++){
for (j=0; j<N; j++){
suma += a[i][j];
}
}
printf("Loop 1 check: Sum of a is %lf\n", suma);
}
void valid2(void) {
int i;
double sumc;
sumc= 0.0;
for (i=0; i<N; i++){
sumc += c[i];
}
printf("Loop 2 check: Sum of c is %f\n", sumc);
}
You don't initialise the arrays threads_lo_bound and threads_hi_bound, so they initially contain some completely random values (this is source of randomness number 1).
You then enter the parallel region, where it is imperative to realise not all threads will be moving through the code in sync, the actual speed of each threads is quite random as it shares the CPU with many other programs, even if they only use 1%, that will still show (this is source of randomness number 2, I'd argue this one is more relevant to why you see it working every now and then).
So what happens when the code crashes?
One of the threads (most likely the master) reaches the critical region before at least one of the other threads has reached the line where you set threads_lo_bound[myid] and threads_hi_bound[myid].
After that, depending on what those random values stored in there were (you can generally assume they were out of bounds, your array is fairly small, the odds of those values being valid indices are pretty slim), the thread will try to steal some of the jobs (that don't exist) by setting current_lower_bound and/or current_upper_bound to some value that is out of range of your initial arrays a, b, c.
It will then enter the second iteration of your while(affinity_steal != -1) loop and access memory that is out of bounds inevitably leading to a segmentation fault (eventually, in principle it's undefined behaviour and the crash can occur at any point after an invalid memory access, or in some cases never, leading you to believe everything is in order, when it is most definitely not).
The fix of course is simple, add
#pragma omp barrier
just before the while(affinity_steal != -1) loop to ensure all threads have reached that point (i.e. synchronise the threads at that point) and the bounds are properly set before you proceed into the loop. The overhead of this is minimal, but if for some reason you wish to avoid using barriers, you can simply set the values of the array before entering the parallel region.
That said, bugs like this can usually be located using a good debugger, I strongly suggest learning how to use one, they make life much easier.

Is there a way to iterate over order?

How can one iterate through order of execution?
I am developing a piece of software that have several steps to compute over some data, and i was thinking in may changing the order of those steps pragmatically so i can check what would be the best order for some data.
Let me exemplify: I have let's say 3 steps (it's actually more):
stepA(data);
stepB(data);
stepC(data);
And I want a contraption that allow me to walk thought every permutation of those steps and then check results. Something like that:
data = originalData; i=0;
while (someMagic(&data,[stepA,stepB,stepC],i++)){
checkResults(data);
data = originalData;
}
then someMagic execute A,B then C on i==0. A, C then B on i==1. B, A then C on i==2 and so on.
You can use function pointers, maybe something like the following:
typedef void (*func)(void *data);
int someMagic(void *data, func *func_list, int i) {
switch (i) {
case 0:
func_list[0](data);
func_list[1](data);
func_list[2](data);
break;
case 1:
func_list[0](data);
func_list[2](data);
func_list[1](data);
break;
case 2:
func_list[1](data);
func_list[0](data);
func_list[2](data);
break;
default: return 0;
}
return 1;
}
func steps[3] = {
stepA,
stepB,
stepC
}
while (someMagic(&data, steps, i++)) {
....
}
The key is to find a way to iterate over the set of permutations of the [0, n[ integer interval.
A permutation (in the mathematical meaning) can be seen as a bijection of [0, n[ into itself and can be represented by the image of this permutation, applied to [0, n[.
for example, consider the permutation of [0, 3[:
0 -> 1
1 -> 2
2 -> 0
it can be seen as the tuple (1, 2, 0), which in C, translate naturally to the array of integers permutation = (int []){1, 2, 0};.
Suppose you have an array of function pointers steps, then for each permutation, you'll then want to call steps[permutation[i]], for each value of i in [0, n[.
The following code implements this algorithm:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
static void stepA(int data) { printf("%d: %s\n", data, __func__); }
static void stepB(int data) { printf("%d: %s\n", data, __func__); }
static void stepC(int data) { printf("%d: %s\n", data, __func__); }
static void (* const steps[])(int) = {stepA, stepB, stepC,};
static int fact(int n) { return n == 0 ? 1 : fact(n - 1) * n; }
static int compare_int(const void *pa, const void *pb)
{
return *(const int *)pa - *(const int *)pb;
}
static void get_next_permutation(int tab[], size_t n)
{
int tmp;
unsigned i;
unsigned j;
unsigned k;
/* to find the next permutation in the lexicographic order
* source: question 4 (in french, sorry ^^) of
* https://liris.cnrs.fr/~aparreau/Teaching/INF233/TP2-permutation.pdf
. */
/* 1. find the biggest index i for which tab[i] < tab[i+1] */
for (k = 0; k < n - 1; k++)
if (tab[k] < tab[k + 1])
i = k;
/* 2. Find the index j of the smallest element, bigger than tab[i],
* located after i */
j = i + 1;
for (k = i + 1; k < n; k++)
if (tab[k] > tab[i] && tab[k] < tab[j])
j = k;
/* 3. Swap the elements of index i and j */
tmp = tab[i];
tab[i] = tab[j];
tab[j] = tmp;
/* 4. Sort the array in ascending order, after index i */
qsort(tab + i + 1, n - (i + 1), sizeof(*tab), compare_int);
}
int main(void)
{
int n = sizeof(steps) / sizeof(*steps);
int j;
int i;
int permutation[n];
int f = fact(n);
/* first permutation is identity */
for (i = 0; i < n; i++)
permutation[i] = i;
for (j = 0; j < f; j++) {
for (i = 0; i < n; i++)
steps[permutation[i]](i);
if (j != f - 1)
get_next_permutation(permutation, n);
}
return EXIT_SUCCESS;
}
The outer loop in main, indexed by j, iterates over all the n! permutations, while the inner one, indexed by i, iterates overs the n steps.
The get_next_permutation modifies the permutation array in place, to obtain the next permutation in the lexicographical order.
Note that it doesn't work when the permutation in input is the last one (n - 1, ..., 1, 0), hence the if (j != f - 1) test.
One could enhance it to detect this case (i isn't set) and to put the first permutation (0, 1, ..., n - 1) into the permutation array.
The code can be compiled with:
gcc main.c -o main -Wall -Wextra -Werror -O0 -g3
And I strongly suggest using valgrind as a way to detect off-by-one errors.
EDIT: I just realized I didn't answer the OP's question precisely. The someMagic() function would allow a direct access to the i-th permutation, while my algorithm only allows to compute the successor in the lexicographic order. But if the aim is to iterate on all the permutations, it will work fine. Otherwise, maybe an answer like this one should match the requirement.
I've come to a solution that is simple enough:
void stepA(STRUCT_NAME *data);
void stepB(STRUCT_NAME *data);
void stepC(STRUCT_NAME *data);
typedef void (*check)(STRUCT_NAME *data);
void swap(check *x, check *y) {
check temp;
temp = *x;
*x = *y;
*y = temp;
}
void permute(check *a, int l, int r,STRUCT_NAME *data) {
int i, j = 0, score;
HAND_T *copy, *copy2, *best_order = NULL;
if (l == r) {
j = 0;
while (j <= r) a[j++](data);
} else {
for (i = l; i <= r; i++) {
swap((a + l), (a + i));
permute(a, l + 1, r, data);
swap((a + l), (a + i));
}
}
}
check checks[3] = {
stepA,
stepB,
stepC,
};
int main(void){
...
permute(checks,0,2,data)
}

How to use protect one element in an array in openMP?

For example:
int a[100];
int i=0;
for(i=0;i<100;i++)
{
a[i]=i;
}
#pragma omp parallel for firstprivate(a[0])
for(i=0;i<100;i++)
{
a[i]=a[i]+a[((i+1)%100)];
}
However, it compiles failure:
error: syntax error in omp clause
#pragma omp parallel for firstprivate(a[0])
^
Sorry for my previous description. I want the output to be : a[i]=a[i]+a[((i+1)%100)],for example, a[10]=a[10]+a[11],a[99]=a[99]+a[0]. However, when i becomes 99, a[99] should be a[99]=a[99]+a[0]. But when the thread executes a[99]=a[99]+a[0], the value of a[0] has been changed into a[0]=a[0]+a[1] by another paralleled thread(it has dependency). What should I do to guarantee that each thread can use original value in a[i] to finish a[i]=a[i]+a[((i+1)%100)] ?
Your code is not guaranteed to get the correct result because it has a race condition. One way to fix this is to do it out-of-place like this
void foo2(int * __restrict a, int * __restrict b) {
int i;
#pragma omp parallel for schedule(static)
for (i = 0; i<N; i++) {
b[i] = a[i] + a[((i + 1) % N)];
}
}
If you really want to do it in-place then it's a bit complicated
void foo3(int a[]) {
int n = N - 1;
#pragma omp parallel
{
int i;
int ithread = omp_get_thread_num();
int nthread = omp_get_num_threads();
int start = ithread*n / nthread;
int finish = (ithread + 1)*n/ nthread;
int tmp = a[finish];
for (i = start; i < finish-1; i++) {
a[i] += a[i + 1];
}
a[finish-1] += tmp;
}
a[n] += a[0];
}

How to make parallel this for in OpenMP

I know the basics of OpenMP and I know that in order to parallelize a for its iterations must not depend on previous iterations. Also one can use reductions, but they support only basic operators such as +, -,/, *, &&, ||.
How I can make this for parallel?
for (i = 1; i < n; ++i) {
for (j = 1; j < n; ++j) {
// stanga
if (res[i][j - 1] != res[i][j]) {
cmin2[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin_res[i][j][0]);
} else {
cmin2[i][j][0] = min(cmin2[i][j - 1][0] + 1, cmin[i][j][0]);
cmin2_res[i][j][0] = min(cmin2_res[i][j - 1][0] + 1, cmin_res[i][j][0]);
}
// sus
if (res[i - 1][j] != res[i][j]) {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2[i - 1][j][0] + 1, cmin_res[i][j][1]);
} else {
cmin2[i][j][0] = min3(cmin2[i][j][0], cmin2[i - 1][j][0] + 1, cmin[i][j][1]);
cmin2_res[i][j][0] = min3(cmin2_res[i][j][0], cmin2_res[i - 1][j][0] + 1, cmin_res[i][j][1]);
}
}
}
My question is rather how I can decompose this for to be able to run it in parallel (and maybe use reductions if possible).
The problem is that at each iteration the operations must be done in this order, because I have 3 more groups of for like this.
P.S. min and min3 are macros.
There's a brute force way to do what you want, but a better parallelization will require a little more input about what you want in and out of the routines.
The data dependencies in your loop look like this, in i-j space:
i →
..........
j .....1....
↓ ....12....
...123....
where the value at point three depends on that those point 2s, and those depend on those at pt 1, etc. Because of this diagonal structure, you can re-order the loops to traverse the grid diagonally, eg first iteration is over (0,1), (1,0) then over (0,2),(1,1),(2,0), and so on. A simplified version of your problem looks like below:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
int **int2darray(int n, int m);
void free2darray(int **array);
void init2darray(int **array, int n, int m);
void tick(struct timeval *timer);
double tock(struct timeval *timer);
int main(int argc, char **argv) {
const int N=10000;
int **serialarr, **omparr;
struct timeval serialtimer, omptimer;
double serialtime, omptime;
serialarr = int2darray(N,N);
omparr = int2darray(N,N);
init2darray(serialarr, N, N);
init2darray(omparr, N, N);
/* serial calculation */
tick(&serialtimer);
for (int i=1; i<N; i++)
for (int j=1; j<N; j++)
serialarr[i][j] = serialarr[i-1][j] + serialarr[i][j-1];
serialtime = tock(&serialtimer);
/* omp */
tick(&omptimer);
#pragma omp parallel shared(omparr) default(none)
{
for (int ipj=1; ipj<=N; ipj++) {
#pragma omp for
for (int j=1; j<ipj; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
for (int ipj=N+1; ipj<2*N-1; ipj++) {
#pragma omp for
for (int j=ipj-N+1; j<N; j++) {
int i = ipj - j;
omparr[i][j] = omparr[i-1][j] + omparr[i][j-1];
}
}
}
omptime = tock(&omptimer);
/* compare results */
int abserr = 0;
for (int i=0; i<N; i++)
for (int j=0; j<N; j++)
abserr += abs(omparr[i][j] - serialarr[i][j]);
printf("Difference between serial and OMP array: %d\n", abserr);
printf("Serial time = %lf\n", serialtime);
printf("OMP time = %lf\n", omptime);
free2darray(omparr);
free2darray(serialarr);
return 0;
}
int **int2darray(int n, int m) {
int *data = malloc(n*m*sizeof(int));
int **array = malloc(n*sizeof(int*));
for (int i=0; i<n; i++)
array[i] = &(data[i*m]);
return array;
}
void free2darray(int **array) {
free(array[0]);
free(array);
}
void init2darray(int **array, int n, int m) {
for (int i=0; i<n; i++)
for (int j=0; j<m; j++)
array[i][j] = i*m+j;
}
void tick(struct timeval *timer) {
gettimeofday(timer, NULL);
}
double tock(struct timeval *timer) {
struct timeval now;
gettimeofday(&now, NULL);
return (now.tv_usec-timer->tv_usec)/1.0e6 + (now.tv_sec - timer->tv_sec);
}
Running gives:
$ gcc -fopenmp -Wall -O2 loops.c -o loops -std=c99
$ export OMP_NUM_THREADS=8
$ ./loops
Difference between serial and OMP array: 0
Serial time = 0.246649
OMP time = 0.174936
You'll notice the speedup is pretty poor, even with large N, because the amount of computation per iteration is small, it's the inner loop that's parallelized, and we're going through memory in a weird, cache-unfriendly order.
Some of the above could probably be fixed, but it would help a bit more to know more about what you're trying to do; eg, do you care about the cmin2_res arrays, or are they just intermediate products? In words, what are you trying to calculate?

Resources