Making 2 loops run in parallel - c

I have a task on one of my work sheets asking me to add OpenMP directives to make both of these loops run in parallel.
{
for (i = ; i < N; i += )
{
D[i] = x * A[i] + x * B[i];
}
for (i = 0; i < N; i++)
{
C[i] = c * D[i];
}
}
I made a C file to add the Openmp directives
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h.>
#define THREADS 4
#define N 10
int main (int argc, char *argv[])
{
int i;
double A[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, B[N] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, C[N], D[N];
const double x = 5;
const double z = 5;
#pragma omp parallel for schedule(static) num_threads(THREADS)
for (i = 0; i < N; i += 10)
{
D[i] = z * A[i] + z * B[i];
printf("part 1 Thread %d is doing iteration %d: %d \n", omp_get_thread_num(
),i, D[i]);
}
#pragma omp parallel for schedule(static) num_threads(THREADS)
for (i = 0; i < N; i++)
{
C[i] = x * D[i];
printf("part 2 Thread %d is doing iteration %d: %d \n", omp_get_thread_num(
),i, C[i]);
}
return 0;
}
I get part 1 do one iteration and then part 2 do all iterations, I'm not sure where I'm going wrong.

Part 1 only do 1 iteration because there is only 1 iteration to do:
#pragma omp parallel for schedule(static) num_threads(THREADS)
for (i = 0; i < N; i += 10)
where N expands to 10 in line 7:
#define N 10
A second iteration never happens because is out of loop's range

Related

The problem with parallelizing the program using OpenMP

I have a program that works with arrays and outputs a single number. To parallelize the program, I use OpenMP, but the problem is that after writing the directives, I started getting answers that are not similar to the answers of the program without parallelization. Can anyone tell me where I made a mistake?
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <math.h>
#include <float.h>
#define LOOP_COUNT 100
#define MIN_1_ARRAY_VALUE 1
#define MAX_1_ARRAY_VALUE 10
#define MIN_2_ARRAY_VALUE 10
#define MAX_2_ARRAY_VALUE 100
#define PI 3.1415926535897932384626433
#define thread_count 4
double generate_random(unsigned int* seed, int min, int max) {
return (double)((rand_r(seed) % 10) + (1.0 / (rand_r(seed) % (max - min + 1))));
}
double map_1(double value) {
return pow(value / PI, 3);
}
double map_2(double value) {
return fabs(tan(value));
}
void dwarf_sort(int n, double mass[]) {
int i = 1;
int j = 2;
while (i < n) {
if (mass[i-1]<mass[i]) {
i = j;
j = j + 1;
}
else
{
double tmp = mass[i];
mass[i] = mass[i - 1];
mass[i - 1] = tmp;
--i;
if (i==0)
{
i = j;
j = j + 1;
}
}
}
}
int main(int argc, char* argv[]) {
printf("lab work 3 in processing...!\n");
int trial_counter;
int array_size = atoi(argv[1]);
struct timeval before, after;
long time_diff;
gettimeofday(&before, NULL);
for (trial_counter = 0; trial_counter < LOOP_COUNT; trial_counter++) {
double arr1[array_size];
double arr2[array_size / 2];
double arr2_copy[array_size / 2];
double arr2_min = DBL_MAX;
unsigned int tempValue = trial_counter;
unsigned int *currentSeed = &tempValue;
//stage 1 - init
#pragma omp parallel num_threads(thread_count)
{
#pragma omp parallel for default(none) shared(arr1, currentSeed, array_size) schedule(guided, thread_count)
for (int i = 0; i < array_size; i++) {
arr1[i] = generate_random(currentSeed, MIN_1_ARRAY_VALUE, MAX_1_ARRAY_VALUE);
// printf("arr[%d] = %f\n", i, arr1[i]);
}
#pragma omp parallel for default(none) shared(arr2, arr2_copy, array_size, currentSeed, arr2_min) schedule(guided, thread_count)
for (int i = 0; i < array_size / 2; i++) {
double value = generate_random(currentSeed, MIN_2_ARRAY_VALUE, MAX_2_ARRAY_VALUE);
arr2[i] = value;
arr2_copy[i] = value;
if (value < arr2_min) {
arr2_min = value;
}
}
#pragma omp parallel for default(none) shared(arr1, array_size) schedule(guided, thread_count)
for (int i = 0; i < array_size; i++) {
arr1[i] = map_1(arr1[i]);
}
#pragma omp parallel for default(none) shared(arr2, arr2_copy, array_size) schedule(guided, thread_count)
for (int i = 1; i < array_size / 2; i++) {
#pragma omp critical
arr2[i] = map_2(arr2_copy[i] + arr2_copy[i - 1]);
}
#pragma omp parallel for default(none) shared(arr2, arr1, array_size) schedule(guided, thread_count)
for (int i = 0; i < array_size / 2; i++) {
arr2[i] = pow(arr1[i], arr2[i]);
}
#pragma omp parallel sections
{
#pragma omp section
{
dwarf_sort((int) array_size / 2, arr2);
}
}
double final_sum = 0;
for (int i = 0; i < array_size / 2; i++) {
if (((int) arr2[i]) / 2 == 0) {
final_sum += sin(arr2[i]);
}
}
// printf("Iteration %d, value: %f\n", trial_counter, final_sum);
}
}
gettimeofday(&after, NULL);
time_diff = 1000 * (after.tv_sec - before.tv_sec) + (after.tv_usec - before.tv_usec) / 1000;
printf("\nN=%d. Milliseconds passed: %ld\n", array_size, time_diff);
return 0;
}
rand_r is thread-safe only if each thread have its own seed or if threads are guaranteed to operate on the seed in an exclusive way (eg. using an expensive critical section). This is not the case in your code. Indeed, currentSeed is shared between thread. Thus, it causes a race condition since multiple threads can mutate it simultaneously. You need to use a thread-private seed (with a different value so for results not to be deterministic between threads). The seed of each thread can be initialized from a shared array filled from the main thread (eg. naively [0, 1, 2, 3, etc.]).
The thing is you will still get different results suing a different number of threads with this approach. One solution is to split your data set is independent chunks with an associated seed and then possibly compute the chunk in parallel.
Note that using a #pragma omp parallel for in a #pragma omp parallel section causes many threads to be created (ie. over-subscription). This is generally very inefficient. You should use #pragma omp for instead.

Printing randomly from 1 to 10 in C

I have written a code for randomly printing a number from 1 to 10 without any repetition, but it isn't working properly sometimes I get the number that is already written.
In short, I'm trying to print numbers from 1-10 randomly with no repetition.
Here is my code :
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<conio.h>
main() {
int no = 0, repeat[100] = { 0 }, i = 0, x = 0, j = 0;
srand(time(NULL));
while (true) {
no = (rand() % 10) + 1;
for (i = 0; i < 100; i++) {
if (no != repeat[i]) {
x = 1;
} else if (no == repeat[i]) {
x = 0;
}
}
if (x == 1) {
repeat[j] = no;
printf("\n%d", repeat[i]);
j = j + 1;
}
getch();
}
}
Don't use rand() to generate the numbers directly, instead fill a sequential array, and then use rand() to shuffle the array, e.g.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/** shuffle integer array of size 'n'
* (using fisher-yates method)
*/
void shuffle (int *a, int n)
{
int i, tmp;
while (n-- > 1) {
i = rand() % (n + 1);
tmp = a[i];
a[i] = a[n];
a[n] = tmp;
}
}
int main (void) {
int arr[10] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
srand (time(NULL));
shuffle (arr, 10);
for (int i = 0; i < 10; i++)
printf (" %d", arr[i]);
putchar ('\n');
}
By shuffling the array and swapping random elements within it, you eliminate all possibility of a duplicate number.
Example Use/Output
$ ./bin/shuffle_arr
3 10 7 8 4 5 6 9 1 2
#include <stdio.h>
#define N1 1
#define N2 10
void main() {
int len = N2 - N1 + 1, i, r, temp;
int num[len];
//Fill array with numbers
for (temp = 0, i = N1; temp < len; i++, temp++)
num[temp] = i;
srand(time(NULL));
for (i = len - 1; i > 0; i--) {
r = rand() % i; //pop random number
//swaping
temp = num[i];
num[i] = num[r];
num[r] = temp;
}
/*Random Numbers are stored in Array*/
//print that array
for (i = 0; i < len; i++)
printf("%d\n", num[i]);
}

OpenMP: Prefix Sum Algorithm

I'm trying to implement a Prefix Sum Algorithm in C using OpenMP, and I'm stuck.
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
int main(int argc, char* argv[])
{
int p = 5;
int X[5] = { 1, 5, 4, 2, 3 };
int* Y = (int*)malloc(p * sizeof(int));
for (int i = 0; i < p; i++)
printf("%d ", X[i]);
printf("\n");
Y[0] = X[0];
int i;
#pragma omp parallel for num_threads(4)
for (i = 1; i < p; i++)
Y[i] = X[i - 1] + X[i];
int k = 2;
while (k < p)
{
int i;
#pragma omp parallel for
for (i = k; i < p; i++)
Y[i] = Y[i - k] + Y[i];
k += k;
}
for (int i = 0; i < p; i++)
printf("%d ", Y[i]);
printf("\n");
system("pause");
return 0;
}
What this code should do?
Input numbers are in X,
output numbers are (prefixes) in Y
and the number count is p.
X = 1, 5, 4, 2, 3
Stage I.
Y[0] = X[0];
Y[0] = 1
Stage II.
int i;
#pragma omp parallel for num_threads(4)
for (i = 1; i < p; i++)
Y[i] = X[i - 1] + X[i];
Example:
Y[1] = X[0] + X[1] = 6
Y[2] = X[1] + X[2] = 9
Y[2] = X[2] + X[3] = 6
Y[4] = X[3] + X[4] = 5
Stage III. (where I am stuck)
int k = 2;
while (k < p)
{
int i;
#pragma omp parallel for
for (i = k; i < p; i++)
Y[i] = Y[i - k] + Y[i];
k += k;
}
Example:
k = 2
Y[2] = Y[0] + Y[2] = 1 + 9 = 10
Y[3] = Y[1] + Y[3] = 6 + 6 = 12
Y[4] = Y[2] + Y[4] = 10 + 5 = 15
Above the 10 + 5 = 15 should be 9 + 5 = 14, but the Y[2] was overwritten by another thread. I want to use that Y[2] what was before the for-loop started.
Example:
k = 4
Y[4] = Y[0] + Y[4] = 1 + 15 = 16
Result: 1, 6, 10, 12, 16. Expected good result: 1, 6, 10, 12, 15.
Above the 10 + 5 = 15 should be 9 + 5 = 14, but the Y[2] was overwritten by another thread. I want to use that Y[2] what was before the for-loop started.
With OpenMP, you always have to consider whether your code is correct for the serial case, with a single thread, because
It might in fact run that way, and
If it's incorrect serially, then it's virtually certain to be incorrect as a parallel program, too.
Your code is not correct serially. It appears you could fix that by running the problem loop backward, from i = p - 1 to k, but in fact that's not sufficient for parallel operation.
Your best bet appears to be to accumulate your partial results into a different array than holds the results of the previous cycle. For example, you might flip between X and Y as data source and result, with a little pointer wrangling to grease the iterative wheels. Or you might do it a little more easily by using a 2D array instead of separate X and Y.
UPDATE for Stage III.
int num_threads = 8;
int k = 2;
while (k < p)
{
#pragma omp parallel for ordered num_threads(k < num_threads ? 1 : num_threads)
for (i = p - 1; i >= k; i--)
{
Y[i] = Y[i - k] + Y[i];
}
k += k;
}
The code above solved my problem. It's now working with parallel, except the first few round.

array operation using CUDA kernel

I'm writing CUDA kernel and threads are performing following tasks :
for example i have array of [1, 2, 3, 4] then I want answer [12, 13, 14, 23, 24, 34]
Suppose I've an array with n integers and i've two indexes i and j.
simple solution for that in C language will be :
k=0;
for (i = 0; i < n - 1; i++)
for(j = i+1; j < n-1 ; j++)
{ new_array[k] = array[i]*10 + array[j];
k++;
}
In CUDA I've tried my luck :
for(i = threadIdx.x + 1; i < n-1; i++ )
new_array[i] = array[threadIdx.x] * 10 + array[i];
But I think this is not totally correct or optimal way to do this. can anyone suggest anything better?
I'm assuming that the code you want to port to CUDA is the following:
#include <stdio.h>
#define N 7
int main(){
int array[N] = { 1, 2, 3, 4, 5, 6, 7};
int new_array[(N-1)*N/2] = { 0 };
int k=0;
for (int i = 0; i < N; i++)
for(int j = i+1; j < N; j++)
{
new_array[k] = array[i]*10 + array[j];
k++;
}
for (int i = 0; i < (N-1)*N/2; i++) printf("new_array[%d] = %d\n", i, new_array[i]);
return 0;
}
You may wish to note that you can recast the interior loop as
for (int i = 0; i < N; i++)
for(int j = i+1; j < N; j++)
new_array[i*N+(j-(i+1))-(i)*(i+1)/2] = array[i]*10 + array[j];
which will avoid the explicit definition of an index variable k by directly using index i*N+(j-(i+1))-(i)*(i+1)/2. Such an observation is useful becuase, if you interpret the indices i and j as thread indices in the ported code, then you will have a mapping between the 2d thread indices and the index needed to access the target array in the __global__ function you have to define.
Accordingly, the ported code is
#include <stdio.h>
#define N 7
__global__ void kernel(int* new_array_d, int* array_d) {
int i = threadIdx.x;
int j = threadIdx.y;
if (j > i) new_array_d[i*N+(j-(i+1))-(i)*(i+1)/2] = array_d[i]*10 + array_d[j];
}
int main(){
int array[N] = { 1, 2, 3, 4, 5, 6, 7};
int new_array[(N-1)*N/2] = { 0 };
int* array_d; cudaMalloc((void**)&array_d,N*sizeof(int));
int* new_array_d; cudaMalloc((void**)&new_array_d,(N-1)*N/2*sizeof(int));
cudaMemcpy(array_d,array,N*sizeof(int),cudaMemcpyHostToDevice);
dim3 grid(1,1);
dim3 block(N,N);
kernel<<<grid,block>>>(new_array_d,array_d);
cudaMemcpy(new_array,new_array_d,(N-1)*N/2*sizeof(int),cudaMemcpyDeviceToHost);
for (int i = 0; i < (N-1)*N/2; i++) printf("new_array[%d] = %d\n", i, new_array[i]);
return 0;
}
Please, add your own CUDA error check in the sense of What is the canonical way to check for errors using the CUDA runtime API?. Also, you may wish to extend the above CUDA code to the case of block grids of non-unitary sizes.

Transferring sorting loop results to another array

Here's a loop to sort an array from min to max, I need the result of this loop to be put into another array so I can filter and remove the numbers that occur only once and find the last member of what's left.
Here's the code I have so far:
#include<stdio.h>
#include<conio.h>
#define buffas 1024
void main() {
int arr[buffas],i,j,element,no,temp;
printf("\nEnter the no of Elements: ");
scanf("%d", &no);
for(i=0; i<no; i++) {
printf("\n Enter Element %d: ", i+1);
scanf("%d",&arr[i]);
}
for(i=0; i<no; i++) {
for(j=i; j<no; j++) {
if(arr[i] > arr[j]) {
temp=arr[i];
arr[i]=arr[j];
arr[j]=temp;
}
}
}
printf("\nSorted array:");
for(i=0; i<no; i++) {
printf("\t%d",arr[i]);
}
getch();
}
How do I change the
printf("\t%d",arr[i]);
To fill another array and then sort that to remove single entries and leave ony those that repeat at least once.
eg. the first aray is
2 2 1 6 9 9
and after the second sorting the result should be
2 2 9 9
#include <stdio.h>
#define buffas 16
int main(void)
{
/* Instead of original input and sorting code */
int arr[] = { 1, 2, 2, 6, 9, 9, 10, 10, 10, 11, 12, 13, 14 };
int no = sizeof(arr) / sizeof(arr[0]);
/* Code to copy only duplicated elements in arr */
int copy[buffas];
int n = 0;
for (int i = 0; i < no; i++)
{
int j;
for (j = i + 1; j < no; j++)
{
if (arr[i] != arr[j])
break;
}
if (j - i > 1)
{
for (int k = i; k < j; k++)
copy[n++] = arr[k];
i = j - 1;
}
}
/* Print results for verification */
for (int i = 0; i < n; i++)
printf("c[%d] = %d\n", i, copy[i]);
return 0;
}
The code has been run with various lengths of sorted array and different data in the array; it seems to be correct. The code above produces the output:
c[0] = 2
c[1] = 2
c[2] = 9
c[3] = 9
c[4] = 10
c[5] = 10
c[6] = 10
Note that the code uses the C99 feature of declaring variables in a for loop control statement; if you're on Windows and without C99 support, you'll need to declare i and k outside the loops. If you're using GCC, you need to add -std=c99 or a similar option.

Resources