Unusual behaviour in an OpenMP program - c

I have a program and i tried to use Open MP.
The output is still correct (i tested it after multiple runs), but the times i get are quite odd.
So the time for the single threaded version is 0.1 seconds.
With 2 treads i get 0.05, but with 4 i obtain 0.15 seconds.
How is this possible?
I am just using simple parallel for's.
#pragma omp parallel for private(i, j)
for(i = 1; i <= total_height; i++){
for(j = 1; j <= total_width; j++){
int current_neighbours = neighbours[i][j];
// if(i == 2 && j == 1)
// printf("%d%d\n", current_neighbours, neighbours[2][1]);
if(current_neighbours == 0 || current_neighbours == 1 || current_neighbours > 3){
if(map[i][j] == 1){
update_maps(i, j, 0);
}
}
else if(current_neighbours == 3){
if(map[i][j] == 0){
update_maps(i, j, 1);
}
}
}
}
The update_maps functions looks like this
void update_maps(int i, int j, int value){
map[i][j] = value;
int k, neighbouri, neighbourj;
int num_of_thread = omp_get_thread_num();
if(value == 0)
value = -1;
for(k = 0; k < 8 ; k++){
neighbouri = i + di[k];
neighbourj = j + dj[k];
if(in_map(neighbouri, neighbourj)){
neighbouri--;
neighbourj--;
modify[neighbouri * total_height + neighbourj + (total_height * total_width * num_of_thread)] += value;
}
}
}

Related

Parallel c code using pthreads.h uses less than 100%CPU and is slower when using more threads

So I have a project where I needed to implement the game of life and then parallelise it in c. However, when I try using pthreads.h to parallelise it the program runs slower when introducing more threads and the %CPU is lower than 100% (when using top in the ubuntu terminal, I have an Ubuntu Windows subsystem). Here's my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
/* Parallel code with PTHREADS v1 */
//Global variables
int N; // Size of the world
int nthreads; // Number of threads
pthread_mutex_t lock;
typedef struct info_thread
{
int threadID; // thread ID
int low; // lower limit of interval
int high; // higher limit of interval
int **world; // pointer to the world matrix
int **neighbors; // pointer to the neighbors matrix
//int **neighbors_2; // pointer to the neighbors_2 matrix
//int **one_step; // pointer to the one_step matrix
}t_info;
void * thread_func(void *arg);
void print_world(int **world);
void count_neighbors(int **world, int **neighbors);
void next_step(int **world, int **one_step, int **neighbors);
void update(int **world, int **one_step);
int compare(int **world, int **one_step, int **two_steps, int **old, int status);
int main(int argc, const char *argv[])
{
if (argc != 5)
{
printf("Give the following input arguments:\n");
printf("N: Size of the NxN world (integer)\n");
printf("Initial state: random (0), chessboard (1)\n");
printf("Output: Number of steps until final state (0) \n");
printf(" Number of steps until final state, initial and final states (1) \n");
printf(" Number of steps until final state and all states states (2) \n");
printf("Threads: Number of threads (integer)\n");
exit(0);
}
N = atoi(argv[1]);
const int pattern = atoi(argv[2]);
const int output = atoi(argv[3]);
nthreads = atoi(argv[4]);
// Create necessary matrices
const int n = N+1;
int **buffer = (int **)malloc(6 * n * sizeof(int *));
for(int i = 0; i < (6*n); i++)
{
buffer[i] = (int *)malloc(n*sizeof(int));
}
int **world = &buffer[0];
int **neighbors = &buffer[n];
int **neighbors_2 = &buffer[2*n];
int **one_step = &buffer[3*n];
int **two_steps = &buffer[4*n];
int **old = &buffer[5*n];
// Setting a random initial pattern
if(pattern == 0){
srand(time(0));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
int r = rand() % 10;
if (r > 5)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
// Setting a chessboard initial state
else if(pattern == 1){
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(i%2 == 0){
if(j%2 == 0)
world[i][j] = 0;
else
world[i][j] = 1;
}
else{
if(j%2 == 0)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
}
if(output==1 || output==2){
printf("Initial state:\n");
print_world(world);
}
int status = 1;
int t = 1;
update(old, world);
// Create threads and input info
pthread_t thread[nthreads];
t_info threadinfo[nthreads];
const int interval = N/nthreads;
while(status == 1)
{
for (int k=0; k<nthreads; k++)
{
threadinfo[k].threadID = k;
threadinfo[k].low = k*interval;
threadinfo[k].high = (k+1)*interval-1;
threadinfo[k].world = world;
threadinfo[k].neighbors = neighbors;
}
threadinfo[nthreads-1].high = N;
// Predict one step forward
pthread_mutex_init(&lock, NULL);
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
pthread_mutex_destroy(&lock);
next_step(world, one_step, neighbors);
// Predict two steps forward
for (int k=0; k<nthreads; k++)
{
threadinfo[k].world = one_step;
threadinfo[k].neighbors = neighbors_2;
}
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
//count_neighbors(one_step,neighbors_2);
next_step(one_step, two_steps, neighbors_2);
// Compare all predicted steps
status = compare(world, one_step, two_steps, old, status);
// Update world with two steps
update(world, two_steps);
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j+=2)
{
neighbors[i][j] = 0;
neighbors[i][j+1] = 0;
neighbors_2[i][j] = 0;
neighbors_2[i][j+1] = 0;
}
}
if((output == 2) && (status == 1)){
printf("Step %d:\n", t);
print_world(one_step);
printf("Step %d:\n", t+1);
print_world(two_steps);
}
// Save previous step
update(old, world);
//t+=1;
t+=2;
}
//printf("It took %d steps to reach the final state\n", t-2);
printf("It took %d steps to reach the final state\n", (t-3));
if(output==1 || output ==2){
printf("Final state:\n");
print_world(world);
}
for (int i = 0; i < (6*n); i++)
{
free(buffer[i]);
}
free(buffer);
}
void * thread_func(void *arg)
{
pthread_mutex_lock(&lock);
t_info *threadinfo = arg;
int threadID = threadinfo->threadID;
int low = threadinfo->low;
int high = threadinfo->high;
//int **world = threadinfo->world;
//int **neighbors = threadinfo->neighbors;
int i; //rows
int j; //col
for (i = low; i <= high; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (threadinfo->world[i-1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i-1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i-1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (threadinfo->world[i+1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i+1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i+1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j > 0){
if (threadinfo->world[i][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if(j < N-1){
if (threadinfo->world[i][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
}
}
pthread_mutex_unlock(&lock);
pthread_exit(NULL);
}
void print_world(int **world)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
printf("%d ", world[i][j]);
printf("%d ", world[i][j+1]);
}
printf("\n");
}
printf("\n");
}
void count_neighbors(int **world, int **neighbors)
{
int i; //rows
int j; //col
for (i = 0; i <= N-1; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (world[i-1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i-1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i-1][j] == 1)
neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (world[i+1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i+1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i+1][j] == 1)
neighbors[i][j] +=1;
}
if (j > 0){
if (world[i][j-1] == 1)
neighbors[i][j] +=1;
}
if(j < N-1){
if (world[i][j+1] == 1)
neighbors[i][j] +=1;
}
}
}
}
void next_step(int **world, int **one_step, int **neighbors)
{
int i, j;
for (i = 0; i < N; i++){
for (j = 0; j < N; j++){
if (world[i][j] == 1)
{
if (neighbors[i][j] == 2 || neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
else if (world[i][j] == 0)
{
if (neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
}
}
}
void update(int **world, int **one_step)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
world[i][j] = one_step[i][j];
world[i][j+1] = one_step[i][j+1];
}
}
}
int compare(int **world, int **one_step, int **two_steps, int **old, int status)
{
int counter1=0, counter2=0, counter3=0;
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(world[i][j] == one_step[i][j])
counter1++;
if(world[i][j] == 0)
counter2++;
if(old[i][j] == two_steps[i][j])
counter3++;
}
}
if (counter1 == (N*N))
status = 0;
else if(counter2 == (N*N))
status = 0;
else if(counter3 == (N*N))
status = 0;
return status;
}
When I compile the code and run it using 2, 4 and 8 threads I get the following:
gcc -o gol gol.c -lpthread
time ./gol 500 1 0 2
It took 1670 steps to reach the final state
real 0m10.064s
user 0m8.971s
sys 0m0.246s
time ./gol 500 1 0 4
It took 1670 steps to reach the final state
real 0m15.694s
user 0m9.976s
sys 0m0.437s
time ./gol 500 1 0 8
It took 1670 steps to reach the final state
real 0m14.600s
user 0m10.400s
sys 0m0.855s
Also the %CPU using top is ~65% when using 2 threads, ~78% when using 4 threads and ~100% when using 8 threads. What am I doing wrong?
You've got 2 problems:
a) Creating threads and waiting for them to terminate adds overhead. Doing it inside a "while(status == 1)" loop means that you're paying for that extra overhead repeatedly. It'd be better to create the threads once (outside the loop) then re-use the existing threads, using something (e.g. condition variable) to make the threads wait for the next iteration of the loop.
b) Mutexes exist to prevent (unwanted) parallelism, and also have overhead. If the threads acquire a mutex, then do their work, then release a mutex; then you're deliberately preventing all parallelism between these threads. For your code, parallelism between the main thread and the newly created threads is also prevented (main thread sits waiting in a pthread_join()).
Essentially; you've added lots of overhead (creating and destroying threads, acquiring and releasing mutexes) and prevented all parallelism to ensure that there's no benefits to outweigh the extra overhead; leading to code that is worse than not using threads at all.
To fix this you need to find ways to ensure that threads can do useful work in parallel. The easiest way to do that would probably be to use 2 global arrays to represent the state of the world, where one of these arrays is "previous world state" and the other is "next world state", and where you swap (pointers to) the arrays between steps. In this case, during a step, "previous world state" is only being read (and many threads can read in parallel without a problem) and each thread can update different parts of "next world state" in parallel. Note that because the threads would write to each cell in "next world state" you won't need to clear "next world state" between steps either.
WARNING: To ensure that updating one element of the array won't cause "lack of atomicity" problems with other/adjacent elements in the array; you will need to either use an atomic type (sig_atomic_t, C11 I think) with a 2D array, or use "1D array of pointers to 1D arrays" (where each row can only be modified by one thread) and the element/s are volatile. Note that if the world state is an 8 * 8 grid you can probably represent a whole row with a single uint8_t (meaning that it could become "1D array of pointers to volatile uint8_t).
Basically (if you include the re-use of threads) it can be done without using any mutexes for anything other than worker threads waiting for the main thread to start the next step, and the main thread waiting for worker threads to complete the current step.
Also; instead of waiting for worker threads, the main thread can also participate in doing useful work. For example, if the world state is an 8 * 8 grid, then "main thread + 7 worker threads" can do a row each (in parallel) to ensure that all 8 rows are done. However, when threads have the same priority it's rarely sane to have more threads than CPUs; so it can be a good idea to check how many CPUs the computer has and limit the number of threads (e.g. if there are 4 CPUs, then you might have "main thread + 3 more threads do 2 rows each").

optimizing C code with imbedded for loops

void evolve(board prv, board nxt){
int i, j;
int n;
printf("\rGeneration %d\n", generation++);
if (printLazy == 1){
lazyPrint(prv);
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
if (prv[i][j] && (n == 3 || n == 2))
nxt[i][j] = true;
else if (!prv[i][j] && (n == 3))
nxt[i][j] = true;
else
nxt[i][j] = false;
}
}
}
** Some asked me to add the neighbors method so
static int neighbors (board b, int i, int j) {
int n = 0;
int i_left = max(0,i-1);
int i_right = min(HEIGHT, i+2);
int j_left = max(0,j-1);
int j_right = min(WIDTH, j+2);
int ii, jj;
for (ii = i_left; ii < i_right; ++ii) {
for (jj = j_left; jj < j_right; ++jj) {
n += b[ii][jj];
}
}
return n - b[i][j];
}
So I am working on optimizing this so that it will go faster and I'm stuck on how to optimize this more. Here's what I have so far
void evolve(board prv, board nxt) {
register int i, j;
int n;
bool next;
printf("\rGeneration %d\n", generation++);
if (printLazy == 1){
lazyPrint(prv);
}
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
if (prv[i][j])
if (n == 2)
next = true;
else if (n == 3)
next = true;
else
next = false;
else
if(n == 3)
next = true;
else
next = false;
nxt[i][j] = next;
}
}
}
Is there a better way to do this or are there any resources or videos y'all recommend?
Thanks, any help is appreciated.
Some ideas Inline your function neighbors(). Or turn it into a macro. Tidy up the conditional. To unroll the inner loop replace every use of i with the literal values so your code looks like :
for (j =0;.......
n = fun(prev, 0 ,j);
If.....
n = fun(prev, 1, j);
if......
and so on.
If the value of HEIGHT was let's say 100, then you get a code explosion of 100 function calls and 100 compound conditionals. Even worse if you unroll the outer loop.
If n was limited to say 8 neighbors, use a lookup table
bool foo[2][8] = { [1][2] = true, [1][3] = true, [0][3] = true };
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
nxt[i][j] = foo[prv[i][j]][n];
}
}
A common weakness is the neighbors(prv, i, j) function itself. One trick to to oversize the 2D array by 1 on all four sides and populate the edge with false so neighbors() can always check 8 neighbors as it is never used on the edge/corners.
Making sure the 2nd dimension is a power of 2 helps also - simplifies index calculation. So if the original array way 12*11, make the new array (1+12+1)*(1+11+1+4) or 14*16.

Task Parallelism Open MP (C Language)

I have a problem with making from the sequential algorithm parallel one with task parallelism.
My problem is the Minimum Cut of a Graph and in sequential case, I can achieve the correct result.
Here is part of my code:
main.c
struct ProblemInstance instance = readFromFile(file);
bool *solution = malloc(sizeof(bool) * vertexCount);
for (int n = 0; n < vertexCount; n++)
solution[n] = false;
#pragma omp parallel num_threads(2)
{
printf("Number of thread: %d \n", omp_get_thread_num());
#pragma omp single
recursiveBruteForce(solution, 0, 0);
}
problem.c
void recursiveBruteForce(bool *solution, float cutSum, int depth) {
// In case if it is not correct
if (checkPartialSolution(solution, depth)) return;
if (cutSum > minCutValue) return;
if (depth == vertexCount) {
minCutValue = cutSum;
for (int i = 0; i < vertexCount; i++)
minCutArray[i] = solution[i];
return;
}
solution[depth] = false;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
solution[depth] = true;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
}
// Check Particular Solution to how many 1 and 0 have
bool checkPartialSolution(const bool *solution, int depth) {
int a = 0, n = 0;
for (int i = 0; i < depth; i++) {
if (solution[i])
a = a + 1;
else
n = n + 1;
if (a > subgroupSize || n > (vertexCount - subgroupSize))
return true;
}
return false;
}
// Get a sum of Sub-Graph
double minCutSum(const bool *solution, int depth) {
float sum = 0;
for (int i = 0; i < depth; i++) {
for (int j = 0; j < i; j++) {
if (solution[j] != solution[i])
sum += graphConnections[j][i];
}
}
return sum;
}
I have tried to look at how many calls did recursiveBruteForce function. In sequential, it is approximately 22,000, while in parallel 62. I think the problem with memory.
Any suggestions?

RoundRobin Using Linked List. Finish Time Calculation Issue

we are trying to achieve round robin algorithm using linked list.
But My logic has some errors.
When I try to run it for 3 process then the first process values are wrong and sometimes right for the other below processes.
Please Help me.
I tried Searching for logics
Code Link: https://pastebin.com/FkbtUEaQ
#include<stdio.h>
struct process
{
char na[20];
int at, bt, ft, tat, rem;
//float ntat;
} Q[5], temp;
void roundRobin()
{
int rr[20], q, x, k;
int f, r, n, i, j, tt = 0, qt, t, flag, wt = 0;
float awt = 0, antat = 0, atat = 0;
printf("Enter the no. of jobs:");
scanf("%d", &n);
for (r = 0; r < n; r++)// aceppting arrival; and burst time
{
printf("Enter process name,arrival time and burst time:\n");
scanf("%s%d%d", Q[r].na, &Q[r].at, &Q[r].bt);
}
printf("Enter quantum:\n");
scanf("%d", &qt);
for (i = 0; i < n; i++)
{
for (j = i + 1; j < n; j++)
{
if (Q[i].at < Q[j].at) {
temp = Q[i];
Q[i] = Q[j];
Q[j] = temp;
}
}
}
for (i = 0; i < n; i++)
{
Q[i].rem = Q[i].bt;
Q[i].ft = 0;
}
tt = 0;
q = 0;
rr[q] = 0;
do
{
for (j = 0; j < n; j++)
if (tt >= Q[j].at)
{
x = 0;
for (k = 0; k <= q; k++)
if (rr[k] == j)
x++;
if (x == 0)
{
q++;
rr[q] = j;
}
}
if (q == 0)
i = 0;
if (Q[i].rem == 0)
i++;
if (i > q)
i = (i - 1) % q;
if (i <= q)
{
if (Q[i].rem > 0)
{
if (Q[i].rem < qt)
{
tt += Q[i].rem;
Q[i].rem = 0;
} else
{
tt += qt;
Q[i].rem -= qt;
}
Q[i].ft = tt;
}
i++;
}
flag = 0;
for (j = 0; j < n; j++)
if (Q[j].rem > 0)
flag++;
} while (flag != 0);
printf("\n\n\t\tROUND ROBIN ALGORITHM");
printf("\n***************************");
printf("\nprocesses Arrival time burst time finish time tat wt ntat");
for (f = 0; f < n; f++) {
wt = Q[f].ft - Q[f].bt - Q[f].at;
Q[f].tat = Q[f].ft - Q[f].at;
Q[f].ntat = (float) Q[f].tat / Q[f].bt;
antat += Q[f].ntat;
atat += Q[f].tat;
awt += wt;
printf("\n\t%s\t%d\t%d\t%d\t%d\t%d %f", Q[f].na, Q[f].at, Q[f].bt,
Q[f].ft, Q[f].tat, wt, Q[f].ntat);
}
antat /= n;
atat /= n;
awt /= n;
printf("\nAverage tat is %f", atat);
printf("\nAverage normalised tat is %f", antat);
printf("\n average waiting time is %f", awt);
}
void main()
{
roundRobin();
getch();
clrscr();
}
The First Process Gives Wrong Values
processes | ArrivalTime | BurstTime | FinishTime | Tat | WaitTime
a 0 10 60 60 50
b 0 20 30 30 10
c 0 30 50 50 20
In the do while loop you use i to index the rr array with valid indexes 0 to q as well as the Q array with valid indexes 0 to n − 1, of which only the former is correct. So, you have to change every occurrence of Q[i] in this loop to Q[rr[i]].
After that, still the order of the statements
if (Q[rr[i]].rem == 0)
i++;
if (i > q)
i = (i - 1) % q;
is wrong - the test for Q[rr[i]].rem == 0 is to be done each time a new i is chosen, e. g.:
while (Q[rr[i %= q+1]].rem == 0) i++;

Why won't for loop terminate?

My function:
int checkSE(disk board[][SIZE], disk hypotheticalDisk)
{
int i;
int j;
int row;
int col;
int player;
int opponent;
int checkSEflag;
player = hypotheticalDisk.type;
(player == 0) ? (opponent = 1) : (opponent = 0);
row = hypotheticalDisk.pos.row;
col = hypotheticalDisk.pos.col;
checkSEflag = 0;
for (i = row + 2, j = col + 2; ((i < SIZE) && (j < SIZE) && (checkSEflag == 0)); i++, j++)
{
if (board[i][j].type == player)
{
for (--i, --j; board[i][j].type == opponent; i--, j--)
{
if (i == row && j == col)
{
checkSEflag = 1;
break;
}
}
}
printf("\n%d and %d and %d", i, j, checkSEflag);
}
return checkSEflag;
}
My output:
2 and 3 and 0
2 and 3 and 0
2 and 3 and 0
2 and 3 and 0
2 and 3 and 0
.
.
.
And it keeps on going...
I want both i and j to increase until they are equal to SIZE (SIZE predefined to be 8) or until checkSEflag is assigned to be equal to 1.
It looks like the values of i and j just aren't being changed...
I tried taking them out of the loop conditions and instead placed them
in the loop body, though that didn't change anything.
I doubt the post increment operators just decided to not work so I must be doing something wrong, any ideas of what that may be?
These two lines:
for(i = row+2, j = col+2; ((i < SIZE) && (j <SIZE) && (checkSEflag == 0)); i++, j++)
...
for(--i, --j; board[i][j].type == opponent; i--, j--)
so, you are both incrementing and decrementing (i,j); try sprinkling printfs around these and see if you are both incrementing and decrementing i,j on each iteration...

Resources