Pthread conditionals - c

I can post my code if necessary, but my question is primarily conceptual. I am implementing Gaussian elimination with threading. I have p pthreads operating on an nxn matrix in column major order. Before any p-thread can start operating on a column, row operations must be done to move the row with the largest value in that column up to the diagonal. So I need every thread to wait and then operate in unison. Currently, at each column, Each thread checks its id, the one with id=0 will perform the row operations. My problem is how to get all the threads but id=0 to wait and then operate in Unison.
I've tried using mutex locks and conditionals. These don't seem to work because they give all access rights to a single thread. From what I understand, one can only block a thread in this manner by having it request a lock where one already exists, so it must wait. This would be a problem in my case because I don't want any of the non-0 threads to have a lock, once they are unlocked I want them to operate freely until they finish their work on column.
I tried to avoid Mutex locks by simply having a global "colReady" variable set to 0. The non-zero threads while loop until colReady = True. Logically this makes perfect sense, but it has not worked programatically.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <pthread.h>
#define n 20
#define numThr 3
double **matrix;
int pivotReady[n] = { 0 };
pthread_cond_t cond_pivot_ready;
pthread_mutex_t cond_mutex;
int swapRows (int row1, int row2)
{
double *tempRow = matrix[row1];
matrix[row1] = matrix[row2];
matrix[row2] = tempRow;
return 0;
}
void randinit ()
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n + 1; j++) {
matrix[i][j] = drand48 ();
}
}
}
void rowReduce (void *arg);
void printMatrix ()
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n + 1; j++) {
printf (" %4.2f ", matrix[i][j]);
}
printf ("\n");
}
}
int main ()
{
pthread_cond_init (&cond_pivot_ready, NULL);
pthread_mutex_init (&cond_mutex, NULL);
int i, j;
double temp;
pthread_t p_threads[numThr];
pthread_attr_t attr;
pthread_attr_init (&attr);
//create matrix
matrix = malloc (sizeof (double *) * n);
for (i = 0; i < n; i++) {
*(matrix + i) = malloc (sizeof (double) * (n + 1));
}
randinit ();
for (i = 0; i < numThr; i++) {
pthread_create (&p_threads[i], &attr, rowReduce, (void *) ((long) i));
}
for (i = 0; i < numThr; i++) {
pthread_join (p_threads[i], NULL);
}
printf ("Final Matrix:\n");
printMatrix ();
return 0;
}
void rowReduce (void *arg)
{
int id = (int) arg;
int i, pivot, row;
double ratio, temp, max;
int maxRow;
for (pivot = 0; pivot < n - 1; pivot++) {
//PIVOT THREAD
if (id == 0) {
pthread_mutex_lock (&cond_mutex);
max = matrix[pivot][pivot]
maxRow = pivot;
for (i = pivot + 1; i < n; i++) {
temp = matrix[i][pivot];
if (temp > max) {
max = temp;
maxRow = i;
}
}
swapRows (pivot, maxRow);
pivotReady[pivot] = 1;
pthread_cond_signal (&cond_pivot_ready);
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
printf ("t1: row = %d, piv = %d, ratio = %f\n", row, pivot,
ratio);
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
pthread_mutex_unlock (&cond_mutex);
}
//NON-PIVOT THREAD
else {
pthread_mutex_lock (&cond_mutex);
while (!(pivotReady[pivot])) {
pthread_cond_wait (&cond_pivot_ready, &cond_mutex);
}
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
pthread_mutex_unlock (&cond_mutex);
}
}
//printf("rowReduce called with id = %d\n", id);
pthread_exit (0);
}
This program SHOULD print a random matrix that has been put in upper triangular form.

You only need to hold the cond_mutex while you are accessing pivotReady[pivot], because that's the only shared state it protects.
You also need to use pthread_cond_broadcast() rather than pthread_cond_signal(), because you need all the waiting threads to proceed once the pivot is ready.
After a minor refactoring so that the row processing code isn't repeated, it looks like:
for (pivot = 0; pivot < n - 1; pivot++) {
//PIVOT THREAD
if (id == 0) {
max = matrix[pivot][pivot];
maxRow = pivot;
for (i = pivot + 1; i < n; i++) {
temp = matrix[i][pivot];
if (temp > max) {
max = temp;
maxRow = i;
}
}
swapRows (pivot, maxRow);
pthread_mutex_lock (&cond_mutex);
pivotReady[pivot] = 1;
pthread_cond_broadcast (&cond_pivot_ready);
pthread_mutex_unlock (&cond_mutex);
}
//NON-PIVOT THREAD
else {
pthread_mutex_lock (&cond_mutex);
while (!(pivotReady[pivot])) {
pthread_cond_wait (&cond_pivot_ready, &cond_mutex);
}
pthread_mutex_unlock (&cond_mutex);
}
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
}

Related

Parallel c code using pthreads.h uses less than 100%CPU and is slower when using more threads

So I have a project where I needed to implement the game of life and then parallelise it in c. However, when I try using pthreads.h to parallelise it the program runs slower when introducing more threads and the %CPU is lower than 100% (when using top in the ubuntu terminal, I have an Ubuntu Windows subsystem). Here's my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
/* Parallel code with PTHREADS v1 */
//Global variables
int N; // Size of the world
int nthreads; // Number of threads
pthread_mutex_t lock;
typedef struct info_thread
{
int threadID; // thread ID
int low; // lower limit of interval
int high; // higher limit of interval
int **world; // pointer to the world matrix
int **neighbors; // pointer to the neighbors matrix
//int **neighbors_2; // pointer to the neighbors_2 matrix
//int **one_step; // pointer to the one_step matrix
}t_info;
void * thread_func(void *arg);
void print_world(int **world);
void count_neighbors(int **world, int **neighbors);
void next_step(int **world, int **one_step, int **neighbors);
void update(int **world, int **one_step);
int compare(int **world, int **one_step, int **two_steps, int **old, int status);
int main(int argc, const char *argv[])
{
if (argc != 5)
{
printf("Give the following input arguments:\n");
printf("N: Size of the NxN world (integer)\n");
printf("Initial state: random (0), chessboard (1)\n");
printf("Output: Number of steps until final state (0) \n");
printf(" Number of steps until final state, initial and final states (1) \n");
printf(" Number of steps until final state and all states states (2) \n");
printf("Threads: Number of threads (integer)\n");
exit(0);
}
N = atoi(argv[1]);
const int pattern = atoi(argv[2]);
const int output = atoi(argv[3]);
nthreads = atoi(argv[4]);
// Create necessary matrices
const int n = N+1;
int **buffer = (int **)malloc(6 * n * sizeof(int *));
for(int i = 0; i < (6*n); i++)
{
buffer[i] = (int *)malloc(n*sizeof(int));
}
int **world = &buffer[0];
int **neighbors = &buffer[n];
int **neighbors_2 = &buffer[2*n];
int **one_step = &buffer[3*n];
int **two_steps = &buffer[4*n];
int **old = &buffer[5*n];
// Setting a random initial pattern
if(pattern == 0){
srand(time(0));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
int r = rand() % 10;
if (r > 5)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
// Setting a chessboard initial state
else if(pattern == 1){
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(i%2 == 0){
if(j%2 == 0)
world[i][j] = 0;
else
world[i][j] = 1;
}
else{
if(j%2 == 0)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
}
if(output==1 || output==2){
printf("Initial state:\n");
print_world(world);
}
int status = 1;
int t = 1;
update(old, world);
// Create threads and input info
pthread_t thread[nthreads];
t_info threadinfo[nthreads];
const int interval = N/nthreads;
while(status == 1)
{
for (int k=0; k<nthreads; k++)
{
threadinfo[k].threadID = k;
threadinfo[k].low = k*interval;
threadinfo[k].high = (k+1)*interval-1;
threadinfo[k].world = world;
threadinfo[k].neighbors = neighbors;
}
threadinfo[nthreads-1].high = N;
// Predict one step forward
pthread_mutex_init(&lock, NULL);
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
pthread_mutex_destroy(&lock);
next_step(world, one_step, neighbors);
// Predict two steps forward
for (int k=0; k<nthreads; k++)
{
threadinfo[k].world = one_step;
threadinfo[k].neighbors = neighbors_2;
}
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
//count_neighbors(one_step,neighbors_2);
next_step(one_step, two_steps, neighbors_2);
// Compare all predicted steps
status = compare(world, one_step, two_steps, old, status);
// Update world with two steps
update(world, two_steps);
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j+=2)
{
neighbors[i][j] = 0;
neighbors[i][j+1] = 0;
neighbors_2[i][j] = 0;
neighbors_2[i][j+1] = 0;
}
}
if((output == 2) && (status == 1)){
printf("Step %d:\n", t);
print_world(one_step);
printf("Step %d:\n", t+1);
print_world(two_steps);
}
// Save previous step
update(old, world);
//t+=1;
t+=2;
}
//printf("It took %d steps to reach the final state\n", t-2);
printf("It took %d steps to reach the final state\n", (t-3));
if(output==1 || output ==2){
printf("Final state:\n");
print_world(world);
}
for (int i = 0; i < (6*n); i++)
{
free(buffer[i]);
}
free(buffer);
}
void * thread_func(void *arg)
{
pthread_mutex_lock(&lock);
t_info *threadinfo = arg;
int threadID = threadinfo->threadID;
int low = threadinfo->low;
int high = threadinfo->high;
//int **world = threadinfo->world;
//int **neighbors = threadinfo->neighbors;
int i; //rows
int j; //col
for (i = low; i <= high; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (threadinfo->world[i-1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i-1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i-1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (threadinfo->world[i+1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i+1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i+1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j > 0){
if (threadinfo->world[i][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if(j < N-1){
if (threadinfo->world[i][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
}
}
pthread_mutex_unlock(&lock);
pthread_exit(NULL);
}
void print_world(int **world)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
printf("%d ", world[i][j]);
printf("%d ", world[i][j+1]);
}
printf("\n");
}
printf("\n");
}
void count_neighbors(int **world, int **neighbors)
{
int i; //rows
int j; //col
for (i = 0; i <= N-1; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (world[i-1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i-1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i-1][j] == 1)
neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (world[i+1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i+1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i+1][j] == 1)
neighbors[i][j] +=1;
}
if (j > 0){
if (world[i][j-1] == 1)
neighbors[i][j] +=1;
}
if(j < N-1){
if (world[i][j+1] == 1)
neighbors[i][j] +=1;
}
}
}
}
void next_step(int **world, int **one_step, int **neighbors)
{
int i, j;
for (i = 0; i < N; i++){
for (j = 0; j < N; j++){
if (world[i][j] == 1)
{
if (neighbors[i][j] == 2 || neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
else if (world[i][j] == 0)
{
if (neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
}
}
}
void update(int **world, int **one_step)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
world[i][j] = one_step[i][j];
world[i][j+1] = one_step[i][j+1];
}
}
}
int compare(int **world, int **one_step, int **two_steps, int **old, int status)
{
int counter1=0, counter2=0, counter3=0;
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(world[i][j] == one_step[i][j])
counter1++;
if(world[i][j] == 0)
counter2++;
if(old[i][j] == two_steps[i][j])
counter3++;
}
}
if (counter1 == (N*N))
status = 0;
else if(counter2 == (N*N))
status = 0;
else if(counter3 == (N*N))
status = 0;
return status;
}
When I compile the code and run it using 2, 4 and 8 threads I get the following:
gcc -o gol gol.c -lpthread
time ./gol 500 1 0 2
It took 1670 steps to reach the final state
real 0m10.064s
user 0m8.971s
sys 0m0.246s
time ./gol 500 1 0 4
It took 1670 steps to reach the final state
real 0m15.694s
user 0m9.976s
sys 0m0.437s
time ./gol 500 1 0 8
It took 1670 steps to reach the final state
real 0m14.600s
user 0m10.400s
sys 0m0.855s
Also the %CPU using top is ~65% when using 2 threads, ~78% when using 4 threads and ~100% when using 8 threads. What am I doing wrong?
You've got 2 problems:
a) Creating threads and waiting for them to terminate adds overhead. Doing it inside a "while(status == 1)" loop means that you're paying for that extra overhead repeatedly. It'd be better to create the threads once (outside the loop) then re-use the existing threads, using something (e.g. condition variable) to make the threads wait for the next iteration of the loop.
b) Mutexes exist to prevent (unwanted) parallelism, and also have overhead. If the threads acquire a mutex, then do their work, then release a mutex; then you're deliberately preventing all parallelism between these threads. For your code, parallelism between the main thread and the newly created threads is also prevented (main thread sits waiting in a pthread_join()).
Essentially; you've added lots of overhead (creating and destroying threads, acquiring and releasing mutexes) and prevented all parallelism to ensure that there's no benefits to outweigh the extra overhead; leading to code that is worse than not using threads at all.
To fix this you need to find ways to ensure that threads can do useful work in parallel. The easiest way to do that would probably be to use 2 global arrays to represent the state of the world, where one of these arrays is "previous world state" and the other is "next world state", and where you swap (pointers to) the arrays between steps. In this case, during a step, "previous world state" is only being read (and many threads can read in parallel without a problem) and each thread can update different parts of "next world state" in parallel. Note that because the threads would write to each cell in "next world state" you won't need to clear "next world state" between steps either.
WARNING: To ensure that updating one element of the array won't cause "lack of atomicity" problems with other/adjacent elements in the array; you will need to either use an atomic type (sig_atomic_t, C11 I think) with a 2D array, or use "1D array of pointers to 1D arrays" (where each row can only be modified by one thread) and the element/s are volatile. Note that if the world state is an 8 * 8 grid you can probably represent a whole row with a single uint8_t (meaning that it could become "1D array of pointers to volatile uint8_t).
Basically (if you include the re-use of threads) it can be done without using any mutexes for anything other than worker threads waiting for the main thread to start the next step, and the main thread waiting for worker threads to complete the current step.
Also; instead of waiting for worker threads, the main thread can also participate in doing useful work. For example, if the world state is an 8 * 8 grid, then "main thread + 7 worker threads" can do a row each (in parallel) to ensure that all 8 rows are done. However, when threads have the same priority it's rarely sane to have more threads than CPUs; so it can be a good idea to check how many CPUs the computer has and limit the number of threads (e.g. if there are 4 CPUs, then you might have "main thread + 3 more threads do 2 rows each").

Task Parallelism Open MP (C Language)

I have a problem with making from the sequential algorithm parallel one with task parallelism.
My problem is the Minimum Cut of a Graph and in sequential case, I can achieve the correct result.
Here is part of my code:
main.c
struct ProblemInstance instance = readFromFile(file);
bool *solution = malloc(sizeof(bool) * vertexCount);
for (int n = 0; n < vertexCount; n++)
solution[n] = false;
#pragma omp parallel num_threads(2)
{
printf("Number of thread: %d \n", omp_get_thread_num());
#pragma omp single
recursiveBruteForce(solution, 0, 0);
}
problem.c
void recursiveBruteForce(bool *solution, float cutSum, int depth) {
// In case if it is not correct
if (checkPartialSolution(solution, depth)) return;
if (cutSum > minCutValue) return;
if (depth == vertexCount) {
minCutValue = cutSum;
for (int i = 0; i < vertexCount; i++)
minCutArray[i] = solution[i];
return;
}
solution[depth] = false;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
solution[depth] = true;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
}
// Check Particular Solution to how many 1 and 0 have
bool checkPartialSolution(const bool *solution, int depth) {
int a = 0, n = 0;
for (int i = 0; i < depth; i++) {
if (solution[i])
a = a + 1;
else
n = n + 1;
if (a > subgroupSize || n > (vertexCount - subgroupSize))
return true;
}
return false;
}
// Get a sum of Sub-Graph
double minCutSum(const bool *solution, int depth) {
float sum = 0;
for (int i = 0; i < depth; i++) {
for (int j = 0; j < i; j++) {
if (solution[j] != solution[i])
sum += graphConnections[j][i];
}
}
return sum;
}
I have tried to look at how many calls did recursiveBruteForce function. In sequential, it is approximately 22,000, while in parallel 62. I think the problem with memory.
Any suggestions?

First thread not running with given argument

int run_me(unsigned long prime, unsigned long max, int *ary) {
unsigned long i;
printf("\nI am %d", prime);
if(prime > sqrt(max)) {
return 1; /* do no run */
}
for(i = 3; i*prime < max; i+=2) {
ary[i*prime - 1] = 1;
}
return 0;
}
typedef struct Args {
unsigned long max, prime;
int *ary;
} args;
void *thread_runner(void *all_args) {
args *my_args = all_args;
run_me(my_args->prime, my_args->max, my_args->ary);
return 0;
}
unsigned long *sieve_of_eratosthenes(unsigned long begin, unsigned long end) {
unsigned long i, j, arylen, *ary_to_ret;
unsigned long current_primes[4] = {3, 5, 7, 11}; /* holds primes being used by threads*/
int *ary_of_all;
pthread_t threads[4];
args *curr;
curr = malloc(sizeof(args));
ary_of_all = calloc(end, sizeof(int));
arylen = end - begin + 2;
ary_to_ret = calloc(arylen, sizeof(unsigned long));
ary_of_all[0] = 1;
/*mark all even numbers*/
for(i = 1; 2 * i < end; i++) {
ary_of_all[2*i - 1] = 1;
}
while(current_primes[3] < sqrt(end)) {
/*run threads with current primes*/
for(i = 0; i < 4; i++) {
curr->prime = current_primes[i];
curr->max = end;
curr->ary = ary_of_all;
pthread_create(&threads[i], NULL, thread_runner, curr);
}
/* join all threads */
for(i = 0; i < 4; i++) {
pthread_join(threads[i], NULL);
}
j = 0; /* number of primes found */
/*find new primes*/
for(i = current_primes[3] + 2; i < end && j < 4; i+=2) {
if(ary_of_all[i - 1] == 0) {
current_primes[j] = i;
j++;
}
}
}
/*run threads one more time*/
if(current_primes[0] <= sqrt(end)) {
for(i = 0; i < 4; i++) {
curr->prime = current_primes[i];
curr->max = end;
curr->ary = ary_of_all;
pthread_create(&threads[i], NULL, thread_runner, curr);
}
/* join all threads */
for(i = 0; i < 4; i++) {
pthread_join(threads[i], NULL);
}
}
/*create the array to be returned*/
j = 0; /*pos in *ary_to_ret*/
for(i = begin; i <= end; i++) {
if(ary_of_all[i-1] == 0) {
ary_to_ret[j] = i;
j++;
}
}
ary_to_ret[j] = 0; /* null terminate */
ary_to_ret = realloc(ary_to_ret, (j+1) * sizeof(unsigned long));
return ary_to_ret;
}
I am running the above code in order to get a list of primes given a high and low value using the Sieve of Eratosthenes. I have the code mostly working, however when I run this code the thread which I create using the first element in my curr_primes array is never used and instead runs 5, 7, 11, 11. It does this every time it runs through the array and repopulates it. I was wondering if someone could explain to me why it runs in this way.
You are passing the same curr pointer to all the threads. You are lucky that it even works as well as you have observed as that is a huge race condition. Instead, the code needs to pass a seperate arg buffer to each thread. Here is one example:
/* doesn't really need to be dynamic memory in this simple example */
args curr[4];
for(i = 0; i < 4; i++) {
curr[i].prime = current_primes[i];
curr[i].max = end;
curr[i].ary = ary_of_all;
pthread_create(&threads[i], NULL, thread_runner, &curr[i]);
}
/* join all threads */
for(i = 0; i < 4; i++) {
pthread_join(threads[i], NULL);
}

Segmentation fault in function implementing Ford-Fulkerson

I'm working on a class assignment and I've run into an issue I haven't been able to figure out. I'm implementing the Ford-Fulkerson algorithm using BFS to find max flow. But while trying to set my Residual Capacity matrix to the given capacity, I hit a segmentation fault. In the test code we received, I can see that the original capacity matrix was passed by value by its address, but I have a feeling that in my code I'm not interacting with it the way I think I am? Which leads me to believe that I may have the same issue recurring elsewhere. I worked with gdb and saw that I hit a segmentation fault on this line here in my nested for loop :
resCap[i][j] = *(capacity + i*n + j);
However, nothing I have tried has worked for me though so I am pretty stumped.
void maximum_flow(int n, int s, int t, int *capacity, int *flow)
{
int i, j, resCap[n][n], path[n]; // residual capacity and BFS augmenting path
int min_path = INT_MAX; // min of the augmenting path
// Assign residual capacity equal to the given capacity
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
{
resCap[i][j] = *(capacity + i*n + j);
*(flow + i*n + j) = 0; // no initial flow
}
// Augment path with BFS from source to sink
while (bfs(n, s, t, &(resCap[0][0]), path))
{
// find min of the augmenting path
for (j = t; j != s; j = path[j])
{
i = path[j];
min_path = min(min_path, resCap[i][j]);
}
// update residual capacities and flows on both directions
for (j = t; j != s; j = path[j])
{
i = path[j];
if(*(capacity + i*n + j) > 0)
*(flow + i*n + j) += min_flow_path;
else
*(flow + j*n + i) -= min_flow_path;
resCap[i][j] -= min_flow_path;
resCap[j][i] += min_flow_path;
}
}
}
And here is the test code provided to us in case it is needed:
int main(void)
{ int cap[1000][1000], flow[1000][1000];
int i,j, flowsum;
for(i=0; i< 1000; i++)
for( j =0; j< 1000; j++ )
cap[i][j] = 0;
for(i=0; i<499; i++)
for( j=i+1; j<500; j++)
cap[i][j] = 2;
for(i=1; i<500; i++)
cap[i][500 + (i/2)] =4;
for(i=500; i < 750; i++ )
{ cap[i][i-250]=3;
cap[i][750] = 1;
cap[i][751] = 1;
cap[i][752] = 5;
}
cap[751][753] = 5;
cap[752][753] = 5;
cap[753][750] = 20;
for( i=754; i< 999; i++)
{ cap[753][i]=1;
cap[i][500]=3;
cap[i][498]=5;
cap[i][1] = 100;
}
cap[900][999] = 1;
cap[910][999] = 1;
cap[920][999] = 1;
cap[930][999] = 1;
cap[940][999] = 1;
cap[950][999] = 1;
cap[960][999] = 1;
cap[970][999] = 1;
cap[980][999] = 1;
cap[990][999] = 1;
printf("prepared capacity matrix, now executing maxflow code\n");
maximum_flow(1000,0,999,&(cap[0][0]),&(flow[0][0]));
for(i=0; i<=999; i++)
for(j=0; j<=999; j++)
{ if( flow[i][j] > cap[i][j] )
{ printf("Capacity violated\n"); exit(0);}
}
flowsum = 0;
for(i=0; i<=999; i++)
flowsum += flow[0][i];
printf("Outflow of 0 is %d, should be 10\n", flowsum);
flowsum = 0;
for(i=0; i<=999; i++)
flowsum += flow[i][999];
printf("Inflow of 999 is %d, should be 10\n", flowsum);
printf("End Test\n");
}
This line is likely going to segfault, it does using Clang.
int i, j, resCap[n][n], path[n];
You're declaring a very large array on the stack. Just how big can be seen when you try and allocated it using calloc. Try this instead and don't forget to free it using the same sort of loop.
int **resCap2 = calloc(1, n * sizeof(int *));
assert(resCap2);
for (i = 0; i < n; i++) {
resCap2[i] = calloc(1, n * sizeof(int));
assert(resCap2[i]);
}
This is a lot of space ie
(1000 * sizeof(int*) * (1000 * n * sizeof(int)))

segmentation fault while using pthread library

I am a newbie to threading and i am trying to change the sequential program of travelling salesman problem (dynamic programming) to parallel program using threading in c.
#include <stdio.h>
#include <limits.h>
#define size 10 //maximum 10 cities
#define min(a,b) (a > b ? b : a)
#define sizePOW 1024 // 2^10
//Space complexity: O(n * 2^n)
//Time complexity: O(n^2 * 2^n)
int n; npow;
int g[size][sizePOW];
int p[size][sizePOW];
int adj[size][size];
int compute(int start, int set) {
int masked, mask, temp, i;
int result = INT_MAX; //result stores the minimum
if (g[start][set] != -1) //memoization DP top-down,check for repeated subproblem
return g[start][set];
for (i = 0; i < n; i++) { //npow-1 because we always exclude "home" vertex from our set
mask = (npow - 1) - (1 << i); //remove ith vertex from this set
masked = set & mask;
if (masked != set) { //in case same set is generated(because ith vertex was not present in the set hence we get the same set on removal) eg 12&13=12
temp = adj[start][i] + compute(i, masked); //compute the removed set
if (temp < result)
result = temp,
p[start][set] = i; //removing ith vertex gave us minimum
}
}
return g[start][set] = result; //return minimum
}
void getpath(int start, int set) {
if (p[start][set] == -1)
return; //reached null set
int x = p[start][set];
int mask = (npow - 1) - (1 << x);
int masked = set & mask; //remove p from set
printf("%d ", x);
getpath(x, masked);
}
void TSP() {
int i, j;
//g(i,S) is length of shortest path starting at i visiting all vertices in S and ending at 1
for (i = 0; i < n; i++)
for (j = 0; j < npow; j++)
g[i][j] = p[i][j] = -1;
for (i = 0; i < n; i++)
g[i][0] = adj[i][0]; //g(i,nullset)= direct edge between (i,1)
int result = compute(0, npow - 2);//npow-2 to exclude our "home" vertex
printf("Tour cost:%d\n", result);
printf("Tour path:\n0 ");
getpath(0, npow - 2);
printf("0\n");
}
int main(void) {
int i, j;
printf("Enter number of cities\n");
scanf("%d",&n);
npow=(int)pow(2, n);//bit number required to represent all possible sets
printf("Enter the adjacency matrix\n");
for(i = 0; i < n; i++)
for(j = 0; j < n; j++)
scanf("%d", &adj[i][j]);
TSP();
return 0;
}
This is the sequential program from ideone code.
Here is my parallel code for this
#include <stdio.h>
#include <math.h>
#include <pthread.h>
#include <signal.h>
#include <errno.h>
#include <unistd.h>
#include<limits.h>
#define size 10 //maximum 10 cities
#define min(a,b) a > b ? b:a
#define sizePOW 1024 // 2^10
struct threadargs {
int a, b;
int *c;
};
//Space complexity: O(n * 2^n)
//Time complexity: O(n^2 * 2^n)
int n, npow;
int g[size][sizePOW];
int p[size][sizePOW];
int adj[size][size];
void printMatrix() {
int i, j;
for (i = 0; i < 4; i++) {
for (j = 0; j < 16; j++) {
printf("%d ",g[i][j]);
printf("\n");
}
printf("\n\n");
}
void *compute(void *args) {
int masked, mask, i, start, set;
int result = INT_MAX; //result stores the minimum
struct threadargs *recvargs = (struct threadargs *) args;
start = recvargs->a;
set = recvargs->b;
int *retval = recvargs->c;
if (g[start][set] != -1) { //memoization DP top-down,check for repeated subproblem
*retval += g[start][set];
return;
}
printMatrix();
//sleep(1);
int temp[n];
for (i = 0; i < n; i++)
temp[i] = INT_MAX;
pthread_t threads[n];
struct threadargs arguments[n];
int running_thread_count = 0;
for (i = 0; i < n; i++)
threads[i] == -1;
for (i = 0; i < n; i++) { //npow-1 because we always exclude "home" vertex from our set
mask= (npow - 1) - (1 << i); //remove ith vertex from this set
masked = set & mask;
//printf("hello world");
if (masked != set)//in case same set is generated(because ith vertex was not present in the set hence we get the same set on removal) eg 12&13=12
{
temp[i] = adj[start][i];
arguments[i].a = i;
arguments[i].b = masked;
arguments[i].c = &temp[i];
pthread_create(&threads[i], NULL, compute, (void *)&arguments[i] );
running_thread_count++;
}
}
for (i = 0; i < n; i++) {
if (pthread_kill(threads[i], 0) != ESRCH)
pthread_join(threads[i], NULL);
}
int ith = 0;
result = temp[0];
for (i = 1; i < n; i++) {
if(temp[i] < result) {
result = temp[i];
ith = i;
}
}
p[start][set] = ith;
if (result != INT_MAX)
g[start][set] = result; //return minimum
*retval += g[start][set];
}
void getpath(int start,int set)
{
if (p[start][set] == -1)
return; //reached null set
int x = p[start][set];
int mask= (npow - 1) - (1 << x);
int masked = set & mask;//remove p from set
printf("%d ",x);
getpath(x, masked);
}
void TSP()
{ int i, j;
//g(i,S) is length of shortest path starting at i visiting all vertices in S and ending at 1
for(i=0; i < n; i++)
for( j = 0; j < npow; j++)
g[i][j] = p[i][j] = -1;
for (i = 0; i < n; i++)
g[i][0] = adj[i][0]; //g(i,nullset)= direct edge between (i,1)
int result;
struct threadargs arguments;
arguments.a = 0;
arguments.b = npow-2;
arguments.c = &result;
compute((void *) &arguments);//npow-2 to exclude our "home" vertex
printf("Tour cost:%d\n",result);
printf("Tour path:\n0 ");
getpath(0,npow-2);
printf("0\n");
}
int main(void) {
int i, j;
printf("Enter number of cities\n");
scanf("%d", &n);
npow=(int)pow(2, n);//bit number required to represent all possible sets
printf("Enter the adjacency matrix\n");
for (i = 0; i < n; i++)
for (j = 0; j < n; j++)
scanf("%d", &adj[i][j]);
TSP();
return 0;
}
But I am getting segmentation fault while trying to execute this code. Following is the output of the gdb
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff78bf63e in pthread_join (threadid=4196128,
thread_return=0x0) at pthread_join.c:85
85 pthread_join.c: No such file or directory.
(gdb) backtrace
#0 0x00007ffff78bf63e in pthread_join (threadid=4196128,
thread_return=0x0) at pthread_join.c:85
#1 0x0000000000400b36 in compute (args=0x7fffffffde90) at tsp3.c:69
#2 0x0000000000400db2 in TSP () at tsp3.c:107
#3 0x0000000000400ec8 in main () at tsp3.c:120
(gdb)
I know this will not give any noticeable performance gain but I want to try this. Thanks in advance.
**edit : **
I have rectified the errors but now I am facing new errors.I am getting correct answer when program runs but if I delete the line `printMatrix()
, I get segmentation fault. The gdb log is as follows
(gdb) backtrace
#0 __pthread_kill (threadid=0, signo=0)
at ../nptl/sysdeps/unix/sysv/linux/pthread_kill.c:42
#1 0x0000000000400c85 in compute (args=0x7ffff74eede0) at tsp3.c:79
#2 0x00007ffff78be182 in start_thread (arg=0x7ffff64ed700)
at pthread_create.c:312
#3 0x00007ffff75eaefd in clone ()
at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
(gdb)
why is this happening. Please explain. Thanks in advance.
You are creating threads in compare function, and you pass compare as thread function, that's chaos. You maybe exceed available number of threads.

Resources