I have a problem with making from the sequential algorithm parallel one with task parallelism.
My problem is the Minimum Cut of a Graph and in sequential case, I can achieve the correct result.
Here is part of my code:
main.c
struct ProblemInstance instance = readFromFile(file);
bool *solution = malloc(sizeof(bool) * vertexCount);
for (int n = 0; n < vertexCount; n++)
solution[n] = false;
#pragma omp parallel num_threads(2)
{
printf("Number of thread: %d \n", omp_get_thread_num());
#pragma omp single
recursiveBruteForce(solution, 0, 0);
}
problem.c
void recursiveBruteForce(bool *solution, float cutSum, int depth) {
// In case if it is not correct
if (checkPartialSolution(solution, depth)) return;
if (cutSum > minCutValue) return;
if (depth == vertexCount) {
minCutValue = cutSum;
for (int i = 0; i < vertexCount; i++)
minCutArray[i] = solution[i];
return;
}
solution[depth] = false;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
solution[depth] = true;
#pragma omp task
recursiveBruteForce(solution, minCutSum(solution, depth + 1), depth + 1);
}
// Check Particular Solution to how many 1 and 0 have
bool checkPartialSolution(const bool *solution, int depth) {
int a = 0, n = 0;
for (int i = 0; i < depth; i++) {
if (solution[i])
a = a + 1;
else
n = n + 1;
if (a > subgroupSize || n > (vertexCount - subgroupSize))
return true;
}
return false;
}
// Get a sum of Sub-Graph
double minCutSum(const bool *solution, int depth) {
float sum = 0;
for (int i = 0; i < depth; i++) {
for (int j = 0; j < i; j++) {
if (solution[j] != solution[i])
sum += graphConnections[j][i];
}
}
return sum;
}
I have tried to look at how many calls did recursiveBruteForce function. In sequential, it is approximately 22,000, while in parallel 62. I think the problem with memory.
Any suggestions?
Related
I'm currently working on an essay, in which I compare sorting times of certain algorithms. I've written an optimized version of bubble sort, which checks if there is a swap, if not, it stops.
void bubble_sort(int* tab, int n) {
int zamiana, x, i;
do {
zamiana = 0;
for (int counter = 0, i = 1; i < n; ++counter, ++i) {
if (tab[counter] > tab[i]) {
x = tab[counter];
tab[counter] = tab[i];
tab[i] = x;
zamiana = 1;
}
}
} while (zamiana != 0);
}
I have found that it takes almost 0s to sort array sorted in ascending order, now I'm testing it on an array sorted in descending order, and times are almost the same as for ascending order. Is it normal?
Code tested:
#include <time.h>
#include <stdio.h>
void quickSort(int* tab, int lewy, int prawy) {
int x, y = lewy - 1, z = prawy + 1, pivot = tab[(lewy + prawy) / 2];
while (1) {
while (pivot < tab[++y]);
while (pivot > tab[--z]);
if (y <= z) {
x = tab[y];
tab[y] = tab[z];
tab[z] = x;
}
else {
break;
}
}
if (z > lewy) {
quickSort(tab, lewy, z);
}
if (y < prawy) {
quickSort(tab, y, prawy);
}
}
void bubble_sort(int* tab, int n) {
int zamiana, x, i;
do {
zamiana = 0;
for (int counter = 0, i = 1; i < n; ++counter, ++i) {
if (tab[counter] > tab[i]) {
x = tab[counter];
tab[counter] = tab[i];
tab[i] = x;
zamiana = 1;
}
}
} while (zamiana != 0);
}
int main (){
int* tab;
int n;
srand(time(NULL));
scanf("%d", &n); //user input array size
tab = (int*)malloc(n * sizeof(int*));
for (int counter = 0; counter < n; ++counter) {
tab[counter] = (rand() % 200) - 100; //<-100;100>
}
quickSort(tab, 0, n); //sorting array to get descending order
clock_t start = clock();
bubble_sort(tab, n); //sorting array
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
printf("Time elapsed: %f", seconds);
}
So I have a project where I needed to implement the game of life and then parallelise it in c. However, when I try using pthreads.h to parallelise it the program runs slower when introducing more threads and the %CPU is lower than 100% (when using top in the ubuntu terminal, I have an Ubuntu Windows subsystem). Here's my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
/* Parallel code with PTHREADS v1 */
//Global variables
int N; // Size of the world
int nthreads; // Number of threads
pthread_mutex_t lock;
typedef struct info_thread
{
int threadID; // thread ID
int low; // lower limit of interval
int high; // higher limit of interval
int **world; // pointer to the world matrix
int **neighbors; // pointer to the neighbors matrix
//int **neighbors_2; // pointer to the neighbors_2 matrix
//int **one_step; // pointer to the one_step matrix
}t_info;
void * thread_func(void *arg);
void print_world(int **world);
void count_neighbors(int **world, int **neighbors);
void next_step(int **world, int **one_step, int **neighbors);
void update(int **world, int **one_step);
int compare(int **world, int **one_step, int **two_steps, int **old, int status);
int main(int argc, const char *argv[])
{
if (argc != 5)
{
printf("Give the following input arguments:\n");
printf("N: Size of the NxN world (integer)\n");
printf("Initial state: random (0), chessboard (1)\n");
printf("Output: Number of steps until final state (0) \n");
printf(" Number of steps until final state, initial and final states (1) \n");
printf(" Number of steps until final state and all states states (2) \n");
printf("Threads: Number of threads (integer)\n");
exit(0);
}
N = atoi(argv[1]);
const int pattern = atoi(argv[2]);
const int output = atoi(argv[3]);
nthreads = atoi(argv[4]);
// Create necessary matrices
const int n = N+1;
int **buffer = (int **)malloc(6 * n * sizeof(int *));
for(int i = 0; i < (6*n); i++)
{
buffer[i] = (int *)malloc(n*sizeof(int));
}
int **world = &buffer[0];
int **neighbors = &buffer[n];
int **neighbors_2 = &buffer[2*n];
int **one_step = &buffer[3*n];
int **two_steps = &buffer[4*n];
int **old = &buffer[5*n];
// Setting a random initial pattern
if(pattern == 0){
srand(time(0));
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
int r = rand() % 10;
if (r > 5)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
// Setting a chessboard initial state
else if(pattern == 1){
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(i%2 == 0){
if(j%2 == 0)
world[i][j] = 0;
else
world[i][j] = 1;
}
else{
if(j%2 == 0)
world[i][j] = 1;
else
world[i][j] = 0;
}
}
}
}
if(output==1 || output==2){
printf("Initial state:\n");
print_world(world);
}
int status = 1;
int t = 1;
update(old, world);
// Create threads and input info
pthread_t thread[nthreads];
t_info threadinfo[nthreads];
const int interval = N/nthreads;
while(status == 1)
{
for (int k=0; k<nthreads; k++)
{
threadinfo[k].threadID = k;
threadinfo[k].low = k*interval;
threadinfo[k].high = (k+1)*interval-1;
threadinfo[k].world = world;
threadinfo[k].neighbors = neighbors;
}
threadinfo[nthreads-1].high = N;
// Predict one step forward
pthread_mutex_init(&lock, NULL);
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
pthread_mutex_destroy(&lock);
next_step(world, one_step, neighbors);
// Predict two steps forward
for (int k=0; k<nthreads; k++)
{
threadinfo[k].world = one_step;
threadinfo[k].neighbors = neighbors_2;
}
for (int k=0; k<nthreads; k++)
pthread_create(&thread[k], NULL, thread_func, (void *)&threadinfo[k]);
for (int k=0; k<nthreads; k++)
pthread_join(thread[k],NULL);
//count_neighbors(one_step,neighbors_2);
next_step(one_step, two_steps, neighbors_2);
// Compare all predicted steps
status = compare(world, one_step, two_steps, old, status);
// Update world with two steps
update(world, two_steps);
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j+=2)
{
neighbors[i][j] = 0;
neighbors[i][j+1] = 0;
neighbors_2[i][j] = 0;
neighbors_2[i][j+1] = 0;
}
}
if((output == 2) && (status == 1)){
printf("Step %d:\n", t);
print_world(one_step);
printf("Step %d:\n", t+1);
print_world(two_steps);
}
// Save previous step
update(old, world);
//t+=1;
t+=2;
}
//printf("It took %d steps to reach the final state\n", t-2);
printf("It took %d steps to reach the final state\n", (t-3));
if(output==1 || output ==2){
printf("Final state:\n");
print_world(world);
}
for (int i = 0; i < (6*n); i++)
{
free(buffer[i]);
}
free(buffer);
}
void * thread_func(void *arg)
{
pthread_mutex_lock(&lock);
t_info *threadinfo = arg;
int threadID = threadinfo->threadID;
int low = threadinfo->low;
int high = threadinfo->high;
//int **world = threadinfo->world;
//int **neighbors = threadinfo->neighbors;
int i; //rows
int j; //col
for (i = low; i <= high; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (threadinfo->world[i-1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i-1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i-1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (threadinfo->world[i+1][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j < N-1){
if (threadinfo->world[i+1][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (threadinfo->world[i+1][j] == 1)
threadinfo->neighbors[i][j] +=1;
}
if (j > 0){
if (threadinfo->world[i][j-1] == 1)
threadinfo->neighbors[i][j] +=1;
}
if(j < N-1){
if (threadinfo->world[i][j+1] == 1)
threadinfo->neighbors[i][j] +=1;
}
}
}
pthread_mutex_unlock(&lock);
pthread_exit(NULL);
}
void print_world(int **world)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
printf("%d ", world[i][j]);
printf("%d ", world[i][j+1]);
}
printf("\n");
}
printf("\n");
}
void count_neighbors(int **world, int **neighbors)
{
int i; //rows
int j; //col
for (i = 0; i <= N-1; i++){
for (j = 0; j <= N-1; j++){
if (i > 0){
if (j > 0){
if (world[i-1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i-1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i-1][j] == 1)
neighbors[i][j] +=1;
}
if (i < N-1){
if (j > 0){
if (world[i+1][j-1] == 1)
neighbors[i][j] +=1;
}
if (j < N-1){
if (world[i+1][j+1] == 1)
neighbors[i][j] +=1;
}
if (world[i+1][j] == 1)
neighbors[i][j] +=1;
}
if (j > 0){
if (world[i][j-1] == 1)
neighbors[i][j] +=1;
}
if(j < N-1){
if (world[i][j+1] == 1)
neighbors[i][j] +=1;
}
}
}
}
void next_step(int **world, int **one_step, int **neighbors)
{
int i, j;
for (i = 0; i < N; i++){
for (j = 0; j < N; j++){
if (world[i][j] == 1)
{
if (neighbors[i][j] == 2 || neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
else if (world[i][j] == 0)
{
if (neighbors[i][j] == 3)
one_step[i][j] = 1;
else
one_step[i][j] = 0;
}
}
}
}
void update(int **world, int **one_step)
{
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j+=2)
{
world[i][j] = one_step[i][j];
world[i][j+1] = one_step[i][j+1];
}
}
}
int compare(int **world, int **one_step, int **two_steps, int **old, int status)
{
int counter1=0, counter2=0, counter3=0;
for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
if(world[i][j] == one_step[i][j])
counter1++;
if(world[i][j] == 0)
counter2++;
if(old[i][j] == two_steps[i][j])
counter3++;
}
}
if (counter1 == (N*N))
status = 0;
else if(counter2 == (N*N))
status = 0;
else if(counter3 == (N*N))
status = 0;
return status;
}
When I compile the code and run it using 2, 4 and 8 threads I get the following:
gcc -o gol gol.c -lpthread
time ./gol 500 1 0 2
It took 1670 steps to reach the final state
real 0m10.064s
user 0m8.971s
sys 0m0.246s
time ./gol 500 1 0 4
It took 1670 steps to reach the final state
real 0m15.694s
user 0m9.976s
sys 0m0.437s
time ./gol 500 1 0 8
It took 1670 steps to reach the final state
real 0m14.600s
user 0m10.400s
sys 0m0.855s
Also the %CPU using top is ~65% when using 2 threads, ~78% when using 4 threads and ~100% when using 8 threads. What am I doing wrong?
You've got 2 problems:
a) Creating threads and waiting for them to terminate adds overhead. Doing it inside a "while(status == 1)" loop means that you're paying for that extra overhead repeatedly. It'd be better to create the threads once (outside the loop) then re-use the existing threads, using something (e.g. condition variable) to make the threads wait for the next iteration of the loop.
b) Mutexes exist to prevent (unwanted) parallelism, and also have overhead. If the threads acquire a mutex, then do their work, then release a mutex; then you're deliberately preventing all parallelism between these threads. For your code, parallelism between the main thread and the newly created threads is also prevented (main thread sits waiting in a pthread_join()).
Essentially; you've added lots of overhead (creating and destroying threads, acquiring and releasing mutexes) and prevented all parallelism to ensure that there's no benefits to outweigh the extra overhead; leading to code that is worse than not using threads at all.
To fix this you need to find ways to ensure that threads can do useful work in parallel. The easiest way to do that would probably be to use 2 global arrays to represent the state of the world, where one of these arrays is "previous world state" and the other is "next world state", and where you swap (pointers to) the arrays between steps. In this case, during a step, "previous world state" is only being read (and many threads can read in parallel without a problem) and each thread can update different parts of "next world state" in parallel. Note that because the threads would write to each cell in "next world state" you won't need to clear "next world state" between steps either.
WARNING: To ensure that updating one element of the array won't cause "lack of atomicity" problems with other/adjacent elements in the array; you will need to either use an atomic type (sig_atomic_t, C11 I think) with a 2D array, or use "1D array of pointers to 1D arrays" (where each row can only be modified by one thread) and the element/s are volatile. Note that if the world state is an 8 * 8 grid you can probably represent a whole row with a single uint8_t (meaning that it could become "1D array of pointers to volatile uint8_t).
Basically (if you include the re-use of threads) it can be done without using any mutexes for anything other than worker threads waiting for the main thread to start the next step, and the main thread waiting for worker threads to complete the current step.
Also; instead of waiting for worker threads, the main thread can also participate in doing useful work. For example, if the world state is an 8 * 8 grid, then "main thread + 7 worker threads" can do a row each (in parallel) to ensure that all 8 rows are done. However, when threads have the same priority it's rarely sane to have more threads than CPUs; so it can be a good idea to check how many CPUs the computer has and limit the number of threads (e.g. if there are 4 CPUs, then you might have "main thread + 3 more threads do 2 rows each").
void evolve(board prv, board nxt){
int i, j;
int n;
printf("\rGeneration %d\n", generation++);
if (printLazy == 1){
lazyPrint(prv);
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
if (prv[i][j] && (n == 3 || n == 2))
nxt[i][j] = true;
else if (!prv[i][j] && (n == 3))
nxt[i][j] = true;
else
nxt[i][j] = false;
}
}
}
** Some asked me to add the neighbors method so
static int neighbors (board b, int i, int j) {
int n = 0;
int i_left = max(0,i-1);
int i_right = min(HEIGHT, i+2);
int j_left = max(0,j-1);
int j_right = min(WIDTH, j+2);
int ii, jj;
for (ii = i_left; ii < i_right; ++ii) {
for (jj = j_left; jj < j_right; ++jj) {
n += b[ii][jj];
}
}
return n - b[i][j];
}
So I am working on optimizing this so that it will go faster and I'm stuck on how to optimize this more. Here's what I have so far
void evolve(board prv, board nxt) {
register int i, j;
int n;
bool next;
printf("\rGeneration %d\n", generation++);
if (printLazy == 1){
lazyPrint(prv);
}
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
if (prv[i][j])
if (n == 2)
next = true;
else if (n == 3)
next = true;
else
next = false;
else
if(n == 3)
next = true;
else
next = false;
nxt[i][j] = next;
}
}
}
Is there a better way to do this or are there any resources or videos y'all recommend?
Thanks, any help is appreciated.
Some ideas Inline your function neighbors(). Or turn it into a macro. Tidy up the conditional. To unroll the inner loop replace every use of i with the literal values so your code looks like :
for (j =0;.......
n = fun(prev, 0 ,j);
If.....
n = fun(prev, 1, j);
if......
and so on.
If the value of HEIGHT was let's say 100, then you get a code explosion of 100 function calls and 100 compound conditionals. Even worse if you unroll the outer loop.
If n was limited to say 8 neighbors, use a lookup table
bool foo[2][8] = { [1][2] = true, [1][3] = true, [0][3] = true };
for (j=0; j < WIDTH; ++j) {
for (i = 0; i < HEIGHT; ++i) {
n = neighbors(prv, i, j);
nxt[i][j] = foo[prv[i][j]][n];
}
}
A common weakness is the neighbors(prv, i, j) function itself. One trick to to oversize the 2D array by 1 on all four sides and populate the edge with false so neighbors() can always check 8 neighbors as it is never used on the edge/corners.
Making sure the 2nd dimension is a power of 2 helps also - simplifies index calculation. So if the original array way 12*11, make the new array (1+12+1)*(1+11+1+4) or 14*16.
I was playing with Knight Tour algorithm implementation in Java. All that time I was completely sure that implementation on C must be faster. So after reading GNU C Reference the code be done and logic was implemented the same way is on Java.
Can you imagine my wonder when the C variant took more time to process a 6x6 board.
So my question is how the code below can be optimized from technical perspective (i.e. without heuristic optimizations).
Some performance notes: on my i5 laptop with Ubuntu the provided implementation took more than 4 hours to solve 6x6 board. Program on Java can solve this task in about 3 hours 18 mins with single threaded approach.
Some algorithm notes: this implementation finds all possible tours from all cells on the board, not just closed tours. As well heuristic optimization isn't used as it helps to find faster first tour not all.
EDIT: code compiled without any optimization with this command: gcc knight_tour.c -o knight-tour
#include "stdio.h"
#define BOARD_SIZE 5
#define MAX_MOVE_COUNT BOARD_SIZE*BOARD_SIZE
void printBoard(int[][BOARD_SIZE], int);
void clearBoard(int[][BOARD_SIZE], int);
int knight_move(int[][BOARD_SIZE], int, int, int);
int is_valid_position(int, int);
void calc_all_knight_jumps();
static int ALL_KNIGHT_COL_JUMPS[BOARD_SIZE][BOARD_SIZE][9];
static int ALL_KNIGHT_ROW_JUMPS[BOARD_SIZE][BOARD_SIZE][8];
int main() {
int board[BOARD_SIZE][BOARD_SIZE];
clearBoard(board, BOARD_SIZE);
calc_all_knight_jumps();
int result[BOARD_SIZE][BOARD_SIZE];
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
result[i][j] = knight_move(board, i, j, 1);
}
}
printBoard(result, BOARD_SIZE);
return 0;
}
int knight_move(int board[][BOARD_SIZE], int cpos, int rpos, int level) {
if (level == MAX_MOVE_COUNT)
return 1;
board[cpos][rpos] = level;
int solved_count = 0;
int jump_count = ALL_KNIGHT_COL_JUMPS[cpos][rpos][8];
for (int i = 0; i < jump_count; i++) {
int next_cpos = ALL_KNIGHT_COL_JUMPS[cpos][rpos][i];
int next_rpos = ALL_KNIGHT_ROW_JUMPS[cpos][rpos][i];
if (board[next_cpos][next_rpos] == 0) {
solved_count += knight_move(board, next_cpos, next_rpos, level + 1);
}
}
board[cpos][rpos] = 0;
return solved_count;
}
void clearBoard(int board[][BOARD_SIZE], int size) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
board[i][j] = 0;
}
}
}
void printBoard(int board[][BOARD_SIZE], int size) {
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
printf("%8d", board[i][j]);
}
printf("\n");
}
}
int is_valid_position(int cpos, int rpos) {
if (cpos < 0 || cpos >= BOARD_SIZE) return 0;
if (rpos < 0 || rpos >= BOARD_SIZE) return 0;
return 1;
}
void calc_all_knight_jumps() {
int col_jumps[] = { 1, 2, 2, 1, -1, -2, -2, -1};
int row_jumps[] = { 2, 1, -1, -2, -2, -1, 1, 2};
int next_cpos, next_rpos;
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
int jump_count = 0;
for (int k = 0; k < 8; k++) {
next_cpos = i + col_jumps[k];
next_rpos = j + row_jumps[k];
if (is_valid_position(next_cpos, next_rpos) == 1) {
ALL_KNIGHT_COL_JUMPS[i][j][jump_count] = next_cpos;
ALL_KNIGHT_ROW_JUMPS[i][j][jump_count] = next_rpos;
jump_count++;
}
}
ALL_KNIGHT_COL_JUMPS[i][j][8] = jump_count;
}
}
}
Taking into account all the comments I modified the source code a bit.
tried out -O2 and -O3 optimization options with gcc compiler;
reduced number of top level invocation on knight_move() method. so now only unique results are calculated and then reflected horizontally, vertically and diagonally;
added code to measure performance without printf() usage;
checked that both C and Java variants are identical as much as possible;
And finally there are results I expected - C code is faster (but with optimization options)
C code with the -O2 option: duration - 1348 sec (22:28)
Java code: duration - 1995 sec (33:15)
C code without optimization: duration - 3518 sec (58:38)
C code with the -O3 option: duration - 2143 sec (35:43)
Here are both implementations in case someone be interested in knight tour algorithm on C and Java:-)
GNU C
#include "stdio.h"
#include "time.h"
#define BOARD_SIZE 6
#define MAX_MOVE_COUNT BOARD_SIZE*BOARD_SIZE
int knight_move(int[][BOARD_SIZE], int, int, int);
void pre_calc_all_knight_jumps();
void print_result(int[][BOARD_SIZE]);
static int ALL_KNIGHT_COL_JUMPS[BOARD_SIZE][BOARD_SIZE][9];
static int ALL_KNIGHT_ROW_JUMPS[BOARD_SIZE][BOARD_SIZE][8];
int main() {
// init board
int board[BOARD_SIZE][BOARD_SIZE];
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
board[i][j] = 0;
}
}
pre_calc_all_knight_jumps();
int result[BOARD_SIZE][BOARD_SIZE];
struct timespec s_time, e_time;
clock_gettime(CLOCK_MONOTONIC, &s_time);
int border = BOARD_SIZE - 1;
int center = BOARD_SIZE / 2.0 + 0.5;
for (int i = 0; i < center; i++) {
for (int j = i; j < center; j++) {
int res = knight_move(board, i, j, 1);
result[i][j] = res;
result[border - i][j] = res;
result[i][border - j] = res;
result[border - i][border - j] = res;
if (i != j) result[j][i] = res;
}
}
clock_gettime(CLOCK_MONOTONIC, &e_time);
printf("Duration in seconds: %ld\n", e_time.tv_sec - s_time.tv_sec);
print_result(result);
return 0;
}
int knight_move(int board[][BOARD_SIZE], int cpos, int rpos, int level) {
if (level == MAX_MOVE_COUNT) return 1;
board[cpos][rpos] = level;
int solved_count = 0;
int valid_move_count = ALL_KNIGHT_COL_JUMPS[cpos][rpos][8];
for (int i = 0; i < valid_move_count; i++) {
int next_cpos = ALL_KNIGHT_COL_JUMPS[cpos][rpos][i];
int next_rpos = ALL_KNIGHT_ROW_JUMPS[cpos][rpos][i];
if (board[next_cpos][next_rpos] == 0) {
solved_count += knight_move(board, next_cpos, next_rpos, level + 1);
}
}
board[cpos][rpos] = 0;
return solved_count;
}
void print_result(int board[][BOARD_SIZE]) {
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
printf("%8d", board[i][j]);
}
printf("\n");
}
}
void pre_calc_all_knight_jumps() {
int col_jumps[] = { 1, 2, 2, 1, -1, -2, -2, -1};
int row_jumps[] = { 2, 1, -1, -2, -2, -1, 1, 2};
int next_cpos, next_rpos;
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
int jump_count = 0;
for (int k = 0; k < 8; k++) {
next_cpos = i + col_jumps[k];
next_rpos = j + row_jumps[k];
if (next_cpos < 0 || next_cpos >= BOARD_SIZE) continue;
if (next_rpos < 0 || next_rpos >= BOARD_SIZE) continue;
ALL_KNIGHT_COL_JUMPS[i][j][jump_count] = next_cpos;
ALL_KNIGHT_ROW_JUMPS[i][j][jump_count] = next_rpos;
jump_count++;
}
ALL_KNIGHT_COL_JUMPS[i][j][8] = jump_count;
}
}
}
Java
import java.util.Arrays;
public class KnightTour1 {
private final static int BOARD_SIZE = 6;
private final static int MAX_MOVE_COUNT = BOARD_SIZE * BOARD_SIZE;
private static final int[][][] ALL_KNIGHT_COL_MOVES;
private static final int[][][] ALL_KNIGHT_ROW_MOVES;
static {
final int[] knightColJumps = { 1, 2, 2, 1, -1, -2, -2, -1};
final int[] knightRowJumps = { 2, 1, -1, -2, -2, -1, 1, 2};
ALL_KNIGHT_COL_MOVES = new int[BOARD_SIZE][BOARD_SIZE][];
ALL_KNIGHT_ROW_MOVES = new int[BOARD_SIZE][BOARD_SIZE][];
int[] tmpColMoves = new int[8];
int[] tmpRowMoves = new int[8];
for (int c = 0; c < BOARD_SIZE; c++) {
for (int r = 0; r < BOARD_SIZE; r++) {
int jumpCount = 0;
for (int i = 0; i < 8; i++) {
int nextColPos = c + knightColJumps[i];
int nextRowPos = r + knightRowJumps[i];
if (isValidBoardPos(nextColPos, nextRowPos)) {
tmpColMoves[jumpCount] = nextColPos;
tmpRowMoves[jumpCount] = nextRowPos;
jumpCount++;
}
}
ALL_KNIGHT_COL_MOVES[c][r] = Arrays.copyOf(tmpColMoves, jumpCount);
ALL_KNIGHT_ROW_MOVES[c][r] = Arrays.copyOf(tmpRowMoves, jumpCount);
}
}
}
private static boolean isValidBoardPos(int colPos, int rowPos) {
return colPos > -1 && colPos < BOARD_SIZE && rowPos > -1 && rowPos < BOARD_SIZE;
}
public static void main(String[] args) {
long sTime = System.currentTimeMillis();
int[][] result = findNumberOfTours();
long duration = (System.currentTimeMillis() - sTime) / 1000;
System.out.println("Duration in seconds: " + duration);
printResult(result);
}
private static int knightMove(int[][] board, int colPos, int rowPos, int level) {
if (level == MAX_MOVE_COUNT) return 1;
board[colPos][rowPos] = level;
final int[] validColMoves = ALL_KNIGHT_COL_MOVES[colPos][rowPos];
final int[] validRowMoves = ALL_KNIGHT_ROW_MOVES[colPos][rowPos];
final int validMoveCount = validColMoves.length;
int solvedTourCount = 0;
for (int i = 0; i < validMoveCount; i++) {
final int nextColPos = validColMoves[i];
final int nextRowPos = validRowMoves[i];
if (board[nextColPos][nextRowPos] == 0) {
solvedTourCount += knightMove(board, nextColPos, nextRowPos, level + 1);
}
}
board[colPos][rowPos] = 0;
return solvedTourCount;
}
private static int[][] findNumberOfTours() {
final int[][] result = new int[BOARD_SIZE][BOARD_SIZE];
final int[][] board = new int[BOARD_SIZE][BOARD_SIZE];
final int border = BOARD_SIZE - 1;
final int center = (int)(BOARD_SIZE / 2f + 0.5);
for (int i = 0; i < center; i++) {
for (int j = i; j < center; j++) {
int res = knightMove(board, i, j, 1);
result[i][j] = res;
result[border - i][j] = res;
result[i][border - j] = res;
result[border - i][border - j] = res;
if (i != j) result[j][i] = res;
}
}
return result;
}
private static void printResult(int[][] res) {
for (int i = 0; i < BOARD_SIZE; i++) {
for (int j = 0; j < BOARD_SIZE; j++) {
System.out.print(String.format("%8d", res[i][j]));
}
System.out.println();
}
}
}
Here are some suggestions about your code as well as the updated version posted in your answer:
use <> for standard header files:
#include <stdio.h>
surround expressions with parentheses in macro definitions:
#define MAX_MOVE_COUNT (BOARD_SIZE * BOARD_SIZE)
use (void) when declaring functions without arguments:
void pre_calc_all_knight_jumps(void);
avoid mixing floating point and integer calculations with implicit conversions. Use this instead:
int center = (BOARD_SIZE + 1) / 2;
Some of the symmetries are not properly reflected in the result array. You should change the main loop to:
int border = BOARD_SIZE - 1;
int center = (BOARD_SIZE + 1) / 2;
for (int i = 0; i < center; i++) {
for (int j = i; j < center; j++) {
int res = knight_move(board, i, j, 1);
result[i][j] = res;
result[j][i] = res;
result[border - i][j] = res;
result[j][border - i] = res;
result[i][border - j] = res;
result[border - j][i] = res;
result[border - i][border - j] = res;
result[border - j][border - i] = res;
}
}
I also get some cache usage improvements by making the board 8x8 and using a additional argument for the playing area size.
A more efficient algorithm is definitely needed to solve this problem for larger sizes.
I can post my code if necessary, but my question is primarily conceptual. I am implementing Gaussian elimination with threading. I have p pthreads operating on an nxn matrix in column major order. Before any p-thread can start operating on a column, row operations must be done to move the row with the largest value in that column up to the diagonal. So I need every thread to wait and then operate in unison. Currently, at each column, Each thread checks its id, the one with id=0 will perform the row operations. My problem is how to get all the threads but id=0 to wait and then operate in Unison.
I've tried using mutex locks and conditionals. These don't seem to work because they give all access rights to a single thread. From what I understand, one can only block a thread in this manner by having it request a lock where one already exists, so it must wait. This would be a problem in my case because I don't want any of the non-0 threads to have a lock, once they are unlocked I want them to operate freely until they finish their work on column.
I tried to avoid Mutex locks by simply having a global "colReady" variable set to 0. The non-zero threads while loop until colReady = True. Logically this makes perfect sense, but it has not worked programatically.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <pthread.h>
#define n 20
#define numThr 3
double **matrix;
int pivotReady[n] = { 0 };
pthread_cond_t cond_pivot_ready;
pthread_mutex_t cond_mutex;
int swapRows (int row1, int row2)
{
double *tempRow = matrix[row1];
matrix[row1] = matrix[row2];
matrix[row2] = tempRow;
return 0;
}
void randinit ()
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n + 1; j++) {
matrix[i][j] = drand48 ();
}
}
}
void rowReduce (void *arg);
void printMatrix ()
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < n + 1; j++) {
printf (" %4.2f ", matrix[i][j]);
}
printf ("\n");
}
}
int main ()
{
pthread_cond_init (&cond_pivot_ready, NULL);
pthread_mutex_init (&cond_mutex, NULL);
int i, j;
double temp;
pthread_t p_threads[numThr];
pthread_attr_t attr;
pthread_attr_init (&attr);
//create matrix
matrix = malloc (sizeof (double *) * n);
for (i = 0; i < n; i++) {
*(matrix + i) = malloc (sizeof (double) * (n + 1));
}
randinit ();
for (i = 0; i < numThr; i++) {
pthread_create (&p_threads[i], &attr, rowReduce, (void *) ((long) i));
}
for (i = 0; i < numThr; i++) {
pthread_join (p_threads[i], NULL);
}
printf ("Final Matrix:\n");
printMatrix ();
return 0;
}
void rowReduce (void *arg)
{
int id = (int) arg;
int i, pivot, row;
double ratio, temp, max;
int maxRow;
for (pivot = 0; pivot < n - 1; pivot++) {
//PIVOT THREAD
if (id == 0) {
pthread_mutex_lock (&cond_mutex);
max = matrix[pivot][pivot]
maxRow = pivot;
for (i = pivot + 1; i < n; i++) {
temp = matrix[i][pivot];
if (temp > max) {
max = temp;
maxRow = i;
}
}
swapRows (pivot, maxRow);
pivotReady[pivot] = 1;
pthread_cond_signal (&cond_pivot_ready);
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
printf ("t1: row = %d, piv = %d, ratio = %f\n", row, pivot,
ratio);
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
pthread_mutex_unlock (&cond_mutex);
}
//NON-PIVOT THREAD
else {
pthread_mutex_lock (&cond_mutex);
while (!(pivotReady[pivot])) {
pthread_cond_wait (&cond_pivot_ready, &cond_mutex);
}
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
pthread_mutex_unlock (&cond_mutex);
}
}
//printf("rowReduce called with id = %d\n", id);
pthread_exit (0);
}
This program SHOULD print a random matrix that has been put in upper triangular form.
You only need to hold the cond_mutex while you are accessing pivotReady[pivot], because that's the only shared state it protects.
You also need to use pthread_cond_broadcast() rather than pthread_cond_signal(), because you need all the waiting threads to proceed once the pivot is ready.
After a minor refactoring so that the row processing code isn't repeated, it looks like:
for (pivot = 0; pivot < n - 1; pivot++) {
//PIVOT THREAD
if (id == 0) {
max = matrix[pivot][pivot];
maxRow = pivot;
for (i = pivot + 1; i < n; i++) {
temp = matrix[i][pivot];
if (temp > max) {
max = temp;
maxRow = i;
}
}
swapRows (pivot, maxRow);
pthread_mutex_lock (&cond_mutex);
pivotReady[pivot] = 1;
pthread_cond_broadcast (&cond_pivot_ready);
pthread_mutex_unlock (&cond_mutex);
}
//NON-PIVOT THREAD
else {
pthread_mutex_lock (&cond_mutex);
while (!(pivotReady[pivot])) {
pthread_cond_wait (&cond_pivot_ready, &cond_mutex);
}
pthread_mutex_unlock (&cond_mutex);
}
for (row = pivot + 1 + id; row < n; row += numThr) {
ratio = matrix[row][pivot] / matrix[pivot][pivot];
for (int i = pivot; i < n + 1; i++) {
matrix[row][i] -= ratio * matrix[pivot][i];
}
}
}