How to fix my Pthreads code about Mandelbrot set? - c

I have the following Pthreads code about calculating and creating a picture of the Mandelbrot set. My code in C works just fine and it prints the resulting picture nicely. The point is that using the below code, I am able to compile the code and execute it. Afterwards, if I try to view the resulting .ppm file in Gimp, it simply cannot open it. I guess I'm doing something wrong in my code. If someone could help me I would be glad.
// mandpthread.c
// to compile: gcc mandpthread.c -o mandpthread -lm -lrt -lpthread
// usage: ./mandpthread <no_of_iterations> <no_of_threads> > output.ppm
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <assert.h>
#include <pthread.h>
typedef struct {
int r, g, b;
} rgb;
int NITERATIONS, NTHREADS;
rgb **m;
void color(rgb **m, int x, int y, int red, int green, int blue)
{
m[y][x].r = red;
m[y][x].g = green;
m[y][x].b = blue;
}
void mandelbrot(int tid)
{
int w = 600, h = 400, x, y;
// each iteration, it calculates: newz = oldz*oldz + p,
// where p is the current pixel, and oldz stars at the origin
double pr, pi; // real and imaginary part of the pixel p
double newRe, newIm, oldRe, oldIm; // real and imaginary parts of new and old z
double zoom = 1, moveX = -0.5, moveY = 0; // you can change these to zoom and change position
int start = tid * NITERATIONS/NTHREADS;
int end = (tid+1) * (NITERATIONS/NTHREADS) - 1;
//loop through every pixel
for(y = 0; y < h; y++) {
for(x = 0; x < w; x++) {
// calculate the initial real and imaginary part of z,
// based on the pixel location and zoom and position values
pr = 1.5 * (x - w / 2) / (0.5 * zoom * w) + moveX;
pi = (y - h / 2) / (0.5 * zoom * h) + moveY;
newRe = newIm = oldRe = oldIm = 0; //these should start at 0,0
// i will represent the number of iterations
int i;
// start the iteration process
for(i = start; i <= end; i++) {
// remember value of previous iteration
oldRe = newRe;
oldIm = newIm;
// the actual iteration, the real and imaginary part are calculated
newRe = oldRe * oldRe - oldIm * oldIm + pr;
newIm = 2 * oldRe * oldIm + pi;
// if the point is outside the circle with radius 2: stop
if((newRe * newRe + newIm * newIm) > 4) break;
}
if(i == NITERATIONS)
color(m, x, y, 0, 0, 0); // black
else
{
// normalized iteration count method for proper coloring
double z = sqrt(newRe * newRe + newIm * newIm);
int brightness = 256. * log2(1.75 + i - log2(log2(z))) / log2((double)NITERATIONS);
color(m, x, y, brightness, brightness, 255);
}
}
}
}
// worker function which will be passed to pthread_create function
void *worker(void *arg)
{
int tid = (int)arg;
mandelbrot(tid);
}
int main(int argc, char *argv[])
{
pthread_t* threads;
int i, j, rc;
if(argc != 3)
{
printf("Usage: %s <no_of_iterations> <no_of_threads> > output.ppm\n", argv[0]);
exit(1);
}
NITERATIONS = atoi(argv[1]);
NTHREADS = atoi(argv[2]);
threads = (pthread_t*)malloc(NTHREADS * sizeof(pthread_t));
m = malloc(400 * sizeof(rgb *));
for(i = 0; i < 400; i++)
m[i] = malloc(600 * sizeof(rgb));
// declaring the needed variables for calculating the running time
struct timespec begin, end;
double time_spent;
// starting the run time
clock_gettime(CLOCK_MONOTONIC, &begin);
printf("P6\n# AUTHOR: ET\n");
printf("%d %d\n255\n",600,400);
for(i = 0; i < NTHREADS; i++) {
rc = pthread_create(&threads[i], NULL, worker, (void *)i);
assert(rc == 0); // checking whether thread creating was successfull
}
for(i = 0; i < NTHREADS; i++) {
rc = pthread_join(threads[i], NULL);
assert(rc == 0); // checking whether thread join was successfull
}
// printing to file
for(i = 0; i < 400; i++) {
for(j = 0; j < 600; j++) {
fputc((char)m[i][j].r, stdout);
fputc((char)m[i][j].g, stdout);
fputc((char)m[i][j].b, stdout);
}
}
// ending the run time
clock_gettime(CLOCK_MONOTONIC, &end);
// calculating time spent during the calculation and printing it
time_spent = end.tv_sec - begin.tv_sec;
time_spent += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
fprintf(stderr, "Elapsed time: %.2lf seconds.\n", time_spent);
for(i = 0; i < 400; i++)
free(m[i]);
free(m);
free(threads);
return 0;
}

The newest version of your code works for me with 100 iterations and 1 thread.
Doing two threads fails, because the ppm file has 2 headers one from each thread.
If I delete one of the headers, the image loads but the colours are off and there's a glitch in the image.

Related

parallelizing Mandelbrot using MPI

I am trying to parallelize the Mandelbrot.
the correct output should be around 1.510659. however I am not getting that correctly.
** PROGRAM: Mandelbrot area
**
** PURPOSE: Program to compute the area of a Mandelbrot set.
** The correct answer should be around 1.510659.
**
** USAGE: Program runs without input ... just run the executable
**
reduction for numoutside.
this is my parallelized code
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#define NPOINTS 1000
#define MAXITER 1000
int P = 1;
struct d_complex
{
double r;
double i;
};
int testpoint(struct d_complex);
struct d_complex c;
struct d_complex cPart;
int numoutside = 1;
int main()
{
int i, j, row;
int res;
double area, error, eps = 1.0e-5;
int myrank, mysize;
double stsec, ensec, commtime, maxcommtime;
MPI_Status status;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &mysize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
stsec = MPI_Wtime();
// Loop over grid of points in the complex plane which contains the Mandelbrot set,
// testing each point to see whether it is inside or outside the set.
/*for (i = 0; i < NPOINTS; i ++)
{
for (j = 0; j < NPOINTS ; j++)
{
c.r = -2.0 + 2.5 * (double)(i) / (double)(NPOINTS) + eps;
c.i = 1.125 * (double)(j) / (double)(NPOINTS) + eps;
testpoint(c);
}
}*/
if (myrank == 0)
{
/* Begin User Program - the master */
//*
int outsum, nb_pixel = NPOINTS*NPOINTS ;
for (i = 0; i < nb_pixel; i++)
{
MPI_Recv(&res, 1, MPI_INT, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &status);
// printf("Slave id %d has send : %d \n", status.MPI_SOURCE, data[2]);
// printf("%d: [%d,%d] -> [%d,%d] = %d\n", status.MPI_SOURCE, data[0], data[1], data[0] + MAXX, data[1] + MAXY, data[2]);
res += numoutside;
}
area = 2.0 * 2.5 * 1.125 * (double)(NPOINTS * NPOINTS - res) / (double)(NPOINTS * NPOINTS);
error = area / (double)NPOINTS;
printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n", area, error);
printf("Finish.\n");
}
else
{
for (i = myrank; i < NPOINTS; i+=mysize)
{
for (j = 0; j < NPOINTS; j++)
{
c.r = -2.0 + 2.5 * (double)(i) / (double)(NPOINTS) + eps;
c.i = 1.125 * (double)(j) / (double)(NPOINTS) + eps;
res=testpoint(c);
MPI_Send(&res, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
}
// Calculate area of set and error estimate and output the results
MPI_Finalize();
ensec = MPI_Wtime();
commtime = ensec - stsec;
// area = 2.0 * 2.5 * 1.125 * (double)(NPOINTS * NPOINTS - numoutside) / (double)(NPOINTS * NPOINTS);
// error = area / (double)NPOINTS;
printf("Area of Mandlebrot set = %12.8f +/- %12.8f\n", area, error);
if (myrank == 0)
{
printf("%.3f\n", commtime);
}
}
int testpoint(struct d_complex c)
{
// Does the iteration z=z*z+c, until |z| > 2 when point is known to be outside set
// If loop count reaches MAXITER, point is considered to be inside the set
struct d_complex z;
int iter;
double temp;
z = c;
for (iter = 0; iter < MAXITER; iter++)
{
temp = (z.r * z.r) - (z.i * z.i) + c.r;
z.i = z.r * z.i * 2 + c.i;
z.r = temp;
if ((z.r * z.r + z.i * z.i) > 4.0)
{
// MPI_Send( &numoutside, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
return numoutside;
break;
}
}
return 0;
}
the expectation is to get around 1.510659 when running the code with NPOINTS: 1000,2000 and 2,4, processors .

Ray tracer not giving different light intensities based on direction

Goal: I am trying to create a ray tracer in C. I just added in a light source that should give each of my three spheres a shading effect based on where the light is. If the light is to the left of all of them, a shadow should be cased on the right.
Problem: When changing the light intensities and position of the light, all the spheres are changed uniformly. The spheres will be more or less lit equally and there is no variation of lighting on individual pixels on the sphere.
My debugging attempts: I have tried looking through the variable outputs by printing out a lot of different info and I think the source comes from my variable
diffuse_light_intensity
which does not change much (through all the iterations on the screen the value changes twice when it should be changing quite often due to the angles of the light on the surface changing quite a bit)
My Code: (my theory is the problem lies in scene_intersect() or cast_ray())
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <limits.h>
typedef struct {
float position[3];
float intensity;
} Light;
typedef struct {
float diffuse_color[3];
} Material;
typedef struct {
float center[3];
float radius;
Material material;
} Sphere;
int arrSub(const float arr1[], const float arr2[], float subArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 - arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
subArr[i] = arr1[i] - arr2[i];
}
return 0;
}
int arrAdd(const float arr1[], const float arr2[], float addArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 + arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
addArr[i] = arr1[i] + arr2[i];
}
return 0;
}
int arrScalarMult(const float arr1[], float scalar, float newArr[], int length) {
/*
Requires 3 equally sized arrays (denoted as length),
arr1 - arr2 will result in the third array subArr
*/
for (int i = 0; i < length; i++) {
newArr[i] = arr1[i] * scalar;
}
return 0;
}
float dotProduct(const float arr1[], const float arr2[], int length) {
/*
Returns the dot product of two equal sized arrays
(treated as vectors)
a (dot) b = a1b1 + a2b2 + ... anbn
*/
float result = 0;
for (int i = 0; i < length; i++) {
result += arr1[i] * arr2[i];
}
return result;
}
int normalize(float arr[], int len) {
//Normalize a vector (array)
float sumSqr;
float norm;
for (int i = 0; i < len; i++) {
sumSqr += arr[i] * arr[i];
}
norm = sqrt(sumSqr);
for (int i = 0; i < len; i++) {
arr[i] = arr[i] / norm;
}
return 0;
}
bool ray_intersect(const float origin[], const float dir[], float t0, Sphere s) {
/*
Ray-Sphere Intersection
Vectors:
origin (the zero vector)
dir (direction vector)
L (vector from origin to center of sphere)
Scalars:
tca
d2
thc
t0
t1
*/
float L[3] = {0,0,0}; //The zero vector
arrSub(s.center, origin, L, 3); //L is now the vector from origin to the sphere's center
float tca = dotProduct(L, dir, 3); //Projection of L onto dir
float d2 = dotProduct(L, L, 3) - tca*tca;
if (d2 > s.radius * s.radius) return false; //There is no intersection, so return false.
float thc = sqrtf((s.radius*s.radius - d2));
t0 = tca - thc;
float t1 = tca + thc;
if (t0 < 0) {
t0 = t1;
}
if (t0 < 0) return false;
return true;
}
bool scene_intersect(const float origin[], const float dir[], const Sphere s[], int len, float hit[], float N[], Material * ptr_m) {
float sphere_dist = INT_MAX;
for (size_t i=0; i < len; i++) {
float dist_i;
if (ray_intersect(origin, dir, dist_i, s[i]) && dist_i < sphere_dist) {
sphere_dist = dist_i;
float dirDist[3];
arrScalarMult(dir, dist_i, dirDist, 3);
arrAdd(origin, dirDist, hit, 3);
float hitMinusCenter[3];
arrSub(hit, s[i].center, hitMinusCenter, 3);
normalize(hitMinusCenter, 3);
N[0] = hitMinusCenter[0];
N[1] = hitMinusCenter[1];
N[2] = hitMinusCenter[2];
* ptr_m = s[i].material;
}
}
return sphere_dist<1000;
}
int cast_ray(const float origin[], const float dir[], const Sphere s[], const Light l[], int l_size, unsigned char colorArr[]) {
float point[3], N[3];
Material m;
Material * ptr_m = &m;
if (!scene_intersect(origin, dir, s, 3, point, N, ptr_m)) {
//background
colorArr[0] = 5; //red
colorArr[1] = 100; //green
colorArr[2] = 250; //blue
} else {
float diffuse_light_intensity = 0;
float light_dir[3];
for (size_t i = 0; i < l_size; i++) {
arrSub(l[i].position, point, light_dir, 3);
normalize(light_dir, 3);
diffuse_light_intensity += l[i].intensity * ((0.f >= dotProduct(light_dir, N, 3) ? (0.f) : (dotProduct(light_dir, N, 3))));
}
//light up pixel
colorArr[0] = m.diffuse_color[0] * diffuse_light_intensity;
colorArr[1] = m.diffuse_color[1] * diffuse_light_intensity;
colorArr[2] = m.diffuse_color[2] * diffuse_light_intensity;
}
return 0;
}
int render(const Sphere s[], const Light l[], int l_length) {
/*
Creates image in a new color each step.
*/
const int width = 1024;
const int height = 768;
FILE *fp = fopen("fourth.ppm", "wb"); // Write in binary mode
(void) fprintf(fp, "P6\n%d %d\n255\n", width, height);
float fov = 3.1415926535/2.; // Field of View
#pragma omp parallel for
for (size_t j = 0; j < height; j++) {
for (size_t i = 0; i < width; i++) {
float x = (2*(i+.5)/(float)width - 1)*tan(fov/2.)*width/(float)height;
float y = -(2*(j+.5)/(float)height - 1)*tan(fov/2.);
float dir[] = {x,y,-1};
normalize(dir, 3);
unsigned char color[3];
const float origin[] = {0,0,0};
cast_ray(origin, dir, s, l, l_length, color);
(void) fwrite(color, 1, 3, fp);
}
}
(void) fclose(fp);
return 0;
}
int main(void) {
Material red = {255,0,0};
Material pink = {150,10,150};
Material gold = {255, 195, 0};
//Populate with spheres
Sphere s[3];
Sphere originalS = {{-3,0,-16},2,gold};
Sphere bigS = {{-1.0, -1.5, -12}, 3, red};
Sphere anotherS = {{7,5,-18},2,pink};
s[0] = originalS;
s[1] = bigS;
s[2] = anotherS;
//Add light source
Light l[1];
Light test_light = {{-20,20,20}, 1.5};
l[0] = test_light;
render(s,l, 1);
printf("Run success!\n");
return 0;
}
If any clarification is needed on my code please let me know, I am quite new to both C and stackoverflow.
There's a fundamental error in ray_intersect where you're passing the t0 variable by value, and not as a pointer, and therefore in the scene_intersect function its value is always zero.
The other problem is that you don't initialize the sumSqr in the normalize function, resulting in that function returning NaN for each vector component.
With those two fixed I get something approximating shaded balls. The errors in that image are caused by failing to ensure that your output pixel values fall in the range [0, 255].
NB: both of these first errors are detected if you turn on full compiler error checking, warning you of uninitialised variables being used.

MPI_SEND and MPI_RECIEVE have no Reference by Compile

I am trying to compile an MPI program with mpicc. The compiler complains only that there is no reference to MPI_RECIVE and MPI_SEND, and ends the compile error. I have #include in the .c file.
Can someone tell me how I can fix this?
Here ist the Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <mpi.h>
#include "random.h"
#include "md5tool.h"
/* horizontal size of the configuration */
#define XSIZE 1024
/* "ADT" State and line of states (plus border) */
typedef char State;
typedef State Line[XSIZE + 2];
/* determine random integer between 0 and n-1 */
#define randInt(n) ((int)(nextRandomLEcuyer() * n))
/* random starting configuration */
static void initConfig(Line *buf, int lines){
int x, y;
initRandomLEcuyer(424243);
for (y = 1; y <= lines; y++) {
for (x = 1; x <= XSIZE; x++) {
buf[y][x] = randInt(100) >= 50;
}
}
}
/* annealing rule from ChoDro96 page 34
* the table is used to map the number of nonzero
* states in the neighborhood to the new state
*/
static State anneal[10] = {0, 0, 0, 0, 1, 0, 1, 1, 1, 1};
/* a: pointer to array; x,y: coordinates; result: n-th element of anneal,
where n is the number of neighbors */
#define transition(a, x, y) \
(anneal[(a)[(y)-1][(x)-1] + (a)[(y)][(x)-1] + (a)[(y)+1][(x)-1] +\
(a)[(y)-1][(x) ] + (a)[(y)][(x) ] + (a)[(y)+1][(x) ] +\
(a)[(y)-1][(x)+1] + (a)[(y)][(x)+1] + (a)[(y)+1][(x)+1]])
/* treat torus like boundary conditions */
static void boundary(Line *buf, int lines){
int x,y;
for (y = 0; y <= lines+1; y++) {
/* copy rightmost column to the buffer column 0 */
buf[y][0 ] = buf[y][XSIZE];
/* copy leftmost column to the buffer column XSIZE + 1 */
buf[y][1+1] = buf[y][1 ];
}
for (x = 0; x <= XSIZE+1; x++) {
/* copy bottommost row to buffer row 0 */
buf[0][x ] = buf[lines][x];
/* copy topmost row to buffer row lines + 1 */
buf[lines+1][x] = buf[1][x ];
}
}
/* make one simulation iteration with lines lines.
* old configuration is in from, new one is written to to.
*/
//umschreiben
/**
static void simulate(Line *from, Line *to, int lines){
boundary(from, lines);
for (y = 1; y <= lines; y++) {
for (x = 1; x <= XSIZE; x++) {
to[y][x ] = transition(from, x , y);
}
}
}
*/
/* --------------------- measurement ---------------------------------- */
int main(int argc, char** argv){
int lines, its;
int i;
Line *from, *to, *temp, *next;
char* hash;
assert(argc == 3);
lines = atoi(argv[1]);
its = atoi(argv[2]);
from = malloc((lines + 2) * sizeof(Line));
to = malloc((lines + 2) * sizeof(Line));
MPI_Init(NULL, NULL);
// Get the number of processes
int world_size;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Get the rank of the process
int world_rank;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
if(world_rank == 0){
int y;
next = malloc((lines + 2) * sizeof(Line));
initConfig(from, lines);
for (i = 0; i < its; i++) {
boundary(from, lines);
int z;
for(z = 0; z < world_size;z++){
if(z !=world_rank ){
MPI_SEND(from,(lines + 2) * sizeof(Line),MPI_CHARACTER,z,0,MPI_COMM_WORLD);
}
}
MPI_Status status;
for(z = 0; z < world_size;z++){
if(z !=world_rank ){
MPI_RECIVE(next,(lines + 2) * sizeof(Line),MPI_CHARACTER,z,1,&status);
if(status.MPI_ERROR){
//TODO
MPI_Abort(MPI_COMM_WORLD,1);
}
for (y = 1; y <= (lines%world_size+lines/world_size); y++) {
stpcpy(to[y*z],next[y*z]);
}
}
}
temp = from;
from = to;
to = temp;
}
hash = getMD5DigestStr(from[1], sizeof(Line) * (lines));
printf("hash: %s\n", hash);
free(next);
}else{
int x,y;
MPI_Status status;
for(i = 0; i < its; i++){
MPI_RECIVE(from,(lines + 2) * sizeof(Line),MPI_CHARACTER,0,0,&status);
if(status.MPI_ERROR){
MPI_Abort(MPI_COMM_WORLD,2);
}
for (y = 1; y <= (lines%world_size+lines/world_size); y++) {
for (x = 1; x <= XSIZE; x++) {
to[y*world_rank][x ] = transition(from, x , y*world_rank);
}
}
MPI_SEND(to,(lines + 2) * sizeof(Line),MPI_CHARACTER,0,1,MPI_COMM_WORLD);
}
}
MPI_Finalize();
free(from);
free(to);
free(hash);
return 0;
}
This is a C a Sequence implementation which I wrote for the university as a homework assignment.
Are you talking about MPI_Send and MPI_Recv ?
Don't know about any MPI_SEND or MPI_RECIV function...
I think you just mispelled them.
BTW: here is a great tutorial about how to use them http://mpitutorial.com/tutorials/mpi-send-and-receive/

Adding more threads to program resulted in longer execution time for calculating trapezoidal integration

I am working on a multi-threaded numerical integration program using the trapezoidal rule.
I have a struct which contains six items:
typedef struct trapezoidalIntegrationThread{
float a;
float b;
int n;
float h;
double res;
float elTime;
}threadParams;
a is the left end point, b is the right end point, n is the number of trapezoids, h is the height, res is the result calculated within compute_with_pthread, and finally, elTime is the elapsed time for compute_with_pthread for benchmarking.
Here is my code in main:
int n = NUM_TRAPEZOIDS;
float a = LEFT_ENDPOINT;
float b = RIGHT_ENDPOINT;
pthread_t masterThread;
pthread_t slaveThread[NUM_THREADs];
threadParams *trapThread;
for(i = 0; i < NUM_THREADs; i++) {
trapThread = (threadParams *) malloc(sizeof(threadParams));
trapThread->a = a;
trapThread->b = b;
trapThread->n = n;
trapThread->h = (b - a) / (float) n;
if (pthread_create(&slaveThread[i], NULL, compute_using_pthreads, (void *) trapThread) != 0) {
printf("Looks like something went wrong..\n");
return -1;
}
}
for(i = 0; i < NUM_THREADs; i++) {
pthread_join(slaveThread[i], NULL);
}
pthread_exit((void *) masterThread);
I am basically creating the number of threads defined in NUM_THREADS (let's assume this value is 4). I am allocating how much memory the struct needs, and setting the pre-defined values of:
#define LEFT_ENDPOINT 5
#define RIGHT_ENDPOINT 1000
#define NUM_TRAPEZOIDS 100000000
#define NUM_THREADs 8 /* Number of threads to run. */
Next, I create my pthreads, and call the compute_using_pthreads function:
void *compute_using_pthreads(void *inputs)
{
double integral;
int k;
threadParams *args = (threadParams *) inputs;
unsigned long p_micros = 0;
float p_millis = 0.0;
clock_t p_start, p_end;
float a = args->a;
float b = args->b;
int n = args->n;
float h = args->h;
p_start = clock();
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
p_end = clock();
p_micros = p_end - p_start;
p_millis = p_micros / 1000;
args->res = integral;
args->elTime = p_millis;
}
I ran this program and compared it against a non-multithreaded function:
double compute_gold(float a, float b, int n, float h)
{
double integral;
int k;
integral = (f(a) + f(b))/2.0;
for (k = 1; k <= n-1; k++) {
integral += f(a+k*h);
}
integral = integral*h;
return integral;
}
So here are the results:
Run-time of compute_gold:
~3000 ms
Run_time of compute_with_pthread:
Using 1 thread: ~3000 ms
Using 2 threads: ~6000 ms
Using 4 thrads: ~12000 ms
....
So for some reason, the more threads I added, the execution took n-threads more time to execute. I can't for the life of me figure out why this is happening, as I am quite new to C programming =/

Multi threaded C program freezes when I change number of threads

I am writing a multi threaded c program to multiply two matrices and find the row norm using pthreads and Blas. I thought I had it working when I set the dimension of the matrices to 4 and the number of threads to use to 2. I then changed the number of threads, and it no longer works. It does not compute the wrong answers, but gets stuck when I try to join the threads
void *matrix_norm(void *arg){
mat_norm_t *thread_mat_norm_data = arg;
int n = thread_mat_norm_data->n;
int i, j;
double norm = 0.;
for(i=0;i<thread_mat_norm_data->sub_n;i++){
double row_sum = 0.;
for(j=0;j<n;j++){
row_sum += *(thread_mat_norm_data->z+i*n+j);
}
if(row_sum>norm){
norm = row_sum;
}
}
pthread_mutex_lock(thread_mat_norm_data->mutex);
if (norm > *(thread_mat_norm_data->global_norm)){
*(thread_mat_norm_data->global_norm)=norm;
}
pthread_mutex_unlock(thread_mat_norm_data->mutex);
pthread_exit(NULL);
}
int main() {
pthread_t *working_thread;
mat_mult_t *thread_mat_mult_data;
mat_norm_t *thread_mat_norm_data;
pthread_mutex_t *mutex;
double *x, *y, *z, norm;
int i, rows_per_thread;
int n = 8;
int num_of_thrds = 4;// Works when this is 2, not when 4
if(n<=num_of_thrds && num_of_thrds < MAXTHRDS){
printf("Matrix dim must be greater than num of thrds\nand num of thrds less than 124.\n");
return (-1);
}
x = malloc(n*n*sizeof(double));
y = malloc(n*n*sizeof(double));
z = malloc(n*n*sizeof(double));
initMat(n, x);
initMat(n, y);
working_thread = malloc(num_of_thrds * sizeof(pthread_t));
thread_mat_mult_data = malloc(num_of_thrds * sizeof(mat_mult_t));
rows_per_thread = n/num_of_thrds;
for(i=0;i<num_of_thrds;i++){
thread_mat_mult_data[i].x = x + i * rows_per_thread * n;
thread_mat_mult_data[i].y = y;
thread_mat_mult_data[i].z = z + i * rows_per_thread * n;
thread_mat_mult_data[i].n = n;
thread_mat_mult_data[i].sub_n =
(i == num_of_thrds-1) ? n-(num_of_thrds-1)*rows_per_thread : rows_per_thread;
pthread_create(&working_thread[i], NULL, matrix_mult, (void *)&thread_mat_mult_data[i]);
}
for(i=0;i<num_of_thrds;i++){
pthread_join(working_thread[i], NULL);
}
free(working_thread);
working_thread = malloc(num_of_thrds * sizeof(pthread_t));
thread_mat_norm_data = malloc(num_of_thrds * sizeof(mat_norm_t));
mutex = malloc(sizeof(pthread_mutex_t));
for(i=0;i<num_of_thrds;i++){
thread_mat_norm_data[i].z = z + i * rows_per_thread * n;
thread_mat_norm_data[i].n = n;
thread_mat_norm_data[i].global_norm = &norm;
thread_mat_norm_data[i].sub_n =
(i == num_of_thrds-1) ? n-(num_of_thrds-1)*rows_per_thread : rows_per_thread;
thread_mat_norm_data[i].mutex = mutex;
pthread_create(&working_thread[i], NULL, matrix_norm, (void *)&thread_mat_norm_data[i]);
}
//Stuck running here
for(i=0;i<num_of_thrds;i++){
pthread_join(working_thread[i], NULL);
}
printMat(n, z , "z");
printf("\nRow Sum Norm = %f\n", norm);
free(x);
free(y);
free(z);
free(working_thread);
free(thread_mat_mult_data);
free(thread_mat_norm_data);
pthread_mutex_destroy(mutex);
free(mutex);
return(0);
}
I unsure why it works under certain circumstances and not others, any explanation would be great!
Forgot to initialize the mutex with pthread_mutex_init(mutex, NULL); I am still unsure why it would work with out this for two threads but not more than this?

Resources