Multithreaded program not producing desired output - c

I am writing a code that creates 10 threads and executes those threads with even thread ids first and then executes all those with odd thread ids next. I'm using the POSIX threads library. Here is the code I wrote:
#include "stdlib.h"
#include "pthread.h"
#include "stdio.h"
#define TRUE 1
#define FALSE 0
int EVEN_DONE = FALSE;
int evenThreads, oddThreads = 0;
int currentThread = 0;
//the mutex for thread synchronization
static pthread_mutex_t mymutex = PTHREAD_MUTEX_INITIALIZER;
//the condition variable;
static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void * printEven(unsigned long id)
{
pthread_mutex_lock(&mymutex);
evenThreads++;
printf("TID: %lu, Hello from even\n", id);
// this condition checks whether even threads have finished executing
if(evenThreads + oddThreads >= 10) {
EVEN_DONE = TRUE;
pthread_cond_broadcast(&cond);
}
pthread_mutex_unlock(&mymutex);
return NULL;
}
void * printOdd(unsigned long id)
{
pthread_mutex_lock(&mymutex);
while (!EVEN_DONE) {
oddThreads++;
pthread_cond_wait(&cond, &mymutex);
printf("TID: %lu, Hello from odd\n", id);
}
pthread_mutex_unlock(&mymutex);
return NULL;
}
void * threadFunc(void *arg)
{
unsigned long id = (unsigned long)pthread_self();
if (id % 2 == 0)
{
printEven(id);
}
else
{
printOdd(id);
}
return NULL;
}
int main()
{
pthread_t* threads;
int num_threads = 10;
int i, j;
threads = malloc(num_threads * sizeof(threads));
for ( i = 0; i < 10; i++) {
pthread_create(&threads[i], NULL, threadFunc, NULL);
}
for ( j = 0; j < 10; j++) {
pthread_join(threads[j], NULL);
}
printf("Finished executing all threads\n");
return 0;
}
However, when I run the code it doesn't produce the desired output. The output I'm getting is this:
Apparently, it seems that all the thread IDs are even numbers. However, I do think there is a problem with my code. What am I doing wrong? How can I achieve the desired output?
(Note: I'm at beginner level when it comes to POSIX threads and multithreading in general)
Thanks in advance.

There is no guarantee in POSIX that the pthread_t type returned by pthread_self() is a numeric type that can be cast to an unsigned long - it is allowed to be a structure type, for example.
If you want to write your code in a POSIX-conforming way, you will need to allocate numeric thread IDs yourself. For example, you could have:
unsigned long allocate_id(void)
{
static unsigned long next_id = 0;
static pthread_mutex_t id_lock = PTHREAD_MUTEX_INITIALIZER;
unsigned long id;
pthread_mutex_lock(&id_lock);
id = next_id++;
pthread_mutex_unlock(&id_lock);
return id;
}
Then in your threads use:
unsigned long id = allocate_id();
Controlling the allocation of IDs yourself also allows you to control the sequence - for example in this case you can ensure that IDs are sequentially allocated so that you will have both odd and even IDs.

Related

Why does my simple counting program take longer to run with multiple threads? (in C)

Here's my code:
#define COUNT_TO 100000000
#define MAX_CORES 4
static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
long long i = 0;
void* start_counting(void *arg){
for(;;){
pthread_mutex_lock(&mutex);
if(i >= COUNT_TO){
pthread_mutex_unlock(&mutex);
return NULL;
}
i++;
pthread_mutex_unlock(&mutex);
//printf("i = %lld\n", i);
}
}
int main(int argc, char* argv[]){
int i = 0;
pthread_t * thread_group = malloc(sizeof(pthread_t) * MAX_CORES);
for(i = 0; i < MAX_CORES; i++){
pthread_create(&thread_group[i], NULL, start_counting, NULL);
}
for(i = 0; i < MAX_CORES; i++){
pthread_join(thread_group[i], NULL);
}
return 0;
}
This is what your threads do:
Read the value of i.
Increment the value we read.
Write back the incremented value of i.
Go to step 1.
Cleary, another thread cannot read the value of i after a different thread has accomplished step 1 but before it has completed step 3. So there can be no overlap between two threads doing steps 1, 2, or 3.
So all your threads are fighting over access to the same resource -- i (or the mutex that protects it). No thread can make useful forward progress without exclusive access to one or both of those. Given that, there is no benefit to using multiple threads since only one of them can accomplish useful work at a time.

Passing threads a value from a for loop

I am attempting to create threads and pass each thread the value from a for loop. Here is the code segment
pthread_t *threadIDs;
int i = 0;
if(impl == 1)
{
threadIDs = (pthread_t *)malloc(sizeof(pthread_t)*reduces);
for(;i < reduces; i++)
{
pthread_create(&threadIDs[i], NULL, reduce,&i);
}
}
It is not passing the correct values of the loop, which makes sense since I am creating a race condition. What is the simplest way to pass the correct value of i from my loop?
Another question, will each thread finish executing before the next one is created and called?
You've already dynamically created an array of thread IDs. Do the same for the values you want to pass in.
pthread_t *threadIDs;
int *values;
int i = 0;
if(impl == 1)
{
threadIDs = malloc(sizeof(pthread_t)*reduces);
values = malloc(sizeof(int)*reduces);
for(;i < reduces; i++)
{
values[i] = i;
pthread_create(&threadIDs[i], NULL, reduce, &values[i]);
}
}
Each thread will be working with a different array member, so there's no race condition.
You can define a structure and assign i to the variable of the object.
#include <stdio.h>
#include <pthread.h>
typedef struct Param_ {
int index;
}Param;
static void* thread(void* p) {
Param* param = p;
printf("index: %d\n", param->index);
}
int main() {
int i = 0;
int reduces = 10;
pthread_t *threadIDs;
threadIDs = (pthread_t *)malloc(sizeof(pthread_t)*reduces);
for(; i < reduces; i++)
{
Param* p;
p = (Param*)malloc(sizeof(*p));
p->index = i;
pthread_create(&threadIDs[i], NULL, thread, p);
}
return 0;
}
What is the simplest way to pass the correct value of i from my loop?
What is to be considered "simple" depends on the use case, so here another approach to solve the issues you present:
#include <pthread.h>
pthread_mutex_t m_init;
pthread_cond_t c_init;
int init_done = 1;
void* thread_function(void * pv)
{
pthread_mutex_lock(&m_init);
size_t i = *((size_t*) pv);
init_done = 1;
pthread_cond_signal(&c_init);
pthread_mutex_unlock(&m_init);
...
}
#define THREADS_MAX (42)
int main(void)
{
pthread_t thread[THREADS_MAX];
pthread_mutex_init(&m_init, NULL);
pthread_cond_init(&c_init, NULL);
for(size_t i = 0; i < THREADS_MAX; ++i)
{
pthread_mutex_lock(&m_init);
init_done = 0;
pthread_create(&thread[i], NULL, thread_function, &i);
while (!init_done)
{
pthread_cond_wait(&c_init);
}
pthread_mutex_unlock(&m_init);
}
...
}
(error checking omitted for the sake of legibility)

Mutex Locks Across Threads with Different Functions

The problem:
Similar to one of my other Questions Other Question
I am trying to create a program in C that allows me to Search through 10 text files with a variable amount of threads to find the largest Prime. It should also have a Manager thread that is allowed to read the Largest Prime number of a worker thread (and not modify it). The Manager thread also Posts the largest Prime number found by all of the worker threads so the worker threads can read it and use it. The worker threads must post their local Largest Prime to a global array (privateLargestPrime) and before they do this they must lock it so that the Manager Thread doesn't read it until the worker thread updates it.
The weird Part:
As I step through my program when the worker thread wants to call a lock it switches threads to the manager which calls for a lock and is granted a lock then it keeps looping starving the Worker thread. I am Not sure what is going on with that. If I could get any insight on this problem it will be greatly appreciated.
/*
* The Reason for Worker Initialization + Manager Initialization is that we need both types of threads to exist at the same time
* so I just combined them into one loop, although I believe that they could have been created seperatly.
* Basically just call pthread_Join at the end
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <time.h>
#include <string.h>
#include <fileTest.h>
clock_t Start, End;
double elapsed = 0;
pthread_cond_t managerVar;
pthread_mutex_t mutex;
unsigned int globalLargestPrime = 0;
int numThreads = 1;//Number of Threads
int LINES_PER_THREAD;
pthread_cond_t *WorkerConditionaVar;
pthread_cond_t *ManagerConditionaVar;
unsigned int *privateLocalLargest;//will need to be changed
int *statusArray;
FILE *fileOut;
typedef enum{
FREE,
IN_USE
}lrgstPrm;
lrgstPrm monLargestPrime;//create enum
lrgstPrm workerLargestPrime;//create enum
typedef enum{
Finished,
Not_Finished
}Status;
Status is_Finished;
typedef struct threadFields{
int id;
int StartPos;//gets seek for worker thread
int EndPos;
}tField;
int ChkPrim(unsigned int n){
unsigned int i;
unsigned int root = sqrt(n);
for(i=2; i<root; i++){
if(n % i == 0)
return 0;
}
//printf("%d \n", isPrime);
return 1;
}
void *Worker(void *threadStruct){//Create Threads
struct threadFields *info = threadStruct;
int index;
int id = info->id;
unsigned int currentNum = 0;
int Seek = info->StartPos;
unsigned int localLargestPrime = 0;
char *buffer = malloc(50);
int isPrime = 0;
while(Seek<info->EndPos){
for(index = 0; index < 1000; index++){//Loop 1000 times
fseek(fileOut,Seek*sizeof(char)*20, SEEK_SET);
fgets(buffer,20,fileOut);
Seek++;
currentNum = atoi(buffer);
if(currentNum>localLargestPrime && currentNum > 0){
isPrime = ChkPrim(currentNum);
if( isPrime == 1)
localLargestPrime = currentNum;
}
}
//while(monLargestPrime == IN_USE)
//pthread_cond_wait(&monitor[id], &mutex);//wait untill mutex is unlocked
//monLargestPrime = IN_USE;
//Critical Zone
//printf("Entering Critical Zone My ID: %d\n",id);
/*Should Lock the Private Largest Prime from any other thread using it*/
if(pthread_mutex_lock(&mutex) != 0)//Lock
printf("Failed To Lock");
while(workerLargestPrime == IN_USE)//Wait untill Workers largest prime is free
pthread_cond_wait(ManagerConditionaVar, &mutex);
workerLargestPrime = IN_USE;//Local Largest is in use
privateLocalLargest[id] = localLargestPrime;//Assign Local Largest to each workers Shared Variable
workerLargestPrime = FREE;
pthread_cond_signal(ManagerConditionaVar);//Signal to any waiting thread that wants to touch(read) this workers privateLocalLargest
pthread_mutex_unlock(&mutex);
/*
pthread_mutex_lock(&mutex);
while(workerLargestPrime == FREE){
workerLargestPrime = IN_USE;
//pthread_cond_wait(&managerVar,&mutex);
*/
if(localLargestPrime < globalLargestPrime)
localLargestPrime = globalLargestPrime;
/*
workerLargestPrime = FREE;
pthread_mutex_unlock(&mutex);
// for(index = 0; index < numThreads; index++)
// if(index != id)
// pthread_cond_signal(&monitor[id]);//signal all threads that mutex is unlocked
//monLargestPrime = FREE;
//printf("Exiting Critical Zone My ID: %d\n",id);
*/
//pthread_mutex_unlock(&mutex);
}//End of While
statusArray[id] = 1;
void *i = 0;
return i;
}
void *manager(){
int index, MlocalLargestPrime;
while(is_Finished==Not_Finished){
/*Should Lock the Private Largest Prime from any other thread using it*/
if(pthread_mutex_lock(&mutex) != 0)//Lock
printf("Failed To Lock");
while(workerLargestPrime == IN_USE)//Wait untill Workers largest prime is free
pthread_cond_wait(ManagerConditionaVar, &mutex);
workerLargestPrime = IN_USE;//Local Largest is in use
//Critical Zone
for(index =0; index < numThreads; index++)
if(privateLocalLargest[index] > MlocalLargestPrime)
MlocalLargestPrime = privateLocalLargest[index];
//Critical Zone
workerLargestPrime = FREE;
pthread_cond_signal(ManagerConditionaVar);//Signal to any waiting thread that wants to touch(read) this workers privateLocalLargest
pthread_mutex_unlock(&mutex);
/*
pthread_mutex_lock(&mutex);
while(workerLargestPrime == FREE){
workerLargestPrime = IN_USE;
globalLargestPrime = MlocalLargestPrime;
workerLargestPrime = FREE;
pthread_cond_signal(&managerVar);
}
pthread_mutex_unlock(&mutex);
*/
/*check if workers have finished*/
for(index = 0; index < numThreads; index++)
if(statusArray[index] == 0)
is_Finished = Not_Finished;
}
void *i = 0;
return i;
}
int main(){
//setFile();
LINES_PER_THREAD = (getLineNum()/numThreads);
fileOut = fopen("TextFiles/dataBin.txt", "rb");
Start = clock();
//pthread_t managerThread;
pthread_t threads[numThreads];
pthread_cond_t monitor[numThreads];
pthread_cond_t managerCon;
WorkerConditionaVar = monitor;//Global Pointer points to the array created in main
ManagerConditionaVar = &managerCon;
unsigned int WorkerSharedVar[numThreads];
privateLocalLargest = WorkerSharedVar;
pthread_mutex_init(&mutex, NULL);
int finishedArr[numThreads];
statusArray = finishedArr;
is_Finished = Not_Finished;
int index;
/*Worker Initialization + Manager Initialization*/
pthread_cond_init(&managerCon,NULL);
/*Worker Thread Struct Initalization*/
tField *threadFields[numThreads];//sets number of thread structs
rewind(fileOut);
for(index = 0; index < numThreads; index++){//run through threads; inizilize the Struct for workers
pthread_cond_init(&monitor[index], NULL);//Initialize all the conditional variables
threadFields[index] = malloc(sizeof(tField));
threadFields[index]->id = index;
threadFields[index]->StartPos = index*LINES_PER_THREAD;// Get Position for start of block
threadFields[index]->EndPos = (index+1)*LINES_PER_THREAD-1;// Get Position for end of block
}
/*Worker Thread Struct Initalization*/
for(index = 0; index<numThreads+1; index++)
if(index == numThreads)//Last Thread is Manager Thread
pthread_create(&threads[index],NULL,manager,NULL);//Create Manager
else//Worker Threads
pthread_create(&threads[index],NULL,Worker,(void *)threadFields[index]);//Pass struct to each worker
for(index = 0; index<numThreads+1; index++)
pthread_join(threads[index], NULL);
/*Worker Initialization + Manager Initialization*/
/*Destroy the mutexes & conditional signals*/
for(index = 0; index < numThreads; index++){
pthread_cond_destroy(&WorkerConditionaVar[index]);
}
pthread_cond_destroy(&managerCon);
pthread_mutex_destroy(&mutex);
End = clock();
elapsed = ((double) (End - Start)) / CLOCKS_PER_SEC;
printf("This is the Time %f\n", elapsed);
printf("This is the Largest Prime Number: %u", globalLargestPrime);
return 0;
}
[1]: https://stackoverflow.com/questions/13672456/slightly-complicated-thread-synchronization
There is another C source which I only use 1 method and it is to give me the number of lines from the 10 text files, I will also post it (but not neccessary):
/*
* fileTest.c
*
* Created on: Dec 8, 2012
* Author: kevin
*
* count number of lines
* divide by number of threads
* get the positions to hand to each thread
* to get positions, one needs to get the number of lines per thread,
* add number of lines to each: Seek*sizeof(char)*10, SEEK_SET.
* and hand out these positions to each thread
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <time.h>
#include <string.h>
FILE *filesIn[10], *fileOut;
int Seek;
void createText(){
FILE *fOUT = fopen("data9.txt", "w");
int i;
srand(time(NULL));
for(i=0; i<10000; i++)
fprintf(fOUT, "%d\n",rand()%9000);
fclose(fOUT);
}
void setFile(){
int index;
Seek = 0;
char *buffer = malloc(50);
filesIn[0] = fopen("TextFiles/primes1.txt", "r");//read Text
filesIn[1] = fopen("TextFiles/primes2.txt", "r");//read Text
filesIn[2] = fopen("TextFiles/primes3.txt", "r");//read Text
filesIn[3] = fopen("TextFiles/primes4.txt", "r");//read Text
filesIn[4] = fopen("TextFiles/primes5.txt", "r");//read Text
filesIn[5] = fopen("TextFiles/primes6.txt", "r");//read Text
filesIn[6] = fopen("TextFiles/primes7.txt", "r");//read Text
filesIn[7] = fopen("TextFiles/primes8.txt", "r");//read Text
filesIn[8] = fopen("TextFiles/primes9.txt", "r");//read Text
filesIn[9] = fopen("TextFiles/primes10.txt", "r");//read Text
fileOut = fopen("TextFiles/dataBin.txt", "wb");//write in bin
for(index = 0; index < 10; index++)//Run through 10 files
while(!feof(filesIn[index])){
fscanf(filesIn[index],"%s", buffer);//take line from input
fseek(fileOut,Seek*sizeof(char)*20, SEEK_SET);
fputs(buffer,fileOut);//Print line to output file
Seek++;
}
fclose(filesIn[0]);
fclose(filesIn[1]);
fclose(filesIn[2]);
fclose(filesIn[3]);
fclose(filesIn[4]);
fclose(filesIn[5]);
fclose(filesIn[6]);
fclose(filesIn[7]);
fclose(filesIn[8]);
fclose(filesIn[9]);
fclose(fileOut);
}
void getFile(){
int Seek = 0;
int currentSeek = 0;
int currentNum = 0;
int localLargestPrime = 0;
char *buffer = malloc(50);
fileOut = fopen("TextFiles/dataBin.txt", "rb");
rewind(fileOut);
while(!feof(fileOut)){
fseek(fileOut,Seek*sizeof(char)*20, SEEK_SET);
fgets(buffer,10,fileOut);
Seek++;
currentNum = atoi(buffer);
if(currentNum>localLargestPrime)
if(ChkPrim(currentNum) == 1){
localLargestPrime = currentNum;
currentSeek = Seek*sizeof(char)*20;
printf("the current seek is: %d\n", currentSeek);
}
}
printf("This is the largest Prime: %d\n", localLargestPrime);
}
int getLineNum(){
Seek = 0;
int index;
char c;
filesIn[0] = fopen("TextFiles/primes1.txt", "r");//read Text
filesIn[1] = fopen("TextFiles/primes2.txt", "r");//read Text
filesIn[2] = fopen("TextFiles/primes3.txt", "r");//read Text
filesIn[3] = fopen("TextFiles/primes4.txt", "r");//read Text
filesIn[4] = fopen("TextFiles/primes5.txt", "r");//read Text
filesIn[5] = fopen("TextFiles/primes6.txt", "r");//read Text
filesIn[6] = fopen("TextFiles/primes7.txt", "r");//read Text
filesIn[7] = fopen("TextFiles/primes8.txt", "r");//read Text
filesIn[8] = fopen("TextFiles/primes9.txt", "r");//read Text
filesIn[9] = fopen("TextFiles/primes10.txt", "r");//read Text
for(index = 0; index < 10; index++)
while((c = fgetc(filesIn[index])) != EOF)
if(c == '\n')
Seek++;
return Seek;
}
enter link description here
You seem to be overdoing the synchronization of the access to globalLargestPrime. But instead of trying to fix that there might be a better way to communicate each thread's value to the manager - just have the thread function return the value it finds as an unsigned int cast to a void*. Then the manager can collect those values by just waiting on a pthread_join() for each thread to finish.
Something like the following pseudo code:
void *Worker(void *threadStruct)
{
unsigned int largest_prime;
// do whatever you need to do to find the largest prime in the set of numbers
// this thread has to deal with
//
// Note that nothing here should require synchronization, since the data should be
// completely independent of other threads
return (void*) largest_prime;
}
void *manager()
{
unsigned int largest_prime = 0;
// do whatever to spin up the threads and keep track of them in a
// pthread_t[] array...
// now wait for the threads to finish up and keep deal with the value
// each thread has found:
for each (pthread* p in the pthread_t[]) { // remember - pseudo code
void* result = 0;
// get the result that thread found
pthread_join( p, &result);
unsigned int thread_prime = (unsigned int) result;
if (largest_prime < thread_prime) {
largest_prime = thread_prime;
}
}
printf("largest prime: %u\n", largest_prime);
}
Now all of your synchronization hassles are dealt with by pthread_join().
Going by your problem, I think you can and should do without locks.
Use the global array to update the Manager thread from the worker threads. Since worker each thread will write to separate array index, there is only one writer per array index. Main thread can keep on reading from the same array.
Use one global variable for Largest prime number found so far (shared across all threads). For this variable, the main thread is the only writer and the worker threads are all readers.
Consistency will not be an issue since its only one variable. You need to worry about taking locks if there are more variables that need to be updated together.
Hope this helps.
Ok So I was doing something Funky with my conditional variables (I had way too many!) so Here I shall post my answer:
/*
* The Reason for Worker Initialization + Manager Initialization is that we need both types of threads to exist at the same time
* so I just combined them into one loop, although I believe that they could have been created seperatly.
* Basically just call pthread_Join at the end
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <pthread.h>
#include <time.h>
#include <string.h>
#include "fileTest.h"
clock_t Start, End;
double elapsed = 0;
pthread_cond_t managerVar;
pthread_mutex_t mutex;
unsigned int globalLargestPrime = 0;
int *numThreads;//Number of Threads ptr
int LINES_PER_THREAD;
pthread_cond_t *WorkerConditionaVar;
pthread_cond_t *ManagerConditionaVar;
unsigned int *privateLocalLargest;//will need to be changed
int *statusArray;
FILE *fileOut;
typedef enum{
FREE,
IN_USE
}lrgstPrm;
lrgstPrm managerLargestPrime;//create enum
lrgstPrm workerLargestPrime;//create enum
typedef enum{
Finished,
Not_Finished
}Status;
Status is_Finished;
typedef struct threadFields{
int id;
int StartPos;//gets seek for worker thread
int EndPos;
}tField;
int ChkPrim(unsigned int n){
unsigned int i;
unsigned int root = sqrt(n);
for(i=2; i<root; i++){
if(n % i == 0)
return 0;
}
//printf("%d \n", isPrime);
return 1;
}
void *Worker(void *threadStruct){//Create Threads
struct threadFields *info = threadStruct;
int index;
int id = info->id;
unsigned int currentNum = 0;
int Seek = info->StartPos;
unsigned int localLargestPrime = 0;
char *buffer = malloc(50);
int isPrime = 0;
while(Seek<info->EndPos){
for(index = 0; index < 1000; index++){//Loop 1000 times
fseek(fileOut,Seek*sizeof(char)*20, SEEK_SET);
fgets(buffer,20,fileOut);
Seek++;
currentNum = atoi(buffer);
if(currentNum>localLargestPrime && currentNum > 0){
isPrime = ChkPrim(currentNum);
if( isPrime == 1)
localLargestPrime = currentNum;
}
}
/*Here is where I block the manager thread read while I Write*/
pthread_mutex_lock(&mutex);
while(workerLargestPrime == IN_USE)
pthread_cond_wait(WorkerConditionaVar,&mutex);
//Critical Zone
privateLocalLargest[id] = localLargestPrime;
//Critical Zone
pthread_cond_signal(WorkerConditionaVar);
pthread_mutex_unlock(&mutex);
/*Here is where I block the manager thread read while I Write*/
/*I need to wait here until it is free to read the Managers Shared variable (GlobaLargestPrime)*/
pthread_mutex_lock(&mutex);
while(managerLargestPrime == IN_USE)
pthread_cond_wait(ManagerConditionaVar,&mutex);
//Critical Zone
if(localLargestPrime < globalLargestPrime)
localLargestPrime = globalLargestPrime;
//Critical Zone
pthread_cond_signal(ManagerConditionaVar);
pthread_mutex_unlock(&mutex);
/*I need to wait here until it is free to read the Managers Shared variable (GlobaLargestPrime)*/
}//End of While
statusArray[id] = 1;
return NULL;
}
void *manager(){
int index;
int ManagerLocalLargest = 0;
while(is_Finished==Not_Finished){
/*I need to wait here until it is free to read the workers Shared variable (PrivateLocalLargest)*/
pthread_mutex_lock(&mutex);
while(workerLargestPrime == IN_USE)
pthread_cond_wait(WorkerConditionaVar,&mutex);
//Critical Zone
for(index = 0; index < *numThreads; index++)
if(privateLocalLargest[index] > ManagerLocalLargest)
ManagerLocalLargest = privateLocalLargest[index];
//Critical Zone
pthread_cond_signal(WorkerConditionaVar);
pthread_mutex_unlock(&mutex);
/*Here is where I block the worker thread read while I Write*/
pthread_mutex_lock(&mutex);
while(managerLargestPrime == IN_USE)
pthread_cond_wait(ManagerConditionaVar,&mutex);
//Critical Zone
for(index = 0; index < *numThreads; index++)
if(privateLocalLargest[index] > globalLargestPrime)
globalLargestPrime = privateLocalLargest[index];
//Critical Zone
pthread_cond_signal(ManagerConditionaVar);
pthread_mutex_unlock(&mutex);
/*Here is where I block the worker thread read while I Write*/
/*check if workers have finished*/
for(index = 0; index < *numThreads; index++){
is_Finished = Finished;
if(statusArray[index] != 1){
is_Finished = Not_Finished;
}
}
}
return NULL;
}
int main(int argc, char *argv[]){
//setFile();
int argument;
switch(argc){
case 1:
printf("You didn't Type the number of threads you wanted... \n");
printf("argument format: [# of Threads]\n");
return -1;
break;
case 2:
if(strcmp(argv[1],"--help") == 0){
printf("argument format: [# of Threads]\n");
return 0;
}
else
argument = atoi(argv[1]);
break;
}
printf("The number of threads is %d\n", argument);
numThreads = &argument;
LINES_PER_THREAD = (getLineNum()/(*numThreads));
fileOut = fopen("TextFiles/dataBin.txt", "rb");
//pthread_t managerThread;
pthread_t threads[*numThreads];
pthread_cond_t monitor[*numThreads];
pthread_cond_t managerCon;
WorkerConditionaVar = monitor;//Global Pointer points to the array created in main
ManagerConditionaVar = &managerCon;
unsigned int WorkerSharedVar[*numThreads];
privateLocalLargest = WorkerSharedVar;
pthread_mutex_init(&mutex, NULL);
int finishedArr[*numThreads];
statusArray = finishedArr;
is_Finished = Not_Finished;
int index;
/*Worker Initialization + Manager Initialization*/
pthread_cond_init(&managerCon,NULL);
/*Worker Thread Struct Initalization*/
tField *threadFields[*numThreads];//sets number of thread structs
rewind(fileOut);
for(index = 0; index < *numThreads; index++){//run through threads; inizilize the Struct for workers
privateLocalLargest[index] = 0;
pthread_cond_init(&monitor[index], NULL);//Initialize all the conditional variables
threadFields[index] = malloc(sizeof(tField));
threadFields[index]->id = index;
threadFields[index]->StartPos = index*LINES_PER_THREAD;// Get Position for start of block
threadFields[index]->EndPos = (index+1)*LINES_PER_THREAD-1;// Get Position for end of block
}
/*Worker Thread Struct Initalization*/
Start = clock();
for(index = 0; index<*numThreads+1; index++)
if(index == *numThreads)//Last Thread is Manager Thread
pthread_create(&threads[index],NULL,manager,NULL);//Create Manager
else//Worker Threads
pthread_create(&threads[index],NULL,Worker,(void *)threadFields[index]);//Pass struct to each worker
for(index = 0; index<*numThreads+1; index++)
pthread_join(threads[index], NULL);
/*Worker Initialization + Manager Initialization*/
/*Destroy the mutexes & conditional signals*/
for(index = 0; index < *numThreads; index++){
pthread_cond_destroy(&WorkerConditionaVar[index]);
}
pthread_cond_destroy(&managerCon);
pthread_mutex_destroy(&mutex);
End = clock();
elapsed = ((double) (End - Start)) / CLOCKS_PER_SEC;
printf("This is the Time %f\n", elapsed);
printf("This is the Largest Prime Number: %u\n", globalLargestPrime);
return 0;
}
Also I solved my problem with the number of threads and Conditional variables needing to be hard-coded in, Now It can just be entered in as a parameter. Thanks everyone for all the support.
PS.
I have noticed that having 2 threads does not speed up the process (I assumed it would) and my pc is a dual core. it could be because of the Mutex locks and all of the blocking. I also Noticed that the more threads the longer it takes for them to process the data... hrmm if anyone sees this and can give me some insight please pm me or write a comment. Thanks (the other c file stayed the same).

How to synchronize manager/worker pthreads without a join?

I'm familiar with multithreading and I've developed many multithreaded programs in Java and Objective-C successfully. But I couldn't achieve the following in C using pthreads without using a join from the main thread:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define NUM_OF_THREADS 2
struct thread_data {
int start;
int end;
int *arr;
};
void print(int *ints, int n);
void *processArray(void *args);
int main(int argc, const char * argv[])
{
int numOfInts = 10;
int *ints = malloc(numOfInts * sizeof(int));
for (int i = 0; i < numOfInts; i++) {
ints[i] = i;
}
print(ints, numOfInts); // prints [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
pthread_t threads[NUM_OF_THREADS];
struct thread_data thread_data[NUM_OF_THREADS];
// these vars are used to calculate the index ranges for each thread
int remainingWork = numOfInts, amountOfWork;
int startRange, endRange = -1;
for (int i = 0; i < NUM_OF_THREADS; i++) {
amountOfWork = remainingWork / (NUM_OF_THREADS - i);
startRange = endRange + 1;
endRange = startRange + amountOfWork - 1;
thread_data[i].arr = ints;
thread_data[i].start = startRange;
thread_data[i].end = endRange;
pthread_create(&threads[i], NULL, processArray, (void *)&thread_data[i]);
remainingWork -= amountOfWork;
}
// 1. Signal to the threads to start working
// 2. Wait for them to finish
print(ints, numOfInts); // should print [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
free(ints);
return 0;
}
void *processArray(void *args)
{
struct thread_data *data = (struct thread_data *)args;
int *arr = data->arr;
int start = data->start;
int end = data->end;
// 1. Wait for a signal to start from the main thread
for (int i = start; i <= end; i++) {
arr[i] = arr[i] + 1;
}
// 2. Signal to the main thread that you're done
pthread_exit(NULL);
}
void print(int *ints, int n)
{
printf("[");
for (int i = 0; i < n; i++) {
printf("%d", ints[i]);
if (i+1 != n)
printf(", ");
}
printf("]\n");
}
I would like to achieve the following in the above code:
In main():
Signal to the threads to start working.
Wait for the background threads to finish.
In processArray():
Wait for a signal to start from the main thread
Signal to the main thread that you're done
I don't want to use a join in the main thread because in the real application, the main thread will create the threads once, and then it will signal to the background threads to work many times, and I can't let the main thread proceed unless all the background threads have finished working. In the processArray function, I will put an infinite loop as following:
void *processArray(void *args)
{
struct thread_data *data = (struct thread_data *)args;
while (1)
{
// 1. Wait for a signal to start from the main thread
int *arr = data->arr;
int start = data->start;
int end = data->end;
// Process
for (int i = start; i <= end; i++) {
arr[i] = arr[i] + 1;
}
// 2. Signal to the main thread that you're done
}
pthread_exit(NULL);
}
Note that I'm new to C and the posix API, so excuse me if I'm missing something obvious. But I really tried many things, starting from using a mutex, and an array of semaphores, and a mixture of both, but without success. I think a condition variable may help, but I couldn't understand how it could be used.
Thanks for your time.
Problem Solved:
Thank you guys so much! I was finally able to get this to work safely and without using a join by following your tips. Although the solution is somewhat ugly, it gets the job done and the performance gains is worth it (as you'll see below). For anyone interested, this is a simulation of the real application I'm working on, in which the main thread keeps giving work continuously to the background threads:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define NUM_OF_THREADS 5
struct thread_data {
int id;
int start;
int end;
int *arr;
};
pthread_mutex_t currentlyIdleMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t currentlyIdleCond = PTHREAD_COND_INITIALIZER;
int currentlyIdle;
pthread_mutex_t workReadyMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t workReadyCond = PTHREAD_COND_INITIALIZER;
int workReady;
pthread_cond_t currentlyWorkingCond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t currentlyWorkingMutex= PTHREAD_MUTEX_INITIALIZER;
int currentlyWorking;
pthread_mutex_t canFinishMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t canFinishCond = PTHREAD_COND_INITIALIZER;
int canFinish;
void print(int *ints, int n);
void *processArray(void *args);
int validateResult(int *ints, int num, int start);
int main(int argc, const char * argv[])
{
int numOfInts = 10;
int *ints = malloc(numOfInts * sizeof(int));
for (int i = 0; i < numOfInts; i++) {
ints[i] = i;
}
// print(ints, numOfInts);
pthread_t threads[NUM_OF_THREADS];
struct thread_data thread_data[NUM_OF_THREADS];
workReady = 0;
canFinish = 0;
currentlyIdle = 0;
currentlyWorking = 0;
// these vars are used to calculate the index ranges for each thread
int remainingWork = numOfInts, amountOfWork;
int startRange, endRange = -1;
// Create the threads and give each one its data struct.
for (int i = 0; i < NUM_OF_THREADS; i++) {
amountOfWork = remainingWork / (NUM_OF_THREADS - i);
startRange = endRange + 1;
endRange = startRange + amountOfWork - 1;
thread_data[i].id = i;
thread_data[i].arr = ints;
thread_data[i].start = startRange;
thread_data[i].end = endRange;
pthread_create(&threads[i], NULL, processArray, (void *)&thread_data[i]);
remainingWork -= amountOfWork;
}
int loops = 1111111;
int expectedStartingValue = ints[0] + loops; // used to validate the results
// The elements in ints[] should be incremented by 1 in each loop
while (loops-- != 0) {
// Make sure all of them are ready
pthread_mutex_lock(&currentlyIdleMutex);
while (currentlyIdle != NUM_OF_THREADS) {
pthread_cond_wait(&currentlyIdleCond, &currentlyIdleMutex);
}
pthread_mutex_unlock(&currentlyIdleMutex);
// All threads are now blocked; it's safe to not lock the mutex.
// Prevent them from finishing before authorized.
canFinish = 0;
// Reset the number of currentlyWorking threads
currentlyWorking = NUM_OF_THREADS;
// Signal to the threads to start
pthread_mutex_lock(&workReadyMutex);
workReady = 1;
pthread_cond_broadcast(&workReadyCond );
pthread_mutex_unlock(&workReadyMutex);
// Wait for them to finish
pthread_mutex_lock(&currentlyWorkingMutex);
while (currentlyWorking != 0) {
pthread_cond_wait(&currentlyWorkingCond, &currentlyWorkingMutex);
}
pthread_mutex_unlock(&currentlyWorkingMutex);
// The threads are now waiting for permission to finish
// Prevent them from starting again
workReady = 0;
currentlyIdle = 0;
// Allow them to finish
pthread_mutex_lock(&canFinishMutex);
canFinish = 1;
pthread_cond_broadcast(&canFinishCond);
pthread_mutex_unlock(&canFinishMutex);
}
// print(ints, numOfInts);
if (validateResult(ints, numOfInts, expectedStartingValue)) {
printf("Result correct.\n");
}
else {
printf("Result invalid.\n");
}
// clean up
for (int i = 0; i < NUM_OF_THREADS; i++) {
pthread_cancel(threads[i]);
}
free(ints);
return 0;
}
void *processArray(void *args)
{
struct thread_data *data = (struct thread_data *)args;
int *arr = data->arr;
int start = data->start;
int end = data->end;
while (1) {
// Set yourself as idle and signal to the main thread, when all threads are idle main will start
pthread_mutex_lock(&currentlyIdleMutex);
currentlyIdle++;
pthread_cond_signal(&currentlyIdleCond);
pthread_mutex_unlock(&currentlyIdleMutex);
// wait for work from main
pthread_mutex_lock(&workReadyMutex);
while (!workReady) {
pthread_cond_wait(&workReadyCond , &workReadyMutex);
}
pthread_mutex_unlock(&workReadyMutex);
// Do the work
for (int i = start; i <= end; i++) {
arr[i] = arr[i] + 1;
}
// mark yourself as finished and signal to main
pthread_mutex_lock(&currentlyWorkingMutex);
currentlyWorking--;
pthread_cond_signal(&currentlyWorkingCond);
pthread_mutex_unlock(&currentlyWorkingMutex);
// Wait for permission to finish
pthread_mutex_lock(&canFinishMutex);
while (!canFinish) {
pthread_cond_wait(&canFinishCond , &canFinishMutex);
}
pthread_mutex_unlock(&canFinishMutex);
}
pthread_exit(NULL);
}
int validateResult(int *ints, int n, int start)
{
int tmp = start;
for (int i = 0; i < n; i++, tmp++) {
if (ints[i] != tmp) {
return 0;
}
}
return 1;
}
void print(int *ints, int n)
{
printf("[");
for (int i = 0; i < n; i++) {
printf("%d", ints[i]);
if (i+1 != n)
printf(", ");
}
printf("]\n");
}
I'm not sure though if pthread_cancel is enough for clean up! As for the barrier, it would've been of a great help if it wasn't limited to some OSs as mentioned by #Jeremy.
Benchmarks:
I wanted to make sure that these many conditions aren't actually slowing down the algorithm, so I've setup this benchmark to compare the two solutions:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
#define NUM_OF_THREADS 5
struct thread_data {
int start;
int end;
int *arr;
};
pthread_mutex_t currentlyIdleMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t currentlyIdleCond = PTHREAD_COND_INITIALIZER;
int currentlyIdle;
pthread_mutex_t workReadyMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t workReadyCond = PTHREAD_COND_INITIALIZER;
int workReady;
pthread_cond_t currentlyWorkingCond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t currentlyWorkingMutex= PTHREAD_MUTEX_INITIALIZER;
int currentlyWorking;
pthread_mutex_t canFinishMutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t canFinishCond = PTHREAD_COND_INITIALIZER;
int canFinish;
void *processArrayMutex(void *args);
void *processArrayJoin(void *args);
double doItWithMutex(pthread_t *threads, struct thread_data *data, int loops);
double doItWithJoin(pthread_t *threads, struct thread_data *data, int loops);
int main(int argc, const char * argv[])
{
int numOfInts = 10;
int *join_ints = malloc(numOfInts * sizeof(int));
int *mutex_ints = malloc(numOfInts * sizeof(int));
for (int i = 0; i < numOfInts; i++) {
join_ints[i] = i;
mutex_ints[i] = i;
}
pthread_t join_threads[NUM_OF_THREADS];
pthread_t mutex_threads[NUM_OF_THREADS];
struct thread_data join_thread_data[NUM_OF_THREADS];
struct thread_data mutex_thread_data[NUM_OF_THREADS];
workReady = 0;
canFinish = 0;
currentlyIdle = 0;
currentlyWorking = 0;
int remainingWork = numOfInts, amountOfWork;
int startRange, endRange = -1;
for (int i = 0; i < NUM_OF_THREADS; i++) {
amountOfWork = remainingWork / (NUM_OF_THREADS - i);
startRange = endRange + 1;
endRange = startRange + amountOfWork - 1;
join_thread_data[i].arr = join_ints;
join_thread_data[i].start = startRange;
join_thread_data[i].end = endRange;
mutex_thread_data[i].arr = mutex_ints;
mutex_thread_data[i].start = startRange;
mutex_thread_data[i].end = endRange;
pthread_create(&mutex_threads[i], NULL, processArrayMutex, (void *)&mutex_thread_data[i]);
remainingWork -= amountOfWork;
}
int numOfBenchmarkTests = 100;
int numberOfLoopsPerTest= 1000;
double join_sum = 0.0, mutex_sum = 0.0;
for (int i = 0; i < numOfBenchmarkTests; i++)
{
double joinTime = doItWithJoin(join_threads, join_thread_data, numberOfLoopsPerTest);
double mutexTime= doItWithMutex(mutex_threads, mutex_thread_data, numberOfLoopsPerTest);
join_sum += joinTime;
mutex_sum+= mutexTime;
}
double join_avg = join_sum / numOfBenchmarkTests;
double mutex_avg= mutex_sum / numOfBenchmarkTests;
printf("Join average : %f\n", join_avg);
printf("Mutex average: %f\n", mutex_avg);
double diff = join_avg - mutex_avg;
if (diff > 0.0)
printf("Mutex is %.0f%% faster.\n", 100 * diff / join_avg);
else if (diff < 0.0)
printf("Join is %.0f%% faster.\n", 100 * diff / mutex_avg);
else
printf("Both have the same performance.");
free(join_ints);
free(mutex_ints);
return 0;
}
// From https://stackoverflow.com/a/2349941/408286
double get_time()
{
struct timeval t;
struct timezone tzp;
gettimeofday(&t, &tzp);
return t.tv_sec + t.tv_usec*1e-6;
}
double doItWithMutex(pthread_t *threads, struct thread_data *data, int num_loops)
{
double start = get_time();
int loops = num_loops;
while (loops-- != 0) {
// Make sure all of them are ready
pthread_mutex_lock(&currentlyIdleMutex);
while (currentlyIdle != NUM_OF_THREADS) {
pthread_cond_wait(&currentlyIdleCond, &currentlyIdleMutex);
}
pthread_mutex_unlock(&currentlyIdleMutex);
// All threads are now blocked; it's safe to not lock the mutex.
// Prevent them from finishing before authorized.
canFinish = 0;
// Reset the number of currentlyWorking threads
currentlyWorking = NUM_OF_THREADS;
// Signal to the threads to start
pthread_mutex_lock(&workReadyMutex);
workReady = 1;
pthread_cond_broadcast(&workReadyCond );
pthread_mutex_unlock(&workReadyMutex);
// Wait for them to finish
pthread_mutex_lock(&currentlyWorkingMutex);
while (currentlyWorking != 0) {
pthread_cond_wait(&currentlyWorkingCond, &currentlyWorkingMutex);
}
pthread_mutex_unlock(&currentlyWorkingMutex);
// The threads are now waiting for permission to finish
// Prevent them from starting again
workReady = 0;
currentlyIdle = 0;
// Allow them to finish
pthread_mutex_lock(&canFinishMutex);
canFinish = 1;
pthread_cond_broadcast(&canFinishCond);
pthread_mutex_unlock(&canFinishMutex);
}
return get_time() - start;
}
double doItWithJoin(pthread_t *threads, struct thread_data *data, int num_loops)
{
double start = get_time();
int loops = num_loops;
while (loops-- != 0) {
// create them
for (int i = 0; i < NUM_OF_THREADS; i++) {
pthread_create(&threads[i], NULL, processArrayJoin, (void *)&data[i]);
}
// wait
for (int i = 0; i < NUM_OF_THREADS; i++) {
pthread_join(threads[i], NULL);
}
}
return get_time() - start;
}
void *processArrayMutex(void *args)
{
struct thread_data *data = (struct thread_data *)args;
int *arr = data->arr;
int start = data->start;
int end = data->end;
while (1) {
// Set yourself as idle and signal to the main thread, when all threads are idle main will start
pthread_mutex_lock(&currentlyIdleMutex);
currentlyIdle++;
pthread_cond_signal(&currentlyIdleCond);
pthread_mutex_unlock(&currentlyIdleMutex);
// wait for work from main
pthread_mutex_lock(&workReadyMutex);
while (!workReady) {
pthread_cond_wait(&workReadyCond , &workReadyMutex);
}
pthread_mutex_unlock(&workReadyMutex);
// Do the work
for (int i = start; i <= end; i++) {
arr[i] = arr[i] + 1;
}
// mark yourself as finished and signal to main
pthread_mutex_lock(&currentlyWorkingMutex);
currentlyWorking--;
pthread_cond_signal(&currentlyWorkingCond);
pthread_mutex_unlock(&currentlyWorkingMutex);
// Wait for permission to finish
pthread_mutex_lock(&canFinishMutex);
while (!canFinish) {
pthread_cond_wait(&canFinishCond , &canFinishMutex);
}
pthread_mutex_unlock(&canFinishMutex);
}
pthread_exit(NULL);
}
void *processArrayJoin(void *args)
{
struct thread_data *data = (struct thread_data *)args;
int *arr = data->arr;
int start = data->start;
int end = data->end;
// Do the work
for (int i = start; i <= end; i++) {
arr[i] = arr[i] + 1;
}
pthread_exit(NULL);
}
And the output is:
Join average : 0.153074
Mutex average: 0.071588
Mutex is 53% faster.
Thank you again. I really appreciate your help!
There are several synchronization mechanisms you can use (condition variables, for example). I think the simplest would be to use a pthread_barrier to synchronize the the start of the threads.
Assuming that you want all of the threads to 'sync up' on each loop iteration, you can just reuse the barrier. If you need something more flexible, a condition variable might be more appropriate.
When you decide it's time for the thread to wrap up (you haven't indicated how the threads will know to break out of the infinite loop - a simple shared variable might be used for that; the shared variable could be an atomic type or protected with a mutex), the main() thread should use pthread_join() to wait for all the threads to complete.
You need to use a different synchronization technique than join, that's clear.
Unfortunately you have a lot of options. One is a "synchronization barrier", which basically is a thing where each thread that reaches it blocks until they've all reached it (you specify the number of threads in advance). Look at pthread_barrier.
Another is to use a condition-variable/mutex pair (pthread_cond_*). When each thread finishes it takes the mutex, increments a count, signals the condvar. The main thread waits on the condvar until the count reaches the value it expects. The code looks like this:
// thread has finished
mutex_lock
++global_count
// optional optimization: only execute the next line when global_count >= N
cond_signal
mutex_unlock
// main is waiting for N threads to finish
mutex_lock
while (global_count < N) {
cond_wait
}
mutex_unlock
Another is to use a semaphore per thread -- when the thread finishes it posts its own semaphore, and the main thread waits on each semaphore in turn instead of joining each thread in turn.
You also need synchronization to re-start the threads for the next job -- this could be a second synchronization object of the same type as the first, with details changed for the fact that you have 1 poster and N waiters rather than the other way around. Or you could (with care) re-use the same object for both purposes.
If you've tried these things and your code didn't work, maybe ask a new specific question about the code you tried. All of them are adequate to the task.
You are working at the wrong level of abstraction. This problem has been solved already. You are reimplementing a work queue + thread pool.
OpenMP seems like a good fit for your problem. It converts #pragma annotations into threaded code. I believe it would let you express what you're trying to do pretty directly.
Using libdispatch, what you're trying to do would be expressed as a dispatch_apply targeting a concurrent queue. This implicitly waits for all child tasks to complete. Under OS X, it's implemented using a non-portable pthread workqueue interface; under FreeBSD, I believe it manages a group of pthreads directly.
If it is portability concerns driving you to use raw pthreads, don't use pthread barriers. Barriers are an additional extension over and above basic POSIX threads. OS X for example does not support it. For more, see POSIX.
Blocking the main thread till all child threads have completed can be done using a count protected by a condition variable or, even more simply, using a pipe and a blocking read where the number of bytes to read matches the number of threads. Each thread writes one byte on work completion, then sleeps till it gets new work from the main thread. The main thread unblocks once each thread has written its "I'm done!" byte.
Passing work to the child threads can be done using a mutex protecting the work-descriptor and a condition to signal new work. You could use a single array of work descriptors that all threads draw from. On signal, each one tries to grab the mutex. On grabbing the mutex, it would dequeue some work, signal anew if the queue is nonempty, and then process its work, after which it would signal completion to the master thread.
You could reuse this "work queue" to unblock the main thread by enqueueing the results, with the main thread waiting till the result queue length matches the number of threads; the pipe approach is just using a blocking read to do this count for you.
To tell all the threads to start working, it can be as simple as a global integer variable which is initialized to zero, and the threads simply wait until it's non-zero. This way you don't need the while (1) loop in the thread function.
For waiting until they are all done, pthread_join is simplest as it will actually block until the thread it's joining is done. It's also needed to clean up system stuff after the thread (like otherwise the return value from the thread will be stored for the remainder of the program). As you have an array of all pthread_t for the threads, just loop over them one by one. As that part of your program doesn't do anything else, and has to wait until all threads are done, just waiting for them in order is okay.

C: pthread performance woes. How can I make this code perform as expected?

I have created this little program to calculate pi using probability and ratios. In order to make it run faster I decided to give multithreading with pthreads a shot. Unfortunately, even after doing much searching around I was unable to solve the problem I have in that when I run the threadFunc function, with one thread, whether that be with a pthread, or just normally called from the calculate_pi_mt function, the performance is much better (at least twice or if not 3 times better) than when I try running it with two threads on my dual core machine. I have tried disabling optimizations to no avail. As far as I can see, when the thread is running it is using local variables apart from at the end when I have used a mutex lock to create the sum of hits...
Firstly are there any tips for creating code that will run better here? (ie style) because I'm just learning by trying this stuff.
And secondly would there be any reason for these obvious performance problems?
When running with number of threads set to 1, one of my cpus maxes out at 100%. When set to two, the second cpu rises to roughly 80%-90%, but all this extra work it is apparently doing is to no avail! Could it be the use of the rand() function?
struct arguments {
int n_threads;
int rays;
int hits_in;
pthread_mutex_t *mutex;
};
void *threadFunc(void *arg)
{
struct arguments* args=(struct arguments*)arg;
int n = 0;
int local_hits_in = 0;
double x;
double y;
double r;
while (n < args->rays)
{
n++;
x = ((double)rand())/((double)RAND_MAX);
y = ((double)rand())/((double)RAND_MAX);
r = (double)sqrt(pow(x, 2) + pow(y, 2));
if (r < 1.0){
local_hits_in++;
}
}
pthread_mutex_lock(args->mutex);
args->hits_in += local_hits_in;
pthread_mutex_unlock(args->mutex);
return NULL;
}
double calculate_pi_mt(int rays, int threads){
double answer;
int c;
unsigned int iseed = (unsigned int)time(NULL);
srand(iseed);
if ( (float)(rays/threads) != ((float)rays)/((float)threads) ){
printf("Error: number of rays is not evenly divisible by threads\n");
}
/* argument initialization */
struct arguments* args = malloc(sizeof(struct arguments));
args->hits_in = 0;
args->rays = rays/threads;
args->n_threads = 0;
args->mutex = malloc(sizeof(pthread_mutex_t));
if (pthread_mutex_init(args->mutex, NULL)){
printf("Error creating mutex!\n");
}
pthread_t thread_ary[MAXTHREADS];
c=0;
while (c < threads){
args->n_threads += 1;
if (pthread_create(&(thread_ary[c]),NULL,threadFunc, args)){
printf("Error when creating thread\n");
}
printf("Created Thread: %d\n", args->n_threads);
c+=1;
}
c=0;
while (c < threads){
printf("main waiting for thread %d to terminate...\n", c+1);
if (pthread_join(thread_ary[c],NULL)){
printf("Error while waiting for thread to join\n");
}
printf("Destroyed Thread: %d\n", c+1);
c+=1;
}
printf("Hits in %d\n", args->hits_in);
printf("Rays: %d\n", rays);
answer = 4.0 * (double)(args->hits_in)/(double)(rays);
//freeing everything!
pthread_mutex_destroy(args->mutex);
free(args->mutex);
free(args);
return answer;
}
There's a couple of problems I can see:
rand() is not thread-safe. Use drand48_r() (which generates a double in the range [0.0, 1.0) natively, which is what you want)
You only create one struct arguments structure, then try to use that for multiple threads. You need to create a seperate one for each thread (just use an array).
Here's how I'd clean up your approach. Note how we don't need to use any mutexes - each thread just stashes its own return value in a seperate location, and the main thread adds them up after the other threads have finished:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
#include <pthread.h>
struct thread_info {
int thread_n;
pthread_t thread_id;
int rays;
int hits_in;
};
void seed_rand(int thread_n, struct drand48_data *buffer)
{
struct timeval tv;
gettimeofday(&tv, NULL);
srand48_r(tv.tv_sec * thread_n + tv.tv_usec, buffer);
}
void *threadFunc(void *arg)
{
struct thread_info *thread_info = arg;
struct drand48_data drand_buffer;
int n = 0;
const int rays = thread_info->rays;
int hits_in = 0;
double x;
double y;
double r;
seed_rand(thread_info->thread_n, &drand_buffer);
for (n = 0; n < rays; n++)
{
drand48_r(&drand_buffer, &x);
drand48_r(&drand_buffer, &y);
r = x * x + y * y;
if (r < 1.0){
hits_in++;
}
}
thread_info->hits_in = hits_in;
return NULL;
}
double calculate_pi_mt(int rays, int threads)
{
int c;
int hits_in = 0;
if (rays % threads) {
printf("Error: number of rays is not evenly divisible by threads\n");
rays = (rays / threads) * threads;
}
/* argument initialization */
struct thread_info *thr = malloc(threads * sizeof thr[0]);
for (c = 0; c < threads; c++) {
thr[c].thread_n = c;
thr[c].rays = rays / threads;
thr[c].hits_in = 0;
if (pthread_create(&thr[c].thread_id, NULL, threadFunc, &thr[c])) {
printf("Error when creating thread\n");
}
printf("Created Thread: %d\n", thr[c].thread_n);
}
for (c = 0; c < threads; c++) {
printf("main waiting for thread %d to terminate...\n", c);
if (pthread_join(thr[c].thread_id, NULL)) {
printf("Error while waiting for thread to join\n");
}
hits_in += thr[c].hits_in;
printf("Destroyed Thread: %d\n", c+1);
}
printf("Hits in %d\n", hits_in);
printf("Rays: %d\n", rays);
double answer = (4.0 * hits_in) / rays;
free(thr);
return answer;
}
You're using far too many synchronization primitives. You should sum the local_hits at the end in the main thread, and not use a mutex to update it in an asynchronous fashion. Or, at least, you could use an atomic operation (it's just an int) to do it instead of lock an entire mutex to update one int.
Threading has a cost. It may be that, as your useful computing code looks very simple, the cost of thread management (cost paid when changing thread and synchronisation cost) is much higher than the benefit.

Resources