I have a simple question for you. I made this code to calculate the factorial of a number without recursion.
int fact2(int n){
int aux=1, total = 1;
int i;
int limit = n - 1;
for (i=1; i<=limit; i+=2){
aux = i*(i+1);
total = total*aux;
}
for (;i<=n;i++){
total = total*i;
}
return total;
}
As you can see, my code uses loop unrolling to optimize clock cycles in the execution. Now I'm asked to add two-way parallelism to the same code, any idea how?
You can use ptherads library to create two separate threads. Each thread should do half of the multiplications. I could put together following solution.
#include <pthread.h>
typedef struct {
int id;
int num;
int *result;
} thread_arg_t;
void* thread_func(void *arg) {
int i;
thread_arg_t *th_arg = (thread_arg_t *)arg;
int start, end;
if(th_arg->id == 0) {
start = 1;
end = th_arg->num/2;
} else if (th_arg->id == 1) {
start = th_arg->num / 2;
end = th_arg->num + 1;
} else {
return NULL;
}
for(i=start; i < end; i++) {
th_arg->result[th_arg->id] *= i;
}
return NULL;
}
int factorial2(int n) {
pthread_t threads[2];
int rc;
int result[2];
thread_arg_t th_arg[2];
for(i=0; i<2; i++) {
th_arg[i].id = i;
th_arg[i].num = n;
th_arg[i].result = result;
rc = pthread_create(&threads[i], NULL, thread_func, (void *)&th_arg[i]);
if (rc){
printf("pthread_create() failed, rc = %d\n", rc);
exit(1);
}
}
/* wait for threads to finish */
for(i=0; i<2; i++) {
pthread_join(thread[i], NULL);
/* compute final one multiplication */
return (result[0] * result[1]);
}
The pthread library implementation should take care of parallelizing the work of two threads for you. Also, this example can be generalized for N threads with minor modifications.
Related
It's been a few hours and i can't seem to understand the issue. Build this program to count from 1 - 10. The goal of this program is to use multithreading and dynamically split the array depending on how many threads it requested. Problem is the first 2 threads are being skipped and the last thread is doing most of th e process. I suspect it's the for loop that creates the threads.
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
typedef struct
{
int *array;
int batch;
int start;
int end;
} Parameter;
void *method(void *p)
{
Parameter *param = (Parameter *)p;
for (int i = param->start; i < param->end; i++)
{
printf("Start:%d\tEnd:%d\tIndex:%d\tValue:%d\n", param->start, param->end, i,param->array[i]);
}
}
int main(int argc, char **argv)
{
// Getting the user input
int array_length = atoi(argv[1]);
int batches = atoi(argv[2]);
printf("User specified Array:%d\tBatch:%d\n", array_length, batches);
// Creating an array
int *array = (int *)calloc(array_length, sizeof(int));
// Fill it up with some data
for (int i = 0; i < array_length; i++)
{
array[i] = i;
}
// Determine the Batches
int batch_size = array_length / batches;
int remainder = array_length % batches;
printf("%d\n", batch_size);
printf("%d\n", remainder);
int start = 0;
int end = 0;
int index =0;
// List of parameters
Parameter *param = (Parameter *)calloc(batches, sizeof(Parameter));
pthread_t *threads = (pthread_t *)calloc(batches, sizeof(pthread_t));
// Loop through each batch.
for (int i = 0; i < batches; i++)
{
printf("\n\nBatch number -> %d\n", i);
end = start + batch_size;
if (remainder > 0)
{
remainder --;
end ++;
}
// Fill the parameters
param[i].array = array;
param[i].end = end;
param[i].start = start;
param[i].batch = i;
// Call the thread.
pthread_create(threads + index, NULL, method, (void *)¶m[i]);
index++;
start = end;
}
for (int i = 0; i < batches; i++)
{
pthread_join(threads[i], NULL);
}
free(param);
free(threads);
free(array);
return 0;
}
Been playing with the index of the for loop(line 57) as i'm certain it's the cause of the issue. been getting some results but the main problem still persisted.
Code Works as intended. I'm a dumbas who didn't put the printf in the void function. like so:
void *method(void *p) {
Parameter *param = (Parameter *)p;
printf("\n\nBatch number -> %d\n", param->batch); //<-- moved from main method
for (int i = param->start; i < param->end; i++)
{
printf("Start:%d\tEnd:%d\tIndex:%d\tValue:%d\n", param->start, param->end, i,param->array[i]);
} }
Thanks for pointing it out that the program works
This is a multi-threaded program that outputs prime numbers. The user runs the program and enters a number into the command line. It creates a separate thread that outputs all the prime numbers less than or equal to the number entered by the user.
I have an error: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] I'm so close but I've been staring at this for awhile now. I thought I would get some feedback.
How can I fix this? It is referring to the void here:
(void *)count);
Here is all the code:
#include <stdio.h>
#include <pthread.h>
int N = 100; //number of promes to be generated
int prime_arr[100000] = {0}; //prime arrray
void *printprime(void *ptr) //thread function
{
int j, flag;
int i = (int)(long long int)ptr; //getting thread number
//for thread 0, we check for all primes 0,4,8,12
//for thread 1, we check for all primes 1,5,9,13
while (i < N) { //while number in range
flag = 0; //check if i has factor
for (j = 2; j <= i / 2; j++) //factor can be at max i/2 value
{
if (i % j == 0) //factor found
{
flag = 1;
break;
}
}
if (flag == 0 && (i > 1)) //prime found, no factor
{
prime_arr[i] = 1;
}
i += 4; //increase by interval of 4
}
}
int main()
{
printf("Enter N: ");
scanf("%d", &N); //input N
pthread_t tid[4] = {0}; //create an array of 4 threads
int count = 0;
for (count = 0; count < 4; count++) //initialize threads and start
{
printf("\r\n CREATING THREADS %d", count);
pthread_create(&tid[count], NULL, printprime,(void *)count); //count is passed as argument, target = printprime
}
printf("\n");
for (count = 0; count < 4; count++)
{
pthread_join(tid[count], NULL); //while all thread havent finished
}
int c = 0;
for (count = 0; count < N; count++) //print primes
if (prime_arr[count] == 1)
printf("%d ", count);
printf("\n");
return 0;
}
Here you cast count to a void* which isn't a compatible type.
pthread_create(&tid[count], NULL, printprime, (void*) count);
And here you try to convert it back to an int improperly:
int i = (int)(long long int)ptr;
I suggest creating workpackages, tasks that you instead use and cast proberly to void* and back.
Example:
#include <pthread.h>
#include <stdio.h>
typedef struct {
pthread_t tid;
int count;
} task_t;
void *printprime(void *ptr) {
task_t *task = ptr;
task->count += 10; // do some work
return NULL;
}
#define TASKS (4)
int main() {
task_t tasks[TASKS] = {0}; // an array of tasks
for (int count = 0; count < TASKS; ++count) {
tasks[count].count = count; // fill the task with some job
pthread_create(&tasks[count].tid, NULL, printprime, &tasks[count]);
}
// join and take care of result from all threads
for (int count = 0; count < TASKS; ++count) {
pthread_join(tasks[count].tid, NULL);
printf("task %d value = %d\n", count, tasks[count].count);
}
}
Demo
Use a uintptr_t or a intptr_t instead of an int.
Technically, that's for storing a pointer in an integer, not for storing an integer in a pointer. So it's not exactly kosher. But it's still a common practice.
To do it properly, you would need to (statically or dynamically) allocate a variable for each thread, and pass the address of that variable to the thread.
I want to read as input a table A and B from a user , and make an inner product space from them (a1b1+a2b2+……+anbn) and save it in a local_sum and then share it to an total_sum variable. I am doing the bellow code , but there is a segment fault. For some reason table A & B can't pass to function MUL. Any help would be great, thank you!
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#define N 2
int p;
int A[N],B[N];
int local_sum;
void *mul(void *arg)
{
int lines, start, end, i, j;
int id = *(int*)arg;
lines = N / p;
start = id * lines;
end = start + lines;
for (i = start; i < end; i++)
local_sum = A[i] * B[i] + local_sum;
return NULL;
}
int main (int argc, char *argv[])
{
int i;
pthread_t *tid;
if (argc != 2)
{
printf("Provide number of threads.\n");
exit(1);
}
p = atoi(argv[1]);
tid = (pthread_t *)malloc(p * sizeof(pthread_t));
if (tid == NULL)
{
printf("Could not allocate memory.\n");
exit(1);
}
printf("Give Table A\n");
for (int i = 0; i < N; i++)
{
scanf("%d", &A[i]);
}
printf("Give Table B\n");
for (int i = 0; i < N; i++)
{
scanf("%d", &B[i]);
}
for (i = 0; i < p; i++)
{
int *a;
a = malloc(sizeof(int));
*a = 0;
pthread_create(&tid[i], NULL, mul, a);
}
for (i = 0; i < p; i++)
pthread_join(tid[i], NULL);
printf("%d", local_sum);
return 0;
}
Let's see:
You want to have p threads, working on the vectors A and B.
You must be aware of that threads share the same memory, and might be interrupted at any time.
You've got p threads, all trying to write to one shared variable local_sum. This leads to unpredictable results since one thread overwrites the value another thread has written there before.
You can bypass this problem by ensuring exclusive access of one single thread to this variable by using a mutex or the like, or you could have one variable per thread, have each thread produce an intermediate result and after joining all threads, collapse all your intermediate results into the final one.
To do this, your main should look something like (assuming your compiler supports a recent C standard):
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#define N 2
/* these are variables shared amongst all threads */
int p;
int A[N], B[N];
/* array with one slot per thread to receive the partial result of each thread */
int* partial_sum;
/* prototype of thread function, just to be independent of the place mul will be placed in the source file... */
void *mul(void *arg);
int main (int argc, char** argv)
{
pthread_t* tid;
p = atoi(argv[1]);
const size_t n_by_p = N/p;
if(n_by_p * p != N)
{
fprintf(stderr, "Number of threads must be an integral factor of N\n");
exit(EXIT_FAILURE) ;
}
tid = calloc(p, sizeof(pthread_t));
partial_sum = calloc(p, sizeof(int)) ;
printf("Give Table A\n");
for(size_t i = 0; i < N; ++i)
{
scanf("%d",&A[i]);
}
printf("Give Table B\n");
for(size_t i = 0; i < N; ++i)
{
scanf("%d",&B[i]);
}
for (size_t i =0; i < p; ++i)
{
/* clumsy way to pass a thread it's slot number, but works as a starter... */
int *a;
a = malloc(sizeof(int));
*a = i;
pthread_create(&tid[i], 0, mul, a);
}
for (size_t i = 0; i < p; ++i)
{
pthread_join(tid[i], 0);
}
free(tid);
tid = 0;
int total_sum = 0;
for (size_t i = 0; i < p; ++i)
{
total_sum += partial_sum[i] ;
}
free(partial_sum);
partial_sum = 0;
printf("%d",total_sum);
return EXIT_SUCCESS;
}
Your threaded method mul should now write to its particular partial_sum slot only :
void *mul(void *arg)
{
int slot_num = *(int*)arg;
free(arg);
arg = 0;
const size_t lines = N/p;
const size_t start = slot_num * lines;
const size_t end = start + lines;
partial_sum[slot_num] = 0;
for(size_t i = start; i < end; ++i)
{
partial_sum[slot_num] += A[i]*B[i];
}
return 0;
}
Beware: This code runs smoothly, only if N is some integral multiple of p.
If this condition is not met, due to truncation in N/p, not all elements of the vectors will be processed.
However, fixing these cases is not the core of this question IMHO.
I spared all kinds of error-checking, which you should add, should this code become part of some operational setup...
if (tid=NULL)
-->
if (tid==NULL)
and
for (i=start;i<end;i++)
I suppose we need
for (i=0;i<end-start;i++)
Issue
Hello everyone, I have got a program (from the net) that I intend to speed up by converting it into its parallel version with the use of pthreads. But surprisingly though, it runs slower than the serial version. Below is the program:
# include <stdio.h>
//fast square root algorithm
double asmSqrt(double x)
{
__asm__ ("fsqrt" : "+t" (x));
return x;
}
//test if a number is prime
bool isPrime(int n)
{
if (n <= 1) return false;
if (n == 2) return true;
if (n%2 == 0) return false;
int sqrtn,i;
sqrtn = asmSqrt(n);
for (i = 3; i <= sqrtn; i+=2) if (n%i == 0) return false;
return true;
}
//number generator iterated from 0 to n
int main()
{
n = 1000000; //maximum number
int k,j;
for (j = 0; j<= n; j++)
{
if(isPrime(j) == 1) k++;
if(j == n) printf("Count: %d\n",k);
}
return 0;
}
First attempt for parallelization
I let the pthread manage the for loop
# include <stdio.h>
.
.
int main()
{
.
.
//----->pthread code here<----
for (j = 0; j<= n; j++)
{
if(isPrime(j) == 1) k++;
if(j == n) printf("Count: %d\n",k);
}
return 0;
}
Well, it runs slower than the serial one
Second attempt
I divided the for loop into two threads and run them in parallel using pthreads
However, it still runs slower, I am intending that it may run about twice as fast or well faster. But its not!
These is my parallel code by the way:
# include <stdio.h>
# include <pthread.h>
# include <cmath>
# define NTHREADS 2
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER;
int k = 0;
double asmSqrt(double x)
{
__asm__ ("fsqrt" : "+t" (x));
return x;
}
struct arg_struct
{
int initialPrime;
int nextPrime;
};
bool isPrime(int n)
{
if (n <= 1) return false;
if (n == 2) return true;
if (n%2 == 0) return false;
int sqrtn,i;
sqrtn = asmSqrt(n);
for (i = 3; i <= sqrtn; i+=2) if (n%i == 0) return false;
return true;
}
void *parallel_launcher(void *arguments)
{
struct arg_struct *args = (struct arg_struct *)arguments;
int j = args -> initialPrime;
int n = args -> nextPrime - 1;
for (j = 0; j<= n; j++)
{
if(isPrime(j) == 1)
{
printf("This is prime: %d\n",j);
pthread_mutex_lock( &mutex1 );
k++;
pthread_mutex_unlock( &mutex1 );
}
if(j == n) printf("Count: %d\n",k);
}
pthread_exit(NULL);
}
int main()
{
int f = 100000000;
int m;
pthread_t thread_id[NTHREADS];
struct arg_struct args;
int rem = (f+1)%NTHREADS;
int n = floor((f+1)/NTHREADS);
for(int h = 0; h < NTHREADS; h++)
{
if(rem > 0)
{
m = n + 1;
rem-= 1;
}
else if(rem == 0)
{
m = n;
}
args.initialPrime = args.nextPrime;
args.nextPrime = args.initialPrime + m;
pthread_create(&thread_id[h], NULL, ¶llel_launcher, (void *)&args);
pthread_join(thread_id[h], NULL);
}
// printf("Count: %d\n",k);
return 0;
}
Note:
OS: Fedora 21 x86_64,
Compiler: gcc-4.4,
Processor: Intel Core i5 (2 physical core, 4 logical),
Mem: 6 Gb,
HDD: 340 Gb,
You need to split the range you are examining for primes up into n parts, where n is the number of threads.
The code that each thread runs becomes:
typedef struct start_end {
int start;
int end;
} start_end_t;
int find_primes_in_range(void *in) {
start_end_t *start_end = (start_end_t *) in;
int num_primes = 0;
for (int j = start_end->start; j <= start_end->end; j++) {
if (isPrime(j) == 1)
num_primes++;
}
pthread_exit((void *) num_primes;
}
The main routine first starts all the threads which call find_primes_in_range, then calls pthread_join for each thread. It sums all the values returned by find_primes_in_range. This avoids locking and unlocking a shared count variable.
This will parallelize the work, but the amount of work per thread will not be equal. This can be addressed but is more complicated.
The main design flaw: you must let each thread have its own private counter variable instead of using the shared one. Otherwise they will spend far more time waiting on and handling that mutex, than they will do on the actual calculation. You are essentially forcing the threads to execute in serial.
Instead, sum everything up with a private counter variable and once a thread is done with its work, return the counter variable and sum them up in main().
Also, you should not call printf() from inside the threads. If there is a context switch in the middle of a printf call, you'll end up with crappy output such as This is This is prime: 2. In which case you must synchronize the printf calls between threads, which will slow the program down again. Also, the printf() calls themselves are likely 90% of the work that the thread is doing. So some sort of re-design of who does the printing might be a good idea, depending on what you want to do with the results.
Summary
Indeed, the use of PThread speed up my code. It was my programming flaw of placing pthread_join right after the first pthread_create and the common counter I have set on arguments. After fixing this up, I tested my parallel code to determine the primality of 100 Million numbers then compared its processing time with a serial code. Below are the results.
http://i.stack.imgur.com/gXFyk.jpg (I could not attach the image as I don't have much reputation yet, instead, I am including a link)
I conducted three trials for each to account for the variations caused by different OS activities. We got speed up for utilizing parallel programming with PThread. What is surprising is a PThread code running in ONE thread was a bit faster than purely serial code. I could not explain this one, nevertheless using PThreads is well, surely worth a try.
Here is the corrected parallel version of the code (gcc-c++):
# include <stdio.h>
# include <pthread.h>
# include <cmath>
# define NTHREADS 4
double asmSqrt(double x)
{
__asm__ ("fsqrt" : "+t" (x));
return x;
}
struct start_end_f
{
int start;
int end;
};
//test if a number is prime
bool isPrime(int n)
{
if (n <= 1) return false;
if (n == 2) return true;
if (n%2 == 0) return false;
int sqrtn = asmSqrt(n);
for (int i = 3; i <= sqrtn; i+=2) if (n%i == 0) return false;
return true;
}
//executes the tests for prime in a certain range, other threads will test the next range and so on..
void *find_primes_in_range(void *in)
{
int k = 0;
struct start_end_f *start_end_h = (struct start_end_f *)in;
for (int j = start_end_h->start; j < (start_end_h->end +1); j++)
{
if(isPrime(j) == 1) k++;
}
int *t = new int;
*t = k;
pthread_exit(t);
}
int main()
{
int f = 100000000; //maximum number to be tested for prime
pthread_t thread_id[NTHREADS];
struct start_end_f start_end[NTHREADS];
int rem = (f+1)%NTHREADS;
int n = (f+1)/NTHREADS;
int rem_change = rem;
int m;
if(rem>0) m = n+1;
else if(rem == 0) m = n;
//distributes task 'evenly' to the number of parallel threads requested
for(int h = 0; h < NTHREADS; h++)
{
if(rem_change > 0)
{
start_end[h].start = m*h;
start_end[h].end = start_end[h].start+m-1;
rem_change -= 1;
}
else if(rem_change<= 0)
{
start_end[h].start = m*(h+rem_change)-rem_change*n;
start_end[h].end = start_end[h].start+n-1;
rem_change -= 1;
}
pthread_create(&thread_id[h], NULL, find_primes_in_range, &start_end[h]);
}
//retreiving returned values
int *t;
int c = 0;
for(int h = 0; h < NTHREADS; h++)
{
pthread_join(thread_id[h], (void **)&t);
int b = *((int *)t);
c += b;
b = 0;
}
printf("\nNumber of Primes: %d\n",c);
return 0;
}
I'm pretty new to threads and would like some insight. I'm trying to get the percentage each thread has completed for its calculation. Each thread will report its percentage to a different element of the same array. I have this working with pthread_join immediately after pthread_create and a separate thread for reading all the values of the array and printing the percentage but when I have all threads running after each other without waiting for the previous one to finish I get some weird behavior. This is how I'm accessing the shared (global) array.
//global
int *currentProgress;
//main
currentProgress = malloc(sizeof(int)*threads);
for(i=0; i<threads; i++)
currentProgress[i] = 0;
//child threads
currentProgress[myId] = (int)percent; //myId is unique
//progress thread
for(i=0; i<threads; i++)
progressTotal += currentProgress[i];
progressTotal /= threads;
printf("Percent: %d", progressTotal);
This is essentially the code I think is not being used correctly for multi-threads. When I print out the state of the shared array, I notice that as soon as another thread starts accessing the array (different element though), the previous element immediately goes to some random number... -2147483648 and when the latter element finishes the prior element continues like normal. Should I be using semaphores for this? I thought I could access different elements of an array at the same time and I thought reading them wasn't an issue.
This is the entire code:
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <stdint.h>
#include <pthread.h>
#include <string.h>
#define STDIN 0
int counter = 0;
uint64_t *factors;
void *getFactors(void *arg);
void *deleteThreads(void *arg);
void *displayProgressThread(void *arg);
int *currentProgress;
struct data
{
uint64_t num;
uint64_t incrS;
uint64_t incrF;
int threads;
int member;
} *args;
int main(int argc, char *argv[])
{
if(argc < 3) {printf("not enough arguments"); exit(1);}
int i;
int threads = atoi(argv[2]);
pthread_t thread_id[threads];
pthread_t dThread;
currentProgress = malloc(sizeof(int)*threads);
for(i=0; i<threads; i++)
currentProgress[i] = 0;
args = (struct data*)malloc(sizeof(struct data));
args->num = atoll(argv[1]);
args->threads = threads;
uint64_t increment = (uint64_t)sqrt((uint64_t)args->num)/threads;
factors = (uint64_t*)malloc(sizeof(uint64_t)*increment*threads);
pthread_create(&dThread, NULL, displayProgressThread, (void*)args);
//for the id of each thread
args->member = 0;
for(i=0; i<threads; i++)
{
args->incrS = (i)*increment +1;
args->incrF = (i+1)*increment +1;
pthread_create(&thread_id[i], NULL, getFactors, (void*)args);
usleep(5);
}
for(i=0; i<threads; i++)
{
pthread_join(thread_id[i], NULL);
}
sleep(1);
printf("done\n");
for (i=0; i<counter; i++)
printf("\n%llu : %llu", factors[++i], factors[i]);
return 0;
}
void *getFactors(void *arg)
{
uint64_t count;
int myId;
int tempCounter = 0, i;
struct data *temp = (struct data *) arg;
uint64_t number = temp->num;
float total = temp->incrF - temp->incrS, percent;
myId = temp->member++;
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
for(count=temp->incrS; count<=temp->incrF; count++)
{
percent = (float)(count-temp->incrS)/total*100;
currentProgress[myId] = (int)percent;
if (number%count == 0)
{
factors[counter++] = count;
factors[counter++] = number/count;
}
usleep(1);
}
usleep(1);
pthread_exit(NULL);
}
void *displayProgressThread(void *arg)
{
struct data *temp = (struct data *) arg;
int toDelete = 0;
while(1)
{
int i;
int progressTotal = 0;
char *percent = malloc(sizeof(char)*20);
for(i=0; i<toDelete; i++)
printf("\b \b");
for(i=0; i<temp->threads; i++){
progressTotal += currentProgress[i];
}
progressTotal /= temp->threads;
printf("|");
for(i=0; i<50; i++)
if(i<progressTotal/2)
printf("#");
else
printf("_");
printf("| ");
sprintf(percent, "Percent: %d", progressTotal);
printf("%s", percent);
toDelete = 53 + strlen(percent);
usleep(1000);
fflush(stdout);
if(progressTotal >= 100)
pthread_exit(NULL);
}
}
There are some non synchronized pieces of code that are accessed by the threads which cause this problem.
One first place to be synchronized is:
myId = temp->member++;
But more importantly is that, the main thread is doing:
args->incrS = (i)*increment +1;
args->incrF = (i+1)*increment +1;
while at the same time in the threads:
for(count=temp->incrS; count<= temp->incrF; count++)
{
percent = (float)(count-temp->incrS)/total*100;
currentProgress[myId] = (int)percent;
if (number%count == 0)
{
factors[counter++] = count;
factors[counter++] = number/count;
}
usleep(1);
}
The unsynchronized accesses mentioned above affect the calculation of percent value which results in such abnormal happenings. You have to do synchronization in all these places in order to get the kind of behavior you would expect.