Measuring speed up of a multi threaded C program (implementation using Pthreads) - c

I currently have a multi-threaded C program coded using Pthreads which uses 2 threads. I want to increase the no. of threads and measure speed up upon doing so. I would like to run my code in an automated manner where the no. of threads used keeps getting incremented and I want to graphically display running times of my code. I would love it if I could get a clue in on how to do so especially on how to automate the entire process and plotting it graphically. Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata
{
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[])
{
long vec_index;
for(vec_index = 0 ; vec_index < VECTOR_SIZE ; vec_index++){
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE/NUM_THREADS;
pthread_t call_thread[NUM_THREADS];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for(i = 0 ; i < NUM_THREADS ; i++){
pthread_create(&call_thread[i], &attr, calcDOT, (void *)i);
}
pthread_attr_destroy(&attr);
for (i = 0 ; i < NUM_THREADS ; i++){
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
pthread_exit(NULL);
}
void *calcDOT(void *thread_id)
{
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long)thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for(vec_index = start_index ; vec_index < end_index ; vec_index++){
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *)thread_id);
}
I would like to increment my NUM_THREADS parameter and run it after each increment, record the execution time after each increment and plot a graph of execution time vs number of threads.

I tried a naive approach by increasing the number of threads, timing it with time.h and plotting it with gnuplot. Each iteration we double the number of threads and we print the time for an iteration. We use gnuplot to display a graph with number of threads on the x-axis and execution time on the y-axis
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata {
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[]) {
double xvals[VECTOR_SIZE / NUM_THREADS];
double yvals[VECTOR_SIZE / NUM_THREADS];
int index = 0;
for (int count = NUM_THREADS; count < VECTOR_SIZE / NUM_THREADS; count = count * 2) {
clock_t begin = clock();
long vec_index;
for (vec_index = 0; vec_index < VECTOR_SIZE; vec_index++) {
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE / count;
pthread_t call_thread[count];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for (i = 0; i < count; i++) {
pthread_create(&call_thread[i], &attr, calcDOT, (void *) i);
}
pthread_attr_destroy(&attr);
for (i = 0; i < count; i++) {
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
clock_t end = clock();
double time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
printf("time spent: %f NUM_THREADS: %d\n", time_spent, count);
xvals[index] = count;
yvals[index] = time_spent;
index++;
}
FILE * gnuplotPipe = popen ("gnuplot -persistent", "w");
fprintf(gnuplotPipe, "plot '-' \n");
for (int i = 0; i < VECTOR_SIZE / NUM_THREADS; i++)
{
fprintf(gnuplotPipe, "%lf %lf\n", xvals[i], yvals[i]);
}
fprintf(gnuplotPipe, "e");
pthread_exit(NULL);
}
void *calcDOT(void *thread_id) {
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long) thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for (vec_index = start_index; vec_index < end_index; vec_index++) {
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *) thread_id);
}
Output
Resultant X*Y is 20900
time spent: 0.000155 NUM_THREADS: 2
Resultant X*Y is 19860
time spent: 0.000406 NUM_THREADS: 4
Resultant X*Y is 17680
time spent: 0.000112 NUM_THREADS: 8
Resultant X*Y is 5712
time spent: 0.000587 NUM_THREADS: 16

Related

how to compute sum of n/m Gregory-Leibniz terms in C language

get the two values named m & n from the command line arguments and convert them into integers. now after that create m threads and each thread computes the sum of n/m terms in Gregory-Leibniz Series.
pi = 4 * (1 - 1/3 + 1/5 - 1/7 + 1/9 - ...)
Now when thread finishes its computation, print its partial sum and atomically add it to a shared global variable.
& how to check that all of the m computational threads have done the atomic additions?
I share my source code, what I tried
#include<stdio.h>
#include<pthread.h>
#include <stdlib.h>
#include<math.h>
pthread_barrier_t barrier;
int count;
long int term;
// int* int_arr;
double total;
void *thread_function(void *vargp)
{
int thread_rank = *(int *)vargp;
// printf("waiting for barrier... \n");
pthread_barrier_wait(&barrier);
// printf("we passed the barrier... \n");
double sum = 0.0;
int n = count * term;
int start = n - term;
// printf("start %d & end %d \n\n", start, n);
for(int i = start; i < n; i++)
{
sum += pow(-1, i) / (2*i+1);
// v += 1 / i - 1 / (i + 2);
}
total += sum;
// int_arr[count] = sum;
count++;
printf("thr %d : %lf \n", thread_rank, sum);
return NULL;
}
int main(int argc,char *argv[])
{
if (argc <= 2) {
printf("missing arguments. please pass two num. in arguments\n");
exit(-1);
}
int m = atoi(argv[1]); // get value of first argument
int n = atoi(argv[2]); // get value of second argument
// int_arr = (int*) calloc(m, sizeof(int));
count = 1;
term = n / m;
pthread_t thread_id[m];
int i, ret;
double pi;
/* Initialize the barrier. */
pthread_barrier_init(&barrier, NULL, m);
for(i = 0; i < m; i++)
{
ret = pthread_create(&thread_id[i], NULL , &thread_function, (void *)&i);
if (ret) {
printf("unable to create thread! \n");
exit(-1);
}
}
for(i = 0; i < m; i++)
{
if(pthread_join(thread_id[i], NULL) != 0) {
perror("Failed to join thread");
}
}
pi = 4 * total;
printf("%lf ", pi);
pthread_barrier_destroy(&barrier);
return 0;
}
what I need :-
create M thread & each thread computes the sum of n/m terms in the Gregory-Leibniz Series.
first thread computes the sum of term 1 to n/m , the second thread computes the sum of the terms from (n/m + 1) to 2n/m etc.
when all the thread finishes its computation than print its partial sum and Value of Pi.
I tried a lot, but I can't achieve exact what I want. I got wrong output value of PI
for example : m = 16 and n = 1024
then it sometimes return 3.125969, sometimes 12.503874 , 15.629843, sometimes 6.251937 as a output of Pi value
please help me
Edited Source Code :
#include <inttypes.h>
#include <math.h>
#include <pthread.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
struct args {
uint64_t thread_id;
struct {
uint64_t start;
uint64_t end;
} range;
};
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_barrier_t barrier;
long double total = 0;
uint64_t total_iterations = 0;
void *partial_sum(void *arg)
{
struct args *args = arg;
long double sum = 0;
printf("waiting for barrier in thread -> %" PRId64 "\n", args->thread_id);
pthread_barrier_wait(&barrier);
// printf("we passed the barrier... \n");
for (uint64_t n = args->range.start; n < args->range.end; n++)
sum += pow(-1.0, n) / (1 + n * 2);
if (pthread_mutex_lock(&mutex)) {
perror("pthread_mutex_lock");
exit(EXIT_FAILURE);
}
total += sum;
total_iterations += args->range.end - args->range.start;
if (pthread_mutex_unlock(&mutex)) {
perror("pthread_mutex_unlock");
exit(EXIT_FAILURE);
}
printf("thr %" PRId64 " : %.20Lf\n", args->thread_id, sum);
return NULL;
}
int main(int argc,char *argv[])
{
if (argc <= 2) {
fprintf(stderr, "usage: %s THREADS TERMS.\tPlease pass two num. in arguments\n", *argv);
return EXIT_FAILURE;
}
int m = atoi(argv[1]); // get value of first argument & converted into int
int n = atoi(argv[2]); // get value of second argument & converted into int
if (!m || !n) {
fprintf(stderr, "Argument is zero.\n");
return EXIT_FAILURE;
}
uint64_t threads = m;
uint64_t terms = n;
uint64_t range = terms / threads;
uint64_t excess = terms - range * threads;
pthread_t thread_id[threads];
struct args arguments[threads];
int ret;
/* Initialize the barrier. */
ret = pthread_barrier_init(&barrier, NULL, m);
if (ret) {
perror("pthread_barrier_init");
return EXIT_FAILURE;
}
for (uint64_t i = 0; i < threads; i++) {
arguments[i].thread_id = i;
arguments[i].range.start = i * range;
arguments[i].range.end = arguments[i].range.start + range;
if (threads - 1 == i)
arguments[i].range.end += excess;
printf("In main: creating thread %ld\n", i);
ret = pthread_create(thread_id + i, NULL, partial_sum, arguments + i);
if (ret) {
perror("pthread_create");
return EXIT_FAILURE;
}
}
for (uint64_t i = 0; i < threads; i++)
if (pthread_join(thread_id[i], NULL))
perror("pthread_join");
pthread_barrier_destroy(&barrier);
printf("Pi value is : %.10Lf\n", 4 * total);
printf("COMPLETE? (%s)\n", total_iterations == terms ? "YES" : "NO");
return 0;
}
In each thread, the count variable is expected to be of a steadily increasing value in this expression
int n = count * term;
being one larger than it was in the "previous" thread, but count is only increased later on in each thread.
Even if you were to "immediately" increase count, there is nothing that guards against two or more threads attempting to read from and write to the variable at the same time.
The same issue exists for total.
The unpredictability of these reads and writes will lead to indeterminate results.
When sharing resources between threads, you must take care to avoid these race conditions. The POSIX threads library does not contain any atomics for fundamental integral operations.
You should protect your critical data against a read/write race condition by using a lock to restrict access to a single thread at a time.
The POSIX threads library includes a pthread_mutex_t type for this purpose. See:
pthread_mutex_init / pthread_mutex_destroy
pthread_mutex_lock / pthread_mutex_unlock
Additionally, as pointed out by #Craig Estey, using (void *) &i as the argument to the thread functions introduces a race condition where the value of i may change before any given thread executes *(int *) vargp;.
The suggestion is to pass the value of i directly, storing it intermediately as a pointer, but you should use the appropriate type of intptr_t or uintptr_t, which are well defined for this purpose.
pthread_create(&thread_id[i], NULL , thread_function, (intptr_t) i)
int thread_rank = (intptr_t) vargp;
How to check that all of the m computational threads have done the atomic additions?
Sum up the number of terms processed by each thread, and ensure it is equal to the expected number of terms. This can also naturally be assumed to be the case if all possible errors are accounted for (ensuring all threads run to completion and assuming the algorithm used is correct).
A moderately complete example program:
#define _POSIX_C_SOURCE 200809L
#include <inttypes.h>
#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
struct args {
uint64_t thread_id;
struct {
uint64_t start;
uint64_t end;
} range;
};
pthread_mutex_t mutex;
long double total = 0;
uint64_t total_iterations = 0;
void *partial_sum(void *arg)
{
struct args *args = arg;
long double sum = 0;
for (uint64_t n = args->range.start; n < args->range.end; n++)
sum += pow(-1.0, n) / (1 + n * 2);
if (pthread_mutex_lock(&mutex)) {
perror("pthread_mutex_lock");
exit(EXIT_FAILURE);
}
total += sum;
total_iterations += args->range.end - args->range.start;
if (pthread_mutex_unlock(&mutex)) {
perror("pthread_mutex_unlock");
exit(EXIT_FAILURE);
}
printf("thread(%" PRId64 ") Partial sum: %.20Lf\n", args->thread_id, sum);
return NULL;
}
int main(int argc,char **argv)
{
if (argc < 3) {
fprintf(stderr, "usage: %s THREADS TERMS\n", *argv);
return EXIT_FAILURE;
}
uint64_t threads = strtoull(argv[1], NULL, 10);
uint64_t terms = strtoull(argv[2], NULL, 10);
if (!threads || !terms) {
fprintf(stderr, "Argument is zero.\n");
return EXIT_FAILURE;
}
uint64_t range = terms / threads;
uint64_t excess = terms - range * threads;
pthread_t thread_id[threads];
struct args arguments[threads];
if (pthread_mutex_init(&mutex, NULL)) {
perror("pthread_mutex_init");
return EXIT_FAILURE;
}
for (uint64_t i = 0; i < threads; i++) {
arguments[i].thread_id = i;
arguments[i].range.start = i * range;
arguments[i].range.end = arguments[i].range.start + range;
if (threads - 1 == i)
arguments[i].range.end += excess;
int ret = pthread_create(thread_id + i, NULL , partial_sum, arguments + i);
if (ret) {
perror("pthread_create");
return EXIT_FAILURE;
}
}
for (uint64_t i = 0; i < threads; i++)
if (pthread_join(thread_id[i], NULL))
perror("pthread_join");
pthread_mutex_destroy(&mutex);
printf("%.10Lf\n", 4 * total);
printf("COMPLETE? (%s)\n", total_iterations == terms ? "YES" : "NO");
}
Using 16 threads to process 1 billion terms:
$ ./a.out 16 10000000000
thread(14) Partial sum: 0.00000000000190476190
thread(10) Partial sum: 0.00000000000363636364
thread(2) Partial sum: 0.00000000006666666667
thread(1) Partial sum: 0.00000000020000000000
thread(8) Partial sum: 0.00000000000555555556
thread(15) Partial sum: 0.00000000000166666667
thread(0) Partial sum: 0.78539816299744868408
thread(3) Partial sum: 0.00000000003333333333
thread(13) Partial sum: 0.00000000000219780220
thread(11) Partial sum: 0.00000000000303030303
thread(4) Partial sum: 0.00000000002000000000
thread(5) Partial sum: 0.00000000001333333333
thread(7) Partial sum: 0.00000000000714285714
thread(6) Partial sum: 0.00000000000952380952
thread(12) Partial sum: 0.00000000000256410256
thread(9) Partial sum: 0.00000000000444444444
3.1415926535
COMPLETE? (YES)

Performance of multithreaded algorithm to find max number in array

I'm trying to learn about multithreaded algorithms so I've implemented a simple find max number function of an array.
I've made a baseline program (findMax1.c) which loads from a file about 263 million int numbers into memory.
Then I simply use a for loop to find the max number. Then I've made another program (findMax2.c) which uses 4 threads.
I chose 4 threads because the CPU (intel i5 4460) I'm using has 4 cores and 1 thread per core. So my guess is that
if I assign each core a chunk of the array to process it would be more efficient because that way I'll have fewer cache
misses. Now, each thread finds the max number from each chunk, then I join all threads to finally find the max number
from all those chunks. The baseline program findMax1.c takes about 660ms to complete the task, so my initial thought was
that findMax2.c (which uses 4 threads) would take about 165ms (660ms / 4) to complete since now I have 4 threads running
all in parallel to do the same task, but findMax2.c takes about 610ms. Only 50ms less than findMax1.c.
What am I missing? is there something wrong with the implementation of the threaded program?
findMax1.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
int main(void)
{
int i, *array, max = 0, position;
size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
FILE *f;
clock_t t;
double time;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is falsa
printf("Loading array...");
t = clock();
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
assert(elements_read == 1);
printf("done!\n");
printf("File load time: %f [s]\n", time);
fclose(f);
array_size = array_size_in_bytes / sizeof(int);
printf("Finding max...");
t = clock();
for(i = 0; i < array_size; i++)
if(array[i] > max)
{
max = array[i];
position = i;
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
return 0;
}
findMax2.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, elements_read;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is false
printf("Loading array...");
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
assert(elements_read == 1);
printf("done!\n");
fclose(f);
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
First of all, you're measuring your time wrong.
clock() measures process CPU time, i.e., time used by all threads. The real elapsed time will be fraction of that. clock_gettime(CLOCK_MONOTONIC,...) should yield better measurements.
Second, your core loops aren't at all comparable.
In the multithreaded program you're writing in each loop iteration to global variables that are very close to each other and that is horrible for cache contention.
You could space that global memory apart (make each array item a cache-aligned struct (_Alignas(64))) and that'll help the time, but a better and fairer approach would be to use local variables (which should go into registers), copying the approach of the first loop, and then write out the chunk result to memory at the end of the loop:
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
Here's your modified test program with expected speedups (I'm getting approx. a 2x speedup on my two-core processor).
(I've also taken the liberty of replacing the file load with in-memory initialization, to make it simpler to test.)
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000+(Ts1.tv_nsec - Ts0.tv_nsec); }
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
#if 1 //shouldn't make much difference
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
#endif
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
#if 0 //horrible for caches
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
#else
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
#endif
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);
array = (int*) malloc(array_size_in_bytes);
if(array == NULL) abort(); // assert if condition is false
for(size_t i=0; i<array_size; i++) array[i]=i;
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
clock_gettime(CLOCK_MONOTONIC,&ts0);
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
clock_gettime(CLOCK_MONOTONIC,&ts1);
printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
My timings:
0.188917 for the signle threaded version
2.511590 for the original multithreaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
0.099802 with the modified threaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
ran on a Linux machine with Intel(R) Core(TM) i7-2620M CPU # 2.70GHz.

Will there be serious performance degradation when using multiple threads to write to memory concurrently on Linux?

I wrote a multi-thread today. The task of the thread is to write data to a large array. A single thread takes about 0.7s, but it takes more than 20 seconds to write independently and concurrently with two threads. The same operation is under Windows or Multi-process seconds under Linux all are about 0.7s.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/time.h>
#include <sys/types.h>
#define SIZE_IN_MB 256
#define NUM_BYTE (SIZE_IN_MB*1024*1024)
#define NUM_LONG (NUM_BYTE/sizeof(long))
#define CHILD_COUNT 2
#define STEP_SIZE 1 //use to avoid cache,when set to 8
unsigned long Time[CHILD_COUNT];
struct Arg {
unsigned long *data;
int index;
};
unsigned long diffTime(struct timeval *end, struct timeval *start) {
return labs((end->tv_sec - start->tv_sec) * 1000 + (end->tv_usec - start->tv_usec) / 1000);
}
void getTime(struct timeval *t) {
gettimeofday(t, NULL);
}
unsigned long writeData() {
struct timeval start, end;
getTime(&start);
unsigned long *data = (unsigned long *) malloc(NUM_LONG * sizeof(long));
for (int i = 0; i < STEP_SIZE; ++i) {
for (size_t k = i; k < NUM_LONG; k+=STEP_SIZE)
data[k] = 0x5a5a5a5a5a5a5a5a + rand();
}
getTime(&end);
free(data);
return diffTime(&end, &start);
}
void *child(void *arg) {
Time[((struct Arg *) arg)->index] = writeData();
}
void waitAll(pthread_t threads[]) {
for (int i = 0; i < CHILD_COUNT; i++) {
pthread_join(threads[i], NULL);
}
}
void printAverTime(int count) {
unsigned long time = 0;
for (int i = 0; i < count; ++i) {
time += Time[i];
}
printf("Thread: %ld\n", time / count);
}
void thread_test() {
pthread_t threads[CHILD_COUNT];
struct Arg arg[CHILD_COUNT] = {};
for (int i = 0; i < CHILD_COUNT; i++) {
arg[i].index = i;
pthread_create(&threads[i], NULL, child, (void *) &arg[i]);
}
waitAll(threads);
printAverTime(CHILD_COUNT);
}
void process_test() {
int p[CHILD_COUNT][2];
for (int i = 0; i < CHILD_COUNT; ++i) {
pipe(p[i]);
}
for (int i = 0; i < CHILD_COUNT; i++) {
if (fork() == 0) {
unsigned long t = writeData();
write(p[i][1], &t, sizeof(t));
exit(0);
}
}
unsigned long t = 0,tmp= 0;
for (int i = 0; i < CHILD_COUNT; ++i) {
read(p[i][0], &tmp, sizeof(tmp));
t += tmp;
}
printf("Process: %ld\n", t / CHILD_COUNT);
}
int main() {
thread_test();
process_test();
}
The penalty you are paying when using multiple threads is not for writing to memory but for the fact that you are calling rand(), which involves locking, many times in the following nested loops in writeData():
for (int i = 0; i < STEP_SIZE; ++i) {
for (size_t k = i; k < NUM_LONG; k+=STEP_SIZE)
data[k] = 0x5a5a5a5a5a5a5a5a + rand();
}
So you are incurring a huge penalty because for each call to rand() only one thread can get in at a time and all the other threads have to wait and there is overhead to this waiting.
You can fix your code to avoid collisions in the inner loop by using a reentrant form of rand(), such as rand_r() (which is documented at https://man7.org/linux/man-pages/man3/rand.3.html)
unsigned int seed = rand();
for (int i = 0; i < STEP_SIZE; ++i) {
for (size_t k = i; k < NUM_LONG; k+=STEP_SIZE)
data[k] = 0x5a5a5a5a5a5a5a5a + rand_r(&seed);
}

Issues with pointers when passing a struct to a thread on Win32 API

The user provides command line arguments that are used to compute the number of partitions, and the number of threads, where each thread does a minimum linear search of a specific partition of the large array. Each minimum value found by a thread is stored inside a small global array. The main function then does a minimum linear search of the small array, and also a minimum search of the large array and confirms that the minimum found in both small and large array are equal. The problem that I am encountering is that the minimums inside the small global array are sometimes garbage, and sometimes matches the minimum found in the large array. I have tried to figure out the problem but I don't seem to find it. Your help will be really appreciated. I am coding in C, using Dev-C++ on win32 API. The code is bellow:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
int number_items = 0;
int size = 1;
int partits = 1;
int P = 0;
int N = 0;
int Index = 0;
int index_global = 0;
int min;
#define NUM_THREADS 65536 //or 2^16
typedef struct thread_data
{
int thread_id;
int a;
int b;
int * copy_array;
int * glob_array;
int nbr_items;
int subarraysize;
} s_param, *p_s_param;
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
DWORD WINAPI CompMin( LPVOID lpParam )
{
int i, tmp;
int SubArSize,nbrItems,thrid;
p_s_param param2;
param2 = (p_s_param)lpParam;
min = param2->copy_array[Index];
min = param2->copy_array[param2->a];
param2->glob_array[index_global] = min;
Index++;
index_global++;
}
int main(int argc, char *argv[])
{
int sub_array_size;
p_s_param pDataArray[NUM_THREADS];
DWORD dwThreadIdArray[NUM_THREADS];
HANDLE hThreadArray[NUM_THREADS];
HANDLE myhandle;
//pthread_t thID, thread;
p_s_param param[NUM_THREADS];
int rNum, rc = 0, i, j, large_min;
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
P = atoi(argv[1]); /* will be used to define size of large array */
N = atoi(argv[2]); /* will be used to define number of threads */
if(N>P)
{
printf(" Argument 1 should be greater than argument 2\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Create a dynamic array of size size*/
int *array = (int*) malloc(size*sizeof(int));
srand(time(NULL));
for (i=0; i<size; i++)
{
rNum = rand() / RAND_DIVISOR;
array[i] = rNum;
}
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
/*numbers of elements per sub array*/
sub_array_size = size/partits;
/*Global array*/
int *Globalarray = (int*) malloc(partits*sizeof(int));
for (i=0; i<partits; i++)
{
/*Allocate memory for thread data*/
param[i] = (p_s_param) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(s_param));
if( param[i] == NULL)
{
ExitProcess(2);
}
param[i]->a=i;
param[i]->copy_array=array;
param[i]->glob_array = Globalarray;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, param[i], 0, &dwThreadIdArray[i]);
if(hThreadArray[i] == NULL)
{
puts("Error, cannot create Thread!");
puts(strerror(errno));
ExitProcess(3);
}
//printf("Number of partitions: %d\n",partits );
} WaitForMultipleObjects(NUM_THREADS,hThreadArray, TRUE, INFINITE);
/*find mimimum value from Global array returned by threads*/
min = Globalarray[0];
for(i = 0; i< partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, Globalarray[i] );
if(Globalarray[i] < min)
min = Globalarray[i];
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
large_min = array[0];
for(i = 0; i< partits; i++)
{
printf("value into large array: %d\n",array[i] );
if(array[i] < large_min)
large_min = array[i];
}
//large_min = array[0];
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small %d array is different from large array %d!\n", min, array[0]);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free((void*) Globalarray);
free((void*) array);
exit (0);
}
I just added a sleep(3) after the wait, and it fixed the problem.
Your CompMin() function is not thread-safe. It is accessing global variables that are shared and modified by multiple threads at the same time, so they are going to step over each other's data as they run in parallel to each other. You need to make your work data self-contained so each thread is only operating on the data it is given to work on, and get rid of your shared globals altogether. You designed your thread_data struct to allow partitioning the array data, but you are not actually utilizing that functionality, so each thread is not searching its individual partition of data, and not storing its search result in its individual section of the global array.
You are also passing the wrong number of thread handles to WaitForMultipleObjects(), so it will fail to wait, which you are not checking for, and then you move on to process the array data before they are actually ready to be processed.
Your are also searching the arrays incorrectly after the threads have finished running, so you are not going to end up with the correct results.
Try something more like this instead:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
typedef struct thread_data
{
DWORD thread_id;
int *items;
int nbr_items;
int min;
} s_thread_data, *p_s_thread_data;
/*
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
*/
DWORD WINAPI CompMin( LPVOID lpParam )
{
p_s_thread_data data = (p_s_thread_data) lpParam;
int i;
data->min = data->items[0];
for(i = 1; i < data->nbr_items; i++)
{
if(data->items[i] < data->min)
data->min = data->items[i];
}
return 0;
}
int main(int argc, char *argv[])
{
int size = 1;
int partits = 1;
int sub_array_size;
int i;
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
int P = atoi(argv[1]); /* will be used to define size of large array */
if(P < 1)
{
printf(" Argument 1 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
int N = atoi(argv[2]); /* will be used to define number of threads */
if(N < 1)
{
printf(" Argument 2 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the large array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Allocate memory for large array*/
int *array = (int*) malloc(size*sizeof(int));
if(array == NULL)
return 2;
srand(time(NULL));
/*Fill the large array with random data*/
for (i=0; i<size; i++)
array[i] = rand() / RAND_DIVISOR;
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
//printf("Number of partitions: %d\n", partits );
/*numbers of elements per partition*/
sub_array_size = size/partits;
/*Allocate memory for thread data*/
p_s_thread_data ThreadDataArray = (p_s_thread_data) malloc(partits*sizeof(s_thread_data));
if(ThreadDataArray == NULL)
return 2;
memset(ThreadDataArray, 0, partits*sizeof(s_thread_data));
/*Allocate memory for thread handles array*/
HANDLE *hThreadArray = (HANDLE*) malloc(partits*sizeof(HANDLE));
if(hThreadArray == NULL)
return 2;
memset(hThreadArray, 0, partits*sizeof(HANDLE));
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
for (i=0; i<partits; i++)
{
ThreadDataArray[i].items = &array[i*sub_array_size];
ThreadDataArray[1].nbr_items = sub_array_size;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, &param[i], 0, &(param[i].thread_id));
if(hThreadArray[i] == NULL)
{
printf("Error, cannot create Thread! %s\n", strerror(errno));
return 3;
}
}
/*Wait for threads to finish*/
i = 0;
int nbr_handles = partits;
while (nbr_handles >= MAXIMUM_WAIT_OBJECTS)
{
if (WaitForMultipleObjects(MAXIMUM_WAIT_OBJECTS, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
i = i + MAXIMUM_WAIT_OBJECTS;
nbr_handles = nbr_handles - MAXIMUM_WAIT_OBJECTS;
}
if (nbr_handles > 0)
{
if (WaitForMultipleObjects(nbr_handles, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
}
/*find minimum value from thread results*/
int min = ThreadDataArray[0].min;
for(i = 0; i < partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, ThreadDataArray[i].min );
if(ThreadDataArray[i].min < min)
min = ThreadDataArray[i].min;
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
int large_min = array[0];
for(i = 0; i < size; i++)
{
printf("value into large array: %d\n", array[i] );
if(array[i] < large_min)
large_min = array[i];
}
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small array (%d) is different from large array (%d)!\n", min, large_min);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free(array);
free(ThreadDataArray);
free(hThreadArray);
return 0;
}

Time measuring in Synchronized threaded C language

The time measuring in thread_work function is not working.
Code is a little bit nasty but I just want you to look at the thread_work function
and teach me why the print_time function keeps generating 0 value.
(I write the whole code just in case, I'm sorry for your eyes, really)
#include <stdio.h>
#include <pthread.h>
#include <time.h>
#include <stdlib.h>
#include <semaphore.h>
#include <unistd.h>
#define num_thread 20
char str[11];
void *thread_work(void *tid);
void generate_str(int n);
void str_sort(int n);
void check_sort(void);
void print_time(struct timespec *myclock);
void print_time_start(struct timespec *myclock);
void print_time_end(struct timespec *myclock);
sem_t my_sem;
int main(void)
{
pthread_t tid[num_thread];
int ret;
int t;
struct timespec t1[2];
srand(time(NULL));
ret = sem_init(&my_sem, 0, 1);
clock_gettime(CLOCK_REALTIME, &t1[0]);
print_time_start(t1);
for(t=0; t<num_thread; t++)
{
ret = pthread_create(&tid[t], NULL, thread_work, (void *)t);
usleep(1);
}
for(t=0; t<num_thread; t++)
ret = pthread_join(tid[t], NULL);
clock_gettime(CLOCK_REALTIME, &t1[1]);
print_time_end(t1);
sem_destroy(&my_sem);
return 0;
}
void *thread_work(void *t)
{
int n = (int )t;
struct timespec t2[2];
printf("########## Thread #%2d starting ########## \n",n);
sleep(1);
sem_wait(&my_sem); //Entry Section
clock_gettime(CLOCK_REALTIME, &t2[0]); //Critical Section Start
generate_str(n);
str_sort(n);
check_sort();
clock_gettime(CLOCK_REALTIME, &t2[1]);
print_time(t2); //Critical Section End
sem_post(&my_sem); //Exit Section
}
void str_sort(int n)
{
int temp;
int i, j;
for(i=0; i<9; i++)
for(j=0; j<9-i; j++)
{
if(str[j]>str[j+1])
{
temp=str[j];
str[j]=str[j+1];
str[j+1]=temp;
}
}
printf("[%2d] ",n);
for(i=0; i<10; i++)
printf("%2c", str[i]);
}
void generate_str(int n)
{
int i;
int num;
srand(n); //differentiate the string of each threads
for(i=0; i<10; i++)
{
num = (97+rand()%26);
str[i]=num;
}
str[10]='\0';
}
void check_sort(void)
{
int i;
int count=0;
for(i=0; i<9; i++)
{
if(str[i]>str[i+1])
count++;
}
if(count != 0)
printf(" [X]FALSE ");
else
printf(" [O]TRUE ");
}
void print_time(struct timespec *myclock)
{
long delay, temp, temp_n, sec;
sec = myclock[0].tv_sec % 60;
printf(" %ld.%ld -> ", sec, myclock[0].tv_nsec);
sec = myclock[1].tv_sec % 60;
printf("%ld.%ld", sec, myclock[1].tv_nsec);
if(myclock[1].tv_nsec >= myclock[0].tv_nsec)
{
temp = myclock[1].tv_sec - myclock[0].tv_sec;
temp_n = myclock[1].tv_nsec - myclock[0].tv_nsec;
delay = 1000000000 * temp + temp_n;
}
else
{
temp = myclock[1].tv_sec - myclock[0].tv_sec - 1;
temp_n = 1000000000 + myclock[1].tv_nsec - myclock[0].tv_nsec;
delay = 1000000000 * temp + temp_n;
}
printf(", Interval : %ld ns\n", delay);
}
void print_time_start(struct timespec *myclock)
{
long sec;
sec = myclock[0].tv_sec % 60;
printf("########## Thread: Start Time -> %ld.%ld\n", sec, myclock[0].tv_nsec);
}
void print_time_end(struct timespec *myclock)
{
long delay, temp, temp_n, sec;
sec = myclock[1].tv_sec % 60;
printf("########## Thread: End Time -> %ld.%ld ", sec, myclock[1].tv_nsec);
if (myclock[1].tv_nsec >= myclock[0].tv_nsec)
{
temp = myclock[1].tv_sec - myclock[0].tv_sec;
temp_n = myclock[1].tv_nsec - myclock[0].tv_nsec;
delay = 1000000000 * temp + temp_n; //The unit of delay is nano second
}
else
{
temp = myclock[1].tv_sec - myclock[0].tv_sec - 1;
temp_n = 1000000000 + myclock[1].tv_nsec - myclock[0].tv_nsec;
delay = 1000000000 * temp + temp_n; //The unit of delay is nano second
}
delay = delay / 1000; //The unit of delay is now micro second
printf("(Thread Execution Time -> %ld micro second)\n", delay);
}
clock_gettime(CLOCK_REALTIME, &t2[0]); //Critical Section Start
generate_str(n);
str_sort(n);
check_sort();
clock_gettime(CLOCK_REALTIME, &t2[1]);
Could be that the three methods execute so fast that the system clock doesn't progress. You could try and get a higher solution by changing CLOCK_REALTIME to CLOCK_THREAD_CPUTIME_ID or CLOCK_PROCESS_CPUTIME_ID.

Resources