Performance of multithreaded algorithm to find max number in array - c

I'm trying to learn about multithreaded algorithms so I've implemented a simple find max number function of an array.
I've made a baseline program (findMax1.c) which loads from a file about 263 million int numbers into memory.
Then I simply use a for loop to find the max number. Then I've made another program (findMax2.c) which uses 4 threads.
I chose 4 threads because the CPU (intel i5 4460) I'm using has 4 cores and 1 thread per core. So my guess is that
if I assign each core a chunk of the array to process it would be more efficient because that way I'll have fewer cache
misses. Now, each thread finds the max number from each chunk, then I join all threads to finally find the max number
from all those chunks. The baseline program findMax1.c takes about 660ms to complete the task, so my initial thought was
that findMax2.c (which uses 4 threads) would take about 165ms (660ms / 4) to complete since now I have 4 threads running
all in parallel to do the same task, but findMax2.c takes about 610ms. Only 50ms less than findMax1.c.
What am I missing? is there something wrong with the implementation of the threaded program?
findMax1.c
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
int main(void)
{
int i, *array, max = 0, position;
size_t array_size_in_bytes = 1024*1024*1024, elements_read, array_size;
FILE *f;
clock_t t;
double time;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is falsa
printf("Loading array...");
t = clock();
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
assert(elements_read == 1);
printf("done!\n");
printf("File load time: %f [s]\n", time);
fclose(f);
array_size = array_size_in_bytes / sizeof(int);
printf("Finding max...");
t = clock();
for(i = 0; i < array_size; i++)
if(array[i] > max)
{
max = array[i];
position = i;
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
return 0;
}
findMax2.c:
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, elements_read;
array = (int*) malloc(array_size_in_bytes);
assert(array != NULL); // assert if condition is false
printf("Loading array...");
f = fopen("numbers.bin", "rb");
assert(f != NULL);
elements_read = fread(array, array_size_in_bytes, 1, f);
assert(elements_read == 1);
printf("done!\n");
fclose(f);
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}

First of all, you're measuring your time wrong.
clock() measures process CPU time, i.e., time used by all threads. The real elapsed time will be fraction of that. clock_gettime(CLOCK_MONOTONIC,...) should yield better measurements.
Second, your core loops aren't at all comparable.
In the multithreaded program you're writing in each loop iteration to global variables that are very close to each other and that is horrible for cache contention.
You could space that global memory apart (make each array item a cache-aligned struct (_Alignas(64))) and that'll help the time, but a better and fairer approach would be to use local variables (which should go into registers), copying the approach of the first loop, and then write out the chunk result to memory at the end of the loop:
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
Here's your modified test program with expected speedups (I'm getting approx. a 2x speedup on my two-core processor).
(I've also taken the liberty of replacing the file load with in-memory initialization, to make it simpler to test.)
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <sched.h>
#include <stdint.h>
struct timespec ts0,ts1;
uint64_t sc_timespec_diff(struct timespec Ts1, struct timespec Ts0) { return (Ts1.tv_sec - Ts0.tv_sec)*1000000000+(Ts1.tv_nsec - Ts0.tv_nsec); }
#define NUM_THREADS 4
int max_chunk[NUM_THREADS], pos_chunk[NUM_THREADS];
int *array;
pthread_t tid[NUM_THREADS];
void *thread(void *arg)
{
size_t array_size_in_bytes = 1024*1024*1024;
int i, rc, offset, chunk_size, array_size, *core_id = (int*) arg, num_cores = sysconf(_SC_NPROCESSORS_ONLN);
#if 1 //shouldn't make much difference
pthread_t id = pthread_self();
cpu_set_t cpuset;
if (*core_id < 0 || *core_id >= num_cores)
return NULL;
CPU_ZERO(&cpuset);
CPU_SET(*core_id, &cpuset);
rc = pthread_setaffinity_np(id, sizeof(cpu_set_t), &cpuset);
if(rc != 0)
{
printf("pthread_setaffinity_np() failed! - rc %d\n", rc);
return NULL;
}
printf("Thread running on CPU %d\n", sched_getcpu());
#endif
array_size = (int) (array_size_in_bytes / sizeof(int));
chunk_size = (int) (array_size / NUM_THREADS);
offset = chunk_size * (*core_id);
// Find max number in the array chunk
#if 0 //horrible for caches
for(i = offset; i < (offset + chunk_size); i++)
{
if(array[i] > max_chunk[*core_id])
{
max_chunk[*core_id] = array[i];
pos_chunk[*core_id] = i;
}
}
#else
int l_max_chunk=0, l_pos_chunk=0, *a;
for(i = 0,a=array+offset; i < chunk_size; i++)
if(a[i] > l_max_chunk) l_max_chunk=a[i], l_pos_chunk=i;
max_chunk[*core_id] = l_max_chunk;
pos_chunk[*core_id] = l_pos_chunk;
#endif
return NULL;
}
void load_array(void)
{
FILE *f;
size_t array_size_in_bytes = 1024*1024*1024, array_size=array_size_in_bytes/sizeof(int);
array = (int*) malloc(array_size_in_bytes);
if(array == NULL) abort(); // assert if condition is false
for(size_t i=0; i<array_size; i++) array[i]=i;
}
int main(void)
{
int i, max = 0, position, id[NUM_THREADS], rc;
clock_t t;
double time;
load_array();
printf("Finding max...");
t = clock();
clock_gettime(CLOCK_MONOTONIC,&ts0);
// Create threads
for(i = 0; i < NUM_THREADS; i++)
{
id[i] = i; // uso id para pasarle un puntero distinto a cada thread
rc = pthread_create(&(tid[i]), NULL, &thread, (void*)(id + i));
if (rc != 0)
printf("Can't create thread! rc = %d\n", rc);
else
printf("Thread %lu created\n", tid[i]);
}
// Join threads
for(i = 0; i < NUM_THREADS; i++)
pthread_join(tid[i], NULL);
// Find max number from all chunks
for(i = 0; i < NUM_THREADS; i++)
if(max_chunk[i] > max)
{
max = max_chunk[i];
position = pos_chunk[i];
}
clock_gettime(CLOCK_MONOTONIC,&ts1);
printf("Time2 %.6LF\n", sc_timespec_diff(ts1,ts0)/1E9L);
t = clock() - t;
time = ((double) t) / CLOCKS_PER_SEC;
printf("done!\n");
free(array);
printf("----------- Program results -------------\nMax number: %d position %d\n", max, position);
printf("Time %f [s]\n", time);
pthread_exit(NULL);
return 0;
}
My timings:
0.188917 for the signle threaded version
2.511590 for the original multithreaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
0.099802 with the modified threaded version (measured with clock_gettime(CLOCK_MONOTONIC,...)
ran on a Linux machine with Intel(R) Core(TM) i7-2620M CPU # 2.70GHz.

Related

how to compute sum of n/m Gregory-Leibniz terms in C language

get the two values named m & n from the command line arguments and convert them into integers. now after that create m threads and each thread computes the sum of n/m terms in Gregory-Leibniz Series.
pi = 4 * (1 - 1/3 + 1/5 - 1/7 + 1/9 - ...)
Now when thread finishes its computation, print its partial sum and atomically add it to a shared global variable.
& how to check that all of the m computational threads have done the atomic additions?
I share my source code, what I tried
#include<stdio.h>
#include<pthread.h>
#include <stdlib.h>
#include<math.h>
pthread_barrier_t barrier;
int count;
long int term;
// int* int_arr;
double total;
void *thread_function(void *vargp)
{
int thread_rank = *(int *)vargp;
// printf("waiting for barrier... \n");
pthread_barrier_wait(&barrier);
// printf("we passed the barrier... \n");
double sum = 0.0;
int n = count * term;
int start = n - term;
// printf("start %d & end %d \n\n", start, n);
for(int i = start; i < n; i++)
{
sum += pow(-1, i) / (2*i+1);
// v += 1 / i - 1 / (i + 2);
}
total += sum;
// int_arr[count] = sum;
count++;
printf("thr %d : %lf \n", thread_rank, sum);
return NULL;
}
int main(int argc,char *argv[])
{
if (argc <= 2) {
printf("missing arguments. please pass two num. in arguments\n");
exit(-1);
}
int m = atoi(argv[1]); // get value of first argument
int n = atoi(argv[2]); // get value of second argument
// int_arr = (int*) calloc(m, sizeof(int));
count = 1;
term = n / m;
pthread_t thread_id[m];
int i, ret;
double pi;
/* Initialize the barrier. */
pthread_barrier_init(&barrier, NULL, m);
for(i = 0; i < m; i++)
{
ret = pthread_create(&thread_id[i], NULL , &thread_function, (void *)&i);
if (ret) {
printf("unable to create thread! \n");
exit(-1);
}
}
for(i = 0; i < m; i++)
{
if(pthread_join(thread_id[i], NULL) != 0) {
perror("Failed to join thread");
}
}
pi = 4 * total;
printf("%lf ", pi);
pthread_barrier_destroy(&barrier);
return 0;
}
what I need :-
create M thread & each thread computes the sum of n/m terms in the Gregory-Leibniz Series.
first thread computes the sum of term 1 to n/m , the second thread computes the sum of the terms from (n/m + 1) to 2n/m etc.
when all the thread finishes its computation than print its partial sum and Value of Pi.
I tried a lot, but I can't achieve exact what I want. I got wrong output value of PI
for example : m = 16 and n = 1024
then it sometimes return 3.125969, sometimes 12.503874 , 15.629843, sometimes 6.251937 as a output of Pi value
please help me
Edited Source Code :
#include <inttypes.h>
#include <math.h>
#include <pthread.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
struct args {
uint64_t thread_id;
struct {
uint64_t start;
uint64_t end;
} range;
};
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_barrier_t barrier;
long double total = 0;
uint64_t total_iterations = 0;
void *partial_sum(void *arg)
{
struct args *args = arg;
long double sum = 0;
printf("waiting for barrier in thread -> %" PRId64 "\n", args->thread_id);
pthread_barrier_wait(&barrier);
// printf("we passed the barrier... \n");
for (uint64_t n = args->range.start; n < args->range.end; n++)
sum += pow(-1.0, n) / (1 + n * 2);
if (pthread_mutex_lock(&mutex)) {
perror("pthread_mutex_lock");
exit(EXIT_FAILURE);
}
total += sum;
total_iterations += args->range.end - args->range.start;
if (pthread_mutex_unlock(&mutex)) {
perror("pthread_mutex_unlock");
exit(EXIT_FAILURE);
}
printf("thr %" PRId64 " : %.20Lf\n", args->thread_id, sum);
return NULL;
}
int main(int argc,char *argv[])
{
if (argc <= 2) {
fprintf(stderr, "usage: %s THREADS TERMS.\tPlease pass two num. in arguments\n", *argv);
return EXIT_FAILURE;
}
int m = atoi(argv[1]); // get value of first argument & converted into int
int n = atoi(argv[2]); // get value of second argument & converted into int
if (!m || !n) {
fprintf(stderr, "Argument is zero.\n");
return EXIT_FAILURE;
}
uint64_t threads = m;
uint64_t terms = n;
uint64_t range = terms / threads;
uint64_t excess = terms - range * threads;
pthread_t thread_id[threads];
struct args arguments[threads];
int ret;
/* Initialize the barrier. */
ret = pthread_barrier_init(&barrier, NULL, m);
if (ret) {
perror("pthread_barrier_init");
return EXIT_FAILURE;
}
for (uint64_t i = 0; i < threads; i++) {
arguments[i].thread_id = i;
arguments[i].range.start = i * range;
arguments[i].range.end = arguments[i].range.start + range;
if (threads - 1 == i)
arguments[i].range.end += excess;
printf("In main: creating thread %ld\n", i);
ret = pthread_create(thread_id + i, NULL, partial_sum, arguments + i);
if (ret) {
perror("pthread_create");
return EXIT_FAILURE;
}
}
for (uint64_t i = 0; i < threads; i++)
if (pthread_join(thread_id[i], NULL))
perror("pthread_join");
pthread_barrier_destroy(&barrier);
printf("Pi value is : %.10Lf\n", 4 * total);
printf("COMPLETE? (%s)\n", total_iterations == terms ? "YES" : "NO");
return 0;
}
In each thread, the count variable is expected to be of a steadily increasing value in this expression
int n = count * term;
being one larger than it was in the "previous" thread, but count is only increased later on in each thread.
Even if you were to "immediately" increase count, there is nothing that guards against two or more threads attempting to read from and write to the variable at the same time.
The same issue exists for total.
The unpredictability of these reads and writes will lead to indeterminate results.
When sharing resources between threads, you must take care to avoid these race conditions. The POSIX threads library does not contain any atomics for fundamental integral operations.
You should protect your critical data against a read/write race condition by using a lock to restrict access to a single thread at a time.
The POSIX threads library includes a pthread_mutex_t type for this purpose. See:
pthread_mutex_init / pthread_mutex_destroy
pthread_mutex_lock / pthread_mutex_unlock
Additionally, as pointed out by #Craig Estey, using (void *) &i as the argument to the thread functions introduces a race condition where the value of i may change before any given thread executes *(int *) vargp;.
The suggestion is to pass the value of i directly, storing it intermediately as a pointer, but you should use the appropriate type of intptr_t or uintptr_t, which are well defined for this purpose.
pthread_create(&thread_id[i], NULL , thread_function, (intptr_t) i)
int thread_rank = (intptr_t) vargp;
How to check that all of the m computational threads have done the atomic additions?
Sum up the number of terms processed by each thread, and ensure it is equal to the expected number of terms. This can also naturally be assumed to be the case if all possible errors are accounted for (ensuring all threads run to completion and assuming the algorithm used is correct).
A moderately complete example program:
#define _POSIX_C_SOURCE 200809L
#include <inttypes.h>
#include <math.h>
#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
struct args {
uint64_t thread_id;
struct {
uint64_t start;
uint64_t end;
} range;
};
pthread_mutex_t mutex;
long double total = 0;
uint64_t total_iterations = 0;
void *partial_sum(void *arg)
{
struct args *args = arg;
long double sum = 0;
for (uint64_t n = args->range.start; n < args->range.end; n++)
sum += pow(-1.0, n) / (1 + n * 2);
if (pthread_mutex_lock(&mutex)) {
perror("pthread_mutex_lock");
exit(EXIT_FAILURE);
}
total += sum;
total_iterations += args->range.end - args->range.start;
if (pthread_mutex_unlock(&mutex)) {
perror("pthread_mutex_unlock");
exit(EXIT_FAILURE);
}
printf("thread(%" PRId64 ") Partial sum: %.20Lf\n", args->thread_id, sum);
return NULL;
}
int main(int argc,char **argv)
{
if (argc < 3) {
fprintf(stderr, "usage: %s THREADS TERMS\n", *argv);
return EXIT_FAILURE;
}
uint64_t threads = strtoull(argv[1], NULL, 10);
uint64_t terms = strtoull(argv[2], NULL, 10);
if (!threads || !terms) {
fprintf(stderr, "Argument is zero.\n");
return EXIT_FAILURE;
}
uint64_t range = terms / threads;
uint64_t excess = terms - range * threads;
pthread_t thread_id[threads];
struct args arguments[threads];
if (pthread_mutex_init(&mutex, NULL)) {
perror("pthread_mutex_init");
return EXIT_FAILURE;
}
for (uint64_t i = 0; i < threads; i++) {
arguments[i].thread_id = i;
arguments[i].range.start = i * range;
arguments[i].range.end = arguments[i].range.start + range;
if (threads - 1 == i)
arguments[i].range.end += excess;
int ret = pthread_create(thread_id + i, NULL , partial_sum, arguments + i);
if (ret) {
perror("pthread_create");
return EXIT_FAILURE;
}
}
for (uint64_t i = 0; i < threads; i++)
if (pthread_join(thread_id[i], NULL))
perror("pthread_join");
pthread_mutex_destroy(&mutex);
printf("%.10Lf\n", 4 * total);
printf("COMPLETE? (%s)\n", total_iterations == terms ? "YES" : "NO");
}
Using 16 threads to process 1 billion terms:
$ ./a.out 16 10000000000
thread(14) Partial sum: 0.00000000000190476190
thread(10) Partial sum: 0.00000000000363636364
thread(2) Partial sum: 0.00000000006666666667
thread(1) Partial sum: 0.00000000020000000000
thread(8) Partial sum: 0.00000000000555555556
thread(15) Partial sum: 0.00000000000166666667
thread(0) Partial sum: 0.78539816299744868408
thread(3) Partial sum: 0.00000000003333333333
thread(13) Partial sum: 0.00000000000219780220
thread(11) Partial sum: 0.00000000000303030303
thread(4) Partial sum: 0.00000000002000000000
thread(5) Partial sum: 0.00000000001333333333
thread(7) Partial sum: 0.00000000000714285714
thread(6) Partial sum: 0.00000000000952380952
thread(12) Partial sum: 0.00000000000256410256
thread(9) Partial sum: 0.00000000000444444444
3.1415926535
COMPLETE? (YES)

Changing irrelevant part of the function changes papi measurement of branch prediction

I am playing with the codes that I found online and I want to try different branch prediction codes to have a better understanding of branch predictors.
CPU is AMD Ryzen 3600.
Basically, what I am doing is in the code below, I am trying to measure a misprediction rate of a given function/code segment. As a pseudo/shortened code, here is what I do:
int measurement(int length, int arr){
r1 = papi_read();
for(;i<len;){
if(arr[j]){
do_sth();
}
}
r2 = papi_read();
return r2-r1;
}
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
init_papi();
init_others(); //creates arrays, initialize them, etc.
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(128, array);
usleep(1200); // 2nd important line
}
print_mispredictions();
}
My setup
I have isolated the core I am working on so that there is no other user process in that core. I have also isolated the sibling one so that I am full in charge of both of the cores, except there is an interrupt or a routine.
Previously, I have seen that if I use sleep between iterations (as in main function), the CPU enters a deeper level C-state so the branch prediction units (BHT in this case) resets. This is the explanation of 2nd important line in the code.
What I want to see
Without the sleep line I am seeing that in each iteration I am having lower and lower misprediction rates. That is because of the branch predictors are learning the pattern in the array.
With the sleep line, what I want to achieve is, at each iteration, I should see similar misprediction numbers, as BPU entries are being reset.
What is the problem
The problem is when I change the warmup line from
void warmup(){
for(volatile int i = 0; i< 10000; i++){
for(volatile int j = 0; j < 100; j++){}
}
}
to
void warmup(){
for(volatile int i = 0; i< 1000000; i++){ // notice that I have the
// same amount of iteration
}
}
then the measurements are messed up. I have experienced this kind of issue in my previous question, which is never ever answered. Changing a line that is irrelevant to the measurement changes the measurement behavior.
This is my results:
# With 1 for loop, expected behavior. At every iteration, it is reset to ~ 60
# For 128 if statement, 60 misprediction is 50% guessing.
$ ./exp
0:73 #iteration count, misprediction count
1:62
2:63
3:21
4:63
...
# With 2 for loops. Unexpected behavior. It should always reset to ~ 60 but it keeps decreasing.
./exp
0:66
1:18
2:4
3:4
4:1
5:0
6:0
...
Putting a mfence or lfence instruction after the warmup doesn't change the result either.
Below, I am putting the whole code in case someone wants to try and/or has an answer for this behavior.
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/sysinfo.h>
#include <time.h>
#include "papi.h"
#define stick_this_thread_to_core(retval, core_id){ \
int num_cores = sysconf(_SC_NPROCESSORS_ONLN); \
if (core_id < 0 || core_id >= num_cores) \
retval = EINVAL; \
cpu_set_t cpuset; \
CPU_ZERO(&cpuset); \
CPU_SET(core_id, &cpuset); \
retval = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);\
}
#define ERROR_RETURN(retval) { \
fprintf(stderr, "Error %d, (%s) %s:line %d: \n", retval,PAPI_strerror(retval), __FILE__,__LINE__); \
exit(retval); \
}
void papi_my_native_add_event(int* EventSet1, char* eventname, int *native){
int retval;
//printf("native add\n");
if((retval = PAPI_event_name_to_code(eventname, native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add to_code is successful\n");
if ((retval = PAPI_add_event(*EventSet1, *native)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("native add add_event is successful\n");
int number = 0;
if((retval = PAPI_list_events(*EventSet1, NULL, &number)) != PAPI_OK)
ERROR_RETURN(retval);
//fprintf(stderr, "Added %d events.\n", number);
}
void papi_my_native_add_start_event(int* EventSet1, char* eventname, int *native){
papi_my_native_add_event(EventSet1, eventname, native);
int retval = 0;
if((retval = PAPI_start(*EventSet1)) != PAPI_OK)
ERROR_RETURN(retval);
//printf("START %s\n", eventname);
}
int RNG_SIZE = 128;
uint64_t* rng_arr;
uint64_t dummy;
// 12th core
int cpuid = 11;
// Code from
// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2019/11/05
// acts like a random number generator, but it is deterministic.
static inline uint64_t rng(uint64_t h) {
h ^= h >> 33;
h *= UINT64_C(0xff51afd7ed558ccd);
h ^= h >> 33;
h *= UINT64_C(0xc4ceb9fe1a85ec53);
h ^= h >> 33;
return h;
}
uint64_t measurement(int* EventSet, uint64_t howmany, uint64_t* arr){
long long reads[2] = {0};
PAPI_read(*EventSet, &reads[0]);
for(int j = 0; j < howmany; j++){
if(arr[j]){
dummy &= arr[j];
}
}
PAPI_read(*EventSet, &reads[1]);
return (reads[1] - reads[0]);
}
void precompute_rng(){
int howmany = RNG_SIZE;
for(int i = 0; i < RNG_SIZE; i++){
rng_arr[i] = rng(howmany) &0x1;
howmany--;
}
}
int stick_to_core(){
int retval = 0;
stick_this_thread_to_core(retval, cpuid);
if(retval){
printf("Affinity error: %s\n", strerror(errno));
return 1;
}
return 0;
}
void init_papi(int* EventSet, int cpuid){
int retval = 0;
// papi init
if((retval = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT )
ERROR_RETURN(retval);
PAPI_option_t opts1;
opts1.cpu.cpu_num = cpuid;
if((retval = PAPI_create_eventset(EventSet)) != PAPI_OK)
ERROR_RETURN(retval);
if((retval = PAPI_assign_eventset_component(*EventSet, 0)) != PAPI_OK)
ERROR_RETURN(retval);
opts1.cpu.eventset = *EventSet;
if((retval =PAPI_set_opt(PAPI_CPU_ATTACH, &opts1)) != PAPI_OK)
ERROR_RETURN(retval);
char* eventname = "RETIRED_BRANCH_INSTRUCTIONS_MISPREDICTED";
unsigned int native = 0x0;
papi_my_native_add_start_event(EventSet, eventname, &native);
}
void warmup(){
for(volatile int i = 0; i< 100000; i++){
for(volatile int j = 0; j < 100; j++){} // important line
}
}
int main() {
if(stick_to_core()){
printf("Error on sticking to the core\n");
return 1;
}
int EventSet = PAPI_NULL;
int* EventSetPtr = &EventSet;
init_papi(EventSetPtr, cpuid);
rng_arr = (uint64_t*) malloc(RNG_SIZE * sizeof(uint64_t));
precompute_rng(cpuid);
int iter = 4096;
uint64_t* results = (uint64_t*) malloc(iter * sizeof(uint64_t));
for(int i = 0; i < iter; i++)
results[i] = 0;
warmup();
for(int i = 0; i < 20; i++){
results[i] = measurement(&EventSet, RNG_SIZE, rng_arr);
usleep(1200);
}
// prints
for(int i = 0; i < 20; i++){
printf("%d:%ld\n", i, results[i]);
}
printf("\n");
free(results);
return 0;
}
Compile with
gcc -O0 main.c -lpthread -lpapi -o exp

Measuring speed up of a multi threaded C program (implementation using Pthreads)

I currently have a multi-threaded C program coded using Pthreads which uses 2 threads. I want to increase the no. of threads and measure speed up upon doing so. I would like to run my code in an automated manner where the no. of threads used keeps getting incremented and I want to graphically display running times of my code. I would love it if I could get a clue in on how to do so especially on how to automate the entire process and plotting it graphically. Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata
{
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[])
{
long vec_index;
for(vec_index = 0 ; vec_index < VECTOR_SIZE ; vec_index++){
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE/NUM_THREADS;
pthread_t call_thread[NUM_THREADS];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for(i = 0 ; i < NUM_THREADS ; i++){
pthread_create(&call_thread[i], &attr, calcDOT, (void *)i);
}
pthread_attr_destroy(&attr);
for (i = 0 ; i < NUM_THREADS ; i++){
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
pthread_exit(NULL);
}
void *calcDOT(void *thread_id)
{
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long)thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for(vec_index = start_index ; vec_index < end_index ; vec_index++){
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *)thread_id);
}
I would like to increment my NUM_THREADS parameter and run it after each increment, record the execution time after each increment and plot a graph of execution time vs number of threads.
I tried a naive approach by increasing the number of threads, timing it with time.h and plotting it with gnuplot. Each iteration we double the number of threads and we print the time for an iteration. We use gnuplot to display a graph with number of threads on the x-axis and execution time on the y-axis
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_THREADS 2
#define VECTOR_SIZE 40
struct DOTdata {
/* data */
long X[VECTOR_SIZE];
long Y[VECTOR_SIZE];
long sum;
long compute_length;
};
struct DOTdata dotstr;
pthread_mutex_t mutex_sum;
void *calcDOT(void *);
int main(int argc, char *argv[]) {
double xvals[VECTOR_SIZE / NUM_THREADS];
double yvals[VECTOR_SIZE / NUM_THREADS];
int index = 0;
for (int count = NUM_THREADS; count < VECTOR_SIZE / NUM_THREADS; count = count * 2) {
clock_t begin = clock();
long vec_index;
for (vec_index = 0; vec_index < VECTOR_SIZE; vec_index++) {
dotstr.X[vec_index] = vec_index + 1;
dotstr.Y[vec_index] = vec_index + 2;
}
dotstr.sum = 0;
dotstr.compute_length = VECTOR_SIZE / count;
pthread_t call_thread[count];
pthread_attr_t attr;
void *status;
pthread_mutex_init(&mutex_sum, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
long i;
for (i = 0; i < count; i++) {
pthread_create(&call_thread[i], &attr, calcDOT, (void *) i);
}
pthread_attr_destroy(&attr);
for (i = 0; i < count; i++) {
pthread_join(call_thread[i], &status);
}
printf("Resultant X*Y is %ld\n", dotstr.sum);
pthread_mutex_destroy(&mutex_sum);
clock_t end = clock();
double time_spent = (double) (end - begin) / CLOCKS_PER_SEC;
printf("time spent: %f NUM_THREADS: %d\n", time_spent, count);
xvals[index] = count;
yvals[index] = time_spent;
index++;
}
FILE * gnuplotPipe = popen ("gnuplot -persistent", "w");
fprintf(gnuplotPipe, "plot '-' \n");
for (int i = 0; i < VECTOR_SIZE / NUM_THREADS; i++)
{
fprintf(gnuplotPipe, "%lf %lf\n", xvals[i], yvals[i]);
}
fprintf(gnuplotPipe, "e");
pthread_exit(NULL);
}
void *calcDOT(void *thread_id) {
long vec_index;
long start_index;
long end_index;
long length;
long offset;
long sum = 0;
offset = (long) thread_id;
length = dotstr.compute_length;
start_index = offset * length;
end_index = (start_index + length) - 1;
for (vec_index = start_index; vec_index < end_index; vec_index++) {
sum += (dotstr.X[vec_index] * dotstr.Y[vec_index]);
}
pthread_mutex_lock(&mutex_sum);
dotstr.sum += sum;
pthread_mutex_unlock(&mutex_sum);
pthread_exit((void *) thread_id);
}
Output
Resultant X*Y is 20900
time spent: 0.000155 NUM_THREADS: 2
Resultant X*Y is 19860
time spent: 0.000406 NUM_THREADS: 4
Resultant X*Y is 17680
time spent: 0.000112 NUM_THREADS: 8
Resultant X*Y is 5712
time spent: 0.000587 NUM_THREADS: 16

No real acceleration with pthreads

I am trying to implement the multithreaded version of the Monte-Carlo algorithm. Here is my code:
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#include <math.h>
#include <semaphore.h>
#include <errno.h>
#include <stdbool.h>
#include <string.h>
#define MAX_THREADS 12
#define MAX_DOTS 10000000
double sum = 0.0;
sem_t sem;
void reset() {
sum = 0.0;
}
void* check_dot(void* _iterations) {
int* iterations = (int*)_iterations;
for(int i = 0; i < *iterations; ++i) {
double x = (double)(rand() % 314) / 100;
double y = (double)(rand() % 100) / 100;
if(y <= sin(x)) {
sem_wait(&sem);
sum += x * y;
sem_post(&sem);
}
}
return NULL;
}
void* check_dots_advanced(void* _iterations) {
int* iterations = (int*)_iterations;
double* res = (double*)malloc(sizeof(double));
for(int i = 0; i < *iterations; ++i) {
double x = (double)(rand() % 314) / 100;
double y = (double)(rand() % 100) / 100;
if(y <= sin(x)) *res += x * y;
}
pthread_exit((void*)res);
}
double run(int threads_num, bool advanced) {
if(!advanced) sem_init(&sem, 0, 1);
struct timespec begin, end;
double elapsed;
pthread_t threads[threads_num];
int iters = MAX_DOTS / threads_num;
for(int i = 0; i < threads_num; ++i) {
if(!advanced) pthread_create(&threads[i], NULL, &check_dot, (void*)&iters);
else pthread_create(&threads[i], NULL, &check_dots_advanced, (void*)&iters);
}
if(clock_gettime(CLOCK_REALTIME, &begin) == -1) {
perror("Unable to get time");
exit(-1);
}
for(int i = 0; i < threads_num; ++i) {
if(!advanced) pthread_join(threads[i], NULL);
else {
void* tmp;
pthread_join(threads[i], &tmp);
sum += *((double*)tmp);
free(tmp);
}
}
if(clock_gettime(CLOCK_REALTIME, &end) == -1) {
perror("Unable to get time");
exit(-1);
}
if(!advanced) sem_destroy(&sem);
elapsed = end.tv_sec - begin.tv_sec;
elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
return elapsed;
}
int main(int argc, char** argv) {
bool advanced = false;
char* filename = NULL;
for(int i = 1; i < argc; ++i) {
if(strcmp(argv[i], "-o") == 0 && argc > i + 1) {
filename = argv[i + 1];
++i;
}
else if(strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--advanced") == 0) {
advanced = true;
}
}
if(!filename) {
fprintf(stderr, "You should provide the name of the output file.\n");
exit(-1);
}
FILE* fd = fopen(filename, "w");
if(!fd) {
perror("Unable to open file");
exit(-1);
}
srand(time(NULL));
double worst_time = run(1, advanced);
double result = (3.14 / MAX_DOTS) * sum;
reset();
fprintf(fd, "Result: %f\n", result);
for(int i = 2; i <= MAX_THREADS; ++i) {
double time = run(i, advanced);
double accel = time / worst_time;
fprintf(fd, "%d:%f\n", i, accel);
reset();
}
fclose(fd);
return 0;
}
However, I can't see any real acceleration with increasing the number of threads (and it does not matter what check_dot() function I am using). I have tried to execute this code on my laptop with Intel Core i7-3517u (lscpu says that it has 4 independent CPUs) and it looks like the number of threads not really influences the execution time of my program:
Number of threads: 1, working time: 0.847277 s
Number of threads: 2, working time: 3.133838 s
Number of threads: 3, working time: 2.331216 s
Number of threads: 4, working time: 3.011819 s
Number of threads: 5, working time: 3.086003 s
Number of threads: 6, working time: 3.118296 s
Number of threads: 7, working time: 3.058180 s
Number of threads: 8, working time: 3.114867 s
Number of threads: 9, working time: 3.179515 s
Number of threads: 10, working time: 3.025266 s
Number of threads: 11, working time: 3.142141 s
Number of threads: 12, working time: 3.064318 s
I supposed that it should be some kind of linear dependence between the execution time and number of working threads for at least four first values (the more threads are working the less is execution time), but here I have pretty equal time values. Is it a real problem in my code or I am too demanding?
The problem you are experiencing is that the internal state of rand() is a shared resource between all threads, so the threads are going to serialise on access to rand().
You need to use a pseudo-random number generator with per-thread state - the rand_r() function (although marked obsolete in the latest version of POSIX) can be used as such. For serious work you would be best off importing the implementation of some specific PRNG algorithm such as Mersenne Twister.
I was able to collect the timing / scaling measurements that you would desire with two changes to your code.
First, rand() is not thread safe. Replacing the calls with calls to rand_r(seed) in the advanced check_dots showed continual scaling as threads increased. I think rand might have an internal lock that is serializing execution and preventing any speedup. This change alone shows some scaling, from 1.23s -> 0.55 sec (5 threads).
Second, I introduced barriers around the core execution region so that the cost of serially creating/joining threads and the malloc calls is not included. The core execution region shows good scaling, from 1.23sec -> 0.18sec (8 threads).
Code was compiled with gcc -O3 -pthread mcp.c -std=c11 -lm, run on Intel E3-1240 v5 (4 cores, HT), Linux 3.19.0-68-generic. Single measurements reported.
pthread_barrier_t bar;
void* check_dots_advanced(void* _iterations) {
int* iterations = (int*)_iterations;
double* res = (double*)malloc(sizeof(double));
sem_wait(&sem);
unsigned int seed = rand();
sem_post(&sem);
pthread_barrier_wait(&bar);
for(int i = 0; i < *iterations; ++i) {
double x = (double)(rand_r(&seed) % 314) / 100;
double y = (double)(rand_r(&seed) % 100) / 100;
if(y <= sin(x)) *res += x * y;
}
pthread_barrier_wait(&bar);
pthread_exit((void*)res);
}
double run(int threads_num, bool advanced) {
sem_init(&sem, 0, 1);
struct timespec begin, end;
double elapsed;
pthread_t threads[threads_num];
int iters = MAX_DOTS / threads_num;
pthread_barrier_init(&bar, NULL, threads_num + 1); // barrier init
for(int i = 0; i < threads_num; ++i) {
if(!advanced) pthread_create(&threads[i], NULL, &check_dot, (void*)&iters);
else pthread_create(&threads[i], NULL, &check_dots_advanced, (void*)&iters);
}
pthread_barrier_wait(&bar); // wait until threads are ready
if(clock_gettime(CLOCK_REALTIME, &begin) == -1) { // begin time
perror("Unable to get time");
exit(-1);
}
pthread_barrier_wait(&bar); // wait until threads finish
if(clock_gettime(CLOCK_REALTIME, &end) == -1) { // end time
perror("Unable to get time");
exit(-1);
}
for(int i = 0; i < threads_num; ++i) {
if(!advanced) pthread_join(threads[i], NULL);
else {
void* tmp;
pthread_join(threads[i], &tmp);
sum += *((double*)tmp);
free(tmp);
}
}
pthread_barrier_destroy(&bar);

Issues with pointers when passing a struct to a thread on Win32 API

The user provides command line arguments that are used to compute the number of partitions, and the number of threads, where each thread does a minimum linear search of a specific partition of the large array. Each minimum value found by a thread is stored inside a small global array. The main function then does a minimum linear search of the small array, and also a minimum search of the large array and confirms that the minimum found in both small and large array are equal. The problem that I am encountering is that the minimums inside the small global array are sometimes garbage, and sometimes matches the minimum found in the large array. I have tried to figure out the problem but I don't seem to find it. Your help will be really appreciated. I am coding in C, using Dev-C++ on win32 API. The code is bellow:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
int number_items = 0;
int size = 1;
int partits = 1;
int P = 0;
int N = 0;
int Index = 0;
int index_global = 0;
int min;
#define NUM_THREADS 65536 //or 2^16
typedef struct thread_data
{
int thread_id;
int a;
int b;
int * copy_array;
int * glob_array;
int nbr_items;
int subarraysize;
} s_param, *p_s_param;
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
DWORD WINAPI CompMin( LPVOID lpParam )
{
int i, tmp;
int SubArSize,nbrItems,thrid;
p_s_param param2;
param2 = (p_s_param)lpParam;
min = param2->copy_array[Index];
min = param2->copy_array[param2->a];
param2->glob_array[index_global] = min;
Index++;
index_global++;
}
int main(int argc, char *argv[])
{
int sub_array_size;
p_s_param pDataArray[NUM_THREADS];
DWORD dwThreadIdArray[NUM_THREADS];
HANDLE hThreadArray[NUM_THREADS];
HANDLE myhandle;
//pthread_t thID, thread;
p_s_param param[NUM_THREADS];
int rNum, rc = 0, i, j, large_min;
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
P = atoi(argv[1]); /* will be used to define size of large array */
N = atoi(argv[2]); /* will be used to define number of threads */
if(N>P)
{
printf(" Argument 1 should be greater than argument 2\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Create a dynamic array of size size*/
int *array = (int*) malloc(size*sizeof(int));
srand(time(NULL));
for (i=0; i<size; i++)
{
rNum = rand() / RAND_DIVISOR;
array[i] = rNum;
}
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
/*numbers of elements per sub array*/
sub_array_size = size/partits;
/*Global array*/
int *Globalarray = (int*) malloc(partits*sizeof(int));
for (i=0; i<partits; i++)
{
/*Allocate memory for thread data*/
param[i] = (p_s_param) HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(s_param));
if( param[i] == NULL)
{
ExitProcess(2);
}
param[i]->a=i;
param[i]->copy_array=array;
param[i]->glob_array = Globalarray;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, param[i], 0, &dwThreadIdArray[i]);
if(hThreadArray[i] == NULL)
{
puts("Error, cannot create Thread!");
puts(strerror(errno));
ExitProcess(3);
}
//printf("Number of partitions: %d\n",partits );
} WaitForMultipleObjects(NUM_THREADS,hThreadArray, TRUE, INFINITE);
/*find mimimum value from Global array returned by threads*/
min = Globalarray[0];
for(i = 0; i< partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, Globalarray[i] );
if(Globalarray[i] < min)
min = Globalarray[i];
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
large_min = array[0];
for(i = 0; i< partits; i++)
{
printf("value into large array: %d\n",array[i] );
if(array[i] < large_min)
large_min = array[i];
}
//large_min = array[0];
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small %d array is different from large array %d!\n", min, array[0]);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free((void*) Globalarray);
free((void*) array);
exit (0);
}
I just added a sleep(3) after the wait, and it fixed the problem.
Your CompMin() function is not thread-safe. It is accessing global variables that are shared and modified by multiple threads at the same time, so they are going to step over each other's data as they run in parallel to each other. You need to make your work data self-contained so each thread is only operating on the data it is given to work on, and get rid of your shared globals altogether. You designed your thread_data struct to allow partitioning the array data, but you are not actually utilizing that functionality, so each thread is not searching its individual partition of data, and not storing its search result in its individual section of the global array.
You are also passing the wrong number of thread handles to WaitForMultipleObjects(), so it will fail to wait, which you are not checking for, and then you move on to process the array data before they are actually ready to be processed.
Your are also searching the arrays incorrectly after the threads have finished running, so you are not going to end up with the correct results.
Try something more like this instead:
#include <inttypes.h>
#include <stdlib.h>
#include <stdio.h>
#include <tchar.h>
#include <windows.h>
#define RAND_DIVISOR 800
typedef struct thread_data
{
DWORD thread_id;
int *items;
int nbr_items;
int min;
} s_thread_data, *p_s_thread_data;
/*
int compare (const void *a, const void *b)
{
return( *(int*)a - *(int*)b);
}
*/
DWORD WINAPI CompMin( LPVOID lpParam )
{
p_s_thread_data data = (p_s_thread_data) lpParam;
int i;
data->min = data->items[0];
for(i = 1; i < data->nbr_items; i++)
{
if(data->items[i] < data->min)
data->min = data->items[i];
}
return 0;
}
int main(int argc, char *argv[])
{
int size = 1;
int partits = 1;
int sub_array_size;
int i;
if(argc < 2 )
{
printf("Need %d arguments, only %d provided\n", 2, argc);
printf("The program will exit now!\n");
return 1;
}
int P = atoi(argv[1]); /* will be used to define size of large array */
if(P < 1)
{
printf(" Argument 1 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
int N = atoi(argv[2]); /* will be used to define number of threads */
if(N < 1)
{
printf(" Argument 2 should be greater than zero\n");
printf("The program will exit now!\n");
return 1;
}
/*compute the size of the large array*/
for (i=1; i<=P; i++)
size = size * 2;
/*Allocate memory for large array*/
int *array = (int*) malloc(size*sizeof(int));
if(array == NULL)
return 2;
srand(time(NULL));
/*Fill the large array with random data*/
for (i=0; i<size; i++)
array[i] = rand() / RAND_DIVISOR;
/*compute the number of partitions*/
for (i = 1; i<=N; i++)
partits = partits * 2;
//printf("Number of partitions: %d\n", partits );
/*numbers of elements per partition*/
sub_array_size = size/partits;
/*Allocate memory for thread data*/
p_s_thread_data ThreadDataArray = (p_s_thread_data) malloc(partits*sizeof(s_thread_data));
if(ThreadDataArray == NULL)
return 2;
memset(ThreadDataArray, 0, partits*sizeof(s_thread_data));
/*Allocate memory for thread handles array*/
HANDLE *hThreadArray = (HANDLE*) malloc(partits*sizeof(HANDLE));
if(hThreadArray == NULL)
return 2;
memset(hThreadArray, 0, partits*sizeof(HANDLE));
double time1, time2, time3, time4;
//get initial timestamp in micro seconds
struct timeval tv;
gettimeofday( &tv, NULL );
time1 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
printf( "Start timestamp: %f\n", time1 );
for (i=0; i<partits; i++)
{
ThreadDataArray[i].items = &array[i*sub_array_size];
ThreadDataArray[1].nbr_items = sub_array_size;
hThreadArray[i] = CreateThread(NULL, 0, CompMin, &param[i], 0, &(param[i].thread_id));
if(hThreadArray[i] == NULL)
{
printf("Error, cannot create Thread! %s\n", strerror(errno));
return 3;
}
}
/*Wait for threads to finish*/
i = 0;
int nbr_handles = partits;
while (nbr_handles >= MAXIMUM_WAIT_OBJECTS)
{
if (WaitForMultipleObjects(MAXIMUM_WAIT_OBJECTS, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
i = i + MAXIMUM_WAIT_OBJECTS;
nbr_handles = nbr_handles - MAXIMUM_WAIT_OBJECTS;
}
if (nbr_handles > 0)
{
if (WaitForMultipleObjects(nbr_handles, &hThreadArray[i], TRUE, INFINITE) != WAIT_OBJECT_0)
return 4;
}
/*find minimum value from thread results*/
int min = ThreadDataArray[0].min;
for(i = 0; i < partits; i++)
{
printf("Index: %d, value into small array: %d\n",i, ThreadDataArray[i].min );
if(ThreadDataArray[i].min < min)
min = ThreadDataArray[i].min;
}
gettimeofday( &tv, NULL );
time2 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
gettimeofday( &tv, NULL );
time3 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*sorting the large array in ascending order and find minimum value*/
//qsort(array,size, sizeof(int), compare);
int large_min = array[0];
for(i = 0; i < size; i++)
{
printf("value into large array: %d\n", array[i] );
if(array[i] < large_min)
large_min = array[i];
}
gettimeofday( &tv, NULL );
time4 = tv.tv_sec + ( tv.tv_usec / 1000000.0 );
/*display result*/
printf("Min from small array : %d\n", min);
printf("Min from large array : %d\n", large_min);
if(min == large_min)
printf("Same minimum found in small and large array! : %d\n", large_min);
else
{
printf("error!, the min from small array (%d) is different from large array (%d)!\n", min, large_min);
return 1;
}
printf("length of time recorded to search min in small array: %f\n", time2-time1);
printf("length of time recorded to search min in large array: %f\n", time4-time3);
free(array);
free(ThreadDataArray);
free(hThreadArray);
return 0;
}

Resources