How to parallelize monte carlo estimation of pi using pthreads? - c

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
int main(int argc, char **argv)
{
unsigned long long in = 1;
unsigned long long total = 2;
double tol , change, new, secs , old = 0.0;
struct timeval start , end;
int threads ; /* ignored */
if ( argc < 2) {
exit (-1);
}
threads = atoi ( argv[1]);
tol = atof ( argv[2]);
if (( threads < 1) || ( tol < 0.0)) {
exit (-1);
}
tol = tol *tol;
srand48(clock());
gettimeofday (&start , NULL);
do
{
double x, y;
x = drand48();
y = drand48();
total ++;
if (( x*x + y*y) <= 1.00)
in ++;
new = 4.0 * (double)in/( double)total ;
change = fabs (new - old);
old = new;
}while (change > tol );
gettimeofday (&end, NULL);
secs = (( double)end.tv_sec - (double)start.tv_sec )
+ (( double)end.tv_usec - (double)start.tv_usec )/1000000.0;
printf ( ”Found estimate of pi of %.12f in %llu iterations , %.6f seconds.n n”,
new, total - 2, secs );
}
The code above is a sequential program that takes an argument for the tolerance for how closely to estimate pi. Once the change in these old and new values fall below the tolerance it exits.
I have to parallelize this program in pthreads. I am not trying to have someone do it for me but rather to get some pointers and ideas to think about so that I may be able to this. the pthreads program will take number of threads and tolerance as it's argument and output the estimation. I am very new to parallel programs and don't really know where to start so I will take any advice. Thanks.

Each thread should keep its own in and total counts and use a better exit condition. Then add up the in and total values from each thread.

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
void* Function(void* i);
#define MAX_THREADS 200
unsigned long long total[MAX_THREADS] = {0}; //total points for threads
unsigned long long in[MAX_THREADS] = {0}; //points inside for threads
double tolerance, change, new, old = 0.0;
long thread_num;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
int main(int argc, char **argv)
{
long i;
struct timeval start, end;
double secs;
unsigned long long master_total;
pthread_t* threads;
if (argc != 3){
printf("\nMust pass 2 arguments: (Tolerance) (# of Threads)");
exit(-1);
}
thread_num = atoi ( argv[1]);
tolerance = atof ( argv[2]);
if (( thread_num < 1) || ( tolerance < 0.0) || (thread_num > 200)) {
printf("\nIncorrect tolerance or threads.");
exit (-1);
}
threads = malloc(thread_num*sizeof(pthread_t)); //allocating space for threads
tolerance = tolerance * tolerance;
change = 0.5;
srand48(clock());
gettimeofday (&start, NULL);
for( i = 0; i < thread_num; i++ ){
pthread_create(&threads[i], NULL, Function, (void*)i);
}
for( i = 0; i < thread_num; i++ ){
pthread_join(threads[i], NULL);
}
gettimeofday (&end, NULL);
master_total = 0;
for( i = 0; i < thread_num; i++ ){
master_total = master_total + total[i];
}
secs = (( double)end.tv_sec - (double)start.tv_sec )
+ (( double)end.tv_usec - (double)start.tv_usec )/1000000.0;
printf ( "Estimation of pi is %.12f in %llu iterations , %.6f seconds.\n", new, master_total, secs );
}
//Each thread will calculate it's own points for in and total
//Per 1000 points it will calculate new and old values and compare to tolerance
//If desired accuracy is met the threads will return.
void* Function(void* i){
/*
rc - return code
total[i], in[i] - threads own number of calculated points
my_total, my_in - Each thread calculates global in and total, per 1000 points and calculates tolerance
*/
long my_spot = (long) i;
long rc;
long j;
unsigned long long my_total;
unsigned long long my_in;
do
{
double x, y;
x = drand48();
y = drand48();
total[my_spot]++;
if (( x*x + y*y) <= 1.00){
in[my_spot]++;
}
if(total[my_spot] % 1000 == 0){
while ( j < thread_num){
my_total = my_total + total[j];
my_in = my_in + in[j];
}
my_total = my_total;
//critical section
//old, new, and change are all global
rc = pthread_mutex_lock(&mutex);
new = 4.0 * (double)my_in/( double)my_total;
change = fabs (new - old);
old = new;
rc = pthread_mutex_unlock(&mutex);
}
}while (change > tolerance );
return NULL;
}
This is what I whipped up but I am getting errors. It just stops. I just have the threads break out of a loop and return then the main thread joins them. Any advice on what I am doing here?
I run it and it seems that all the threads get locked out when they reach mutex lock. I have each thread check the change in pi every 1000 points.

Related

program doesen't return nothing if there isn't printf in for

I am trying to generate random arrays in my program.
If in the for i put printf("%d", i); my programs run and take his time to printf all of the values, and then print "end", but if i comment the printf in the for, when i execute the program after 1-2 sec it end without giving any type of result back (no error, no printf).
#include<stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
//Costanti
static double E = 0.001; // Errore relativo massimo ammissibile
static int nMin = 100; // A = nMin
static int nMax = 5000000;
double B;
double duration(struct timespec start, struct timespec end) {
return end.tv_sec - start.tv_sec
+ ((end.tv_nsec - start.tv_nsec) / (double)1000000000.0);
}
double getResolution() {
struct timespec start, end;
clock_gettime(CLOCK_MONOTONIC, &start);
do {
clock_gettime(CLOCK_MONOTONIC, &end);
} while (duration(start, end) == 0.0);
return duration(start, end);
}
int main() {
// Inizializzazione variabili per il calcolo del tempo
double tMin = getResolution() * ((1 / E) + 1);
B = exp((log(nMax) - log(nMin)) / 99);
srand(time(NULL));
// Generazione Input per l'algoritmo
//struct timespec inizio, fine;
//clock_gettime(CLOCK_MONOTONIC, &inizio);
for (int j = 0; j < 100; j++) {
int n = nMin * pow(B,j);
int array[n];
for (int i = 0; i < n; i++) {
array[i] = rand();
//printf("%d", i);
}
}
//clock_gettime(CLOCK_MONOTONIC, &fine);
//double quantodura = duration(inizio, fine);
//printf("generation time: %f", quantodura);
printf("ciao");
return 0;
}
Even if I comment all of the struct timespec inizio,fine; clock_gettime ecc. it doesen't work
It didn't return nothing. A program always returns an exit code. (It can be obtained using echo $? if using "sh".) I get 139, indicating a segmentation violation on my system. Using -fsanitize=address identifies a stack overflow as the cause.
AddressSanitizer:DEADLYSIGNAL
=================================================================
==1==ERROR: AddressSanitizer: stack-overflow on address 0x7fff91751728 (pc 0x00000040175f bp 0x7fff92031910 sp 0x7fff91751730 T0)
#0 0x40175f in main /app/example.c:46
#1 0x7f5eafe2c0b2 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x240b2)
#2 0x40117d in _start (/app/output.s+0x40117d)
SUMMARY: AddressSanitizer: stack-overflow /app/example.c:46 in main
==1==ABORTING
Line 46 is int array[n];. This line is creating ever bigger arrays. Eventually, the array to create is so large that it can't be accommodated by the stack. (This happened when n was 2,326,588 in my test. B was 1.115,487, and j was 92.) You'll need to allocate such large arrays on the heap (e.g. using malloc) instead of the stack.

Increasing a global counter variable from within a thread without having to wait for each individual thread

My goal is to create a program that evaluates the performance gains from increasing the number of threads the program can use. I evaluate the performance by using the Monte Carlo method to calculate pi. Each thread should create 1 random coordinate (x,y) and check if that coordinate is within the circle or not. If it is, the inCircle counter should increase. Pi is calculated as follows: 4 * inCircle/trys. Using pthread_join, there is no performance gains in a problem that should benefit from multiple threads. Is there some way to allow multiple threads to increase a counter without having to wait for each individual thread?
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <stdbool.h>
#include <pthread.h>
#define nPoints 10000000
#define NUM_THREADS 16
int inCircle = 0;
int count = 0;
double x,y;
pthread_mutex_t mutex;
bool isInCircle(double x, double y){
if(x*x+y*y<=1){
return true;
}
else{
return false;
}
}
void *piSlave(){
int myCount = 0;
time_t now;
time(&now);
srand((unsigned int)now);
for(int i = 1; i <= nPoints/NUM_THREADS; i++) {
x = (double)rand() / (double)RAND_MAX;
y = (double)rand() / (double)RAND_MAX;
if(isInCircle(x,y)){
myCount++;
}
}
pthread_mutex_lock(&mutex);
inCircle += myCount;
pthread_mutex_unlock(&mutex);
pthread_exit(0);
}
double piMaster()
{
pthread_t threads[NUM_THREADS];
int rc;
long t;
for(t=0; t<NUM_THREADS; t++){
printf("Creating thread %ld\n", t);
rc = pthread_create(&threads[t], NULL, piSlave, (void *)t);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
//pthread_join(threads[t], NULL);
}
//wait(NULL);
return 4.0*inCircle/nPoints;
}
int main()
{
printf("%f\n",piMaster());
return(0);
}
There are few issues with the code.
Wait for Thread Termination
The piMaster() function should wait for the threads it created. We can do this by simply running pthread_join() in a loop:
for (t = 0; t < NUM_THREADS; t++)
pthread_join(threads[t], NULL);
Avoid Locks
We can simply atomically increase the inCircle counter at the end of the loop, so no locks are needed. The variable must be declared with _Atomic keyword as described in the Atomic operations C reference:
_Atomic long inCircle = 0;
void *piSlave(void *arg)
{
[...]
inCircle += myCount;
[...]
}
This will generate correct CPU instructions to atomically increase the variable. For example, for x86 architecture a lock prefix appears as we can confirm in disassembly:
29 inCircle += myCount;
0x0000000100000bdb <+155>: lock add %rbx,0x46d(%rip) # 0x100001050 <inCircle>
Avoid Slow and Thread Unsafe rand()
Instead, we can simply scan the whole circle in a loop as described on Approximations of Pi Wikipedia page:
for (long x = -RADIUS; x <= RADIUS; x++)
for (long y = -RADIUS; y <= RADIUS; y++)
myCount += isInCircle(x, y);
So here is the code after the changes above:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#define RADIUS 10000L
#define NUM_THREADS 10
_Atomic long inCircle = 0;
inline long isInCircle(long x, long y)
{
return x * x + y * y <= RADIUS * RADIUS ? 1 : 0;
}
void *piSlave(void *arg)
{
long myCount = 0;
long tid = (long)arg;
for (long x = -RADIUS + tid; x <= RADIUS + tid; x += NUM_THREADS)
for (long y = -RADIUS; y <= RADIUS; y++)
myCount += isInCircle(x, y);
printf("\tthread %ld count: %zd\n", tid, myCount);
inCircle += myCount;
pthread_exit(0);
}
double piMaster()
{
pthread_t threads[NUM_THREADS];
long t;
for (t = 0; t < NUM_THREADS; t++) {
printf("Creating thread %ld...\n", t);
if (pthread_create(&threads[t], NULL, piSlave, (void *)t)) {
perror("Error creating pthread");
exit(-1);
}
}
for (t = 0; t < NUM_THREADS; t++)
pthread_join(threads[t], NULL);
return (double)inCircle / (RADIUS * RADIUS);
}
int main()
{
printf("Result: %f\n", piMaster());
return (0);
}
And here is the output:
Creating thread 0...
Creating thread 1...
Creating thread 2...
Creating thread 3...
Creating thread 4...
Creating thread 5...
Creating thread 6...
Creating thread 7...
Creating thread 8...
Creating thread 9...
thread 7 count: 31415974
thread 5 count: 31416052
thread 1 count: 31415808
thread 3 count: 31415974
thread 0 count: 31415549
thread 4 count: 31416048
thread 2 count: 31415896
thread 9 count: 31415808
thread 8 count: 31415896
thread 6 count: 31416048
Result: 3.141591

Thread safe bit array?

EDIT TO QUESTION: Is it possible to have thread safe access to a bit array? My implementation below seems to require mutex locks which defeats the purpose of parallelizing.
I've been tasked with creating a parallel implementation of a twin prime generator using pthreads. I decided to use the Sieve of Eratosthenes and to divide the work of marking the factors of known primes. I staggering which factors a thread gets.
For example, if there are 4 threads:
thread one marks multiples 3, 11, 19, 27...
thread two marks multiples 5, 13, 21, 29...
thread two marks multiples 7, 15, 23, 31...
thread two marks multiples 9, 17, 25, 33...
I skipped the even multiples as well as the even base numbers. I've used a bitarray, so I run it up to INT_MAX. The problem I have is at max value of 10 million, the result varies by about 5 numbers, which is how much error there is compared to a known file. The results vary all the way down to about max value of 10000, where it changes by 1 number. Anything below that is error-free.
At first I didn't think there was a need for communication between processes. When I saw the results, I added a pthread barrier to let all the threads catch up after each set of multiples. This didn't make any change. Adding a mutex lock around the mark() function did the trick, but that slows everything down.
Here is my code. Hoping someone might see something obvious.
#include <pthread.h>
#include <stdio.h>
#include <sys/times.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <limits.h>
#include <getopt.h>
#define WORDSIZE 32
struct t_data{
int *ba;
unsigned int val;
int num_threads;
int thread_id;
};
pthread_mutex_t mutex_mark;
void mark( int *ba, unsigned int k )
{
ba[k/32] |= 1 << (k%32);
}
void mark( int *ba, unsigned int k )
{
pthread_mutex_lock(&mutex_mark);
ba[k/32] |= 1 << (k%32);
pthread_mutex_unlock(&mutex_mark);
}
void initBa(int **ba, unsigned int val)
{
*ba = calloc((val/WORDSIZE)+1, sizeof(int));
}
void getPrimes(int *ba, unsigned int val)
{
int i, p;
p = -1;
for(i = 3; i<=val; i+=2){
if(!isMarked(ba, i)){
if(++p == 8){
printf(" \n");
p = 0;
}
printf("%9d", i);
}
}
printf("\n");
}
void markTwins(int *ba, unsigned int val)
{
int i;
for(i=3; i<=val; i+=2){
if(!isMarked(ba, i)){
if(isMarked(ba, i+2)){
mark(ba, i);
}
}
}
}
void *setPrimes(void *arg)
{
int *ba, thread_id, num_threads, status;
unsigned int val, i, p, start;
struct t_data *data = (struct t_data*)arg;
ba = data->ba;
thread_id = data->thread_id;
num_threads = data->num_threads;
val = data->val;
start = (2*(thread_id+2))-1; // stagger threads
i=3;
for(i=3; i<=sqrt(val); i+=2){
if(!isMarked(ba, i)){
p=start;
while(i*p <= val){
mark(ba, (i*p));
p += (2*num_threads);
}
}
}
return 0;
}
void usage(char *filename)
{
printf("Usage: \t%s [option] [arg]\n", filename);
printf("\t-q generate #'s internally only\n");
printf("\t-m [size] maximum size twin prime to calculate\n");
printf("\t-c [threads] number of threads\n");
printf("Defaults:\n\toutput results\n\tsize = INT_MAX\n\tthreads = 1\n");
}
int main(int argc, char **argv)
{
int *ba, i, num_threads, opt, output;
unsigned int val;
output = 1;
num_threads = 1;
val = INT_MAX;
while ((opt = getopt(argc, argv, "qm:c:")) != -1){
switch (opt){
case 'q': output = 0;
break;
case 'm': val = atoi(optarg);
break;
case 'c': num_threads = atoi(optarg);
break;
default:
usage(argv[0]);
exit(EXIT_FAILURE);
}
}
struct t_data data[num_threads];
pthread_t thread[num_threads];
pthread_attr_t attr;
pthread_mutex_init(&mutex_mark, NULL);
initBa(&ba, val);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for(i=0; i < num_threads; i++){
data[i].ba = ba;
data[i].thread_id = i;
data[i].num_threads = num_threads;
data[i].val = val;
if(0 != pthread_create(&thread[i],
&attr,
setPrimes,
(void*)&data[i])){
perror("Cannot create thread");
exit(EXIT_FAILURE);
}
}
for(i = 0; i < num_threads; i++){
pthread_join(thread[i], NULL);
}
markTwins(ba, val);
if(output)
getPrimes(ba, val);
free(ba);
return 0;
}
EDIT: I got rid of the barrier and added a mutex_lock to the mark function. Output is accurate now, but now more than one thread slows it down. Any suggestions on speeding it up?
Your currently implementation of mark is correct, but the locking is extremely coarse-grained - there's only one lock for your entire array. This means that your threads are constantly contending for that lock.
One way of improving performance is to make the lock finer-grained: each 'mark' operation only requires exclusive access to a single integer within the array, so you could have a mutex for each array entry:
struct bitarray
{
int *bits;
pthread_mutex_t *locks;
};
struct t_data
{
struct bitarray ba;
unsigned int val;
int num_threads;
int thread_id;
};
void initBa(struct bitarray *ba, unsigned int val)
{
const size_t array_size = val / WORDSIZE + 1;
size_t i;
ba->bits = calloc(array_size, sizeof ba->bits[0]);
ba->locks = calloc(array_size, sizeof ba->locks[0]);
for (i = 0; i < array_size; i++)
{
pthread_mutex_init(&ba->locks[i], NULL);
}
}
void mark(struct bitarray ba, unsigned int k)
{
const unsigned int entry = k / 32;
pthread_mutex_lock(&ba.locks[entry]);
ba.bits[entry] |= 1 << (k%32);
pthread_mutex_unlock(&ba.locks[entry]);
}
Note that your algorithm has a race-condition: consider the example where num_threads = 4, so Thread 0 starts at 3, Thread 1 starts at 5 and Thread 2 starts at 7. It is possible for Thread 2 to execute fully, marking every multiple of 7 and then start again at 15, before Thread 0 or Thread 1 get a chance to mark 15 as a multiple of 3 or 5. Thread 2 will then do useless work, marking every multiple of 15.
Another alternative, if your compiler supports Intel-style atomic builtins, is to use those instead of a lock:
void mark(int *ba, unsigned int k)
{
__sync_or_and_fetch(&ba[k/32], 1U << k % 32);
}
Your mark() funciton is not threadsafe - if two threads try to set bits within the same int location one might overwrite with 0 a bit that was just set by another thread.

Calculating pi in pthreads

I am trying to calculate pi using the bpp method but my result keeps coming up at 0.The whole idea is for each thread to compute a part of it and the sum of each thread gets summed up using the join method
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#define NUM_THREADS 20
void *pi_function(void *p);//returns the value of pi
pthread_mutex_t mutex1 = PTHREAD_MUTEX_INITIALIZER; //creates a mutex variable
double pi=0,p16=1;int k=0;
double sumvalue=0,sum=0;
main()
{
pthread_t threads[NUM_THREADS]; //creates the number of threads NUM_THREADS
int iret1; //used to ensure that threads are created properly
//pthread_create(thread,attr,start_routine,arg)
int i;
pthread_mutex_init(&mutex1, NULL);
for(i=0;i<NUM_THREADS;i++){
iret1= pthread_create(&threads[i],NULL,pie_function,(void *) i);
if(iret1){
printf("ERROR; return code from pthread_create() is %d\n", iret1);
exit(-1);
}
}
for(i=0;i<NUM_THREADS;i++){
iret1=pthread_join(threads[i],&sumvalue);
if(iret1){
printf("ERROR; return code from pthread_create() is %d\n", iret1);
exit(-1);
}
pi=pi+sumvalue; //my result here keeps returning 0
}
pthread_mutex_destroy(&mutex1);
printf("Main: program completed. Exiting.\n");
printf("The value of pi is : %f\n",pi);
exit(0);
}
void *pie_function(void * p){
int rc;
int k=(int)p;
sumvalue += 1.0/p16 * (4.0/(8* k + 1) - 2.0/(8*k + 4)
- 1.0/(8*k + 5) - 1.0/(8*k+6));
pthread_mutex_lock( &mutex1 ); //locks the share variable pi and p16
p16 *=16;
rc=pthread_mutex_unlock( &mutex1 );
if(rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
}
pthread_exit(&sumvalue);
}
For your purpose you don't need to have a mutex or other complicated structure. Just have every thread compute on its own local variables. Provide to each thread the address of a double where he receives his k and may return the result, in the same way as you already separate the ptread_t variables for each thread.
To avoid getting 0 as the output. Put pi=pi+sumvalue inside the join for loop.
since pi=pi+sumvalue is executed only once when sumvalue is very small and pi=0. pi=0 is the output. Just put pi=pi+sumvalue in the join for loop.
To get consistent value of pi follow below mentioned details.
You have to make sure two thinks:-
Just make pi as an global variable and remove all global variables. limit the use of many global variable as updating them will become a critical section. Update pi as
pthread mutex lock(&mutex1);
pi += pi + my sum;
pthread mutex unlock(&mutex1);
you can also calculate sum_values as:
void pie_function(void rank) {
long my_rank = (long) rank;
//printf("%ld \n",my_rank);
double factor, sumvalue = 0.0;
long long i;
long long n=1000000;
long long my_n = n/thread_count;
long long my_first_i = my_n*my_rank;
long long my_last_i = my_first_i + my_n;
if (my_first_i % 2 == 0)
factor = 1.0;
else
factor = -1.0;
for (i = my_first_i; i < my_last_i; i++, factor = -factor)
sumvalue+= 4factor/(2i+1);
pthread_exit(&sumvalue);
}

C pthread Segmentation fault

so I was trying to make a GPGPU emulator with c & pthreads but ran into a rather strange problem which I have no idea why its occurring. The code is as below:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
// simplifies malloc
#define MALLOC(a) (a *)malloc(sizeof(a))
// Index of x/y coordinate
#define x (0)
#define y (1)
// Defines size of a block
#define BLOCK_DIM_X (3)
#define BLOCK_DIM_Y (2)
// Defines size of the grid, i.e., how many blocks
#define GRID_DIM_X (5)
#define GRID_DIM_Y (7)
// Defines the number of threads in the grid
#define GRID_SIZE (BLOCK_DIM_X * BLOCK_DIM_Y * GRID_DIM_X * GRID_DIM_Y)
// execution environment for the kernel
typedef struct exec_env {
int threadIdx[2]; // thread location
int blockIdx[2];
int blockDim[2];
int gridDim[2];
float *A,*B; // parameters for the thread
float *C;
} exec_env;
// kernel
void *kernel(void *arg)
{
exec_env *env = (exec_env *) arg;
// compute number of threads in a block
int sz = env->blockDim[x] * env->blockDim[y];
// compute the index of the first thread in the block
int k = sz * (env->blockIdx[y]*env->gridDim[x] + env->blockIdx[x]);
// compute the index of a thread inside a block
k = k + env->threadIdx[y]*env->blockDim[x] + env->threadIdx[x];
// check whether it is in range
assert(k >= 0 && k < GRID_SIZE && "Wrong index computation");
// print coordinates in block and grid and computed index
/*printf("tx:%d ty:%d bx:%d by:%d idx:%d\n",env->threadIdx[x],
env->threadIdx[y],
env->blockIdx[x],
env->blockIdx[y], k);
*/
// retrieve two operands
float *A = &env->A[k];
float *B = &env->B[k];
printf("%f %f \n",*A, *B);
// retrieve pointer to result
float *C = &env->C[k];
// do actual computation here !!!
// For assignment replace the following line with
// the code to do matrix addition and multiplication.
*C = *A + *B;
// free execution environment (not needed anymore)
free(env);
return NULL;
}
// main function
int main(int argc, char **argv)
{
float A[GRID_SIZE] = {-1};
float B[GRID_SIZE] = {-1};
float C[GRID_SIZE] = {-1};
pthread_t threads[GRID_SIZE];
int i=0, bx, by, tx, ty;
//Error location
/*for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}*/
// Step 1: create execution environment for threads and create thread
for (bx=0;bx<GRID_DIM_X;bx++) {
for (by=0;by<GRID_DIM_Y;by++) {
for (tx=0;tx<BLOCK_DIM_X;tx++) {
for (ty=0;ty<BLOCK_DIM_Y;ty++) {
exec_env *e = MALLOC(exec_env);
assert(e != NULL && "memory exhausted");
e->threadIdx[x]=tx;
e->threadIdx[y]=ty;
e->blockIdx[x]=bx;
e->blockIdx[y]=by;
e->blockDim[x]=BLOCK_DIM_X;
e->blockDim[y]=BLOCK_DIM_Y;
e->gridDim[x]=GRID_DIM_X;
e->gridDim[y]=GRID_DIM_Y;
// set parameters
e->A = A;
e->B = B;
e->C = C;
// create thread
pthread_create(&threads[i++],NULL,kernel,(void *)e);
}
}
}
}
// Step 2: wait for completion of all threads
for (i=0;i<GRID_SIZE;i++) {
pthread_join(threads[i], NULL);
}
// Step 3: print result
for (i=0;i<GRID_SIZE;i++) {
printf("%f ",C[i]);
}
printf("\n");
return 0;
}
Ok this code here runs fine, but as soon as I uncomment the "Error Location" (for loop which assigns A[i] = i and B[i] = i + 1, I get snapped by a segmentation fault in unix, and by these random 0s within C in cygwin. I must admit my fundamentals in C is pretty poor, so it may be highly likely that I missed something. If someone can give an idea on what's going wrong it'd be greatly appreciated. Thanks.
It works when you comment that because i is still 0 when the 4 nested loops start.
You have this:
for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}
/* What value is `i` now ? */
And then
pthread_create(&threads[i++],NULL,kernel,(void *)e);
^
So pthread_create will try to access some interesting indexes indeed.

Resources