EDIT TO QUESTION: Is it possible to have thread safe access to a bit array? My implementation below seems to require mutex locks which defeats the purpose of parallelizing.
I've been tasked with creating a parallel implementation of a twin prime generator using pthreads. I decided to use the Sieve of Eratosthenes and to divide the work of marking the factors of known primes. I staggering which factors a thread gets.
For example, if there are 4 threads:
thread one marks multiples 3, 11, 19, 27...
thread two marks multiples 5, 13, 21, 29...
thread two marks multiples 7, 15, 23, 31...
thread two marks multiples 9, 17, 25, 33...
I skipped the even multiples as well as the even base numbers. I've used a bitarray, so I run it up to INT_MAX. The problem I have is at max value of 10 million, the result varies by about 5 numbers, which is how much error there is compared to a known file. The results vary all the way down to about max value of 10000, where it changes by 1 number. Anything below that is error-free.
At first I didn't think there was a need for communication between processes. When I saw the results, I added a pthread barrier to let all the threads catch up after each set of multiples. This didn't make any change. Adding a mutex lock around the mark() function did the trick, but that slows everything down.
Here is my code. Hoping someone might see something obvious.
#include <pthread.h>
#include <stdio.h>
#include <sys/times.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <limits.h>
#include <getopt.h>
#define WORDSIZE 32
struct t_data{
int *ba;
unsigned int val;
int num_threads;
int thread_id;
};
pthread_mutex_t mutex_mark;
void mark( int *ba, unsigned int k )
{
ba[k/32] |= 1 << (k%32);
}
void mark( int *ba, unsigned int k )
{
pthread_mutex_lock(&mutex_mark);
ba[k/32] |= 1 << (k%32);
pthread_mutex_unlock(&mutex_mark);
}
void initBa(int **ba, unsigned int val)
{
*ba = calloc((val/WORDSIZE)+1, sizeof(int));
}
void getPrimes(int *ba, unsigned int val)
{
int i, p;
p = -1;
for(i = 3; i<=val; i+=2){
if(!isMarked(ba, i)){
if(++p == 8){
printf(" \n");
p = 0;
}
printf("%9d", i);
}
}
printf("\n");
}
void markTwins(int *ba, unsigned int val)
{
int i;
for(i=3; i<=val; i+=2){
if(!isMarked(ba, i)){
if(isMarked(ba, i+2)){
mark(ba, i);
}
}
}
}
void *setPrimes(void *arg)
{
int *ba, thread_id, num_threads, status;
unsigned int val, i, p, start;
struct t_data *data = (struct t_data*)arg;
ba = data->ba;
thread_id = data->thread_id;
num_threads = data->num_threads;
val = data->val;
start = (2*(thread_id+2))-1; // stagger threads
i=3;
for(i=3; i<=sqrt(val); i+=2){
if(!isMarked(ba, i)){
p=start;
while(i*p <= val){
mark(ba, (i*p));
p += (2*num_threads);
}
}
}
return 0;
}
void usage(char *filename)
{
printf("Usage: \t%s [option] [arg]\n", filename);
printf("\t-q generate #'s internally only\n");
printf("\t-m [size] maximum size twin prime to calculate\n");
printf("\t-c [threads] number of threads\n");
printf("Defaults:\n\toutput results\n\tsize = INT_MAX\n\tthreads = 1\n");
}
int main(int argc, char **argv)
{
int *ba, i, num_threads, opt, output;
unsigned int val;
output = 1;
num_threads = 1;
val = INT_MAX;
while ((opt = getopt(argc, argv, "qm:c:")) != -1){
switch (opt){
case 'q': output = 0;
break;
case 'm': val = atoi(optarg);
break;
case 'c': num_threads = atoi(optarg);
break;
default:
usage(argv[0]);
exit(EXIT_FAILURE);
}
}
struct t_data data[num_threads];
pthread_t thread[num_threads];
pthread_attr_t attr;
pthread_mutex_init(&mutex_mark, NULL);
initBa(&ba, val);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
for(i=0; i < num_threads; i++){
data[i].ba = ba;
data[i].thread_id = i;
data[i].num_threads = num_threads;
data[i].val = val;
if(0 != pthread_create(&thread[i],
&attr,
setPrimes,
(void*)&data[i])){
perror("Cannot create thread");
exit(EXIT_FAILURE);
}
}
for(i = 0; i < num_threads; i++){
pthread_join(thread[i], NULL);
}
markTwins(ba, val);
if(output)
getPrimes(ba, val);
free(ba);
return 0;
}
EDIT: I got rid of the barrier and added a mutex_lock to the mark function. Output is accurate now, but now more than one thread slows it down. Any suggestions on speeding it up?
Your currently implementation of mark is correct, but the locking is extremely coarse-grained - there's only one lock for your entire array. This means that your threads are constantly contending for that lock.
One way of improving performance is to make the lock finer-grained: each 'mark' operation only requires exclusive access to a single integer within the array, so you could have a mutex for each array entry:
struct bitarray
{
int *bits;
pthread_mutex_t *locks;
};
struct t_data
{
struct bitarray ba;
unsigned int val;
int num_threads;
int thread_id;
};
void initBa(struct bitarray *ba, unsigned int val)
{
const size_t array_size = val / WORDSIZE + 1;
size_t i;
ba->bits = calloc(array_size, sizeof ba->bits[0]);
ba->locks = calloc(array_size, sizeof ba->locks[0]);
for (i = 0; i < array_size; i++)
{
pthread_mutex_init(&ba->locks[i], NULL);
}
}
void mark(struct bitarray ba, unsigned int k)
{
const unsigned int entry = k / 32;
pthread_mutex_lock(&ba.locks[entry]);
ba.bits[entry] |= 1 << (k%32);
pthread_mutex_unlock(&ba.locks[entry]);
}
Note that your algorithm has a race-condition: consider the example where num_threads = 4, so Thread 0 starts at 3, Thread 1 starts at 5 and Thread 2 starts at 7. It is possible for Thread 2 to execute fully, marking every multiple of 7 and then start again at 15, before Thread 0 or Thread 1 get a chance to mark 15 as a multiple of 3 or 5. Thread 2 will then do useless work, marking every multiple of 15.
Another alternative, if your compiler supports Intel-style atomic builtins, is to use those instead of a lock:
void mark(int *ba, unsigned int k)
{
__sync_or_and_fetch(&ba[k/32], 1U << k % 32);
}
Your mark() funciton is not threadsafe - if two threads try to set bits within the same int location one might overwrite with 0 a bit that was just set by another thread.
Related
I'm relatively new to multithread programming. I wrote a program which is calculating the squares from 0 - 10000 and saving them into an array. The sequential program is running much faster than the parallel. In my parallel program I have divided the loop into 8 threads (my machine has 8 cores) but it is much slower! Anyone an idea why this is the case? I have added the screenshots of the execution times.
/*Here is the normal program:*/
#define ARRAYSIZE 10000
int main(void) {
int array[ARRAYSIZE];
int i;
for (i=0; i<ARRAYSIZE; i++)
{
array[i]=i*i;
}
return 0;
}
/*Here is the parallelized calculation. Used from http://ramcdougal.com/threads.html*/
#include <stdio.h>
#include <pthread.h>
#define ARRAYSIZE 10000
#define NUMTHREADS 8 /*Cause have 8 core on my machine*/
struct ThreadData {
int start;
int stop;
int* array;
};
void* squarer (struct ThreadData* td);
/* puts i^2 into array positions i=start to stop-1 */
void* squarer (struct ThreadData* td)
{
struct ThreadData* data = (struct ThreadData*) td;
int start=data->start;
int stop=data->stop;
int* array=data->array;
int i;
for(i= start; i<stop; i++)
{
array[i]=i*i;
}
return NULL;
}
int main(void) {
int array[ARRAYSIZE];
pthread_t thread[NUMTHREADS];
struct ThreadData data[NUMTHREADS];
int i;
int tasksPerThread= (ARRAYSIZE + NUMTHREADS - 1)/ NUMTHREADS;
/* Divide work for threads, prepare parameters */
/* This means in my example I divide the loop into 8 regions: 0 ..1250,1250 .. 2500 etc., 2500 .. 3750 */
for(i=0; i<NUMTHREADS;i++)
{
data[i].start=i*tasksPerThread;
data[i].stop=(i+1)*tasksPerThread;
data[i].array=array;
data[NUMTHREADS-1].stop=ARRAYSIZE;
}
for(i=0; i<NUMTHREADS;i++)
{
pthread_create(&thread[i], NULL, squarer, &data[i]);
}
for(i=0; i<NUMTHREADS;i++)
{
pthread_join(thread[i], NULL);
}
return 0;
}
You want to have a garden party. In preparation, you must move 8 chairs from the house into the garden. You call a moving company and ask them to send 8 movers. They arrive from across town and quickly complete the task, one chair each. The 8 movers drive back to the other end of the town. When they return, they call you and tell you that the task has been completed.
Question: Would the whole process have gone faster if you had moved the 8 chairs yourself?
Answer: Yes, the actual task (moving 8 chairs a short distance) is far too small to involve a moving company. The time spent on transport back and forth far exceeds the time spent on the task itself.
The example above is similar to what your code does.
Starting 8 threads is equivalent to driving from the other end of town to your house.
Stopping 8 threads is equivalent to returning back.
There is far too much wasted time compared to the size of the task to be solved.
Lesson: Only use multi-threading when the task is sufficiently big.
So for your test, you should increase ARRAYSIZE (a lot). Further, you have to add some code that prevents the compiler from doing optimizations that bypass the array assignments.
Try the code below (It's OPs code with a few changes).
Single thread
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAYSIZE 1000000000
unsigned array[ARRAYSIZE];
int main(void) {
unsigned i;
for (i=0; i<ARRAYSIZE; i++)
{
array[i]=i*i;
}
srand(time(NULL));
return array[rand() % ARRAYSIZE] > 10000;
}
My result: 1.169 s
Multi thread
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define ARRAYSIZE 1000000000
unsigned array[ARRAYSIZE];
#define NUMTHREADS 8 /*Cause have 8 core on my machine*/
struct ThreadData {
unsigned start;
unsigned stop;
unsigned* array;
};
/* puts i^2 into array positions i=start to stop-1 */
void* squarer (void* td)
{
struct ThreadData* data = (struct ThreadData*) td;
unsigned start=data->start;
unsigned stop=data->stop;
unsigned* array=data->array;
unsigned i;
for(i= start; i<stop; i++)
{
array[i]=i*i;
}
return NULL;
}
int main(void) {
pthread_t thread[NUMTHREADS];
struct ThreadData data[NUMTHREADS];
int i;
int tasksPerThread= (ARRAYSIZE + NUMTHREADS - 1)/ NUMTHREADS;
/* Divide work for threads, prepare parameters */
/* This means in my example I divide the loop into 8 regions: 0 ..1250,1250 .. 2500 etc., 2500 .. 3750 */
for(i=0; i<NUMTHREADS;i++)
{
data[i].start=i*tasksPerThread;
data[i].stop=(i+1)*tasksPerThread;
data[i].array=array;
data[NUMTHREADS-1].stop=ARRAYSIZE;
}
for(i=0; i<NUMTHREADS;i++)
{
pthread_create(&thread[i], NULL, squarer, &data[i]);
}
for(i=0; i<NUMTHREADS;i++)
{
pthread_join(thread[i], NULL);
}
srand(time(NULL));
return array[rand() % ARRAYSIZE] > 10000;
}
My result: 0.192 s
First time asking a question hope it will be productive:)
I have 10 threads running, and I need the main to print 2 things:
A value as it returns from a thread.
When all the threads are finished, to print a vector of all the values at the same order as they sent to the threads.
Now, the program prints the "--->" from the function, that means it finished the tread, but I need it to print them from main.
#include <stdio.h>
#include <pthread.h>
#include <semaphore.h>
#include <unistd.h>
#include <pthread.h>
#include <stdio.h>
//sem_t mutex;
void *myThread(void *args)
{
int argptr=do123(*(int*)args);
printf("--->%d\n",argptr);
// sem_wait(&mutex);
//*(int*)args=do123((int)args);
return (void*)argptr;
}
int main()
{
int nums[10]={17,65,34,91,92,93,33,16,22,75};
int TemPnums[10]={17,65,34,91,92,93,33,16,22,75};
int res[10]={0};
//pthread_t t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
pthread_t theads[10];
for (int i = 0; i < 10; i++) {
res[i]=nums[i];
pthread_create(&theads[i], NULL, myThread, &TemPnums[i]);
}
// pthread_join(&theads[10], &status);
for (int i = 0; i < 10; i++) {
void *status;
pthread_join(theads[i], &status);
res[i]=(int)status;
}
for (int i = 0; i < 10; i++) {
printf("%d\n",res[i]);
}
}
int do123(int num)
{
int k=0;
while(num!=1){
if(num%2==1){
num=num*3+1;
k++;
}else{
num=num/2;
k++;
}
}
return k;
}
OutPut:
--->12
--->92
--->27
--->13
--->17
--->17
--->26
--->14
--->4
--->15
12
27
13
92
17
17
26
4
15
14
The time at which a thread in C joins is not influenced by the time at which that same thread executes nor is it determined by its order. This means that on my system the order which a thread executes and joins in a pool of 10 threads can vary on your system. For example using this modified version of your code (see bottom of post for change notes):
#include <pthread.h>
#include <semaphore.h>
#include <stdio.h>
#include <unistd.h>
#include <cstdint>
//sem_t mutex;
int do123(int); // Added (1)
void *myThread(void *args)
{
size_t argptr = do123(*(int *)args);
printf("--->%d\n", argptr);
// sem_wait(&mutex);
//*(int*)args=do123((int)args);
return (void *)argptr;
}
int main()
{
int nums[10] = {17, 65, 34, 91, 92, 93, 33, 16, 22, 75};
int TemPnums[10] = {17, 65, 34, 91, 92, 93, 33, 16, 22, 75};
int res[10] = {0};
//pthread_t t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
pthread_t theads[10];
for (int i = 0; i < 10; i++)
{
res[i] = nums[i];
pthread_create(&theads[i], NULL, myThread, &TemPnums[i]);
}
// pthread_join(&theads[10], &status);
for (int i = 0; i < 10; i++)
{
void *status;
pthread_join(theads[i], &status);
res[i] = (size_t)status;
}
for (int i = 0; i < 10; i++)
{
printf("%d\n", res[i]);
}
}
int do123(int num)
{
int k = 0;
while (num != 1)
{
if (num % 2 == 1)
{
num = num * 3 + 1;
k++;
}
else
{
num = num / 2;
k++;
}
}
return k;
I get the output:
--->12
--->27
--->13
--->92
--->17
--->17
--->26
--->4
--->15
--->14
12
27
13
92
17
17
26
4
15
14
If your goal is to make sure that threads are joined into the array in the main function at the same order at which they are assigned their value in the helper function, I suggest implementing a way to block subsequent threads after one thread has its value assigned to it. In order to do this, you can implement a system using semaphores or mutexes.
Documentation on semaphores: https://www.tutorialspoint.com/how-to-use-posix-semaphores-in-c-language
Documentation on mutexes: https://www.tutorialspoint.com/deadlock-with-mutex-locks
In short, the flow should be when one thread enters do123(), lock all other threads from entering the function. Let all work on that thread be done and have it return from the function and be assigned to its respective index in the array. After this, you should unlock the next thread and repeat.
I suggest giving those a read to better understand how threading works. Good luck.
Notes on changes:
(1) You have to add the function declaration before using the function in your code. You had the definition of the function below where you call it. The compiler does not know about this function as it looks at your code from "top down".
You are losing precision by casting a type void* to an int as the size depends on your OS (16-bit, 32-bit, etc.). I changed them to a size_t struct which will ensure non-negative values as well as account for the loss of precision.
According to specification, the function rand() in C uses mutexes to lock context (http://sourcecodebrowser.com/uclibc/0.9.27/rand_8c.html). So if I use multiple threads that call it, my program will be slow because all threads will try to access this lock region.
So, I have found drand48(), another random number generator function, which does not have locks (http://sourcecodebrowser.com/uclibc/0.9.27/drand48_8c.html#af9329f9acef07ca14ea2256191c3ce74). But, somehow, my parallel program is still slower than the serial one! The code is pasted bellow:
Serial version:
#include <cstdlib>
#define M 100000000
int main()
{
for (int i = 0; i < M; ++i)
drand48();
return 0;
}
Parallel version:
#include <pthread.h>
#include <cstdlib>
#define M 100000000
#define N 4
pthread_t threads[N];
void* f(void* p)
{
for (int i = 0; i < M/N; ++i)
drand48();
}
int main()
{
for (int i = 0; i < N; ++i)
pthread_create(&threads[i], NULL, f, NULL);
for (int i = 0; i < N; ++i)
pthread_join(threads[i], NULL);
return 0;
}
I executed both codes. The serial one runs in ~0.6 seconds and the parallel in ~2.1 seconds.
Could anyone explain me why this happens?
Some additional information: I have 4 cores on my PC. I compile the serial version using
g++ serial.cpp -o serial
and the parallel using
g++ parallel.cpp -lpthread -o parallel
Edit:
Apparently, this performance loss happens whenever I updates a global variable in my threads. In the exemple below, the x variable is the global (note that in the parallel example, the operation will be non thread-safe):
Serial:
#include <cstdlib>
#define M 1000000000
int x = 0;
int main()
{
for (int i = 0; i < M; ++i)
x = x + 10 - 10;
return 0;
}
Parallel:
#include <pthread.h>
#include <cstdlib>
#define M 1000000000
#define N 4
pthread_t threads[N];
int x;
void* f(void* p)
{
for (int i = 0; i < M/N; ++i)
x = x + 10 - 10;
}
int main()
{
for (int i = 0; i < N; ++i)
pthread_create(&threads[i], NULL, f, NULL);
for (int i = 0; i < N; ++i)
pthread_join(threads[i], NULL);
return 0;
}
Note that the drand48() uses the global struct variable _libc_drand48_data.
drand48() uses the global struct variable _libc_drand48_data, it keeps state there (writes to it), and is therefore the source of cache line contention, which is very likely the source of the performance degradation. It isn't false sharing as I initially suspected and wrote in the comments, it is bona fide sharing. The reason there is no locking in the implementation of drand48() is two fold:
drand48() is not required to be thread-safe "The drand48(), lrand48(), and mrand48() functions need not be thread-safe."
If two threads happen to access it at the same time, and their writes to memory are interleaved there is no harm done - the data structure is not corrupted, and it is, after all, supposed to return pseudo random data.
There are some subtle considerations (race conditions) in the use of drand48() when one thread is initializing state, but considered harmless
Notice below in __drand48_iterate how it stores to three 16-bit words in the global variable, this is where the random generator keeps its state, and this is the source of the cache-line contention between your threads
xsubi[0] = result & 0xffff;
xsubi[1] = (result >> 16) & 0xffff;
xsubi[2] = (result >> 32) & 0xffff;
Source code
You provided the link to drand48() source code which I've included below for reference. The problem is cache line contention when the state is updated
#include <stdlib.h>
/* Global state for non-reentrant functions. Defined in drand48-iter.c. */
extern struct drand48_data __libc_drand48_data;
double drand48(void)
{
double result;
erand48_r (__libc_drand48_data.__x, &__libc_drand48_data, &result);
return result;
}
And here is the source for erand48_r
extern int __drand48_iterate(unsigned short xsubi[3], struct drand48_data *buffer);
int erand48_r (xsubi, buffer, result)
unsigned short int xsubi[3];
struct drand48_data *buffer;
double *result;
{
union ieee754_double temp;
/* Compute next state. */
if (__drand48_iterate (xsubi, buffer) < 0)
return -1;
/* Construct a positive double with the 48 random bits distributed over
its fractional part so the resulting FP number is [0.0,1.0). */
temp.ieee.negative = 0;
temp.ieee.exponent = IEEE754_DOUBLE_BIAS;
temp.ieee.mantissa0 = (xsubi[2] << 4) | (xsubi[1] >> 12);
temp.ieee.mantissa1 = ((xsubi[1] & 0xfff) << 20) | (xsubi[0] << 4);
/* Please note the lower 4 bits of mantissa1 are always 0. */
*result = temp.d - 1.0;
return 0;
}
And the implementation of __drand48_iterate which is where it writes back to the global
int
__drand48_iterate (unsigned short int xsubi[3], struct drand48_data *buffer)
{
uint64_t X;
uint64_t result;
/* Initialize buffer, if not yet done. */
if (unlikely(!buffer->__init))
{
buffer->__a = 0x5deece66dull;
buffer->__c = 0xb;
buffer->__init = 1;
}
/* Do the real work. We choose a data type which contains at least
48 bits. Because we compute the modulus it does not care how
many bits really are computed. */
X = (uint64_t) xsubi[2] << 32 | (uint32_t) xsubi[1] << 16 | xsubi[0];
result = X * buffer->__a + buffer->__c;
xsubi[0] = result & 0xffff;
xsubi[1] = (result >> 16) & 0xffff;
xsubi[2] = (result >> 32) & 0xffff;
return 0;
}
I am trying to create an array of size n (where n is user's input) and when the user runs the program, the array elements should be set to 1 (each in a separate thread). Here is what I have done so far:
#include <windows.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <conio.h>
#include <process.h>
int *x;
DWORD WINAPI init_X(LPVOID param)
{
int index = *(int *) param;
x[index] = 1;
return 0;
}
int main(int argc, char *argv[])
{
int n = atoi(argv[1]);
int i; // counter.
HANDLE THandles[n];
x = malloc(n * sizeof (int));
for(i = 0; i < n; i++)
{
THandles[i] = CreateThread(NULL, 0, init_X, &i, 0, NULL);
}
// Now wait for threads to finish
WaitForMultipleObjects(n, THandles, TRUE, INFINITE);
// Close the thread handle
for(i = 0; i < n; i++)
{
CloseHandle(THandles[i]);
}
printf("After initialization x = ");
for(i = 0; i < n; i++)
{
printf("%d ", x[i]);
if(i < n - 1) printf(" ");
}
// ...
return 0;
}
I run this program and I got wrong outputs:
> Test.exe 3
After initialization x = 11611536 11600064 50397186
It should be After initialization x = 1 1 1 though. I am not sure how I can I fix this, but I am sure its something related to the pointers.
P.S: I'm Java programmer so I'm not familiar with pointers.
The value you are passing as your array index will more than likely be invalid by the time the thread runs, as there is no guaranteeing that the thread is run immediately after the call to CreateThread.
You have two solutions, either pass by value (simple & easy, but not always safe) or allocate a temporary buffer for the value that will be freed by the thread when its used.
Minor Update:
In fact, a better way would be to pass &x[i], then you can just do *(int*)param = 1;
You are passing i by pointer to the thread, so the value each thread gets will depend on when int index = *(int *) param; actually executes and it should be something between 0 and n. You can just pass i by value (casted to a pointer) to avoid this.
so I was trying to make a GPGPU emulator with c & pthreads but ran into a rather strange problem which I have no idea why its occurring. The code is as below:
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <assert.h>
// simplifies malloc
#define MALLOC(a) (a *)malloc(sizeof(a))
// Index of x/y coordinate
#define x (0)
#define y (1)
// Defines size of a block
#define BLOCK_DIM_X (3)
#define BLOCK_DIM_Y (2)
// Defines size of the grid, i.e., how many blocks
#define GRID_DIM_X (5)
#define GRID_DIM_Y (7)
// Defines the number of threads in the grid
#define GRID_SIZE (BLOCK_DIM_X * BLOCK_DIM_Y * GRID_DIM_X * GRID_DIM_Y)
// execution environment for the kernel
typedef struct exec_env {
int threadIdx[2]; // thread location
int blockIdx[2];
int blockDim[2];
int gridDim[2];
float *A,*B; // parameters for the thread
float *C;
} exec_env;
// kernel
void *kernel(void *arg)
{
exec_env *env = (exec_env *) arg;
// compute number of threads in a block
int sz = env->blockDim[x] * env->blockDim[y];
// compute the index of the first thread in the block
int k = sz * (env->blockIdx[y]*env->gridDim[x] + env->blockIdx[x]);
// compute the index of a thread inside a block
k = k + env->threadIdx[y]*env->blockDim[x] + env->threadIdx[x];
// check whether it is in range
assert(k >= 0 && k < GRID_SIZE && "Wrong index computation");
// print coordinates in block and grid and computed index
/*printf("tx:%d ty:%d bx:%d by:%d idx:%d\n",env->threadIdx[x],
env->threadIdx[y],
env->blockIdx[x],
env->blockIdx[y], k);
*/
// retrieve two operands
float *A = &env->A[k];
float *B = &env->B[k];
printf("%f %f \n",*A, *B);
// retrieve pointer to result
float *C = &env->C[k];
// do actual computation here !!!
// For assignment replace the following line with
// the code to do matrix addition and multiplication.
*C = *A + *B;
// free execution environment (not needed anymore)
free(env);
return NULL;
}
// main function
int main(int argc, char **argv)
{
float A[GRID_SIZE] = {-1};
float B[GRID_SIZE] = {-1};
float C[GRID_SIZE] = {-1};
pthread_t threads[GRID_SIZE];
int i=0, bx, by, tx, ty;
//Error location
/*for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}*/
// Step 1: create execution environment for threads and create thread
for (bx=0;bx<GRID_DIM_X;bx++) {
for (by=0;by<GRID_DIM_Y;by++) {
for (tx=0;tx<BLOCK_DIM_X;tx++) {
for (ty=0;ty<BLOCK_DIM_Y;ty++) {
exec_env *e = MALLOC(exec_env);
assert(e != NULL && "memory exhausted");
e->threadIdx[x]=tx;
e->threadIdx[y]=ty;
e->blockIdx[x]=bx;
e->blockIdx[y]=by;
e->blockDim[x]=BLOCK_DIM_X;
e->blockDim[y]=BLOCK_DIM_Y;
e->gridDim[x]=GRID_DIM_X;
e->gridDim[y]=GRID_DIM_Y;
// set parameters
e->A = A;
e->B = B;
e->C = C;
// create thread
pthread_create(&threads[i++],NULL,kernel,(void *)e);
}
}
}
}
// Step 2: wait for completion of all threads
for (i=0;i<GRID_SIZE;i++) {
pthread_join(threads[i], NULL);
}
// Step 3: print result
for (i=0;i<GRID_SIZE;i++) {
printf("%f ",C[i]);
}
printf("\n");
return 0;
}
Ok this code here runs fine, but as soon as I uncomment the "Error Location" (for loop which assigns A[i] = i and B[i] = i + 1, I get snapped by a segmentation fault in unix, and by these random 0s within C in cygwin. I must admit my fundamentals in C is pretty poor, so it may be highly likely that I missed something. If someone can give an idea on what's going wrong it'd be greatly appreciated. Thanks.
It works when you comment that because i is still 0 when the 4 nested loops start.
You have this:
for (i = 0; i < GRID_SIZE;i++){
A[i] = i;
B[i] = i+1;
printf("%f %f\n ", A[i], B[i]);
}
/* What value is `i` now ? */
And then
pthread_create(&threads[i++],NULL,kernel,(void *)e);
^
So pthread_create will try to access some interesting indexes indeed.