I have been working on this program that accomplishes this:
counts the number of occurrences of a specific integer value in a 2D array (matrix). Each position of the matrix must first be initialized to an integer value between 0 and
n. Once initialized, program will search and count the total number of occurrences of a specific value.
The program is run by taking in the parameters as command line arguments:
programName rows cols n c
rows – number of rows of the matrix
cols – number of columns of the matrix
n – the upper bound of the random values of the matrix, values can be 0–(n-1)
c – the value to search for in the matrix, note c must be between 0–(n-1)
After this, the program implements the search using 1 to 10 threads and displays the execution time and number of occurrences.
I seem to have all of this working how I wish, however the problem is that whenever I enter a value over 4 in the command line for rows, I keep getting the segment fault error.
I am at a loss as to what is causing this. Please help me understand what error in my coding may be contributing to this? Thank you.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>
#define NUM_THREADS 10
int **arr;
int rows, cols, n, c, totalOccurrence, done, numThreads;
int threadCounter[10];
void *matrixThread (void *threadid)
{
long tid;
tid = (long)threadid;
long lowBound = tid * (rows / numThreads);
long highBound = lowBound + (rows / numThreads);
int localcount = 0;
if (tid == numThreads - 1)
{
highBound = rows;
}
long i;
int ic;
for (i = lowBound; i < highBound; i++)
{
for (ic = 0; ic < cols; ic++)
{
if (arr[i][ic] == c)
{
localcount++;
}
}
}
threadCounter[tid] = localcount;
pthread_exit(NULL);
}
int main (int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
if (argc != 5)
{
printf("Error: Invalid number of arguments\n");
}
else
{
rows = strtol(argv[1], NULL, 10);
cols = strtol(argv[2], NULL, 10);
n = strtol(argv[3], NULL, 10);
c = strtol(argv[4], NULL, 10);
int r, cl;
arr = (int**)malloc(rows * sizeof(int));
for (r = 0; r < rows; r++)
{
arr[r] = malloc(cols * sizeof(int));
}
int randomNum;
srand(time(NULL));
for (r = 0; r < rows; r++)
{
for (cl = 0; cl < cols; cl++)
{
randomNum = rand() % n;
arr[r][cl] = randomNum;
}
}
long rc, t;
for (numThreads = 1; numThreads <= 10; numThreads++)
{
struct timeval start,end;
double elapsed_time;
gettimeofday(&start, NULL);
for (t = 0; t < numThreads; t++)
{
rc = pthread_create(&threads[t], NULL, matrixThread, (void *)t);
if (rc)
{
printf ("Error: Thread could not be created; return %d", rc);
exit(-1);
}
}
for (t = 0; t < numThreads; t++)
{
pthread_join(threads[t], NULL);
}
totalOccurrence = 0;
int q;
for (q = 0; q < numThreads; q++)
{
totalOccurrence += threadCounter[q];
}
gettimeofday(&end, NULL);
elapsed_time = (end.tv_sec + end.tv_usec/1000000.10000) - (start.tv_sec + start.tv_usec/1000000.10000);
printf("\nNumber of threads: %d " , numThreads);
printf("Total Occurrences of %d: %d " ,c, totalOccurrence);
printf("Elapsed time: %.8f\n" , elapsed_time);
totalOccurrence = 0;
}
}
pthread_exit(NULL);
}
Here is one problem:
arr = (int**)malloc(rows * sizeof(int));
should be:
arr = (int**)malloc(rows * sizeof(int *));
The allocation of the rows should be like this
arr = (int**)malloc(rows * sizeof(int*));
Because the sizeof datatypes can vary. But the sizeof a pointer will be constant in a particular machine architecture. In a 64 bit machine the sizeof a pointer will be 8 bytes. But sizeof int will be usually 4 bytes (gcc). So here you will be having allocated only 4 blocks. That why when you try to pass more than 4, it's crashing because there's an invalid memory read.
Also your program will cause memory leak, as you are not freeing the allocated memory. Use like this at the end.
for (r = 0; r < rows; r++)
{
free (arr[r]);
}
free (arr);
Related
I am trying to sum up 1000 elements integer array(where each element is 1) with pthread library by splitting the array in to segments of size 10. So effectively, 100 threads are being used to do that. The results of this parallel operation is as expected (1000). But interestingly, the sequential sum which I calculated before creating the threads is being set to zero after my first call to pthread_join(). Not sure if I am missing something here. Can someone spot the bug here?
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define SEGMENT_SIZE 10
#define NUM_THREADS 100
int *array = NULL;
void* segment_sum(void *args)
{
int index = (int)args;
int sum = 0;
for (int i = index * SEGMENT_SIZE; i < (index + 1) * SEGMENT_SIZE; i++) {
sum += array[i];
}
return (void *)sum;
}
int main()
{
pthread_t thread[NUM_THREADS];
int res = 0;
int seq_res = 0;
int par_res = 0;
array = calloc(1, sizeof(int) * NUM_THREADS * SEGMENT_SIZE);
for (int i = 0; i < NUM_THREADS * SEGMENT_SIZE; i++) {
array[i] = 1;
seq_res += 1;
}
for (int i = 0; i < NUM_THREADS; i++) {
res = pthread_create(&thread[i], NULL, segment_sum, (void *)i);
if (res != 0) {
printf("\nError creating new thread");
}
}
printf("\nindex = %d", seq_res); // the sequential sum here is 1000
for (int i = 0; i < NUM_THREADS; i++) {
int sum = 0;
res = pthread_join(thread[i], (void **)&sum);
if (res != 0) {
printf("\nError creating new thread");
}
printf("\nindex = %d", seq_res); // Here it is becoming zero!!!
par_res += sum;
}
printf("\nmultithreaded sum: %d single threaded sum: %d\n", par_res, seq_res);
}
When you compile your program, try as much as possible to eliminate
the warnings as they often point out non portable behaviors or hidden
errors. Here the compilation points out the following:
pte.c: In function 'segment_sum':
pte.c:11:21: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
11 | int index = (int)args;
| ^
pte.c:18:16: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
18 | return (void *)sum;
| ^
pte.c: In function 'main':
pte.c:36:69: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
36 | res = pthread_create(&thread[i], NULL, segment_sum, (void *)i);
| ^
The parameter passed to the threads is a cast of a pointer into an "int". It is
advised to pass the address of an "int". Hence, you can define a per-thread
context:
struct thd_ctx {
pthread_t thread;
int index;
int sum;
};
pthread_join() is passed the address of a pointer which will get the address
of the memory location into which the thread stored its result. The thread must
return the address of this memory location, not the value stored into it.
Moreover, the thread should not return the address of an automatic variable
(i.e. in its stack) as it is unspecified. The result must be the address
of a global variable (or "something" visible from the joining thread) returned either directly or through pthread_exit(). In this enhancement of the program, we use the address of the "sum" field in the thread's context:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <errno.h>
#define SEGMENT_SIZE 10
#define NUM_THREADS 100
int *array = NULL;
struct thd_ctx {
pthread_t thread;
int index;
int sum;
};
void *segment_sum(void *args)
{
int i;
struct thd_ctx *ctx = (struct thd_ctx *)args;
ctx->sum = 0;
for (i = ctx->index * SEGMENT_SIZE; i < (ctx->index + 1) * SEGMENT_SIZE; i++) {
ctx->sum += array[i];
}
return (void *)&(ctx->sum);
}
int main(void)
{
struct thd_ctx thd_ctx[NUM_THREADS];
int res = 0;
int seq_res = 0;
int par_res = 0;
int i;
array = calloc(1, sizeof(int) * NUM_THREADS * SEGMENT_SIZE);
if (!array) {
fprintf(stderr, "calloc(): error %d\n", errno);
return 1;
}
for (i = 0; i < NUM_THREADS * SEGMENT_SIZE; i++) {
array[i] = 1;
seq_res += 1;
}
for (i = 0; i < NUM_THREADS; i++) {
thd_ctx[i].index = i;
res = pthread_create(&(thd_ctx[i].thread), NULL, segment_sum, (void *)&(thd_ctx[i]));
if (res != 0) {
fprintf(stderr, "Error %d creating new thread#%d\n", res, i);
free(array);
return 1;
}
}
printf("Index = %d\n", seq_res); // the sequential sum here is 1000
for (i = 0; i < NUM_THREADS; i++) {
int *sum = 0;
res = pthread_join(thd_ctx[i].thread, (void **)&(sum));
if (res != 0) {
printf("Error %d joining thread#%d", res, i);
free(array);
return 1;
}
par_res += *sum;
printf("sum = %d\n", par_res);
}
printf("\nMultithreaded sum: %d single threaded sum: %d\n", par_res, seq_res);
free(array);
return 0;
}
I want to read as input a table A and B from a user , and make an inner product space from them (a1b1+a2b2+……+anbn) and save it in a local_sum and then share it to an total_sum variable. I am doing the bellow code , but there is a segment fault. For some reason table A & B can't pass to function MUL. Any help would be great, thank you!
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#define N 2
int p;
int A[N],B[N];
int local_sum;
void *mul(void *arg)
{
int lines, start, end, i, j;
int id = *(int*)arg;
lines = N / p;
start = id * lines;
end = start + lines;
for (i = start; i < end; i++)
local_sum = A[i] * B[i] + local_sum;
return NULL;
}
int main (int argc, char *argv[])
{
int i;
pthread_t *tid;
if (argc != 2)
{
printf("Provide number of threads.\n");
exit(1);
}
p = atoi(argv[1]);
tid = (pthread_t *)malloc(p * sizeof(pthread_t));
if (tid == NULL)
{
printf("Could not allocate memory.\n");
exit(1);
}
printf("Give Table A\n");
for (int i = 0; i < N; i++)
{
scanf("%d", &A[i]);
}
printf("Give Table B\n");
for (int i = 0; i < N; i++)
{
scanf("%d", &B[i]);
}
for (i = 0; i < p; i++)
{
int *a;
a = malloc(sizeof(int));
*a = 0;
pthread_create(&tid[i], NULL, mul, a);
}
for (i = 0; i < p; i++)
pthread_join(tid[i], NULL);
printf("%d", local_sum);
return 0;
}
Let's see:
You want to have p threads, working on the vectors A and B.
You must be aware of that threads share the same memory, and might be interrupted at any time.
You've got p threads, all trying to write to one shared variable local_sum. This leads to unpredictable results since one thread overwrites the value another thread has written there before.
You can bypass this problem by ensuring exclusive access of one single thread to this variable by using a mutex or the like, or you could have one variable per thread, have each thread produce an intermediate result and after joining all threads, collapse all your intermediate results into the final one.
To do this, your main should look something like (assuming your compiler supports a recent C standard):
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#define N 2
/* these are variables shared amongst all threads */
int p;
int A[N], B[N];
/* array with one slot per thread to receive the partial result of each thread */
int* partial_sum;
/* prototype of thread function, just to be independent of the place mul will be placed in the source file... */
void *mul(void *arg);
int main (int argc, char** argv)
{
pthread_t* tid;
p = atoi(argv[1]);
const size_t n_by_p = N/p;
if(n_by_p * p != N)
{
fprintf(stderr, "Number of threads must be an integral factor of N\n");
exit(EXIT_FAILURE) ;
}
tid = calloc(p, sizeof(pthread_t));
partial_sum = calloc(p, sizeof(int)) ;
printf("Give Table A\n");
for(size_t i = 0; i < N; ++i)
{
scanf("%d",&A[i]);
}
printf("Give Table B\n");
for(size_t i = 0; i < N; ++i)
{
scanf("%d",&B[i]);
}
for (size_t i =0; i < p; ++i)
{
/* clumsy way to pass a thread it's slot number, but works as a starter... */
int *a;
a = malloc(sizeof(int));
*a = i;
pthread_create(&tid[i], 0, mul, a);
}
for (size_t i = 0; i < p; ++i)
{
pthread_join(tid[i], 0);
}
free(tid);
tid = 0;
int total_sum = 0;
for (size_t i = 0; i < p; ++i)
{
total_sum += partial_sum[i] ;
}
free(partial_sum);
partial_sum = 0;
printf("%d",total_sum);
return EXIT_SUCCESS;
}
Your threaded method mul should now write to its particular partial_sum slot only :
void *mul(void *arg)
{
int slot_num = *(int*)arg;
free(arg);
arg = 0;
const size_t lines = N/p;
const size_t start = slot_num * lines;
const size_t end = start + lines;
partial_sum[slot_num] = 0;
for(size_t i = start; i < end; ++i)
{
partial_sum[slot_num] += A[i]*B[i];
}
return 0;
}
Beware: This code runs smoothly, only if N is some integral multiple of p.
If this condition is not met, due to truncation in N/p, not all elements of the vectors will be processed.
However, fixing these cases is not the core of this question IMHO.
I spared all kinds of error-checking, which you should add, should this code become part of some operational setup...
if (tid=NULL)
-->
if (tid==NULL)
and
for (i=start;i<end;i++)
I suppose we need
for (i=0;i<end-start;i++)
am brand new to C. I have a little program which is intended to solve find the dot product of a large 2d matrix with itself using pthread. Now when the function assigned to the pthread is called and the struct passed as a variable is accessed, the program breaks and stop working. I don't really know what am doing wrong. Here is the code:
This is the main function.
int main()
{
int rc;
int threadcount = 1;
char filename[100];
//patch to enable console printing in eclipse
setvbuf(stdout, NULL, _IONBF, 0);
do {
prompt_for_fileName(filename);
if (filename[0] == 'Q' || filename[0] == 'q') {
puts("Program ended");
return 0;
}
//read thread count
read_threadcount(&threadcount);
//initialize matrices
matrix_def matrix = initialize_matrix(filename);
//get the dimension of sub-matrices
int dfRow = (int) floor(matrix.NROWS / threadcount);
pthread_t threads[threadcount];
pthread_arg pthreadargs[threadcount];
for (int i = 0; i < threadcount; i++) {
int startRow = i * dfRow;
int endRow =
((i + 1) == threadcount) ?
matrix.NROWS : (startRow + dfRow) - 1; //we're subtracting one because its zero based.
//create a structure that we'll passed to the array.
pthread_arg arg = { matrix.NROWS, matrix.NCOLS, startRow, endRow,
0.0, NULL, NULL };
arg.data = matrix.data;
arg.result_set = create_result_memory(matrix.NCOLS);
fprintf(stderr, "before %p\n", arg.result_set);
//push arg into array.
pthreadargs[i] = arg;
rc = pthread_create(&threads[i], NULL, compute_dot_product,
(void *) &arg);
if (rc) {
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
/* Last thing that main() should do */
pthread_exit(NULL);
puts("Completed processing.");
double totalTime = 0.0;
for (int z = 0; z < threadcount; z++) {
pthread_arg ar = pthreadargs[z];
printf("Thread %d took %g to process %d rows and %d columns.\n", z,
ar.execution_time, ar.endz - ar.start, ar.col);
totalTime += ar.execution_time;
}
printf(
"It took the total time of %g, to compute the dot product of the matrices.\n",
totalTime);
//free memory
free(matrix.data);
for (int k = 0; k < threadcount; k++) {
free(pthreadargs[k].data);
free(pthreadargs[k].result_set);
}
} while (filename[0] != 'Q' || filename[0] != 'q');
}
This is the function being called by the pthread
void * compute_dot_product(void * inputArgs) {
double startTime, endTime;
pthread_arg * args = inputArgs;
/*Compute the dimension of the result matrix*/
int col, row, start, endz;
col = args->col;
start = args->start;
endz = args->endz;
row = endz - start;
fprintf(stderr, "after %p\n", args->result_set);
//create a pointer to the two array
double **arr1 = args->data;
double **arr2 = arr1;
//begin the computation
int x;
startTime = seconds();
//calculate the dot product the two matrices.
for (x = 0; x < col; x++) {
double colProduct = 0.0;
for (int y = start; y < endz; y++) {
colProduct += arr1[y][x] * arr2[y][x];
}
//The code breaks here.
args->result_set[x] = colProduct;
}
endTime = seconds();
double diff = endTime - startTime;
args->execution_time = diff;
return (void *) 4;
}
This is my struct definitions
typedef struct
{
int NROWS; /*for m rows*/
int NCOLS; /*for n columns*/
double ** data;
} matrix_def;
typedef struct
{
double execution_time;
matrix_def matrix;
} compute_result;
typedef struct{
int row;
int col;
int start;
int endz;
double execution_time;
double **data;
double *result_set;
} pthread_arg;
Memory allocation of the 2D matrix.
/*dynamically allocate array based on the read size*/
matrix.data = (double **) malloc(sizeof(double *) * M);
if(matrix.data != NULL){
int x;
for(x = 0; x < M; x++){
matrix.data[x] = (double) malloc(sizeof(double) * N);
}
}else{
fprintf(stderr, "Unable to allocate memory\n");
exit(1);
}
Initialize Matrix function
matrix_def initialize_matrix(char *argv)
{
int ret_code;
MM_typecode matcode;
FILE *f;
int M, N, nz;
int i;
matrix_def matrix;
if((f = fopen(argv, "r")) == NULL)
{
fprintf(stderr, "Reading file: '%s' failed", argv);
exit(1);
}
/*Read matrix banner*/
if(mm_read_banner(f, &matcode) != 0)
{
printf("Could not process Matrix Market banner. \n");
exit(1);
}
/*Check if the current matrix is supported.*/
if(mm_is_complex(matcode) && mm_is_matrix(matcode) && mm_is_sparse(matcode))
{
printf("Sorry, this application does not support ");
printf("Market Matrix type: [%s]\n", mm_typecode_to_str(matcode));
exit(1);
}
/*find out size of the sparse matrix...*/
if((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) != 0)
exit(1);
/*Assign m, n sizes.*/
matrix.NROWS = M;
matrix.NCOLS = N;
/*dynamically allocate array based on the read size*/
matrix.data = (double **) malloc(sizeof(double *) * M);
if(matrix.data != NULL){
int x;
for(x = 0; x < M; x++){
matrix.data[x] = (double *) malloc(sizeof(double) * N);
}
}else{
fprintf(stderr, "Unable to allocate memory\n");
exit(1);
}
/*Iterate through the created memory location and fill it with zeros*/
int a, b;
for(a = 0; a < M; a++){
for(b = 0; b < N; b++){
matrix.data[a][b] = 0;
}
}
/*Read the matrix*/
for(i = 0; i < nz; i++)
{
int I = 0, J = 0;
double val = 0;
fscanf(f, "%d %d %lg\n", &I, &J, &val);
//since the matrix market file starts off at
//1,1 we have to subtract 1 from the index
//to account for the array which starts off at
// 0,0
matrix.data[--I][--J] = val;
}
if(f != stdin)
fclose(f);
return matrix;
}
Any help will be appreciated, as am not very sure with the formula. Thanks
arg goes out of scope before the pthread is executed. Change your call to
rc = pthread_create(&threads[i], NULL, compute_dot_product, (void *) &pthreadargs[i]);
You will also need pthread_join before you exit, just in case the threads have not finished.
Edit
1) Replace your pthread_exit with
int rv;
for (i = 0; i < threadcount; ++i)
pthread_join(thread[i], &rv);
You would normally call pthread_exit inside a thread (like compute_dot_product) as an abnormal exit. This is a possible reason for your program breaking.
2) On your exit, I don't know how you have allocated your memory but this is a potential area where your code might be broken. If you have allocated your memory as
matrix.data = malloc(sizeof(double*) * matrix.NROWS);
matrix.data[0] = malloc(sizeof(double) * matrix.NROWS * matrix.NCOLS);
for (i = 1; i < matrix.NROWS; ++i)
matrix.data[i] = matrix.data[i - 1] + matrix.NCOLS;
Then you should free as
free(matrix.data[0]);
free(matrix.data);
If you have allocated each row individually, then free all the rows before freeing matrix.data.
3) Since matrix.data has been freed, pthreadargs[k].data should not be freed as it is pointing to the same area of memory that has already been freed.
The arg object defined here:
pthread_arg arg = { matrix.NROWS, matrix.NCOLS, startRow, endRow,
0.0, NULL, NULL };
goes out of scope while thread is still running. You need to prevent this from happening somehow, for example by allocating it on the heap instead.
I'm writing implementation of Sieve of Eratosthenes (https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes) on GPU. But no sth like this - http://developer-resource.blogspot.com/2008/07/cuda-sieve-of-eratosthenes.html
Method:
Creating n-element array with default values 0/1 (0 - prime, 1 - no) and passing it on GPU (I know that it can be done directly in kernel but it's not problem in this moment).
Each thread in block checks multiples of a single number. Each block checks in total sqrt(n) possibilities. Each block == different interval.
Marking multiples as 1 and passing data back to the host.
Code:
#include <stdio.h>
#include <stdlib.h>
#define THREADS 1024
__global__ void kernel(int *global, int threads) {
extern __shared__ int cache[];
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
cache[tid - 1] = global[number];
__syncthreads();
int start = offset + 1;
int end = offset + threads;
for (int i = start; i <= end; i++) {
if ((i != tid) && (tid != 1) && (i % tid == 0)) {
cache[i - offset - 1] = 1;
}
}
__syncthreads();
global[number] = cache[tid - 1];
}
int main(int argc, char *argv[]) {
int *array, *dev_array;
int n = atol(argv[1]);
int n_sqrt = floor(sqrt((double)n));
size_t array_size = n * sizeof(int);
array = (int*) malloc(n * sizeof(int));
array[0] = 1;
array[1] = 1;
for (int i = 2; i < n; i++) {
array[i] = 0;
}
cudaMalloc((void**)&dev_array, array_size);
cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice);
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
int shared = threads * sizeof(int);
kernel<<<blocks, threads, shared>>>(dev_array, threads);
cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost);
int count = 0;
for (int i = 0; i < n; i++) {
if (array[i] == 0) {
count++;
}
}
printf("Count: %d\n", count);
return 0;
}
Run:
./sieve 10240000
It works correctly when n = 16, 64, 1024, 102400... but for n = 10240000 I getting incorrect result. Where is problem?
This code has a variety of problems, in my view.
You are fundamentally accessing items out of range. Consider this sequence in your kernel:
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
cache[tid - 1] = global[number];
You (in some cases -- see below) have launched a thread array exactly equal in size to your global array. So what happens when the highest numbered thread runs the above code? number = threadIdx.x+1+blockIdx.x*blockDim.x. This number index will be one beyond the end of your array. This is true for many possible values of n. This problem would have been evident to you if you had either used proper cuda error checking or had run your code with cuda-memcheck. You should always do those things when you are having trouble with a CUDA code and also before asking for help from others.
The code only has a chance of working correctly if the input n is a perfect square. The reason for this is contained in these lines of code (as well as dependencies in the kernel):
int n = atol(argv[1]);
int n_sqrt = floor(sqrt((double)n));
...
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
(note that the correct function here would be atoi not atol, but I digress...) Unless n is a perfect square, the resultant n_sqrt will be somewhat less than the actual square root of n. This will lead you to compute a total thread array that is smaller than the necessary size. (It's OK if you don't believe me at this point. Run the code I will post below and input a size like 1025, then see if the number of threads * blocks is of sufficient size to cover an array of 1025.)
As you've stated:
Each block checks in total sqrt(n) possibilities.
Hopefully this also points out the danger of non-perfect square n, but we must now ask "what if n is larger than the square of the largest threadblock size (1024)? The answer is that the code will not work correctly in many cases - and your chosen input of 10240000, although a perfect square, exceeds 1024^2 (1048576) and it does not work for this reason. Your algorithm (which I claim is not a Sieve of Eratosthenes) requires that each block be able to check sqrt(n) possibilities, just as you stated in the question. When that no longer becomes possible because of the limits of threads per block, then your algorithm starts to break.
Here is a code that makes some attempt to fix issue #1 above, and at least give an explanation for the failures associated with #2 and #3:
#include <stdio.h>
#include <stdlib.h>
#define THREADS 1024
#define MAX 10240000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *global, int threads) {
extern __shared__ int cache[];
int tid = threadIdx.x + 1;
int offset = blockIdx.x * blockDim.x;
int number = offset + tid;
if ((blockIdx.x != (gridDim.x-1)) || (threadIdx.x != (blockDim.x-1))){
cache[tid - 1] = global[number];
__syncthreads();
int start = offset + 1;
int end = offset + threads;
for (int i = start; i <= end; i++) {
if ((i != tid) && (tid != 1) && (i % tid == 0)) {
cache[i - offset - 1] = 1;
}
}
__syncthreads();
global[number] = cache[tid - 1];}
}
int cpu_sieve(int n){
int limit = floor(sqrt(n));
int *test_arr = (int *)malloc(n*sizeof(int));
if (test_arr == NULL) return -1;
memset(test_arr, 0, n*sizeof(int));
for (int i = 2; i < limit; i++)
if (!test_arr[i]){
int j = i*i;
while (j <= n){
test_arr[j] = 1;
j += i;}}
int count = 0;
for (int i = 2; i < n; i++)
if (!test_arr[i]) count++;
return count;
}
int main(int argc, char *argv[]) {
int *array, *dev_array;
if (argc != 2) {printf("must supply n as command line parameter\n"); return 1;}
int n = atoi(argv[1]);
if ((n < 1) || (n > MAX)) {printf("n out of range %d\n", n); return 1;}
int n_sqrt = floor(sqrt((double)n));
size_t array_size = n * sizeof(int);
array = (int*) malloc(n * sizeof(int));
array[0] = 1;
array[1] = 1;
for (int i = 2; i < n; i++) {
array[i] = 0;
}
cudaMalloc((void**)&dev_array, array_size);
cudaMemcpy(dev_array, array, array_size, cudaMemcpyHostToDevice);
int threads = min(n_sqrt, THREADS);
int blocks = n / threads;
int shared = threads * sizeof(int);
printf("threads = %d, blocks = %d\n", threads, blocks);
kernel<<<blocks, threads, shared>>>(dev_array, threads);
cudaMemcpy(array, dev_array, array_size, cudaMemcpyDeviceToHost);
cudaCheckErrors("some error");
int count = 0;
for (int i = 0; i < n; i++) {
if (array[i] == 0) {
count++;
}
}
printf("Count: %d\n", count);
printf("CPU Sieve: %d\n", cpu_sieve(n));
return 0;
}
There are a couple of issues, I think, but here's a pointer to the actual problem: The sieve of Eratosthenes removes iteratively multiples of already encountered prime numbers, and you want to separate the work-load into thread-blocks, where each thread-block operates on a piece of shared memory (cache, in your example). Thread-blocks, however, are generally independent from all other thread-blocks and cannot easily communicate with one another. One example to illustrate the problem: The thread with index 0 in thread-block with index 0 removes multiples of 2. Thread blocks with index > 0 have no way to know about this.
What the following code trying to accomplish is just to compute the Matrix Multiplication of A and B to get matrix C. It uses nXn threads to compute each entry of C independently. So the code works on Cygwin, but not on linux. I keep getting segment default with the Pthread_join calls.
#define _REENTRANT // Make sure the library functions are MT (muti-thread) safe
#include <stdio.h>
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#define BUFFER_SIZE 512
// Declare a structure data type that will be used to pass arguments to the worker threads
typedef struct args_for_thread_t{
int *rowA;
int rowIdx;
int *columnB;
int columnIdx;
int **matrixC;
} ARGS_FOR_THREAD;
/* Global Variables */
int numRows,numColumns;
/*Function Prototype*/
void *computeC(void *this_arg);
void printMatrix(int** matrix,int numRows,int numColumns);
int main(void){
const char filename[] = "input_data.txt";
FILE *file = fopen(filename,"r");
char *delims = " ";
int **matrixA,**matrixB,**matrixC;
int flagB = 0; //Indicate wether the program should process matrixB
int i,j;
if (file){
char line[BUFFER_SIZE];
int rowIdx = 0;
while (fgets(line,sizeof(line), file)){
char substr[BUFFER_SIZE], *result;
//fputs(line,stdout);
result = strtok(line, delims);
int columnIdx = 0;
//Once we reach a line break, we start the processing of matrix B
if (!strcmp(line,"\n")){
flagB = 1;
rowIdx = 0; //Reset the rowIdx
continue; //Skip the new line, and start to read data into matrix B
}
while (result != NULL){
if (!strcmp(result,"ROWS")){ //To retrieve the number of rows
result = strtok(NULL,delims);
numRows = atoi(result);
matrixA = (int **) malloc(numRows*sizeof(int*));
matrixB = (int **) malloc(numRows*sizeof(int*));
matrixC = (int **) malloc(numRows*sizeof(int*));
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!strcmp(result,"COLUMNS")){//To retrieve the number of Columns
result = strtok(NULL,delims);
numColumns = atoi(result);
for (i=0;i<numRows;i++){ //Malloc the columns
matrixA[i] = (int *) malloc(numColumns*sizeof(int));
matrixB[i] = (int *) malloc(numColumns*sizeof(int));
matrixC[i] = (int *) malloc(numColumns*sizeof(int));
}
rowIdx = -1;
result = strtok(NULL,delims);
}
else if (!flagB){ //Processing Matrix A
matrixA[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
else if (flagB){ //Processing Matrix B
matrixB[rowIdx][columnIdx] = atoi(result);
columnIdx++;
result = strtok(NULL,delims);
}
}
rowIdx++;
}
}
else{
printf("No Such File exists!\n");
}
//At this point, matrix A and matrix B are both ready for computation. We will start to compute the product of the two matrices
int num_threads = numRows*numColumns; //The toal number of worker threads
pthread_t *worker_thread = (pthread_t *) malloc(sizeof(pthread_t)*num_threads);
ARGS_FOR_THREAD *args_for_thread;
for(i = 0; i < numRows; i++){
for(j = 0; j < numColumns; j++){
args_for_thread = (ARGS_FOR_THREAD *)malloc(sizeof(ARGS_FOR_THREAD)); // Allocate memory for the structure that will be used to pack the arguments
args_for_thread-> rowA = matrixA[i];
//We need to allocate the corresponding column in B for multiplication
int k;
args_for_thread->columnB =(int *) malloc(sizeof(int)*numRows);
for (k=0;k<numRows;k++){
args_for_thread-> columnB[k] = matrixB[k][j];
}
//rowIdx and columnIdx gives the corresponding entry for matrix C
args_for_thread-> rowIdx = i;
args_for_thread-> columnIdx = j;
args_for_thread-> matrixC = matrixC;
if((pthread_create(&worker_thread[i], NULL, computeC, (void *)args_for_thread)) != 0){
printf("Cannot create thread \n");
exit(0);
}
}
}
// Wait for all the worker threads to finish
for(i = 0; i < num_threads; i++)
pthread_join(worker_thread[i], NULL);
//Print out the Final Matrix C
printMatrix(matrixC,numRows,numColumns);
//Clean up pointers
for(i = 0; i < numRows; i++){
free(matrixA[i]);
free(matrixB[i]);
free(matrixC[i]);
}
free(matrixA);
free(matrixB);
free(matrixC);
}
void printMatrix(int** matrix,int numRows, int numColumns){
int i,j;
for (i=0;i<numRows;i++){
for (j=0;j<numColumns;j++){
printf("%d ",matrix[i][j]);
if (j==numColumns-1){
printf("\n");
}
}
}
}
/* Function that will be executed by all the worker threads. It will compute the i,j entry for column C */
void *computeC(void *this_arg){
ARGS_FOR_THREAD *arg = (ARGS_FOR_THREAD *) this_arg;
int rowIdx = arg->rowIdx;
int columnIdx = arg->columnIdx;
int *rowA = arg->rowA;
int *columnB = arg->columnB;
int **matrixC = arg->matrixC;
int i;
int sum = 0;
for (i=0;i<numRows;i++){ //Compute entry for matrix C. Since A,B are nxn square matrix, we can use either numRows or numColumns as the size
sum += rowA[i]*columnB[i];
}
matrixC[rowIdx][columnIdx] = sum;
free((void *) arg); // Free up the structure
pthread_exit(NULL);
}
What is the issue here? Thank you.
Here:
pthread_create(&worker_thread[i] ...
You create i * j threads, yet you only provide worker_threads[i] hence your program keeps using the same pthread_t variables. It later fails when you try to join the threads with undefined pthread_t values.
Replace by:
pthread_create(&worker_thread[i*numColumns+j] ...