Related
I'm working on a personal project to fill an array with random numbers, split into a number of user defined segments using pthread (POSIX), search for a target in each segment, and return the number of times the target was found. I'm having bugs and issues. For more than one thread, issues like the target not being held in the struct member and a thread not being created and other things happen. I'm sure my logic is off and my code and it's output reflect this, but I'm stumped. How would you split an array into threads? What logic am I messing up?
HEADER FILE...
#ifndef COUNT_ARRAY_H
#define COUNT_ARRAY_H
// structure declarations
typedef struct
{
int threadNum;
int *array;
int first;
int last;
int target;
int numFound;
} ThreadInfo;
// function prototypes
void* ThreadFunc(void *vptr);
#endif // COUNT_ARRAY_H
MAIN FILE....
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include "count_array.h"
int main(void)
{
int numSegs;
int numSegElems;
int maxRand;
int target;
int totalElems;
int totalFound = 0;
ThreadInfo *infoPtr;
pthread_t *threadHandles;
int index = 0;
int first;
int last;
int threadNum = 0;
//get primary info from user...
printf(" Please enter the total number of elements? ");
scanf("%d", &totalElems);
printf(" Please enter the maximum random value: ");
scanf("%d", &maxRand);
printf(" Please enter the number of segments (1 to 15857): ");
scanf("%d", &numSegs);
if(numSegs > 15857)
{
puts(" Too many segments for machine!");
exit(EXIT_FAILURE);
}
numSegElems = totalElems/numSegs;
// configure the array to work with
// declare array here...
int myArray[totalElems];
//and fill array here
for(; index < totalElems; index++)
{
// % rand() and maxRand to get good range and
//not go beyond users max number
myArray[index] = (rand() % maxRand);
//test printf...ignore if still here
printf(" %d \n", myArray[index]);
}
// get the target value to look for
printf(" Please enter the target value: ");
scanf("%d",&target);
// display initial information
printf("*** Begin search: target = %d, # elements = %d, # segments = %d, "
"# segment elements = %d\n"
, target
, totalElems
, numSegs
, numSegElems);
// initialize the array first/last indexes into the integer array
if(numSegs == 1)
{
first = totalElems;
last = 0;
}
else
{
first = totalElems - numSegElems;
last = (first - numSegElems);
}
// allocate an array to store the thread handles
int size; //size of segment
if(numSegs > 1)
{
size = numSegElems;
}
else
{
size = totalElems;
}
//test printf...please ignore if still here
//printf(" size %d \n", size);
int segA[size];//not sure if I need this
// loop and create threads (# of segments)
index = 0;
for(; threadNum < numSegs; index++)
{
// allocate a thread info structure from the heap
threadHandles = calloc(totalElems, sizeof(pthread_t));
infoPtr = calloc(totalElems, sizeof(ThreadInfo));
// store the information in the allocated structure
infoPtr[index].threadNum = threadNum;
infoPtr->target = target;
infoPtr->first = first;
infoPtr->last = last;
infoPtr->array = myArray;
// create the secondary thread, passing the thread info
pthread_create(&threadHandles[index], NULL, ThreadFunc, &infoPtr[index]);
// update the first/last array indexes for the next thread
first = last;
last = first-numSegs;
++threadNum;
}
// loop and join the threads to fetch each thread's results
for(index = 0; index < numSegs; index++)
{
// join with the next thread
pthread_join(threadHandles[index], NULL);
// get the total number of matches from the thread's infoPtr
// and display a message
printf(" *** pthread_join returned: threadNum = %d, numFound = %d\n",
infoPtr[index].threadNum, infoPtr->numFound);
}
// release the infoPtr structure back to the heap
free(infoPtr);
// display the final results
// release heap memory
free(threadHandles);
return 0;
} // end of "main"
void* ThreadFunc(void *vptr)
{
//declare and set vars
ThreadInfo *ptr = vptr;
ptr->numFound = 0;
int index = ptr->first-1;
//test printf...ignore if still here
printf(" Targ %d \n", ptr->target);
//start search
for(; index >= ptr->last; --index)
{
printf(" %d \n", ptr->array[index]);
//if target found
if(ptr->target == ptr->array[index])
{
puts(" Target found! ");
//increment numFound
++ptr->numFound;
}
}
//drop out and display message
}
You've got multiple errors in your allocation of threadHandles and infoPtr. First, you don't really want to be allocating totalElems of them -- you only need numSegs. Second, and more crucially, you're reallocating them and changing the values of the pointers infoPtr and threadHandles every time through the thread invocation loop. Third, you've mixed treating infoPtr as an array of ThreadInfo structures here:
infoPtr[index].threadNum = threadNum;
with treating it as a pointer to a changing ThreadInfo structure here:
infoPtr->target = target;
infoPtr->first = first;
infoPtr->last = last;
infoPtr->array = myArray;
so every time through the loop, you're setting these parameters on the first thread only.
To fix this, edit and move the allocations before the loop and treat infoPtr consistently as an array:
threadHandles = calloc(numSegs, sizeof(pthread_t));
infoPtr = calloc(numSegs, sizeof(ThreadInfo));
for(; threadNum < numSegs; index++)
{
infoPtr[index].threadNum = threadNum;
infoPtr[index].target = target;
infoPtr[index].first = first;
infoPtr[index].last = last;
infoPtr[index].array = myArray;
and also fix up the second use of infoPtr in this printf a little further down:
printf(" *** pthread_join returned: threadNum = %d, numFound = %d\n",
infoPtr[index].threadNum, infoPtr[index].numFound);
and things will work a little better.
There are still more bugs lingering in your setting of first and last. I suggest you print out their values and make sure they are coming out the way you intend. I was able to get them to become negative (and start searching random memory) pretty easily.
I have written a program in C to count all the word occurrences of each word in a file and sort them to display the most occurring words to the least occurring words. However, I need to use pthread to create multiple threads, depending on the number entered as an argument in the command line. The file needs to be split up into the number of threads entered. For example, say 4 was entered in the command line as an argument. The file would then need to be split up into four parts with each part using a new thread. Then the four parts would need to be joined back together. My C is not very good and I am lost on how to do this. Can anyone please help with this? An example would be great.
Here is my code so far:
int main(int argc, char **argv) {
struct stat fileStat;
FILE *out;
char *address;
int size, res, file, num_threads;
list_t *words = (list_t *)malloc(sizeof(list_t));
res = access(argv[1], F_OK);
if (result != 0) {
exit(1);
}
stat(argv[1], &fileStat);
// Check if a file.
if (S_ISREG(fileStat.st_mode)) {
file = open(argv[1], O_RDONLY);
if (file < 0)
exit(1);
// Check the total size of the file
size = fileStat.st_size;
num_threads = atoi(argv[2]);
if ((addr = mmap(0, size, PROT_READ, MAP_SHARED , file, 0)) == (void *) -1) {
exit(1);
}
munmap(addr, size);
close(file);
} else {
exit(1);
}
Multiple threads can safely read a source file without issue. Writing is when you have problems.
My suggestion (without really understanding the requirements) is:
On launch determine the file size
calculate value size/threadcount
lets say the file is 4k in size, you get a value of about 1k per thread
seek into the file 1 chunk size, read single bytes until you find a word separator
This position is the end of thread 1's zone and the start of thread 2
Seek to the second and 3rd chunk sizes and do the same
At this point you have a file start and finish position for each thread
Launch each thread and pass them the positions they will be responsible for covering
Make the hashtable (or whatever method for counting words you are using) thread safe with mutual exclusion techniques and just have each thread add to the count of whatever word is found
Once all threads are done you have your list
The idea here is that dividing the work in multiple threads and joining the parts afterwards, it is much faster to do the same operation. So you need to:
Divide the work in many parts without wasting too much time
Discover a way of joining back the work easily
Solve the problem in the boundaries caused by dividing the work
The first part is simple. Just divide your data equally between threads.
The second part is easy too. Just sum the results.
The tricky part is part number 3. In your case, you could end up with a word divided between two different threads. So to avoid counting "half-words" you must maintain an separate record for the first/last word of every thread. Then, when you have all your results, you can get the last word of thread N and concatenate it with the first word of thread N+1 and only then add the word to the count. Obviusly if an separator(space, enter, ...) is the first/last char found by a thread, your respective first/last word will be empty.
In pseudo-code:
def main:
size = filesize
ptr = mmap(file)
num_threads = 4
for i in range(1, num_threads):
new_thread(exec = count_words,
start = ptr + i * size / num_threads,
length = size / num_threads)
wait_for_result(all_threads)
join_the_results
def count_words(start, length):
# Count words as if it was an entire file
# But store separatelly the first/last word if
# the segment does not start/ends with an word
# separator(" ", ".", ",", "\n", etc...)
return (count_of_words, first_word, last_word)
This is the same idea behind MapReduce.
This is not a perfect-logic code. I have used C++. If you are very particular with C, you can use POSIX threads instead of std::thread. Also I have just divided the whole file size into number of threads. You will have to take care of last chunk of data(remaining from divide by number of threads), in the last thread itself. I haven't done this.
Another point is the way I am getting the return values from the threads. As of now I am saving it to a global array. C++11 supports retrieving return values - C++: Simple return value from std::thread?
#include <iostream>
#include <fstream>
#include <thread>
#include <mutex>
using namespace std;
#define NO_OF_THREADS 4
int countArray[100];
std::mutex g_pages_mutex;
int trimWhiteSpaces(char *arr, int start, int len)
{
int i = 0;
for(; i < len; i++)
{
char c = arr[i];
if(c == ' ')
{
continue;
}
else
break;
}
return i;
}
void getWordCount(char *arr, int len, int ID)
{
int count = 0;
bool isSpace = false;
int i = 0;
i = i + trimWhiteSpaces(arr, i, len);
for(; i < len; i++)
{
char c = arr[i];
if(c == ' ')
{
i = i + trimWhiteSpaces(&arr[i], i, len) - 1;
//printf("Found space");
isSpace = true;
count++;
}
else
{
isSpace = false;
}
}
if(isSpace)
count = count - 1;
count = count + 1;
g_pages_mutex.lock();
cout << "MYCOUNT:" << count << "\n";
countArray[ID] = count;
g_pages_mutex.unlock();
}
int main(int argc, const char * argv[])
{
char fileData[5000];
std::thread threadIDs[100];
int noOfThreads = NO_OF_THREADS;
char *filePath = "/Users/abc/Desktop/test.txt";
int read_sz = 0;
int decrements = 0;
bool previousNotEndsInSpace = false;
std::ifstream is(filePath, std::ifstream::ate | std::ifstream::binary);
int fileSize = is.tellg();
int bulkSize = fileSize / NO_OF_THREADS;
is.seekg(0);
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
int old_read_sz = read_sz;
is.read(fileData, bulkSize);
read_sz = is.tellg();
fileData[read_sz - old_read_sz] = '\0';
if(read_sz > 0)
{
cout << " data size so far: " << read_sz << "\n";
cout << fileData << endl;
if(previousNotEndsInSpace && fileData[0] != ' ')
{
decrements = decrements + 1;
}
if(fileData[read_sz - 1] != ' ')
{
previousNotEndsInSpace = true;
}
else
{
previousNotEndsInSpace = false;
}
//getWordCount(fileData, strlen(fileData), iter);
threadIDs[iter] = std::thread(getWordCount, fileData, strlen(fileData), iter);
}
}
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
threadIDs[iter].join();
}
int totalCount = 0;
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
cout << "COUNT: " << countArray[iter] << "\n";
totalCount = totalCount + countArray[iter];
}
cout << "TOTAL: " << totalCount - decrements << "\n";
return 0;
}
I'm currently developping a C code with mpi for matrix multiplication. I have functions already implemented as mult or multadd defined in an other file, working well.
But my file pblas.c compiles, but crashes when running.
I run my project on a university server, which has mli installed.
Where am i wrong in my pblas code ?
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "commfct.h"
#include "toolsfct.h"
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
createGridComm(me,P,Q);
ligMe = me / Q;
colMe = me % Q;
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<P;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,ligMe,commCol);
multadd(A,B,C,blockSize);
}
else {
int dest = colMe * Q + ligMe;
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,dest%Q,commCol);
mult(A,Btmp,CC,blockSize);
}
}
else {
int dest = colMe*Q + ligMe;
if(dest%Q == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,colMe,commCol);
multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] != ((double)me/10.0 + sum*blockSize*(colMe+1.0))) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
releaseGridComm();
MPI_Finalize();
return 0;
}
The trouble may come from indexes in MPI_Send(), MPI_Recv() or MPI_Bcast(). For instance, in MPI_Send(), the destinator shall call MPI_Recv(). In MPI_Recv(), the origin should have called MPI_Send(). Otherwise, you will get a deadlock. The code keeps on running, but it waits for messages.
From what you gave us, I guess you have two matrix A and B.
A0 | A1
...........
A2 | A3
To compute the first column of C, you need :
A0xB0 | A1xB2
...................
A2xB0 | A3xB2
I changed your code so that :
for each column i
i_th column of B is transposed in the i_th line of Btmp
i_th line of Btmp is broadcast to Btmp in each column.
Since the issue was about MPI, i reply with a MPI code that looks very close to yours, works but it does nothing, except communication operations...
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
//#include "commfct.h"
//#include "toolsfct.h"
#define TAG_TRANSPOSE 42
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
//createGridComm(me,P,Q);
colMe = me / Q;
ligMe = me % Q;
MPI_Comm commCol, commLine;
//comes from http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
/* Split comm into row and column comms */
MPI_Comm_split(MPI_COMM_WORLD, ligMe, colMe, &commLine);
/* color by row, rank by column */
MPI_Comm_split(MPI_COMM_WORLD, colMe, ligMe, &commCol);
/* color by column, rank by row */
printf("[%d]:My coordinates are i j (%d,%d)\n",me,ligMe,colMe);
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
// t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<Q;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,B,C,blockSize);
}
else {
int dest = ligMe * Q + i;//transpose !
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
else {
int from = i*Q + colMe;// transpose !
if(ligMe == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,from,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
//t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] <0.99999*((double)me/10.0 + sum*blockSize*(colMe+1.0)) || C[i+j*blockSize] >1.00001*((double)me/10.0 + sum*blockSize*(colMe+1.0)) ) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
//releaseGridComm();
MPI_Finalize();
return 0;
}
Watch for the createGridComm(me,P,Q); I had to found something equivalent at http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
I also changed the test at the end of your code. Testing equality between double precision numbers will be too strict. Inequalities are better for floating-point numbers !
I hope this will help !
Bye,
Francis
I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:
__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
__shared__ int D[MAX_THREADS+2];
__shared__ char Text_S[MAX_PATTERN_SIZE];
__shared__ int DNew[MAX_THREADS+2];
__shared__ int BB_S[4][MAX_THREADS];
int w=threadIdx.x+1;
for(int i=0;i<4;i++)
{
BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
}
{
D[threadIdx.x] = 0;
{
D[w] = (1<<(k+1)) -1;
for(int i = 0; i < lc - 1; i++)
{
D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
}
}
D[J+1] = (1<<((k+2)*lc)) - 1;
}
int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));
int copyBlock=(size/J)+ ((size%J)==0?0:1);
if((threadIdx.x * copyBlock) <= size)
memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
memcpy(DNew, D, (J+2)*sizeof(int));
__syncthreads();
uint32_t initial = D[1];
uint32_t x;
uint32_t mask = 1;
for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
for(int i = 0; i < size;i++)
{
{
x = ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
DNew[w] = ((D[w]<<1) | mask)
& (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
& (((x + mask) ^ x) >> 1)
& initial;
}
__syncthreads();
memcpy(D, DNew, (J+2)*sizeof(int));
if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
{
matched[startblock+i] = 1;
D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
}
}
}
I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?
As asked, this is the code where I launch the kernels:
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
char* getTextString() {
FILE *input, *output;
char c;
char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);
int numchars = 0, index = 0;
input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0';
return inputbuffer;
}
int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString(); //reading text from file, supported upto 200MB currently
int k = 13;
int i;
int count=0;
char *pattern_d, *text_d; // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));
int m = strlen(pattern_h);
int n = strlen(text_h);
uint32_t* BB_h[4];
unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2; // Determines the number of threads per block
// can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc) == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
printf("Error: max threads\n");
exit(0);
}
calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa); // not included this function
for(i=0;i<4;i++)
{
cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
cudaStreamCreate( &stream[i] );
}
int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
maxNoBlocks=1;
}
else
{
maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
cudaStreamSynchronize( stream[i] );
// do stuff with matched
// ....
// ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
return 0;
}
Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn't that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously
NOTE: I don't have GUI access.
With all the shared memory you're using, you could be running into bank conflicts if consecutive threads are not reading from consecutive addresses in the shared arrays ... that could cause serialization of the memory accesses, which in turn will kill the parallel performance of your algorithm.
I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? did you try profiling it?
I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads
I'm working on a large scale project in which I'm designing a sparse matrix vector application but I'm still working to understand the code. I'm beginning by building the foundation for the application but I've run into a segmentation fault when executing the program. I've tracked the problem to this loop within the MatrixRead function and am enclosing the code below. When the program is executed I tried programming in some test messages and the program appears to execute all the loops but it returns the segmentation fault at the end. Of course, this is all just speculation. Any help would be awesome. Thanks!
while (ret != EOF && row <= mat->rows)
{
if (row != curr_row) // Won't execute for first iteration
{
/* store this row */
MatrixSetRow(mat, curr_row, len, ind, val);
/* check if the previous row is zero */
i = 1;
while(row != curr_row + i)
{
mat->lens[curr_row+i-1] = 0;
mat->inds[curr_row+i-1] = 0;
mat->vals[curr_row+i-1] = 0;
i++;
}
curr_row = row;
/* reset row pointer */
len = 0;
}
ind[len] = col;
val[len] = value;
len++;
ret = fscanf(file, "%lf %lf %lf", &r1, &c1, &value);
col = (int) (c1);
row = (int) (r1);
}
/* Store the final row */
if (ret == EOF || row > mat->rows)
MatrixSetRow(mat, mat->rows, len, ind, val);
Here's the code for the MatrixSetRow function:
/*--------------------------------------------------------------------------
* MatrixSetRow - Set a row in a matrix. Only local rows can be set.
* Once a row has been set, it should not be set again, or else the
* memory used by the existing row will not be recovered until
* the matrix is destroyed. "row" is in global coordinate numbering.
*--------------------------------------------------------------------------*/
void MatrixSetRow(Matrix *mat, int row, int len, int *ind, double *val)
{
row -= 1;
mat->lens[row] = len;
mat->inds[row] = (int *) MemAlloc(mat->mem, len*sizeof(int));
mat->vals[row] = (double *) MemAlloc(mat->mem, len*sizeof(double));
if (ind != NULL)
memcpy(mat->inds[row], ind, len*sizeof(int));
if (val != NULL)
memcpy(mat->vals[row], val, len*sizeof(double));
}
I'm also including the code for the Matrix.h file that went with it, where the members of Matrix are defined:
#include <stdio.h>
#include "Common.h"
#include "Mem.h"
#ifndef _MATRIX_H
#define _MATRIX_H
typedef struct
{
int rows;
int columns;
Mem *mem;
int *lens;
int **inds;
double **vals;
}
Matrix;