GPU code running slower than CPU version - c

I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:
__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
__shared__ int D[MAX_THREADS+2];
__shared__ char Text_S[MAX_PATTERN_SIZE];
__shared__ int DNew[MAX_THREADS+2];
__shared__ int BB_S[4][MAX_THREADS];
int w=threadIdx.x+1;
for(int i=0;i<4;i++)
{
BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
}
{
D[threadIdx.x] = 0;
{
D[w] = (1<<(k+1)) -1;
for(int i = 0; i < lc - 1; i++)
{
D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
}
}
D[J+1] = (1<<((k+2)*lc)) - 1;
}
int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));
int copyBlock=(size/J)+ ((size%J)==0?0:1);
if((threadIdx.x * copyBlock) <= size)
memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
memcpy(DNew, D, (J+2)*sizeof(int));
__syncthreads();
uint32_t initial = D[1];
uint32_t x;
uint32_t mask = 1;
for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
for(int i = 0; i < size;i++)
{
{
x = ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
DNew[w] = ((D[w]<<1) | mask)
& (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
& (((x + mask) ^ x) >> 1)
& initial;
}
__syncthreads();
memcpy(D, DNew, (J+2)*sizeof(int));
if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
{
matched[startblock+i] = 1;
D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
}
}
}
I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?
As asked, this is the code where I launch the kernels:
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
char* getTextString() {
FILE *input, *output;
char c;
char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);
int numchars = 0, index = 0;
input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0';
return inputbuffer;
}
int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString(); //reading text from file, supported upto 200MB currently
int k = 13;
int i;
int count=0;
char *pattern_d, *text_d; // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));
int m = strlen(pattern_h);
int n = strlen(text_h);
uint32_t* BB_h[4];
unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2; // Determines the number of threads per block
// can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc) == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
printf("Error: max threads\n");
exit(0);
}
calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa); // not included this function
for(i=0;i<4;i++)
{
cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
cudaStreamCreate( &stream[i] );
}
int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
maxNoBlocks=1;
}
else
{
maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
cudaStreamSynchronize( stream[i] );
// do stuff with matched
// ....
// ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
return 0;
}
Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn't that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously
NOTE: I don't have GUI access.

With all the shared memory you're using, you could be running into bank conflicts if consecutive threads are not reading from consecutive addresses in the shared arrays ... that could cause serialization of the memory accesses, which in turn will kill the parallel performance of your algorithm.

I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? did you try profiling it?

I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads

Related

rand() generating different values when used in valgrind

I'm trying to randomly generate rooms in a two-dimensional array of size 100x100. If the room being generated collides with an already existing room, it generates new points for the room. The generation code makes sense conceptually, but when I try to run, the program loops endlessly, and checking the log reveals why.
Room created successfully with dimensions x=0, y=0, width=976761120, height=809120052
For some reason, at lines 65-68, inside create_room(), the width and height for the room are being randomly assigned huge numbers, when they should be between 1 and 11. Just for fun, I ran the program through Valgrind using the options --track-origins=yes -v, and I what I found surprised me. Suddenly, the program would run!
Room created successfully with dimensions x=0, y=0, width=0, height=0
While still not exactly what I wanted, this at least prevents an infinite loop of collisions being detected with an impossibly huge room.
So, my question is, why is the code generating such large numbers when executed normally, but generate smaller numbers when in Valgrind?
Here's the code for the program.
#include <time.h>
#include <stdlib.h>
#include "global.h"
#include "draw.h"
#include "log.h"
#include "generate.h"
#define NUM_ROOMS 10
#define ROOM_SIZE 10
#define MAP_HEIGHT 100
#define MAP_WIDTH 100
static struct ROOM* create_room (unsigned int);
struct ROOM {
int x, y, width, height;
int feature;
};
struct ROOM* rooms[NUM_ROOMS] = {NULL};
static FILE* gen_log;
static WINDOW* gen_window;
int** generate_dungeon(unsigned int seed){
char* log_entry = malloc (80);
int i = 0, j, k;
gen_window = create_window (0, 0, LINES, COLS);
gen_log = log_open (GEN_LOG);
if (seed == 0){
time_t t;
seed = time (&t);
}
srand (seed);
for (int i = 0; i < NUM_ROOMS; i++){
rooms[i] = create_room (seed);
sprintf (log_entry,"Room created successfully with dimensions x=%d, y=%d, width=%d, height=%d\n", rooms[i]->x, rooms[i]->y, rooms[i]->width, rooms[i]->height);
LOG_DEBUG (gen_log,log_entry);
}
LOG_DEBUG(gen_log, "Beginning to draw rooms\n");
for (i=0;i < NUM_ROOMS;i++){
sprintf (log_entry, "Drawing room %d\n", i);
LOG_DEBUG (gen_log, log_entry);
for (j = rooms[i]->y; j < rooms[i]->y + rooms[i]->height; j++){
for (k = rooms[i]->x; k < rooms[i]->x + rooms[i]->width; k++){
sprintf (log_entry, "Clearing %d,%d]\n", j,k);
LOG_DEBUG (gen_log, log_entry);
map_array[j][k] = 1;
}
}
}
destroy_window (gen_window);
}
static struct ROOM* create_room (unsigned int seed){
int i = 0, flag;
srand (seed);
if (rooms[0] == NULL)
flag = 0;
else
flag = 1;
char* log_entry = malloc (80);
struct ROOM* new_room = malloc (sizeof(struct ROOM));
while (flag){
draw_notify (gen_window, "Creating room\n");
new_room->x = (rand() % MAP_WIDTH);
new_room->y = (rand() % MAP_HEIGHT);
new_room->width = (rand() % ROOM_SIZE + 1);
new_room->height = (rand() % ROOM_SIZE + 1);
sprintf (log_entry, "New room created with points x=%d, y=%d,width=%d, height=%d\n", new_room->x, new_room->y, new_room->width, new_room->height);
LOG_DEBUG (gen_log, log_entry);
draw_notify (gen_window, "Log entry made\n");
if (new_room->x + new_room->width >= MAP_WIDTH || new_room->y + new_room->height >= MAP_HEIGHT){
LOG_DEBUG (gen_log, "Room out of bounds\n");
continue;
}
i=0;
draw_notify(gen_window, "Entering loop\n");
while (rooms[i] != NULL && i < NUM_ROOMS){
sprintf (log_entry, "Testing room %d\n", i);
draw_notify (gen_window, log_entry);
LOG_DEBUG(gen_log, log_entry);
if (new_room->x < rooms[i]->x + rooms[i]->width &&
new_room->x + new_room->width > rooms[i]->x &&
new_room->y < rooms[i]->y + rooms[i]->height &&
new_room->y + new_room->height > rooms[i]->y){
sprintf (log_entry, "Collision detected with room %d\n", i);
draw_notify (gen_window, log_entry);
LOG_DEBUG (gen_log, log_entry);
flag = 1;
break;
}
else{
sprintf (log_entry, "Room %d passed.\n", i);
flag = 0;
i++;
}
}
draw_notify(gen_window, "Exited loop\n");
}
return new_room;
}
You have some logic errors and end up with uninitialized values.
You initialize rooms to be an array of NULL pointers.
In create_room, you have:
if (rooms[0] == NULL)
flag = 0;
else
flag = 1;
First time around, flag will be set to 0. And then, you use:
struct ROOM* new_room = malloc (sizeof(struct ROOM));
while (flag){
since flag is set to 0, nothing under the while gets executed and you end up with uninitialized members in new_room.
You need to re-think your logic and make sure that you initialize members of new_room always.

How to use pthread in C to count the number of word occurrences?

I have written a program in C to count all the word occurrences of each word in a file and sort them to display the most occurring words to the least occurring words. However, I need to use pthread to create multiple threads, depending on the number entered as an argument in the command line. The file needs to be split up into the number of threads entered. For example, say 4 was entered in the command line as an argument. The file would then need to be split up into four parts with each part using a new thread. Then the four parts would need to be joined back together. My C is not very good and I am lost on how to do this. Can anyone please help with this? An example would be great.
Here is my code so far:
int main(int argc, char **argv) {
struct stat fileStat;
FILE *out;
char *address;
int size, res, file, num_threads;
list_t *words = (list_t *)malloc(sizeof(list_t));
res = access(argv[1], F_OK);
if (result != 0) {
exit(1);
}
stat(argv[1], &fileStat);
// Check if a file.
if (S_ISREG(fileStat.st_mode)) {
file = open(argv[1], O_RDONLY);
if (file < 0)
exit(1);
// Check the total size of the file
size = fileStat.st_size;
num_threads = atoi(argv[2]);
if ((addr = mmap(0, size, PROT_READ, MAP_SHARED , file, 0)) == (void *) -1) {
exit(1);
}
munmap(addr, size);
close(file);
} else {
exit(1);
}
Multiple threads can safely read a source file without issue. Writing is when you have problems.
My suggestion (without really understanding the requirements) is:
On launch determine the file size
calculate value size/threadcount
lets say the file is 4k in size, you get a value of about 1k per thread
seek into the file 1 chunk size, read single bytes until you find a word separator
This position is the end of thread 1's zone and the start of thread 2
Seek to the second and 3rd chunk sizes and do the same
At this point you have a file start and finish position for each thread
Launch each thread and pass them the positions they will be responsible for covering
Make the hashtable (or whatever method for counting words you are using) thread safe with mutual exclusion techniques and just have each thread add to the count of whatever word is found
Once all threads are done you have your list
The idea here is that dividing the work in multiple threads and joining the parts afterwards, it is much faster to do the same operation. So you need to:
Divide the work in many parts without wasting too much time
Discover a way of joining back the work easily
Solve the problem in the boundaries caused by dividing the work
The first part is simple. Just divide your data equally between threads.
The second part is easy too. Just sum the results.
The tricky part is part number 3. In your case, you could end up with a word divided between two different threads. So to avoid counting "half-words" you must maintain an separate record for the first/last word of every thread. Then, when you have all your results, you can get the last word of thread N and concatenate it with the first word of thread N+1 and only then add the word to the count. Obviusly if an separator(space, enter, ...) is the first/last char found by a thread, your respective first/last word will be empty.
In pseudo-code:
def main:
size = filesize
ptr = mmap(file)
num_threads = 4
for i in range(1, num_threads):
new_thread(exec = count_words,
start = ptr + i * size / num_threads,
length = size / num_threads)
wait_for_result(all_threads)
join_the_results
def count_words(start, length):
# Count words as if it was an entire file
# But store separatelly the first/last word if
# the segment does not start/ends with an word
# separator(" ", ".", ",", "\n", etc...)
return (count_of_words, first_word, last_word)
This is the same idea behind MapReduce.
This is not a perfect-logic code. I have used C++. If you are very particular with C, you can use POSIX threads instead of std::thread. Also I have just divided the whole file size into number of threads. You will have to take care of last chunk of data(remaining from divide by number of threads), in the last thread itself. I haven't done this.
Another point is the way I am getting the return values from the threads. As of now I am saving it to a global array. C++11 supports retrieving return values - C++: Simple return value from std::thread?
#include <iostream>
#include <fstream>
#include <thread>
#include <mutex>
using namespace std;
#define NO_OF_THREADS 4
int countArray[100];
std::mutex g_pages_mutex;
int trimWhiteSpaces(char *arr, int start, int len)
{
int i = 0;
for(; i < len; i++)
{
char c = arr[i];
if(c == ' ')
{
continue;
}
else
break;
}
return i;
}
void getWordCount(char *arr, int len, int ID)
{
int count = 0;
bool isSpace = false;
int i = 0;
i = i + trimWhiteSpaces(arr, i, len);
for(; i < len; i++)
{
char c = arr[i];
if(c == ' ')
{
i = i + trimWhiteSpaces(&arr[i], i, len) - 1;
//printf("Found space");
isSpace = true;
count++;
}
else
{
isSpace = false;
}
}
if(isSpace)
count = count - 1;
count = count + 1;
g_pages_mutex.lock();
cout << "MYCOUNT:" << count << "\n";
countArray[ID] = count;
g_pages_mutex.unlock();
}
int main(int argc, const char * argv[])
{
char fileData[5000];
std::thread threadIDs[100];
int noOfThreads = NO_OF_THREADS;
char *filePath = "/Users/abc/Desktop/test.txt";
int read_sz = 0;
int decrements = 0;
bool previousNotEndsInSpace = false;
std::ifstream is(filePath, std::ifstream::ate | std::ifstream::binary);
int fileSize = is.tellg();
int bulkSize = fileSize / NO_OF_THREADS;
is.seekg(0);
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
int old_read_sz = read_sz;
is.read(fileData, bulkSize);
read_sz = is.tellg();
fileData[read_sz - old_read_sz] = '\0';
if(read_sz > 0)
{
cout << " data size so far: " << read_sz << "\n";
cout << fileData << endl;
if(previousNotEndsInSpace && fileData[0] != ' ')
{
decrements = decrements + 1;
}
if(fileData[read_sz - 1] != ' ')
{
previousNotEndsInSpace = true;
}
else
{
previousNotEndsInSpace = false;
}
//getWordCount(fileData, strlen(fileData), iter);
threadIDs[iter] = std::thread(getWordCount, fileData, strlen(fileData), iter);
}
}
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
threadIDs[iter].join();
}
int totalCount = 0;
for(int iter = 0; iter < NO_OF_THREADS; iter++)
{
cout << "COUNT: " << countArray[iter] << "\n";
totalCount = totalCount + countArray[iter];
}
cout << "TOTAL: " << totalCount - decrements << "\n";
return 0;
}

Segmentation Error 11 in C while writing a file to disk and using buffers

This program is designed to create an output sine wave file with user inputs for duration, amplitude, sampling rate and frequency, filling a buffer with values and applying an short attack and decay ramp before writing a new .aiff file with the data.
Although my program compiles fine, it runs into a 'Segmentation Error 11' when ran with arguments, which after some quick googling seems linked to running out of memory. I've checked my code several times (mainly the areas that deal with the buffer size and pointers to it).
/* playsine.c */
/* Creates a sine wave audio file with input outfile - duration - amplitude - sampling
rate - frequency */
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <portsf.h>
int makeSine(float *buffer, double amplitude, long numFrames, double sineFreq,
double samplingPeriod){
long i;
double time;
double twoPi = 2 * M_PI;
for(i = 0, time = 0; i < numFrames; i++){
buffer[i] = amplitude * sin(twoPi * sineFreq * time);
time += samplingPeriod;
}
return i;
}
long attack(float *buffer, long attackFrames){
long i = 0;
double factor = 0.0, increment = 1.0/attackFrames;
while(factor <= 1.0 && i < attackFrames){
buffer[i] = factor * buffer[i];
factor += increment;
++i;
}
return i;
}
long decay(float *endBuffer, long decayFrames){
long i = 0;
double factor = 1.0, decrement = 1.0/decayFrames;
while(factor >= 0.0 && i < decayFrames){
endBuffer[i] = decayFrames * endBuffer[i];
factor -= decrement;
++i;
}
return i;
}
enum {nameArg, outArg, durArg, ampArg, sampArg, freqArg, numArg};
int main(int argc, char* argv[]){
if(argc < numArg){
printf("Usage:\toutfile.aiff\tduration(s)\tamplitude(0-1)\tsampling rate\t\
frequency(hz)\n");
return 1;
}
if(psf_init()){
printf("Error: Unable to open portsf library\n");
return 1;
}
PSF_PROPS props;
int outfile;
long numFrames, samplingRate = atol(argv[sampArg]);
double amps = atof(argv[ampArg]), samplingPeriod = 1.0/samplingRate;
double sineFreq = atof(argv[freqArg]), attackFrames = 0.005 * samplingRate;
double decayFrames = 0.01 * samplingRate;
float *buffer, duration = atof(argv[durArg]);
numFrames = (long)duration * samplingRate;
float *endBuffer = buffer + (numFrames - (long)decayFrames);
//Fill structure
props.srate = samplingRate;
props.chans = 1;
props.samptype = PSF_SAMP_16;
props.format = PSF_AIFF;
props.chformat = MC_MONO;
//Assign buffer
buffer = (float*)malloc(numFrames * props.chans * sizeof(float));
if(buffer == 0){
printf("Error: unable to allocate buffer\n");
return 1;
}else{
//Fill buffer
if(makeSine(buffer, amps, numFrames, sineFreq, samplingPeriod) != numFrames){
printf("Error: unable to create sinewave\n");
return 1;
}
attack(buffer, attackFrames);
decay(endBuffer, decayFrames);
}
//Create an outfile
outfile = psf_sndCreate(argv[outArg], &props, 0, 0, PSF_CREATE_RDWR);
if(outfile < 0){
printf("Error: unable to create %s\n", argv[outArg]);
return 1;
}
//Write buffer to file
printf("Writing %s ...\n", argv[outArg]);
if(psf_sndWriteFloatFrames(outfile, buffer, numFrames) != numFrames){
printf("Warning: error writing %s\n", argv[outArg]);
return 1;
}
//Close file
if(psf_sndClose(outfile)){
printf("Warning: error closing %s\n", argv[outArg]);
return 1;
}
psf_finish();
return 1;
}
Two immediate problems that I see in decay() and attack():
long attack (float *buffer, long attackFrames) {
long i;
...
buffer[i] = factor * buffer[i]; //Oops, i is never initialized
The variable i is never initialized. This is undefined behaviour and could well lead to crashes. I assume you actually wanted to do something like:
long attack (float *buffer, long attackFrames) {
long i = 0;
double factor = 0.0, increment = 1.0/attackFrames;
while(factor <= 1.0 && i < attackFrames) {
buffer[i] = factor * buffer[i];
factor += increment;
++i;
}
return i;
}
Edit: Another problem is you reference uninitialized memory with endBuffer:
float *buffer; // Buffer not initialized
float *endBuffer = buffer + (numFrames - (long)decayFrames); // Oops!
...
buffer = (float*)malloc(numFrames * props.chans * sizeof(float));
//endBuffer still points to buffer's original address which is who-knows-where
You should assign endBuffer after you use malloc() to allocate buffer.

CUDA C Timeout on Kernel Call: "the launch timed out and was terminated" [duplicate]

My code is a parallel implmentation that calculates the nth digit of pi. When I finish the kernel and try to copy the memory back to the host I get a "the launch timed out and was terminated" error.
I used this code for error checking for each cudamalloc, cudamemcpy, and kernal launch.
std::string error = cudaGetErrorString(cudaGetLastError());
printf("%s\n", error);
These calls were saying everything was fine until the first cudamemcpy call after returning from the kernel. the error happens in the line "cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);" in main. Any help is appreciated.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#define mul_mod(a,b,m) fmod( (double) a * (double) b, m)
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of x mod y */
__device__ int inv_mod(int x,int y) {
int q,u,v,a,c,t;
u=x;
v=y;
c=1;
a=0;
do {
q=v/u;
t=c;
c=a-q*c;
a=t;
t=u;
u=v-q*u;
v=t;
} while (u!=0);
a=a%y;
if (a<0) a=y+a;
return a;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the inverse of u mod v, if v is odd */
__device__ int inv_mod2(int u,int v) {
int u1,u3,v1,v3,t1,t3;
u1=1;
u3=u;
v1=v;
v3=v;
if ((u&1)!=0) {
t1=0;
t3=-v;
goto Y4;
} else {
t1=1;
t3=u;
}
do {
do {
if ((t1&1)==0) {
t1=t1>>1;
t3=t3>>1;
} else {
t1=(t1+v)>>1;
t3=t3>>1;
}
Y4:;
} while ((t3&1)==0);
if (t3>=0) {
u1=t1;
u3=t3;
} else {
v1=v-t1;
v3=-t3;
}
t1=u1-v1;
t3=u3-v3;
if (t1<0) {
t1=t1+v;
}
} while (t3 != 0);
return u1;
}
/* return (a^b) mod m */
__device__ int pow_mod(int a,int b,int m)
{
int r,aa;
r=1;
aa=a;
while (1) {
if (b&1) r=mul_mod(r,aa,m);
b=b>>1;
if (b == 0) break;
aa=mul_mod(aa,aa,m);
}
return r;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return true if n is prime */
int is_prime(int n)
{
int r,i;
if ((n % 2) == 0) return 0;
r=(int)(sqrtf(n));
for(i=3;i<=r;i+=2) if ((n % i) == 0) return 0;
return 1;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
/* return the prime number immediatly after n */
int next_prime(int n)
{
do {
n++;
} while (!is_prime(n));
return n;
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
#define DIVN(t,a,v,vinc,kq,kqinc) \
{ \
kq+=kqinc; \
if (kq >= a) { \
do { kq-=a; } while (kq>=a); \
if (kq == 0) { \
do { \
t=t/a; \
v+=vinc; \
} while ((t % a) == 0); \
} \
} \
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
__global__ void digi_calc(int *s, int *av, int *primes, int N, int n, int nthreads){
int a,vmax,num,den,k,kq1,kq2,kq3,kq4,t,v,i,t1, h;
unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
// GIANT LOOP
for (h = 0; h<1; h++){
if(tid > nthreads) continue;
a = primes[tid];
vmax=(int)(logf(3*N)/logf(a));
if (a==2) {
vmax=vmax+(N-n);
if (vmax<=0) continue;
}
av[tid]=1;
for(i=0;i<vmax;i++) av[tid]*= a;
s[tid]=0;
den=1;
kq1=0;
kq2=-1;
kq3=-3;
kq4=-2;
if (a==2) {
num=1;
v=-n;
} else {
num=pow_mod(2,n,av[tid]);
v=0;
}
for(k=1;k<=N;k++) {
t=2*k;
DIVN(t,a,v,-1,kq1,2);
num=mul_mod(num,t,av[tid]);
t=2*k-1;
DIVN(t,a,v,-1,kq2,2);
num=mul_mod(num,t,av[tid]);
t=3*(3*k-1);
DIVN(t,a,v,1,kq3,9);
den=mul_mod(den,t,av[tid]);
t=(3*k-2);
DIVN(t,a,v,1,kq4,3);
if (a!=2) t=t*2; else v++;
den=mul_mod(den,t,av[tid]);
if (v > 0) {
if (a!=2) t=inv_mod2(den,av[tid]);
else t=inv_mod(den,av[tid]);
t=mul_mod(t,num,av[tid]);
for(i=v;i<vmax;i++) t=mul_mod(t,a,av[tid]);
t1=(25*k-3);
t=mul_mod(t,t1,av[tid]);
s[tid]+=t;
if (s[tid]>=av[tid]) s-=av[tid];
}
}
t=pow_mod(5,n-1,av[tid]);
s[tid]=mul_mod(s[tid],t,av[tid]);
}
__syncthreads();
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
int main(int argc,char *argv[])
{
int N,n,i,totalp, h;
double sum;
const char *error;
int *sdev, *avdev, *shost, *avhost, *adev, *ahost;
argc = 2;
argv[1] = "2";
if (argc<2 || (n=atoi(argv[1])) <= 0) {
printf("This program computes the n'th decimal digit of pi\n"
"usage: pi n , where n is the digit you want\n"
);
exit(1);
}
sum = 0;
N=(int)((n+20)*logf(10)/logf(13.5));
totalp=(N/logf(N))+10;
ahost = (int *)calloc(totalp, sizeof(int));
i = 0;
ahost[0]=2;
for(i=1; ahost[i-1]<=(3*N); ahost[i+1]=next_prime(ahost[i])){
i++;
}
// allocate host memory
size_t size = i*sizeof(int);
shost = (int *)malloc(size);
avhost = (int *)malloc(size);
//allocate memory on device
cudaMalloc((void **) &sdev, size);
cudaMalloc((void **) &avdev, size);
cudaMalloc((void **) &adev, size);
cudaMemcpy(adev, ahost, size, cudaMemcpyHostToDevice);
if (i >= 512){
h = 512;
}
else h = i;
dim3 dimGrid(((i+512)/512),1,1);
dim3 dimBlock(h,1,1);
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
//copy memory back to host
cudaMemcpy(avhost, avdev, size, cudaMemcpyDeviceToHost);
cudaMemcpy(shost, sdev, size, cudaMemcpyDeviceToHost);
// end malloc's, memcpy's, kernel calls
for(h = 0; h <=i; h++){
sum=fmod(sum+(double) shost[h]/ (double) avhost[h],1.0);
}
printf("Decimal digits of pi at position %d: %09d\n",n,(int)(sum*1e9));
//free memory
cudaFree(sdev);
cudaFree(avdev);
cudaFree(adev);
free(shost);
free(avhost);
free(ahost);
return 0;
}
This is exactly the same problem you asked about in this question. The kernel is getting terminated early by the driver because it is taking too long to finish. If you read the documentation for any of these runtime API functions you will see the following note:
Note:
Note that this function may also return error codes from previous,
asynchronous launches.
All that is happening is that the first API call after the kernel launch is returning the error incurred while the kernel was running - in this case the cudaMemcpy call. The way you can confirm this for yourself is to do something like this directly after the kernel launch:
// launch kernel
digi_calc <<<dimGrid, dimBlock >>> (sdev, avdev, adev, N, n, i);
std::string error = cudaGetErrorString(cudaPeekAtLastError());
printf("%s\n", error);
error = cudaGetErrorString(cudaThreadSynchronize());
printf("%s\n", error);
The cudaPeekAtLastError() call will show you if there are any errors in the kernel launch, and the error code returned by the cudaThreadSynchronize() call will show whether any errors were generated while the kernel was executing.
The solution is exactly as outlined in the previous question: probably the simplest way is redesign the code so it is "re-entrant" so you can split the work over several kernel launches, with each kernel launch safely under the display driver watchdog timer limit.
Cuda somehow buffers all the read/write operations on global memory. So you can batch the operations in some loop with some kernel, and it will take actually NO TIME. Then, when you call memcpy, all the buffered operations are done, and it can timeout. Method to go with, is to call cudaThreadSynchronize procedure between iterations.
So remember: if a kernel run takes only nanoseconds to calculate - it doesn't mean that it is so fast - some of the writes to the global memory, are done when memcpy or threadsynchronize is called.

Matrix calculation with PBLAS algorithm - multi threads with MPI

I'm currently developping a C code with mpi for matrix multiplication. I have functions already implemented as mult or multadd defined in an other file, working well.
But my file pblas.c compiles, but crashes when running.
I run my project on a university server, which has mli installed.
Where am i wrong in my pblas code ?
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "commfct.h"
#include "toolsfct.h"
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
createGridComm(me,P,Q);
ligMe = me / Q;
colMe = me % Q;
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<P;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,ligMe,commCol);
multadd(A,B,C,blockSize);
}
else {
int dest = colMe * Q + ligMe;
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,dest%Q,commCol);
mult(A,Btmp,CC,blockSize);
}
}
else {
int dest = colMe*Q + ligMe;
if(dest%Q == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,colMe,commCol);
multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] != ((double)me/10.0 + sum*blockSize*(colMe+1.0))) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
releaseGridComm();
MPI_Finalize();
return 0;
}
The trouble may come from indexes in MPI_Send(), MPI_Recv() or MPI_Bcast(). For instance, in MPI_Send(), the destinator shall call MPI_Recv(). In MPI_Recv(), the origin should have called MPI_Send(). Otherwise, you will get a deadlock. The code keeps on running, but it waits for messages.
From what you gave us, I guess you have two matrix A and B.
A0 | A1
...........
A2 | A3
To compute the first column of C, you need :
A0xB0 | A1xB2
...................
A2xB0 | A3xB2
I changed your code so that :
for each column i
i_th column of B is transposed in the i_th line of Btmp
i_th line of Btmp is broadcast to Btmp in each column.
Since the issue was about MPI, i reply with a MPI code that looks very close to yours, works but it does nothing, except communication operations...
/**********************************************************************
This file is just a pattern for pblas parallel multiplication
There are comments beginning with TO ADD that tell what must be done
where they are placed. Thus, just add the correct lines of code and
everything will work fine !
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "mpi.h"
//#include "commfct.h"
//#include "toolsfct.h"
#define TAG_TRANSPOSE 42
void usage() {
fprintf(stderr,"usage : pblas bloc_size\n\t bloc_size : gives the size of blocs owned by each processor.\n");
exit(1);
}
int main(int argc, char **argv) {
int me,nbProc;
int ligMe,colMe;
int blockSize;
int i,j;
double t;
if (argc != 2) {
usage();
}
blockSize = atoi(argv[1]);
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nbProc);
int P = (int)sqrt(nbProc); // P = the number of rows of proc.
int Q = P; // Q = the number of columns of proc.
if ((P*Q) != nbProc) {
if (me == 0) {
fprintf(stderr,"!!! CRITICAL ERROR : number of processors must be 4, 9, 16, ...\nAborting\n");
}
exit(1);
}
//createGridComm(me,P,Q);
colMe = me / Q;
ligMe = me % Q;
MPI_Comm commCol, commLine;
//comes from http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
/* Split comm into row and column comms */
MPI_Comm_split(MPI_COMM_WORLD, ligMe, colMe, &commLine);
/* color by row, rank by column */
MPI_Comm_split(MPI_COMM_WORLD, colMe, ligMe, &commCol);
/* color by column, rank by row */
printf("[%d]:My coordinates are i j (%d,%d)\n",me,ligMe,colMe);
// allocate memory for matrices
double *A,*Btmp, *B,*C,*CC;
A = (double *)malloc(blockSize*blockSize*sizeof(double));
B = (double *)malloc(blockSize*blockSize*sizeof(double));
Btmp = (double *)malloc(blockSize*blockSize*sizeof(double));
C = (double *)malloc(blockSize*blockSize*sizeof(double));
CC = (double *)malloc(blockSize*blockSize*sizeof(double));
/* fill blocks with pseudo values
NOTE : these values should not be changed so that
the check below is valid
*/
for(i=0;i<blockSize*blockSize;i++) {
A[i] = 2.0+(double)me;
B[i] = 1.0+(double)colMe;
C[i] = (double)me / 10.0;
}
/* CAUTION : in the following, A,B C are supposed to be stored
column after column, with each column of size blockSize.
Thus A(0,0) and A(1,0) are contiguous in memory, but
A(0,0) and A(0,1) are separated by blockSize cells.
*/
// t = dclock(CLOCK_S);
MPI_Status status;
//main Loop
for(i=0;i<Q;i++) {
/*************************************
Etape 1 et 2: Transposition column i (in step i) of B-blocks . stock in Btmp.
**************************************/
if(colMe==i){
if(ligMe==colMe) {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,B,C,blockSize);
}
else {
int dest = ligMe * Q + i;//transpose !
MPI_Send(B,blockSize*blockSize,MPI_DOUBLE,dest,TAG_TRANSPOSE, MPI_COMM_WORLD);
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
else {
int from = i*Q + colMe;// transpose !
if(ligMe == i) {
MPI_Recv(Btmp,blockSize*blockSize,MPI_DOUBLE,from,TAG_TRANSPOSE,MPI_COMM_WORLD,&status);
// Broadcast on the column
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// multadd(A,Btmp,C,blockSize);
}
else {
MPI_Bcast(Btmp,blockSize*blockSize,MPI_DOUBLE,i,commCol);
// mult(A,Btmp,CC,blockSize);
}
}
if(colMe == i)
MPI_Reduce(MPI_IN_PLACE, C, blockSize*blockSize, MPI_DOUBLE, MPI_SUM, colMe, commLine);
else
MPI_Reduce(CC,NULL,blockSize*blockSize,MPI_DOUBLE,MPI_SUM,i,commLine);
}
//t = dclock(CLOCK_S) -t;
printf("timing for %d : %f sec\n",me,t);
// checking for result correctness
int correct = 1;
double sum = 0.0;
for(i=0;i<P;i++) {
sum += 2.0+(ligMe*Q)+(double)i;
}
for(i=0;i<blockSize;i++) {
for(j=0;j<blockSize;j++) {
if (C[i+j*blockSize] <0.99999*((double)me/10.0 + sum*blockSize*(colMe+1.0)) || C[i+j*blockSize] >1.00001*((double)me/10.0 + sum*blockSize*(colMe+1.0)) ) {
correct = 0;
}
}
}
if (correct != 1) {
printf("multiplication result is not correct\n");
}
// free memory
free(A);
free(B);
free(C);
free(CC);
//releaseGridComm();
MPI_Finalize();
return 0;
}
Watch for the createGridComm(me,P,Q); I had to found something equivalent at http://static.msi.umn.edu/tutorial/scicomp/general/MPI/communicator.html
I also changed the test at the end of your code. Testing equality between double precision numbers will be too strict. Inequalities are better for floating-point numbers !
I hope this will help !
Bye,
Francis

Resources