I'm trying to randomly generate rooms in a two-dimensional array of size 100x100. If the room being generated collides with an already existing room, it generates new points for the room. The generation code makes sense conceptually, but when I try to run, the program loops endlessly, and checking the log reveals why.
Room created successfully with dimensions x=0, y=0, width=976761120, height=809120052
For some reason, at lines 65-68, inside create_room(), the width and height for the room are being randomly assigned huge numbers, when they should be between 1 and 11. Just for fun, I ran the program through Valgrind using the options --track-origins=yes -v, and I what I found surprised me. Suddenly, the program would run!
Room created successfully with dimensions x=0, y=0, width=0, height=0
While still not exactly what I wanted, this at least prevents an infinite loop of collisions being detected with an impossibly huge room.
So, my question is, why is the code generating such large numbers when executed normally, but generate smaller numbers when in Valgrind?
Here's the code for the program.
#include <time.h>
#include <stdlib.h>
#include "global.h"
#include "draw.h"
#include "log.h"
#include "generate.h"
#define NUM_ROOMS 10
#define ROOM_SIZE 10
#define MAP_HEIGHT 100
#define MAP_WIDTH 100
static struct ROOM* create_room (unsigned int);
struct ROOM {
int x, y, width, height;
int feature;
};
struct ROOM* rooms[NUM_ROOMS] = {NULL};
static FILE* gen_log;
static WINDOW* gen_window;
int** generate_dungeon(unsigned int seed){
char* log_entry = malloc (80);
int i = 0, j, k;
gen_window = create_window (0, 0, LINES, COLS);
gen_log = log_open (GEN_LOG);
if (seed == 0){
time_t t;
seed = time (&t);
}
srand (seed);
for (int i = 0; i < NUM_ROOMS; i++){
rooms[i] = create_room (seed);
sprintf (log_entry,"Room created successfully with dimensions x=%d, y=%d, width=%d, height=%d\n", rooms[i]->x, rooms[i]->y, rooms[i]->width, rooms[i]->height);
LOG_DEBUG (gen_log,log_entry);
}
LOG_DEBUG(gen_log, "Beginning to draw rooms\n");
for (i=0;i < NUM_ROOMS;i++){
sprintf (log_entry, "Drawing room %d\n", i);
LOG_DEBUG (gen_log, log_entry);
for (j = rooms[i]->y; j < rooms[i]->y + rooms[i]->height; j++){
for (k = rooms[i]->x; k < rooms[i]->x + rooms[i]->width; k++){
sprintf (log_entry, "Clearing %d,%d]\n", j,k);
LOG_DEBUG (gen_log, log_entry);
map_array[j][k] = 1;
}
}
}
destroy_window (gen_window);
}
static struct ROOM* create_room (unsigned int seed){
int i = 0, flag;
srand (seed);
if (rooms[0] == NULL)
flag = 0;
else
flag = 1;
char* log_entry = malloc (80);
struct ROOM* new_room = malloc (sizeof(struct ROOM));
while (flag){
draw_notify (gen_window, "Creating room\n");
new_room->x = (rand() % MAP_WIDTH);
new_room->y = (rand() % MAP_HEIGHT);
new_room->width = (rand() % ROOM_SIZE + 1);
new_room->height = (rand() % ROOM_SIZE + 1);
sprintf (log_entry, "New room created with points x=%d, y=%d,width=%d, height=%d\n", new_room->x, new_room->y, new_room->width, new_room->height);
LOG_DEBUG (gen_log, log_entry);
draw_notify (gen_window, "Log entry made\n");
if (new_room->x + new_room->width >= MAP_WIDTH || new_room->y + new_room->height >= MAP_HEIGHT){
LOG_DEBUG (gen_log, "Room out of bounds\n");
continue;
}
i=0;
draw_notify(gen_window, "Entering loop\n");
while (rooms[i] != NULL && i < NUM_ROOMS){
sprintf (log_entry, "Testing room %d\n", i);
draw_notify (gen_window, log_entry);
LOG_DEBUG(gen_log, log_entry);
if (new_room->x < rooms[i]->x + rooms[i]->width &&
new_room->x + new_room->width > rooms[i]->x &&
new_room->y < rooms[i]->y + rooms[i]->height &&
new_room->y + new_room->height > rooms[i]->y){
sprintf (log_entry, "Collision detected with room %d\n", i);
draw_notify (gen_window, log_entry);
LOG_DEBUG (gen_log, log_entry);
flag = 1;
break;
}
else{
sprintf (log_entry, "Room %d passed.\n", i);
flag = 0;
i++;
}
}
draw_notify(gen_window, "Exited loop\n");
}
return new_room;
}
You have some logic errors and end up with uninitialized values.
You initialize rooms to be an array of NULL pointers.
In create_room, you have:
if (rooms[0] == NULL)
flag = 0;
else
flag = 1;
First time around, flag will be set to 0. And then, you use:
struct ROOM* new_room = malloc (sizeof(struct ROOM));
while (flag){
since flag is set to 0, nothing under the while gets executed and you end up with uninitialized members in new_room.
You need to re-think your logic and make sure that you initialize members of new_room always.
Related
What I want to try is Adding two big numbers under 600 digits.
So I making a struct in C.
But there is some error in the source below.
(The environment of practice is GCC Compiler, and Linux. The tool is VSCode with BASH Terminal.)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define MAX_SIZE 600
#define SWAP(x,y,t) ((t)=(x), (x)=(y), (y)=(t)) //SWAP preprocessor
#define D_C(x) (x==0 ? 0 : x+'0') //Convert Decimal to Character
#define C_D(x) (x==0 ? 0 : x-'0') //Convert Character to Decimal
/*The structure to save BIG NUMBER*/
typedef struct _BIG_DECIMAL{
unsigned char *data;
int size;
} BIG_DECIMAL;
/*Make string reverse*/
void reverseString(char* s, size_t size) {
char temp;
for (size_t i = 0; i < size / 2; i++) SWAP(s[i], s[(size - 1) - i], temp);
}
/*Create Decimal data in BIG_DECIMAL struct*/
BIG_DECIMAL * createDecimal(unsigned char *str) {
//local variables in func.
size_t size_str;
BIG_DECIMAL * number = malloc(sizeof(BIG_DECIMAL));
//save str in buffer
char buffer[MAX_SIZE] = {'\0',};
strcpy(buffer, str);
//temporary value for size measure.
size_str = strlen(buffer);
printf("%d", size_str);
//Save reversed number data.
reverseString(buffer, size_str);
strcpy(number->data, buffer);
//Save size.
number->size = size_str;
//Return BIG_DECIMAL struct.
return number;
}
/*ADDITION BETWEEN TWO BIG NUMBERS. left argument's size value should be big.*/
BIG_DECIMAL * BD_addition(BIG_DECIMAL *dec1, BIG_DECIMAL *dec2) {
//local variable in this func.
int carry = 0;
BIG_DECIMAL *result = malloc(sizeof(BIG_DECIMAL));
//Adding loop start
for(int i = 0; i < (result -> size); i++) {
int digit_plus;
//if start
if(i < dec2->size) {
//there are digit in both dec so...
digit_plus = C_D(dec1->data[i]) + C_D(dec2->data[i]) + carry;
//nested-if start
if(digit_plus > 10) { //if the carry is occured
carry = digit_plus / 10; //carry can be (> 1)
result->data[i] = D_C(digit_plus % 10);
}
else { //if the carry is not occcured
carry = digit_plus / 10; //carry can be (> 1)
result->data[i] = D_C(digit_plus % 10);
}
//nested-if end
}
else if((i >= (dec2->size)) && (i < ((result->size)-1))){
digit_plus = C_D(dec1->data[i]) + carry;
//nested-if start
if(digit_plus > 10) { //if the carry is occured
carry = digit_plus / 10;
result->data[i] = D_C(digit_plus % 10);
}
else { //if the carry is not occcured
carry = 0;
result->data[i] = D_C(digit_plus);
}
//nested-if end
}
else { //if i == (result->size)-1 (the last index of result->data)
//nested-if start
if(carry > 0) result->data[i] = D_C(carry); //if carry occured
else { //if the carry doesn't occure in the last index of result->data
result->data[i] = D_C(0); //the last index value of result->data is NULL.
--(result->size); //result size - 1
}
//nested-if end
}
//if end
}
//Adding loop end
return result;
}
int main() {
/*data for operand*/
BIG_DECIMAL * op1;
BIG_DECIMAL * op2;
/*data for result*/
BIG_DECIMAL * result;
op1 = createDecimal("123456789");
op2 = createDecimal("12345678");
result = BD_addition(op1,op2);
printf("%s", result->data);
/*DeAllocation*/
free(op1);
free(op2);
free(result);
return 0;
}
This code makes Segmentation fault error.
I think that it might be a string access error first, so I tried to type-casting all of the char* type variable but it doesn't work.
As pointer in comments, you can correct your code by allocating enough space for data, you can use strdup for this:
/*Create Decimal data in BIG_DECIMAL struct*/
BIG_DECIMAL * createDecimal(unsigned char *str) {
//local variables in func.
size_t size_str;
BIG_DECIMAL * number = malloc(sizeof(BIG_DECIMAL));
//save str in buffer
char buffer[MAX_SIZE] = {'\0',};
strcpy(buffer, str);
//temporary value for size measure.
size_str = strlen(buffer);
//Save reversed number data.
reverseString(buffer, size_str);
/* here: copy buffer in a new allocated memory stored in number->data. */
number->data = strdup(buffer);
//Save size.
number->size = size_str;
//Return BIG_DECIMAL struct.
return number;
}
And do not forget to free them correctly:
/*DeAllocation*/
free(op1->data);
free(op1);
free(op2->data);
free(op2);
There are stell some errors in your code: the beginning of BD_addition function should looks like:
BIG_DECIMAL * BD_addition(BIG_DECIMAL *dec1, BIG_DECIMAL *dec2) {
//local variable in this func.
int carry = 0;
BIG_DECIMAL *result = malloc(sizeof(BIG_DECIMAL));
/* compute the size of result */
result->size = (dec1->size < dec2->size) ? dec1->size : dec2->size;
/* take in account an eventual carry */
result->size += 1;
/* allocate */
result->data = malloc(result->size+1);
//Adding loop start
....
And your macro D_C does not seem valid (0 is not converted to '0').
If you like, this comes without struct, strdup, reverse etc. just one malloc.
#include <stdlib.h>
#define toI(x) ((x)-'0')
#define toC(x) ((x)+'0')
#define max(a,b) ((a)>(b)) ? (a):(b)
char *add(char *buf1, char *buf2) {
int size, v1, v2, r, carry=0;
char *ap1, *ep1, *ap2, *ep2, *ap3, *ep3, *rp, *result;
for(ep1=ap1=buf1; *ep1; ep1++);
for(ep2=ap2=buf2; *ep2; ep2++);
size=max(ep2-ap2, ep1-ap1);
ap3=ep3=rp=result=malloc(size+10);
ep3+=size+10;
rp=ep3-1;
*rp='\0';
for(ep1--, ep2--, rp--; ep1>=ap1 || ep2>=ap2; ep1--, ep2--, rp--) {
v1 = ep1>=ap1 ? toI(*ep1) : 0;
v2 = ep2>=ap2 ? toI(*ep2) : 0;
r = v1+v2+carry;
*rp=toC(r%10);
carry=r/10;
}
if(carry!=0) *rp-- = toC(carry);
for(rp++;rp<ep3; rp++, ap3++)
*ap3=*rp;
return result;
}
int main() {
char *result = add("123456789", "12345678");
printf("\n%s\n", result);
free(result);
}
I'm using a double for-loop in order to check every point (coordinate pair) in a rectangular area from (-2.0, -1.12) to (0.47, 1.12) to see whether it belongs to the Mandelbrot set. If it does, I want to print a 1. Likewise, if it does not, I want to print a 0. The basic idea is to print, line by line, an array of characters that displays a simplified Mandelbrot set.
This is my main function:
#include <stdio.h>
#include "complex.h"
#include "mandelbrot.h"
#define STEP_X 0.06175
#define STEP_Y 0.07466
int main(void){
int i = 0;
char arr[50];
complex_t c, abs, max;
max.real = 10000;
max.imag = 0;
for (c.imag = -1.12; c.imag <= 1.12; c.imag += STEP_Y){
for (c.real = -2.0; c.real <= 0.47; c.real += STEP_X){
abs = abs_complex(mandelbrot(c,15));
if (abs.real < max.real){
arr[i] = 1;
i++;
}
else{
arr[i] = 0;
i++;
}
}
printf("%s", arr);
i = 0;
}
}
The program compiles just fine, but does not produce an output. I know I must not be printing the array the right way, but for the life of me I can not figure out how to do it.
Any feedback, hints, or tips would be greatly appreciated.
Thanks in advance!
The problems you are having are two-fold. (1) you are copying decimal values to arr (e.g. 0 and 1) instead of ASCII characters ('0' and '1'). Decimal 0 and 1 are non-printable. Ironically decimal 0 is the nul-terminating character, so if if (abs.real >= max.real) for i == 0 arr holds the empty-string.
Second you call printf without having insured the final character is the nul-terminating character. (you can do this by default by initializing char arr[MAXC] = ""; and insuring your loop is limited to i + 1 < 50 && c.real <= 0.47 or you can simply affirmatively terminate arr with arr[i] = 0; before calling i = 0; (or move your declaration of i inside the first for loop and initialize).
This is untested (I don't have your local headers), but it looks like you intended:
#include <stdio.h>
#include "complex.h"
#include "mandelbrot.h"
#define MAXC 50
#define STEP_X 0.06175
#define STEP_Y 0.07466
int main(void){
complex_t c, abs, max;
max.real = 10000;
max.imag = 0;
for (c.imag = -1.12; c.imag <= 1.12; c.imag += STEP_Y) {
int i = 0; /* declare/initialize i & arr here */
char arr[MAXC] = ""; /* set to all zero */
for (c.real = -2.0;
i + 1 < MAXC && c.real <= 0.47; /* limit to 49 chars max */
c.real += STEP_X) {
abs = abs_complex (mandelbrot (c,15));
if (abs.real < max.real)
arr[i++] = '1'; /* assign character '1' */
else
arr[i++] = '0'; /* assign character '0' */
}
arr[i] = 0; /* nul-terminate line */
printf ("%s\n", arr); /* output line */
}
return 0;
}
Give it a try and let me know if you have further questions.
I'm very new to programming in C, and have pretty rusty programming skills overall. In order to learn C and re-orient myself with programming in general, I'm challenging myself to try and make a simple rougelike using ncurses.
I've set up a "log", which I should be able to push messages to - the most recent 10 message should be displayed. In order to test this, I've made it so each time either the player or the very simple randomly-moving mob takes a step, a log message is pushed saying "step [direction]". However, even though they each only take one step, for some reason, four messages are pushed to the log. The second-to-most-recent one is always the actual direction the character moved, and I presume one of the other two is the mob moving, but I don't know the origin of the other two. Does anyone spot anything glaring in my code that might be causing this issue? All help is appreciated, thanks!
(I believe the only major relevant sections to look at should be the main() function, pushToLog(), printLog(), and moveCreature(). That said, there is a chance the problem might be somewhere else. I'm not sure.)
#include <stdlib.h>
#include <stdio.h>
#include <ncurses.h>
#include <unistd.h>
#include <string.h>
#define up 65
#define down 66
#define right 67
#define left 68
#define quit 113
struct creature {
int x;
int y;
int hp;
int maxhp;
};
void setupMap();
struct creature setupCreature();
void moveCreature();
void pushToLog();
void printLog();
int breakFlag = FALSE;
char mapShape[15][15];
char mapFeatures[15][15];
char outputLog[10][60];
int main(int argc, char *argv[]){
struct creature player = setupCreature(4, 4, 100, 100);
struct creature mob = setupCreature(5, 7, 100, 100);
setupMap();
initscr();
noecho();
curs_set(FALSE);
while(1){
for (int i = 0; i < 15; i++){
for (int c = 0; c < 15; c++){
mvprintw(c, i, "%c", mapShape[i][c]);
}
}
mvprintw(player.y, player.x, "%c", '#');
mvprintw(mob.y, mob.x, "%c", 'd');
printLog();
int input = getch();
moveCreature(input, &player);
int mobDir = rand() % (68 + 1 - 65) + 65;
moveCreature(mobDir, &mob);
refresh();
usleep(300);
if (breakFlag == TRUE){
break;
}
}
endwin();
return 0;
}
void moveCreature(int dir, struct creature *subject){
int next;
if (dir == up){
next = (subject->y - 1);
if (mapShape[subject->x][next] != '#'){
subject->y = next;
pushToLog("step up ");
}
}
else if (dir == down){
next = (subject->y + 1);
if (mapShape[subject->x][next] != '#'){
subject->y = next;
pushToLog("step down ");
}
}
else if (dir == right){
next = (subject->x + 1);
if (mapShape[next][subject->y] != '#'){
subject->x = next;
pushToLog("step right ");
}
}
else if (dir == left){
next = (subject->x - 1);
if (mapShape[next][subject->y] != '#'){
subject->x = next;
pushToLog("step left ");
}
}
else if (dir == quit){
breakFlag = TRUE;
}
}
void pushToLog(char string[]){
for (int i = 10; i > 0; i--){
strcpy(outputLog[i], outputLog[i-1]);
}
strcpy(outputLog[0], string);
}
void printLog(){
for (int i = 0; i < 10; i++){
mvprintw(28-i, 0, outputLog[i]);
}
}
struct creature setupCreature(int x,int y,int hp,int maxhp){
struct creature frankenstien;
frankenstien.x = x;
frankenstien.y = y;
frankenstien.hp = hp;
frankenstien.maxhp = maxhp;
return frankenstien;
}
void setupMap(){
for (int i = 0; i < 15; i++){
for (int c = 0; c < 15; c++){
mapShape[i][c] = '.';
}
}
for (int i = 0; i < 15; i++){
mapShape[0][i] = '#';
mapShape[14][i] = '#';
mapShape[i][0] = '#';
mapShape[i][14] = '#';
}
}
Your problem is at the input stage. You expect directional commands via the arrow keys, but those generate multiple bytes per keypress. All but one are invalid as commands.
As a secondary problem, you do not reject invalid commands. You go ahead and move the mob after each command character read, whether that command was valid or not.
The overall upshot is that when you press an arrow key, the program zips through three iterations of the main loop, one right after the other, producing log messages for one valid player move, no log messages for two invalid player moves, and log messages for each of three mob moves.
You could have detected this by logging invalid commands, or by running your program in a debugger.
I am working on an application which divides a string into pieces and assigns each to a block. Within each block the the text is scanned character by character and a shared array of int, D is to be updated by different threads in parallel based on the character read. At the end of each iteration the last element of D is checked, and if it satisfied the condition, a global int array m is set to 1 at the position corresponding to the text. This code was executed on a NVIDIA GEForce Fermi 550, and runs even slower than the CPU version. I have just included the kernel here:
__global__ void match(uint32_t* BB_d,const char* text_d,int n, int m,int k,int J,int lc,int start_addr,int tBlockSize,int overlap ,int* matched){
__shared__ int D[MAX_THREADS+2];
__shared__ char Text_S[MAX_PATTERN_SIZE];
__shared__ int DNew[MAX_THREADS+2];
__shared__ int BB_S[4][MAX_THREADS];
int w=threadIdx.x+1;
for(int i=0;i<4;i++)
{
BB_S[i][threadIdx.x]= BB_d[i*J+threadIdx.x];
}
{
D[threadIdx.x] = 0;
{
D[w] = (1<<(k+1)) -1;
for(int i = 0; i < lc - 1; i++)
{
D[w] = (D[w] << k+2) + (1<<(k+1)) -1;
}
}
D[J+1] = (1<<((k+2)*lc)) - 1;
}
int startblock=(blockIdx.x == 0?start_addr:(start_addr+(blockIdx.x * (tBlockSize-overlap))));
int size= (((startblock + tBlockSize) > n )? ((n- (startblock))):( tBlockSize));
int copyBlock=(size/J)+ ((size%J)==0?0:1);
if((threadIdx.x * copyBlock) <= size)
memcpy(Text_S+(threadIdx.x*copyBlock),text_d+(startblock+threadIdx.x*copyBlock),(((((threadIdx.x*copyBlock))+copyBlock) > size)?(size-(threadIdx.x*copyBlock)):copyBlock));
memcpy(DNew, D, (J+2)*sizeof(int));
__syncthreads();
uint32_t initial = D[1];
uint32_t x;
uint32_t mask = 1;
for(int i = 0; i < lc - 1; i++)mask = (mask<<(k+2)) + 1;
for(int i = 0; i < size;i++)
{
{
x = ((D[w] >> (k+2)) | (D[w - 1] << ((k + 2)* (lc - 1))) | (BB_S[(((int)Text_S[i])/2)%4][w-1])) & ((1 << (k + 2)* lc) - 1);
DNew[w] = ((D[w]<<1) | mask)
& (((D[w] << k+3) | mask|((D[w +1] >>((k+2)*(lc - 1)))<<1)))
& (((x + mask) ^ x) >> 1)
& initial;
}
__syncthreads();
memcpy(D, DNew, (J+2)*sizeof(int));
if(!(D[J] & 1<<(k + (k + 2)*(lc*J -m + k ))))
{
matched[startblock+i] = 1;
D[J] |= ((1<<(k + 1 + (k + 2)*(lc*J -m + k ))) - 1);
}
}
}
I am not very familiar with CUDA so I dont quite understand issues such as shared memory bank conflicts. Could that be the bottleneck here?
As asked, this is the code where I launch the kernels:
#include <stdio.h>
#include <assert.h>
#include <cuda.h>
#define uint32_t unsigned int
#define MAX_THREADS 512
#define MAX_PATTERN_SIZE 1024
#define MAX_BLOCKS 8
#define MAX_STREAMS 16
#define TEXT_MAX_LENGTH 1000000000
void calculateBBArray(uint32_t** BB,const char* pattern_h,int m,int k , int lc , int J){};
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
char* getTextString() {
FILE *input, *output;
char c;
char * inputbuffer=(char *)malloc(sizeof(char)*TEXT_MAX_LENGTH);
int numchars = 0, index = 0;
input = fopen("sequence.fasta", "r");
c = fgetc(input);
while(c != EOF)
{
inputbuffer[numchars] = c;
numchars++;
c = fgetc(input);
}
fclose(input);
inputbuffer[numchars] = '\0';
return inputbuffer;
}
int main(void) {
const char pattern_h[] = "TACACGAGGAGAGGAGAAGAACAACGCGACAGCAGCAGACTTTTTTTTTTTTACAC";
char * text_h=getTextString(); //reading text from file, supported upto 200MB currently
int k = 13;
int i;
int count=0;
char *pattern_d, *text_d; // pointers to device memory
char* text_new_d;
int* matched_d;
int* matched_new_d;
uint32_t* BB_d;
uint32_t* BB_new_d;
int* matched_h = (int*)malloc(sizeof(int)* strlen(text_h));
cudaMalloc((void **) &pattern_d, sizeof(char)*strlen(pattern_h)+1);
cudaMalloc((void **) &text_d, sizeof(char)*strlen(text_h)+1);
cudaMalloc((void **) &matched_d, sizeof(int)*strlen(text_h));
cudaMemcpy(pattern_d, pattern_h, sizeof(char)*strlen(pattern_h)+1, cudaMemcpyHostToDevice);
cudaMemcpy(text_d, text_h, sizeof(char)*strlen(text_h)+1, cudaMemcpyHostToDevice);
cudaMemset(matched_d, 0,sizeof(int)*strlen(text_h));
int m = strlen(pattern_h);
int n = strlen(text_h);
uint32_t* BB_h[4];
unsigned int maxLc = ((((m-k)*(k+2)) > (31))?(31/(k+2)):(m-k));
unsigned int lc=2; // Determines the number of threads per block
// can be varied upto maxLc for tuning performance
if(lc>maxLc)
{
exit(0);
}
unsigned int noWordorNfa =((m-k)/lc) + (((m-k)%lc) == 0?0:1);
cudaMalloc((void **) &BB_d, sizeof(int)*noWordorNfa*4);
if(noWordorNfa >= MAX_THREADS)
{
printf("Error: max threads\n");
exit(0);
}
calculateBBArray(BB_h,pattern_h,m,k,lc,noWordorNfa); // not included this function
for(i=0;i<4;i++)
{
cudaMemcpy(BB_d+ i*noWordorNfa, BB_h[i], sizeof(int)*noWordorNfa, cudaMemcpyHostToDevice);
}
int overlap=m;
int textBlockSize=(((m+k+1)>n)?n:(m+k+1));
cudaStream_t stream[MAX_STREAMS];
for(i=0;i<MAX_STREAMS;i++) {
cudaStreamCreate( &stream[i] );
}
int start_addr=0,index=0,maxNoBlocks=0;
if(textBlockSize>n)
{
maxNoBlocks=1;
}
else
{
maxNoBlocks=((1 + ((n-textBlockSize)/(textBlockSize-overlap)) + (((n-textBlockSize)%(textBlockSize-overlap)) == 0?0:1)));
}
int kernelBlocks = ((maxNoBlocks > MAX_BLOCKS)?MAX_BLOCKS:maxNoBlocks);
int blocksRemaining =maxNoBlocks;
printf(" maxNoBlocks %d kernel Blocks %d \n",maxNoBlocks,kernelBlocks);
while(blocksRemaining >0)
{
kernelBlocks = ((blocksRemaining > MAX_BLOCKS)?MAX_BLOCKS:blocksRemaining);
printf(" Calling %d Blocks with starting Address %d , textBlockSize %d \n",kernelBlocks,start_addr,textBlockSize);
match<<<kernelBlocks,noWordorNfa,0,stream[(index++)%MAX_STREAMS]>>>(BB_d,text_d,n,m,k,noWordorNfa,lc,start_addr,textBlockSize,overlap,matched_d);
start_addr+=kernelBlocks*(textBlockSize-overlap);;
blocksRemaining -= kernelBlocks;
}
cudaMemcpy(matched_h, matched_d, sizeof(int)*strlen(text_h), cudaMemcpyDeviceToHost);
checkCUDAError("Matched Function");
for(i=0;i<MAX_STREAMS;i++)
cudaStreamSynchronize( stream[i] );
// do stuff with matched
// ....
// ....
free(matched_h);cudaFree(pattern_d);cudaFree(text_d);cudaFree(matched_d);
return 0;
}
Number of threads launched per block depends upon the length pattern_h(could be at most maxLc above). I expect it to be around 30 in this case. Shoudn't that be enough to see a good amount of concurrency? As for blocks, I see no point in launching more than MAX_BLOCKS (=10) at a time since the hardware can schedule only 8 simultaneously
NOTE: I don't have GUI access.
With all the shared memory you're using, you could be running into bank conflicts if consecutive threads are not reading from consecutive addresses in the shared arrays ... that could cause serialization of the memory accesses, which in turn will kill the parallel performance of your algorithm.
I breifly looked at your code but it looks like your sending data to the gpu back and forth creating a bottle neck on the bus? did you try profiling it?
I found that I was copying the whole array Dnew to D in each thread rather than copying only the portion each thread was supposed to update D[w]. This would cause the threads to execute serially, although I don't know if it could be called a shared memory bank conflict. Now it gives 8-9x speedup for large enough patterns(=more threads). This is much less than what I expected. I will try to increase number of blocks as suggested. I dont know how to increase the # of threads
I'm working on a large scale project in which I'm designing a sparse matrix vector application but I'm still working to understand the code. I'm beginning by building the foundation for the application but I've run into a segmentation fault when executing the program. I've tracked the problem to this loop within the MatrixRead function and am enclosing the code below. When the program is executed I tried programming in some test messages and the program appears to execute all the loops but it returns the segmentation fault at the end. Of course, this is all just speculation. Any help would be awesome. Thanks!
while (ret != EOF && row <= mat->rows)
{
if (row != curr_row) // Won't execute for first iteration
{
/* store this row */
MatrixSetRow(mat, curr_row, len, ind, val);
/* check if the previous row is zero */
i = 1;
while(row != curr_row + i)
{
mat->lens[curr_row+i-1] = 0;
mat->inds[curr_row+i-1] = 0;
mat->vals[curr_row+i-1] = 0;
i++;
}
curr_row = row;
/* reset row pointer */
len = 0;
}
ind[len] = col;
val[len] = value;
len++;
ret = fscanf(file, "%lf %lf %lf", &r1, &c1, &value);
col = (int) (c1);
row = (int) (r1);
}
/* Store the final row */
if (ret == EOF || row > mat->rows)
MatrixSetRow(mat, mat->rows, len, ind, val);
Here's the code for the MatrixSetRow function:
/*--------------------------------------------------------------------------
* MatrixSetRow - Set a row in a matrix. Only local rows can be set.
* Once a row has been set, it should not be set again, or else the
* memory used by the existing row will not be recovered until
* the matrix is destroyed. "row" is in global coordinate numbering.
*--------------------------------------------------------------------------*/
void MatrixSetRow(Matrix *mat, int row, int len, int *ind, double *val)
{
row -= 1;
mat->lens[row] = len;
mat->inds[row] = (int *) MemAlloc(mat->mem, len*sizeof(int));
mat->vals[row] = (double *) MemAlloc(mat->mem, len*sizeof(double));
if (ind != NULL)
memcpy(mat->inds[row], ind, len*sizeof(int));
if (val != NULL)
memcpy(mat->vals[row], val, len*sizeof(double));
}
I'm also including the code for the Matrix.h file that went with it, where the members of Matrix are defined:
#include <stdio.h>
#include "Common.h"
#include "Mem.h"
#ifndef _MATRIX_H
#define _MATRIX_H
typedef struct
{
int rows;
int columns;
Mem *mem;
int *lens;
int **inds;
double **vals;
}
Matrix;