word frequency in a binary search tree in c? - c

i have to count how many times a word exists in the binary tree and i couldn't do this ,how can i do this? here is my code ;
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
struct treeNode
{
char data[20];
int count;
struct treeNode *leftPtr, *rightPtr;
};
int number = 1;
typedef struct treeNode TreeNode;
typedef TreeNode *TreeNodePtr;
void insertNode(TreeNodePtr *treePtr, char word[]);
void alphabetic(TreeNodePtr treePtr);
int main()
{
/*reading strings from the file and add them to the tree*/
char first[20];
FILE *fp1;
TreeNodePtr rootPtr = NULL;
int c;
fp1 = fopen("output.txt", "r");
do
{
c = fscanf(fp1, "%s", first);
if (c != EOF)
{
insertNode(&rootPtr, first);
}
} while (c != EOF);
fclose(fp1);
printf("%s", rootPtr->rightPtr->leftPtr->data);
//alphabetic(rootPtr);
system("PAUSE");
}
/*for adding nodes to tree*/
void insertNode(TreeNodePtr *treePtr, char word[20])
{
TreeNode *temp = NULL;
if (*treePtr == NULL )
{
temp = (TreeNode *) malloc(sizeof(TreeNode));
temp->leftPtr = NULL;
temp->rightPtr = NULL;
strcpy(temp->data, word);
*treePtr = temp;
}
else if (strcmp(word, (*treePtr)->data) < 0)
{
insertNode(&((*treePtr)->leftPtr), word);
}
else if (strcmp(word, (*treePtr)->data) > 0)
{
insertNode(&((*treePtr)->rightPtr), word);
}
}
/*traverse the tree*/
void alphabetic(TreeNodePtr treePtr)
{
if (treePtr != NULL )
{
alphabetic(treePtr->leftPtr);
printf("%s\n", treePtr->data);
alphabetic(treePtr->rightPtr);
}
}
i have a .txt file which contains some words more than once,and i need to count how many times a word exists in this tree.

Your code does not "work" because you are not inserting duplicate values. Since the duplicate values would return strcmp() as 0, they are not being added in the first place. Thus in the insertNode() function, you would need to consider the else case as well:
else if (strcmp(word, (*treePtr)->data) < 0) {
insertNode(&((*treePtr)->leftPtr), word);
} else if (strcmp(word, (*treePtr)->data) > 0) {
insertNode(&((*treePtr)->rightPtr), word);
} else {
//This is where the duplcate values should be inserted!
}
In fact, the else clause should simply increment the count as in (as in "(*treePtr)->count += 1;"). Also, make sure you initialize the value to 1 in the initial temp structure after you malloc the TreeNode (as in "temp->count = 1;").

Related

Learning to create my own hash table but running into a segmentation fault

I'm trying to learn how a hash table work but I'm running into a segmentation fault and I don't seem to be understanding the pointers where I load into my hash table.
I'm trying to get a dictionary of correctly spelled words into a hash table with an array of 26 nodes. My goal is to get each letter into its correct node.
I've spent almost 2 weeks on this problem. I've been moving pointers around to what I think is right but it is obviously not because it's not working. If someone could write this for my an explain what is going on I'll be able to see where I went wrong.
//Practice for my hash table
//I don't really inderstand what is going on in the cs50 version because there is just so much to look at so I am making my own.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int const LENGTH = 45;
typedef struct node {
char word[LENGTH + 1];
struct node *next;
} node;
loadHashTable(node **n, char c[]);
int main(int argc, char *argv[]){
//Number of buckets for the hast table. 26 letters in the alphabit
const unsigned int N = 26;
//hash table
node *table[N];
//set everything to null in the table.
for (int i = 0; i < N; i++)
{
table[N] = NULL;
}
//check for correct usage
if (argc != 2)
{
printf("Usage: hashTable DICTIONARY\n");
return 1;
}
//open a file
char *fileName = argv[1];
FILE *dictionaryFile = fopen(fileName, "r");
if (!dictionaryFile)
{
printf("Can not open file: %s\n", argv[1]);
}
//read each line
const unsigned MAX_LENGTH = 256;
char buffer[MAX_LENGTH];
while (fgets(buffer, MAX_LENGTH, dictionaryFile))
{
//change the word to lower case. Could I be doing this better?
char lowerBuffer[MAX_LENGTH];
for (int i = 0; i < MAX_LENGTH; i++)
{
char ch;
ch = buffer[i];
lowerBuffer[i] = (char) tolower(ch);
}
//load each line to table in correct bucket
const int REDUCE_TO_BUCKET = 97;
int correctBucket = lowerBuffer[0] - REDUCE_TO_BUCKET;
loadHashTable(&table[correctBucket], lowerBuffer);
}
if (feof(dictionaryFile))
{
printf("\nEnd of File\n");
}
//TODO Print the hashtable
//printHashTable(table[0]);
printf("%s", *table[0]->word);
//TODO Unload the hashtable
fclose(dictionaryFile);
return 0;
}
loadHashTable(node **n, char c[]){
if (n == NULL)
{
//create temp node
node *temp= malloc(sizeof(node));
if (!temp)
{
printf("Could not create node.\n");
return 1;
}
sprintf(temp->word, c);//load the word into the temp node
temp->next = NULL;
*n = temp;//Why is n not being set here?
printf("New Node Word: %s", (*n)->word);
}
else // we never get here. Why?
{
printf("Link Node: %s", (*n)->word);
loadHashTable((*n)->next, c);
}
}
printHashTable(node *toPrint){
//TODO: Print the hash table
if (toPrint->word)
{
printf("%s\n", toPrint->word);
printHashTable(toPrint->next);
}
}
void loadHashTable(node **n, char c[]){
if (*n == NULL) //Why does this need to be *n and not n?
{
//create temp node
node *temp= malloc(sizeof(node));
if (!temp)
{
printf("Could not create node.\n");
}
sprintf(temp->word, "%s", c);//load the word into the temp node
temp->next = NULL;
*n = temp;//This sets the *n to temp
printf("New Node Word: %s", (*n)->word);
}
else // we never get here. Why?
{
printf("Link Node: %s", (*n)->word);
loadHashTable(&((*n)->next), c);//What is the &((*n)->next) doing here? Specifically the '&'
}
}

Creating a singly linked list from a .txt file and reversing odd numbers of each line in C

I have a project about linked lists but I'm having a hard time doing it. The teacher wants me to read a .txt file and create singly linked list from it. After that, I need to reverse odd numbers of every line. Then print it. Here is the code which I used for printing the linked list. But I need help to reverse the odd numbers of each line.
This is the code which I used to print the list:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct list {
char *string;
struct list *next;
};
typedef struct list LIST;
int main(void) {
FILE *fp;
char line[10];
LIST *current, *head;
head = current = NULL;
fp = fopen("data.txt", "r");
while(fgets(line, sizeof(line), fp)){
LIST *node = malloc(sizeof(LIST));
node->string = strdup(line);
node->next =NULL;
if(head == NULL){
current = head = node;
} else {
current = current->next = node;
}
}
fclose(fp);
for(current = head; current ; current=current->next){
printf("%s", current->string);
}
return 0;
}
Here is the content of the .txt file:
10
9,6,11,7,12,18,19,14,15,13
13,14,9,12,15,3,18,20,1,2
4,11,8,17,12,15,20,10,3,16
19,4,11,1,13,17,12,16,20,18
1,6,20,11,13,9,7,16,10,2
12,4,11,16,3,20,9,19,17,15
20,3,10,12,18,2,5,14,15,16
18,19,15,2,6,9,1,3,17,4
7,6,20,1,11,4,3,5,8,16
1,2,16,13,17,10,12,9,4,15
"But I need help to reverse the odd numbers of each line."
There are several other parts that need to be considered before this step can be developed.
Following are suggestions for a functions approach implementation using your problem description. A few items are simply suggestions to simplify the existing code. And a few other steps, are not mentioned as necessary, but should be considered:
Since you are not mandated to use char *string; in your problem description, choose to use a reasonable string length variable that does not require an additional layer of dynamic allocation, such as char string[260]; (or even smaller to fit your input file.) This will greatly simplify the code.
Because the input file is sized with lines ~30 char long, declare the variable line to be at least large enough to contain one line, eg 80 would allow larger values, and still allow enough space, but since memory is cheap, go with the same size as is used in the string member of your linked list.
Move the work of populating each new node to a function. It also will greatly simplify the program, and provide greater readability. Eg: void insert(LIST **head_ref, char *str);
Always test the return of fopen() before attempting to use the file descriptor.
To manipulate the contents of each odd row (eg 1, 3, 5, 7, 9), as numbers, the contents of each line read in from a file as a string, needs to first be converted to a collection of numbers. This suggests an additional member be added to the struct. For example int num[10].
The previous observation implicitly suggests the need of an additional function to parse and convert each comma delimited string into discrete integer values. Perhaps with the prototype: void parseIntArray(LIST **list);
The next and final task also suggests an additional function to reverse the contents of selected array member integer arrays. This one might use a prototype such as: void reverse_odd(LIST **list, size_t size);
Finally, because each node of LIST created required dynamically allocated memory, once finished using LIST, the memory must be given back to the OS to prevent memory leaks. An additional function to do this could be prototyped: void freeList(LIST **head);
Following are the main() function and preceding support declarations etc. It is intended here to illustrate the above suggested steps, and the benefits of breaking down a bigger problem into smaller problems, then implementing each smaller solution to support the whole. Benefits include for example readability and maintainability and potential re-use of code-base, (Note the similarity of argument lists in each supporting function.):
#define MAX_STRLEN 260 //use mnemonic values to avoid magic numbers in code
struct list {
char string[MAX_STRLEN];
int arr[10];
struct list *next;
};
typedef struct list LIST;
//Prototypes of 'smaller' solutions
void insert(LIST **head_ref, char *str);
void parseIntArray(LIST **list);
void reverse_odd(LIST **list, size_t size);
void freeList(LIST **head);
int main(void)
{
FILE *fp;
char line[MAX_STRLEN];
LIST *current, *head;
char *convPtr = NULL;
head = current = NULL;
fp = fopen("data.txt", "r");
if(fp)
{
//consume 1st line
if(fgets(line, sizeof(line), fp));//10
{
sizeArray = strtol(line, &convPtr, 10);
if(errno != ERANGE)
{
while(fgets(line, sizeof(line), fp))
{
//(see implementations of each below)
//create new node, insert num string
insert(&current, line);
//convert new->string to integers, place in new->array
parseIntArray(&current);
//reverse 'odd' contents of each array
reverse_odd(&current, sizeArray);
}
}else{//handle error and leave}
}
fclose(fp);
}else{//handle error and leave}
//At this point in code, entire file is captured into nodes of list.
//use list as needed
//When finished using list, memory must be freed to prevent memory leaks
head = current;
freeList(&head);
return 0;
}
The remaining code segments are the function implementations used above:
void freeList(LIST **head)
{
LIST *tmp;
while (*head != NULL)
{
tmp = (*head);
(*head) = (*head)->next;
free(tmp);
}
}
//create new node, insert num string
void insert(LIST **head_ref, char *str)
{
int *arr = malloc(numNodes * sizeof(*arr));
//allocate node
LIST* new = calloc(1, sizeof(*new));
//put in the data
strcpy(new->string, str);
//Make next of new node as head
new->next = (*head_ref);
//Move the head to point to the new node
(*head_ref) = new;
}
//convert new->string to integers, place in list->array
void parseIntArray(LIST **list)
{
char *tok = NULL;
int i = 0;
int tmp = 0;
char *sArray = strdup((*list)->string);
tok = strtok(sArray, ",\n ");
while(tok)
{
errno = 0;
tmp = atoi(tok);
if(errno == ERANGE)
{
printf("Error converting string to number\nExiting.");
return;
}
(*list)->arr[i] = tmp;
i++;
tok = strtok(NULL, ",\n ");
}
}
//reverse 'odd' contents of list->array
void reverse_odd(LIST **list, size_t size)
{
int *ptr = &((*list)->arr[0]);
int *tmp = malloc(size * sizeof(*tmp));
memset(tmp, -1, size*sizeof(*tmp));
for(int i=0;i<size;i++)
{
if(ptr[i]%2 != 0)
tmp[size-1-i] = ptr[i];
}
for(int i=0;i<size;i++)
{
if(tmp[i] < 0)
{
while((*ptr)%2 != 0 ) ptr++;
tmp[i] = *ptr;
ptr++;
}
}
memcpy((*list)->arr, tmp, size*sizeof(int));
}
This hope this code will do the job.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct line {
struct num *first;
struct line *next;
} LineNode;
typedef struct num {
int num;
int order;
struct num *next;
} NumNode;
int main() {
FILE *fp;
char ch;
int counter = 0;
NumNode *curr_num, *even_ptr, *odd_ptr, *odd_head, *even_head;
LineNode *curr_line, *line_head;
curr_num = even_head = odd_head = even_ptr = odd_ptr = NULL;
line_head = curr_line = NULL;
fp = fopen("data.txt", "r");
if (fp == NULL)
{
return 1;
}
ch = fgetc(fp);
while(ch != EOF){
if (ch >= 48 && ch <= 57)
{
int n = 0;
while (ch != EOF && ch != '\n' && ch >= 48 && ch <= 57)
{
int x = ch - 48;
n = n * 10 + x;
ch = fgetc(fp);
}
NumNode *node = malloc(sizeof(NumNode));
node->num = n;
node->order = counter;
node->next =NULL;
if (n % 2 == 0){
if(even_head == NULL){
even_head = even_ptr = node;
} else {
even_ptr = even_ptr->next = node;
}
}else{
if(odd_head == NULL){
odd_head = node;
} else {
node->next = odd_head;
odd_head = node;
}
}
counter++;
}
if (ch == '\n' || ch == EOF)
{
NumNode *num_node, *head;
num_node = head = NULL;
even_ptr = even_head;
odd_ptr = odd_head;
counter = 0;
if (even_head != NULL && even_head->order == counter){
head = num_node = even_ptr;
even_ptr = even_ptr->next;
} else {
head = num_node = odd_ptr;
odd_ptr = odd_ptr->next;
}
counter++;
while (even_ptr != NULL)
{
if (even_ptr->order == counter) {
num_node = num_node->next = even_ptr;
even_ptr = even_ptr->next;
}
else if (odd_ptr != NULL) {
num_node = num_node->next = odd_ptr;
odd_ptr = odd_ptr->next;
}
counter++;
}
while (odd_ptr != NULL)
{
num_node = num_node->next = odd_ptr;
odd_ptr = odd_ptr->next;
}
LineNode *node = malloc(sizeof(LineNode));
node->next =NULL;
node->first = head;
if (line_head == NULL)
line_head = curr_line = node;
else
curr_line = curr_line->next = node;
odd_head = even_head = NULL;
counter = 0;
}
ch = fgetc(fp);
}
fclose(fp);
for(curr_line = line_head; curr_line != NULL ; curr_line=curr_line->next) {
for(curr_num = curr_line->first; curr_num != NULL ; curr_num=curr_num->next) {
printf("%d", curr_num->num);
if (curr_num->next != NULL)
printf(",");
}
printf("\n");
}
return 0;
}

Reading stdin strings and adding it to BST in C

I'm attempting to insert stdin strings into a binary search tree, and then output the inorder and postorder traversals. My problem is it seems like older nodes are being overwritten by the most recent, but I can't figure out why. So my inorder/postorder methods return the last inserted string how many times there are nodes in the tree.
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <string.h>
#include <ctype.h>
#include "bstsort.h"
/* Case Sensitive String Comparison */
int strcmpCase(char* string1, char* string2) {
while(*string1 && (*string1==*string2)) {
string1++;
string2++;
}
return *string1 - *string2;
}
/* Case Insensitive String Comparison */
int strcmpNoCase(char* string1, char* string2) {
int i;
char a[100];
char b[100];
strcpy(a, string1);
strcpy(b, string2);
for (i = 0; a[i]; ++i) {
a[i] = tolower(a[i]);
}
for (i = 0; b[i]; ++i) {
b[i] = tolower(b[i]);
}
return strcmpCase(a, b);
}
/* Inserts a Node into the Binary Search Tree */
void insert(struct Node **node, char *keyStr, int cflag) {
// Creates new node
if (*node == NULL) {
*node = (struct Node*) malloc (100);
strcpy((*node)->key, keyStr);
(*node)->left = NULL;
(*node)->right = NULL;
(*node)->count = 1;
return;
}
// Compares Strings
int compareResult;
if (cflag == 1){
// Case sensitive
compareResult = strcmpCase(keyStr, (*node)->key);
} else {
// Case insensitive
compareResult = strcmpNoCase(keyStr, (*node)->key);
}
// Moves down branches of BST to insert node in correct order
if (compareResult < 0) {
insert(&((*node)->left), keyStr, cflag);
} else if (compareResult > 0) {
insert(&((*node)->right), keyStr, cflag);
}
(*node)->count++;
return;
}
/* Outputs in-order traversal or BST */
void inorder(Node* root) {
if (root != NULL) { // if current node is not null
inorder(root->left); // travel down left child, recursively
printf("%s", root->key); // prints key of current node, the root
inorder(root->right); // travel down right child after root printed, recursively
}
}
void postorder(Node* root) {
printf("Postorder: \n");
if (root != NULL) { // if current node is not null
postorder(root->left); // travel down left child, recursively
postorder(root->right); // travel down right child, recursively
printf("%s", root->key); // prints key of current node, the root
}
}
int main(int argc, char **argv) {
extern char *optarg;
extern int optind;
int c, err = 0, i = 0, numRead, isfirst = 1;
int cflag = 0, oflag = 0;
char *inName = NULL; // Input filename
char *outName = NULL; // Output filename
static char usage[] = "Usage: bstsort [-c] [-o output_file_name] [input_file_name]\n";
FILE* inFile = NULL;
FILE* outFile = NULL;
char *line;
char tmp[100] = "";
struct Node *root = NULL;
while ((c = getopt(argc, argv, "co:")) != -1)
switch (c) {
case 'c':
cflag = 1;
break;
case 'o':
oflag = 1;
outName = optarg;
break;
case '?':
err = 1;
break;
}
if (err) {
fprintf(stderr, usage, argv[0]);
exit(1);
}
/* see what we have */
printf("cflag: %d\n", cflag);
printf("oflag: %d\n", oflag);
printf("Output Filename: \"%s\"\n", outName);
/* these are the arguments after the command-line options */
if (optind < argc) {
for (; optind < argc; optind++) {
inName = argv[optind];
printf("Input Filename: \"%s\"\n", inName);
}
} else {
printf("No input filename provided.\n");
}
/* Reads stdin one line at a time when input filename not provided*/
line = (char*) malloc (100);
if (inName == NULL) {
printf("\nEnter one line at a time:\n");
fflush(stdout);
fgets(line, 100, stdin);
insert(&root, line, cflag);
while (strcmp(line, "\n") != 0) {
fflush(stdout);
fgets(line, 100, stdin);
if (strcmp(line, "\n") != 0) {
insert(&root, line, cflag);
}
}
}
inorder(root);
postorder(root)
free(line);
free(root);
fclose(inFile);
exit(0);
}
Here's my Node struct
#ifndef BSTSORT_H
#define BSTSORT_H
/* Binary Search Tree Node Struct */
typedef struct Node {
char *key;
int count;
struct Node *left;
struct Node *right;
} Node;
void insert(Node **node, char *keyStr, int cflag);
void inorder(Node* root);
void postorder(Node* root);
void display_tree(Node* nd);
#endif
Any tips would be great.

Reading file into linked list

I am trying to read a text file I made into a linked list, the text file looks like this:
around 1 2 1
bread 2 4 3 5 1
four 1 3 2
head 3 1 2 2 1 5 1
has 2 3 1 5 2
Where the first string of each line are just words from a paragraph. The first number after the word is the number of lines the word was found in, in the paragraph. Then the following numbers are pairs of (line, occurrences) in the paragraph.
For example, for the word bread:
It was found in 2 lines in the paragraph. In the first line, line 4, it was found 3 times. Then in the second line, line 5, it was found 1 time.
I am trying to create a linked list from this text file, my program looks like this so far:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#define MAXWORD 999
typedef struct node node_t;
struct node {
char *word;
int num_lines;
int paragraph;
int freq;
node_t *next;
};
int
main(int argc, char *argv[]) {
FILE *fp;
char word[MAXWORD+1];
int ch, line_count = 0, len = 0;
node_t *node = (node_t*)malloc(sizeof(*node));
node_t *curr, *prev;
fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Error reading file\n");
exit(EXIT_FAILURE);
}
/* Just trying to store the string so far */
while ((ch = getc(fp)) != EOF) {
if (ch == '\n') {
line_count++;
strcpy(node->word, word);
}
if (isalpha(ch)) {
word[len] = ch;
len++;
word[len] = '\0';
}
if (isdigit(ch)) {
len = 0;
}
}
printf("line count = %d", line_count);
free(node)
fclose(fp);
return 0;
}
In this snippet, I have been trying to store the string in the linked list data structure, but I have not yet used dynamic arrays to store the numbers after the word which occur in the text file. I know I will need to build this data structure using malloc() and realloc(), but I am unsure of how to do this.
How should I do this?
My desired output would look like this:
There are five words in the text file,
and 9 pairs of (line, occurences)
Word: pairs
"around": 2,1
"bread": 4,3; 5,1
"four": 3,2
"head": 1,2; 2,1; 5,1
"has": 3,1; 5,2
UPDATE
I have been researching this and it seems to be very similar to the inverted index problem, where I have seen that using a binary search tree would be best.
Could I implement my binary search tree like this:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#define MAXWORD 999
typedef char word_t[MAXWORD+1];
typedef struct node node_t;
struct node {
void *data;
int *ints;
node_t *rght;
node_t *left;
};
typedef struct {
node_t *root;
int (*cmp)(void*, void*);
} tree_t;
int
main(int argc, char *argv[]) {
FILE *fp;
fp = fopen(argv[1], "r");
if (fp == NULL) {
fprintf(stderr, "Error reading file\n");
exit(EXIT_FAILURE);
}
while ((ch = getc(fp)) != EOF) {
if (ch == '\n') {
line_count++;
}
}
fclose(fp);
return 0;
}
You could do something like this:
typedef struct {
int paragraph;
int freq;
} stats_t;
struct node {
char *word;
int num_lines;
stats_t *stats;
node_t *next;
};
Then after you parse the string you can do:
ps = calloc(line_count, sizeof(stats_t));
to get a pointer to an array of stats_t structs, which you can fill with line locations and frequencies. Then you can store the pointer ps in your node struct.
I wrote a program that does what I think you are looking for. I modified the structs I was thinking about before:
typedef node node_t;
struct node {
char *word;
int num_lines;
int *location;
int *frequency;
node_t *next;
};
This way the nodes contain pointers to arrays of int to store the location and frequency information. Nodes and storage for the word strings, location arrays, and frequency arrays are all dynamically allocated. Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define MAXLINE 1000
#define MAXWORD 30
typedef struct node node_t;
struct node {
char *word;
int num_lines;
int *location;
int *frequency;
node_t *next;
};
void strip(char *pln);
void normalize_word(char *pstr);
struct node * update_word(char *pwd, int lnum, struct node *phead);
struct node * find_in_list(char *pwd, struct node *phead);
int find_line_pair(int lnum, struct node *pwn);
int list_len(struct node *phead);
int num_pairs(struct node *phead);
int main(int argc, char *argv[])
{
FILE *fp;
struct node *head, *current;
char *pline, *pword;
char line[MAXLINE + 1];
char word[MAXWORD + 1];
int i, n, line_count = 0;
head = NULL;
if (argc < 2) {
fprintf(stderr, "Usage: %s filename\n", argv[0]);
exit(EXIT_FAILURE);
} else {
if ((fp = fopen(argv[1], "r")) == NULL) {
fprintf(stderr, "Unable to open file %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
/* Read in lines and process words */
pline = line;
pword = word;
while (fgets(pline, MAXLINE, fp) != NULL) {
++line_count;
strip(pline);
while ((pword = strtok(pline, " ")) != NULL) {
normalize_word(pword);
if (*pword != '\0') // don't add empty words
head = update_word(pword, line_count, head);
pline = NULL;
}
pline = line;
}
/* Display list contents */
printf("There are %d words in the text file,\n",
list_len(head));
printf("and %d pairs of (line, occurrences)\n",
num_pairs(head));
printf("Word: pairs\n");
current = head;
while (current != NULL) {
n = current->num_lines;
printf("%s:", current->word);
for (i = 0; i < n; i++) {
printf(" %d, %d;",
current->location[i], current->frequency[i]);
}
putchar('\n');
current = current->next;
}
/* Cleanup */
// close file
if (fclose(fp) != 0)
fprintf(stderr, "Error closing file %s\n", argv[1]);
// free all allocated memory
current = head;
while (current != NULL) {
free(current->word);
free(current->location);
free(current->frequency);
current = current->next;
free(head);
head = current;
}
return 0;
}
/* Remove trailing newlines */
void strip(char *pln)
{
while (*pln != '\0') {
if (*pln == '\n')
*pln = '\0';
++pln;
}
}
/* Convert word to lowercase and remove trailing
* non-alphanumeric characters */
void normalize_word(char *pstr)
{
int i = 0;
char ch;
while ((ch = pstr[i]) != '\0') {
pstr[i] = tolower(ch);
++i;
}
while ((--i >= 0) && !isalnum(pstr[i])) {
pstr[i] = '\0';
continue;
}
}
/* Update existing word node or create a new one, and return
* a pointer to the head of the list */
struct node * update_word(char *pwd, int lnum, struct node *phead)
{
struct node *found, *newnode;
char *pword;
int *ploc, *pfreq;
int index;
/* Modify existing node if word is in list */
if ((found = find_in_list(pwd, phead)) != NULL) {
// add new (location, freq) pair if word not in found line
if ((index = find_line_pair(lnum, found)) == -1) {
index = found->num_lines; // index for new pair
found->num_lines += 1; // increment number of lines
ploc = realloc(found->location, (index + 1) * sizeof(int));
pfreq = realloc(found->frequency, (index + 1) * sizeof(int));
ploc[index] = lnum; // new location
pfreq[index] = 1; // found once in this line so far
found->location = ploc; // point to new location array
found->frequency = pfreq; // point to new frequency array
}
else { // update frequency in existing line
found->frequency[index] += 1;
}
/* Set up a new node */
} else {
// allocate memory for new node
newnode = malloc(sizeof(struct node));
// allocate memory for string pointed to from node
pword = malloc((strlen (pwd) + 1) * sizeof(char));
strcpy(pword, pwd);
newnode->word = pword; // set word pointer
newnode->num_lines = 1; // only one line so far
ploc = malloc(sizeof(int));
pfreq = malloc(sizeof(int));
*ploc = lnum; // location was passed by caller
*pfreq = 1; // only one occurrence so far
newnode->location = ploc;
newnode->frequency = pfreq;
if (phead == NULL) { // if wordlist is empty
newnode->next = NULL; // only/last link in the list
phead = newnode; // newnode is the head
} else {
newnode->next = phead; // insert newnode at front of list
phead = newnode;
}
}
return phead;
}
/* Return pointer to node containing word, or NULL */
struct node * find_in_list(char *pwd, struct node *phead)
{
struct node *current = phead;
while (current != NULL) {
if (strcmp(current->word, pwd) == 0)
return current; // word already in list
current = current->next;
}
return NULL; // word not found
}
/* Return index of existing line location, or -1 */
int find_line_pair(int lnum, struct node *pwn)
{
int n = pwn->num_lines;
int index = 0;
while (index < n) {
if (pwn->location[index] == lnum)
return index; // word already found in this line
++index;
}
return -1; // word not yet found in this line
}
/* Find number of nodes in linked list */
int list_len(struct node *phead)
{
int length = 0;
struct node *current = phead;
while (current != NULL) {
++length;
current = current->next;
}
return length;
}
/* Find number of (line, occurrence) pairs */
int num_pairs(struct node *phead)
{
int num = 0;
struct node *current = phead;
while (current != NULL) {
num += current->num_lines;
current = current->next;
}
return num;
}
Note: I modified this from the previous version in the update_word() function. The original code inserted a new node at the end of the list, so the resulting list contained words in order of their first appearance in the input text. This version inserts a new node at the beginning of the list, so the resulting list contains words in reverse order of their first appearance. This speeds up node insertion and simplifies the node-insertion code from:
current = phead;
while (current->next != NULL) // find tail
current = current->next;
current->next = newnode; // add newnode to end
to:
newnode->next = phead; // insert newnode at front of list
I have no doubt that the code can be improved, but this does seem to work. I wouldn't say that this is exactly simple, but relatively straightforward. I ran it against this text file:
Three blind mice. Three blind mice.
See how they run. See how they run.
They all ran after the farmer's wife,
Who cut off their tails with a carving knife,
Did you ever see such a sight in your life,
As three blind mice?
Here are the results:
There are 31 words in the text file,
and 37 pairs of (line, occurrences)
Word: pairs
as: 6, 1;
life: 5, 1;
your: 5, 1;
in: 5, 1;
sight: 5, 1;
such: 5, 1;
ever: 5, 1;
you: 5, 1;
did: 5, 1;
knife: 4, 1;
carving: 4, 1;
a: 4, 1; 5, 1;
with: 4, 1;
tails: 4, 1;
their: 4, 1;
off: 4, 1;
cut: 4, 1;
who: 4, 1;
wife: 3, 1;
farmer's: 3, 1;
the: 3, 1;
after: 3, 1;
ran: 3, 1;
all: 3, 1;
run: 2, 2;
they: 2, 2; 3, 1;
how: 2, 2;
see: 2, 2; 5, 1;
mice: 1, 2; 6, 1;
blind: 1, 2; 6, 1;
three: 1, 2; 6, 1;
Here is my version using Binary Search Tree (BST):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
typedef struct internal_node in_node;
struct internal_node{
int line;
int freq;
in_node* next;
};
struct tree{
char *word;
int num_lines;
in_node* in_nodeptr;
in_node* current;
struct tree* right;
struct tree* left;
};
typedef struct tree* treeptr;
void free_list(in_node* in_nodeptr){
if(in_nodeptr!=NULL) {
free(in_nodeptr);
}
}
void free_bst(treeptr head){
if (head!=NULL) {
free_bst(head->right);
free_bst(head->left);
free_list(head->in_nodeptr);
free(head->word);
free(head);
}
}
void print_list(in_node* in_nodeptr){
while(in_nodeptr!=NULL){
printf("%d %d; ",in_nodeptr->line,in_nodeptr->freq);
in_nodeptr=in_nodeptr->next;
}
}
void print_bst(treeptr head){
if(head!=NULL){
printf("%s: ",head->word);
print_list(head->in_nodeptr);
printf("\n");
print_bst(head->right);
print_bst(head->left);
}
}
void input_to_bst(treeptr* head,char* word,int line){
if((*head)==NULL){
(*head)=(treeptr)malloc(sizeof(struct tree));
(*head)->word=(char*)malloc(50*sizeof(char));
strcpy(((*head)->word),word);
(*head)->num_lines=1;
(*head)->right=NULL;
(*head)->left=NULL;
(*head)->in_nodeptr=(in_node*)malloc(sizeof(in_node));
(*head)->in_nodeptr->line=line;
(*head)->in_nodeptr->freq=1;
(*head)->in_nodeptr->next=NULL;
(*head)->current=(*head)->in_nodeptr;
}
else{
int check=strcmp(((*head)->word),word);
if(check>0) input_to_bst(&((*head)->left),word,line);
else if(check<0) input_to_bst(&((*head)->right),word,line);
else{
if( (*head)->current->line==line) (*head)->current->freq++;
else {
(*head)->current->next=(in_node*)malloc(sizeof(in_node));
(*head)->current->next->line=line;
(*head)->current->next->freq=1;
(*head)->current->next->next=NULL;
}
}
}
}
int main(int argc, char *argv[]) {
treeptr head=NULL;
FILE *fp=fopen(argv[1], "r");
char word[50],ch;
int len=0,lines=1;
if (fp == NULL) {
fprintf(stderr, "Error reading file\n");
exit(1);
}
while ((ch = getc(fp)) != EOF) {
if (ch == '\n') {
word[len]='\0';
if(len>0) input_to_bst(&head,word,lines);
len=0;
lines++;
}
else if (ch==' '){
word[len]='\0';
if(len>0) input_to_bst(&head,word,lines);
len=0;
}
else if (isalpha(ch)){
word[len]=ch;
len++;
}
}
if(len>0) {
word[len]='\0';
input_to_bst(&head,word,lines);
}
print_bst(head);
fclose(fp);
free_bst(head);
return 0;
}
Every word is held as a node of the BST and also each node of BST except from the word, holds a list with all the appearances (lines and frequency ) of the word. In order to be as most efficient as possible we hold a pointer (in_node* current) to the last element of list of appearance so that we don't need to traverses every time we need to add an appearance.
As an example:
Text:
C is an imperative procedural language. It was designed to be compiled
using a relatively straightforward compiler and to require minimal
runtime support.
Output:
C: 1 1;
is: 1 1;
procedural: 1 1;
was: 1 1;
to: 1 1; 2 1;
using: 2 1;
relatively: 2 1;
straightforward: 2 1;
support: 3 1;
require: 2 1;
runtime: 3 1;
language: 1 1;
minimal: 2 1;
an: 1 1;
imperative: 1 1;
designed: 1 1;
be: 1 1;
compiled: 1 1;
compiler: 2 1;
and: 2 1;
It: 1 1;
a: 2 1;
Note that the above implementation is case sensitive for example "And" is different from "and".
If you don't wish to be case sensitive just replace the line word[len]=ch; with word[len]=tolower(ch); and works fine.
The complexity of the above algorithm is O(n^2) which would be the same if you used only linked lists but in the average case BST is O(nlogn) which is much better than linked lists and this is the reason that it is considered to be the better.
Also note that since we must keep a list for appearances of each word the complexity would be worst if we didn't keep the in_node* current pointer which gives us access to the end of each appearance list in constant time (O(1)). So I think that as terms of complexity you can't go better than O(nlogn).

finding a dangling pointer

I have a problem with my code. I am getting a segmentation fault error, which I understand is a dangling pointer problem(generally) or a faulty allocation of memory. The compiler dose not show at what line the problem might be, so my question is how do I detect these problems for further concern? and where would my problem be in the code?
here is my code:
`#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ARRAY_SIZE(a) sizeof(a)/sizeof(a[0])
#define ALPHABET_SIZE (256)
#define CHAR_TO_INDEX(c) ((int)c - (int)'a')
#define LEVELS 255
// trie node
struct n
{
char value,level,isLeaf;
struct n* children[ALPHABET_SIZE];
struct n* failLink;
};
typedef struct n node;
//trie
struct t
{
node *root;
int count;
};
typedef struct t trie;
void bytesCpy(char *to, char *from, int len)
{
int i;
for(i=0;i<len;i++)
{
to[i]=from[i];
}
}
// Returns new trie node (initialized to NULLs)
node *getNode(trie *t, char value,char level)
{
node *pNode = NULL;
pNode = (node *)malloc(sizeof(node));
if (pNode)
{
printf("ok\n");
int i;
for (i = 0; i < ALPHABET_SIZE; i++)
{
pNode->children[i] = NULL;
}
pNode->failLink = t->root;
pNode->value=value;
pNode->level=level;
pNode->isLeaf=0;
}
else
printf("error\n");
return pNode;
}
// Initializes trie (root is dummy node)
void initialize(trie *t)
{
t->root = getNode(t, '[', 0);
//t->count = 0;
}
// If not present, inserts key into trie
// If the key is prefix of trie node, just marks leaf node
void insert(trie *t, char key[], int len)
{
int level;
char value;
node *node = t->root;
for (level = 0; level<len; level++)
{
value = key[level];
printf("value: %c\n",value);
if (node->children[value] == NULL)
{
node->children[value] = getNode(t, value, level+1);
}
node = node->children[value];
}
node->isLeaf=1;
}
// Returns non zero, if key presents in trie
int search(trie *t, char key[])
{
int level;
int length = strlen(key);
int value;
node *node;
node = t->root;
for (level = 0; level < length; level++)
{
value = key[level];//CHAR_TO_INDEX(key[level]);
if (!node->children[value])
{
node = node->failLink;
return 0;
}
node = node->children[value];
}
return (0 != node);// && node->value);
}
void search1(trie *t, char *c, int len)
{
node *curNode = t->root;
int i;
for(i=0; i<=len; i++)
{
printf("i=%d curnode=%p\n",i,curNode);
if(curNode->isLeaf) //leaf: cuvant gasit
{
printf("if1 curGasit \n");
do{
curNode=curNode->failLink;
if(curNode->isLeaf)
printf("if1 curGasit \n");
else break;
}while(1);
continue;
}
else //nu e gasit inca
{
if(curNode->children[c[i]]==NULL) //fail
{
printf("if2\n");
curNode = curNode->failLink;
continue;
}
else //litera gasita: go on
{
printf("el2\n");
curNode=curNode->children[c[i]];
}
}
}
printf("end of search\n");
}
node* searchAux(trie *t, node *curRoot, char cuv[], char len, int level ,int failLevel)
{
char cuvAux[1024];
bytesCpy(cuvAux,cuv,len);
printf("searchAux level:%d cuvAux:%s curRootLevel:%d\n",level,cuvAux,curRoot->level);
if(cuvAux[level+1] == '\0') //got to the end of cuvAux
{
printf("1st if\n");
return curRoot;
}
if(curRoot->children[cuvAux[level+1]] == NULL) //fail: letter not found
{
printf("3rd if\n");
return searchAux(t, t->root, &cuvAux[failLevel+1], len, 0, failLevel+1);
}
else //letter found: go on
{
printf("3rd else\n");
if(cuvAux[level+2] == '\0') //the found letter was the last of the string
{
printf("4th if\n");
return curRoot->children[cuvAux[level+1]]; //return final pointer
}
else //the found letter was not the last of the string: continue with the next one
{
printf("4th else\n");
return searchAux(t, curRoot->children[cuvAux[level+1]], cuvAux, len, level+1, failLevel);
}
}
}
void createFailLinks(trie *t, node* curRoot, char cuv[], int level)
{
int i;
char cuvAux[1024];
bytesCpy(cuvAux,cuv,1024);
if(curRoot == NULL)
return;
for(i=0;i<ALPHABET_SIZE/*curRoot->children[i] != NULL*/;i++)
{
if(curRoot->children[i] == NULL)
continue;
else
{
cuvAux[level] = curRoot->children[i]->value;
printf("createFailLinks %c%d\n",cuvAux[level],curRoot->children[i]->level);
curRoot->children[i]->failLink = searchAux(t, t->root, cuvAux, level+1, 0, 0);
createFailLinks(t,curRoot->children[i],cuvAux,level+1);
}
}
printf("got\n");
}
void printTrie(node *curRoot)
{
int i;
if(curRoot == NULL)
return;
printf("%c: ", curRoot->value);
for(i=0;i<ALPHABET_SIZE;i++)
if(curRoot->children[i] != NULL)
{
printf("%c ", i);
}
printf("\n");
for(i=0;i<ALPHABET_SIZE;i++)
if(curRoot->children[i] != NULL)
{
printTrie(curRoot->children[i]);
}
}
void checkLinks(node* curRoot)
{
int i;
if(curRoot == NULL)
return;
printf("node %c%d: ",curRoot->value,curRoot->level);
for(i=0;i<256;i++)
if(curRoot->children[i] != NULL)
printf("\n\t%c%d:%c%d",curRoot->children[i]->value, curRoot->children[i]->level, curRoot->children[i]->failLink->value,curRoot->children[i]->failLink->level);
printf("\n");
for(i=0;i<256;i++)
if(curRoot->children[i] != NULL)
checkLinks(curRoot->children[i]);
}
int mai()
{
FILE *fd = fopen("VirusDatabase.txt","r");//O_RDONLY);
int i;
char c;
for(i=0;i<1000;i++)
{
fscanf(fd, "%c", &c);
printf("%c",c);
}
}
int main()
{
// Input keys (use only 'a' through 'z' and lower case)
char keys[][1024] = { "he", "she", "her", "his", "heres"};
char cuv[] = {'\0','\0','\0','\0','\0','\0'};
trie t;
char output[][32] = { "Not present in trie", "Present in trie" };
int i;
char text[]={"andreiherutshevlastashecristihiskatjaheres"};
initialize(&t);
// Construct trie
for (i = 0; i < ARRAY_SIZE(keys); i++)
{
insert(&t, keys[i], strlen(keys[i]));
}
createFailLinks(&t, t.root, cuv, 0);
printTrie(t.root);
printf("\n\n");
checkLinks(t.root);
search1(&t, text, strlen(text));
return 0;
// Search for different keys
printf("%s --- %s\n", "abcd", output[search(&t, "abcd")]);
printf("%s --- %s\n", "ab", output[search(&t, "ab")]);
printf("%s --- %s\n", "ccdd", output[search(&t, "ccdd")]);
printf("%s --- %s\n", "thaw", output[search(&t, "thaw")]);
return 0;
char a = getchar();
}`
Do you have access to a debugger? I ran your code in a debugger and get a memory access violation at line 157 here:
return searchAux(t, t->root, &cuvAux[failLevel+1], len, 0, failLevel+1);
You seem to be recursively calling searchAux. ie you have:
node* searchAux(trie *t, node *curRoot, char cuv[], char len, int level ,int failLevel)
{
char cuvAux[1024];
...
return searchAux(t, t->root, &cuvAux[failLevel+1], len, 0, failLevel+1);
...
Anyway, eventually the buffer size variable failLevel exceeds the size of your buffer so you are attempting to access memory outside the bounds of your array which is why you get an access violation.
The easiest way to debug is use an interactive debugger. On Windows there is a free version of Visual Studio with a very good debugger. On linux you can use GDB.
Failing that you can embed print statements to print out variables before the crash.
You can add print statements at lines of code.
#include <iostream>
std::cout << "At Line: " << __LINE__ << endl;
putting that at various lines of code, you can see what lines got executed, and find where it crashes.
This is for C++. My bad. Same idea, but put printf() statements and see where it stopped executing to narrow down the crash location.

Resources