I am trying to create a program that takes input from a file, puts each word into a "words" structure, and then outputs the results with the frequency of each word, but whenever I try to output the string it just prints something like ?k#?? where I would expect the string to be.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct s_words {
char *str; //stores the word; no pre-determined size
int count;
struct s_words* next;
} words;
words* create_words(char* word) {
//allocate space for the structure
words* newWord = malloc(strlen(word));
if (NULL != newWord){
//allocate space for storing the new word in "str"
//if str was array of fixed size, storage wud be wasted
newWord->str = (char *)malloc(strlen(word));
strcpy(newWord->str,word); //copy “word” into newWord->str
newWord->str[strlen(word)]='\0';
newWord->count = 1; //initialize count to 1;
newWord->next = NULL; //initialize next;
}
return newWord;
}
//If the word is in the list, add 1 to count.
words* add_word(words* wordList, char* word) {
int found=0;
words *temp=wordList;
// search if word exists in the list; if so, make found=1
while (temp!=NULL) {
// printf("looptest\n");
if (strcmp(word,temp->str) == 0) { //use strcmp command
//printf("looptest0\n");
found=1;
temp->count = temp->count + 1; //increment count;
return wordList;
//printf("looptest1\n");
}
else {
temp = temp -> next; //update temp
// printf("looptest2\n");
}
}
// printf("looptest3\n");
//new word
words* newWord = create_words(word);
// printf("looptest4\n");
if (NULL != newWord) {
// printf("looptest5\n");
newWord->next = wordList;
wordList = newWord;
//Insert new word at the head of the list
}
else{
// printf("looptest6\n");
temp = wordList;
while(temp->next != NULL){
// printf("looptest7\n");
temp = temp->next;
}
temp->next = newWord;
}
return newWord;
}
int main(int argc, char* argv[]) {
words *mywords; //head of linked list containing words
mywords=NULL;
FILE *myFile;
myFile = fopen(argv[1],"r"); //first parameter is input file
if (myFile==0) {
printf("file not opened\n");
return 1;
}
else {
printf("file opened\n");
}
//start reading file character by character;
//when word has been detected; call the add_word function
int ch, word = 0, k=0;
char thisword[100];
while ( (ch = fgetc(myFile)) != EOF )
{
// printf("%c",ch);
if (ch==' ' || ch==',' || ch==';' || ch==':' || ch == '.') //detect new word? Check if ch is a delimiter
{
// printf("\ncheck2\n");
if ( word ) //make sure previous character was not delimiter
{
// printf("check\n");
word = 0;
thisword[k] = '\0'; //make the kth character of thisword as \0
// printf("test2\n");
//now call add_word to add thisword into the list
mywords = add_word(mywords,thisword);
// printf("check3\n");
k=0;
}
// printf("test\n");
}
else
{
word = 1;
thisword[k] = ch; //make the kth character of thisword equal to ch
k++;
}
if(ch == EOF){
thisword[k] = '\0';
mywords = add_word(mywords,thisword);
}
}
printf("%s\n",mywords->str);
printf("printing list\n");
//Traverse list and print each word and its count to outputfile
//output file is second parameter being passed
//haven't started to deal with the output file
words* temp = mywords;
while(temp != NULL){
printf("%s\tcount: %i\n",temp->str,temp->count);
temp = temp->next;
}
printf("list complete\n");
return 0;
}
This is all my code, I can't figure out how to error test what the problem is since I can't figure out how to output the strings. I've only started programming in C this year so I assume there's something basic I'm missing.
newWord->str = (char *)malloc(strlen(word));
strcpy(newWord->str,word); //copy “word” into newWord->str
newWord->str[strlen(word)]='\0';
.. writes the null out-of-bounds.
Assuming that strlen() returns the desired value, you should malloc an extra char:
newWord->str = (char *)malloc(1+strlen(word));
Note Olaf comment re. casting in C. Also note that it's unlikely that this is your ONLY bug.
Related
I'm running a program that creates a dictionary tree by reading in words from 'words.txt', and then can search to see if certain words are in the tree. Running this program on https://www.onlinegdb.com/online_c_compiler works perfectly, but I get a segmentation fault when I try to run it on my own Linux system. Any ideas as to why? Here is the code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/* Node structure of trie */
struct node
{
struct node *next[27]; // 26 for a-z and last one(26th index) is for apostrophe
int end; // value as 1 denotes the end of the word.
};
/* This method is for creating a new node */
struct node *createNode()
{
struct node *newNode = (struct node *)malloc(sizeof(struct node));
newNode->end = 0; // set end as false i.e. 0
for (int i = 0; i < 27; i++) // set all children as NULL
newNode->next[i] = NULL;
return newNode;
}
/* This method is for inserting a word in the trie */
void insert(struct node *root, char *word)
{
struct node *curr = root;
for (int i = 0; i < strlen(word); i++) // iterating character by character
{
int index;
if (word[i] == '\'') // if character is apostrophe index is 26
index = 26;
else
index = tolower(word[i]) - 'a'; // else the index as the alphabet sequence number starting from 0.
// for a - 0, b - 1, ..... z - 25
if (!curr->next[index])
curr->next[index] = createNode(); // create node of that character if not created yet
curr = curr->next[index]; // then go for next character
}
curr->end = 1; // mark end as 1 to denote the ending of the word
}
/* This method is for searching a word in the trie */
int search(struct node *root, char *word)
{
struct node *curr = root;
for (int i = 0; i < strlen(word); i++) // iterating character by character
{
/* Getting index same as insert function */
int index;
if (word[i] == '\'')
index = 26;
else
index = tolower(word[i]) - 'a';
if (!curr->next[index]) // if node of current character not found means the word doesn't exist in trie.
return 0;
curr = curr->next[index];
}
if (curr != NULL && curr->end) // if iterated all the characters and end is 1 then the word exists.
return 1;
else
return 0; // otherwise doesn't exist.
}
int main()
{
/* Reading the file line by line */
FILE *file;
size_t len = 1000;
char *word = (char *)malloc(len);
file = fopen("word.txt", "r");
struct node *root = createNode();
while (fgets(word, len, file) != NULL) // iterating line by line
{
int len = strlen(word);
if (word[len - 1] == '\n') // removing the newline which is at the end of the every line
word[len - 1] = '\0';
insert(root, word); // inserting every word
}
int ans;
word = (char *)("error's"); // checking the existence of the word "error's"
ans = search(root, word);
if (ans == 1)
printf("\"%s\" found!\n", word);
else
printf("\"%s\" not found!\n", word);
word = (char *)("hilli");// checking the existence of the word "hilli"
ans = search(root, word);
if (ans == 1)
printf("\"%s\" found!\n", word);
else
printf("\"%s\" not found!\n", word);
return 0;
}
Here's code that should work. It does work on macOS 10.15.2 Catalina using GCC 9.2.0 and XCode 11.3.1 with the compiler set fussy and a number of memory debugging options enabled. It does not attempt to free the trie that it builds; it should (it is a good exercise to be able to free the structures you build).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
/* Node structure of trie */
struct node
{
struct node *next[27]; // 26 for a-z and last one(26th index) is for apostrophe
int end; // value as 1 denotes the end of the word.
};
/* This method is for creating a new node */
static struct node *createNode(void)
{
struct node *newNode = (struct node *)malloc(sizeof(struct node));
newNode->end = 0; // set end as false i.e. 0
for (int i = 0; i < 27; i++) // set all children as NULL
newNode->next[i] = NULL;
return newNode;
}
/* This method is for inserting a word in the trie */
static void insert(struct node *root, char *word)
{
struct node *curr = root;
int length = strlen(word);
for (int i = 0; i < length; i++) // iterating character by character
{
int index;
if (word[i] == '\'') // if character is apostrophe index is 26
index = 26;
else
index = tolower(word[i]) - 'a'; // else the index as the alphabet sequence number starting from 0.
// for a - 0, b - 1, ..... z - 25
if (!curr->next[index])
curr->next[index] = createNode(); // create node of that character if not created yet
curr = curr->next[index]; // then go for next character
}
curr->end = 1; // mark end as 1 to denote the ending of the word
}
/* This method is for searching a word in the trie */
static int search(struct node *root, char *word)
{
struct node *curr = root;
int length = strlen(word);
for (int i = 0; i < length; i++) // iterating character by character
{
/* Getting index same as insert function */
int index;
if (word[i] == '\'')
index = 26;
else
index = tolower(word[i]) - 'a';
if (!curr->next[index]) // if node of current character not found means the word doesn't exist in trie.
return 0;
curr = curr->next[index];
}
if (curr != NULL && curr->end) // if iterated all the characters and end is 1 then the word exists.
return 1;
else
return 0; // otherwise doesn't exist.
}
int main(void)
{
/* Reading the file line by line */
FILE *file;
size_t len = 1000;
char *word = (char *)malloc(len);
const char filename[] = "word.txt";
file = fopen(filename, "r");
if (file == 0)
{
fprintf(stderr, "Failed to open file '%s' for reading\n", filename);
exit(EXIT_FAILURE);
}
struct node *root = createNode();
while (fgets(word, len, file) != NULL) // iterating line by line
{
//int len = strlen(word);
//if (word[len - 1] == '\n') // removing the newline which is at the end of the every line
// word[len - 1] = '\0';
word[strcspn(word, "\r\n")] = '\0';
printf("Word: [%s]\n", word);
insert(root, word); // inserting every word
}
int ans;
word = (char *)("error's"); // checking the existence of the word "error's"
ans = search(root, word);
if (ans == 1)
printf("\"%s\" found!\n", word);
else
printf("\"%s\" not found!\n", word);
word = (char *)("hilli");// checking the existence of the word "hilli"
ans = search(root, word);
if (ans == 1)
printf("\"%s\" found!\n", word);
else
printf("\"%s\" not found!\n", word);
return 0;
}
The code runs correctly given a data file containing a suitable subset of these lines:
enough
abracadabra
acid
test
hilli
error's
tests
testing
tested
tester
testosterone
acidly
acidic
It was tested with both DOS (CRLF) and Unix (NL or LF) line endings and was safe with both because it uses strcspn() to zap either sort of line ending:
word[strcspn(word, "\r\n")] = '\0';
If you had old Mac-style line endings (CR only), then you'd have problems with fgets() not recognizing the ends of lines — but if you fixed that (using POSIX getdelim() for example), it would work correctly with such lines too.
The changes made to your code are basically cosmetic, but allow the code to compile cleanly (source trie79.c; program trie79) using fairly stringent options:
$ gcc -O3 -g -std=c11 -Wall -Wextra -Werror -Wmissing-prototypes -Wstrict-prototypes \
> trie79.c -o trie79
$
Edited question:
Hi guys, my goal is to print the top 10 occurring words in a file, I have managed to get everything to work from reading the file to counting word occurrences and printing it, but when I implement my qsort I get a segfault. I looked over my pointers and they look okay to me, I would appreciate any feedback.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#define MAX 51
struct words
{
char *ch;
int index;
struct words *pNext;
};
struct words* createWordCounter(char *ch)
{
struct words *pCounter = NULL;
pCounter = (struct words*)malloc(sizeof(char));
pCounter->ch = (char*)malloc(strlen(ch)+1);
strcpy(pCounter->ch, ch);
pCounter->index = 1;
pCounter->pNext = NULL;
return pCounter;
}
struct words *pStart = NULL;
char* removePunc(struct words* ch)
{
char *src = ch, *dst = ch;
while (*src)
{
if (ispunct((unsigned char)*src))
{
src++;
}
else if (isupper((unsigned char)*src))
{
*dst++ = tolower((unsigned char)*src);
src++;
}
else if (src == dst)
{
src++;
dst++;
}
else
{
*dst++ = *src++;
}
}
*dst = 0;
}
void addWord(char *word)
{
struct words *pCounter = NULL;
struct words *pLast = NULL;
if(pStart == NULL)
{
pStart = createWordCounter(word);
return;
}
pCounter = pStart;
while(pCounter != NULL)
{
if(strcmp(word, pCounter->ch) == 0)
{
++pCounter->index;
return;
}
pLast = pCounter;
pCounter = pCounter->pNext;
}
pLast->pNext = createWordCounter(word);
}
void printWord(struct words *pCounter)
{
printf("\n%-30s %5d\n", pCounter->ch, pCounter->index);
}
//sort
int compare (const void * a, const void * b){
struct words *A1 = (struct words *)a;
struct words *B1 = (struct words *)b;
return B1->index - A1->index;
/*
if ((A1->count - B1->count) > 0)
return -1;
else if ((A1->count - B2->count) < 0)
return 1;
else
return 0;
*/
}
int main(int argc, char * argv[])
{
struct words *pCounter = NULL;
char temp[MAX];
FILE *fpt;
if(argc == 2)
{
printf("File name is: %s\n",argv[1]);
fpt = fopen(argv[1], "r");
//fail test
if(fpt == NULL)
{
printf("cannot open file, exiting program...\n");
exit(0);
}
//get the data out of the file and insert in struct
int wordCounter = 0;
int i = 0;
int lines = 0;
while((fscanf(fpt, "%s ", &temp)) == 1)
{
removePunc(temp);
addWord(temp);
if(temp == ' ')
i++;
if(temp == '\n')
lines++;
wordCounter++;
}
/*
pCounter = pStart;
while(pCounter != NULL)
{
printWord(pCounter);
pCounter = pCounter->pNext;
}
*/
//sort
qsort(pCounter, wordCounter, sizeof(struct words), compare);
for(int j = 0; i < 10; i++)
{
printWord(pCounter);
}
}
fclose(fpt);
return 0;
}
First temp is already a pointer, so do not include '&' before it in fscanf. Second, don't skimp on buffer size (e.g. #define MAX 1024). Third, protect your array bounds with the field-width modifier and don't put trailing whitespace in your format-string.
Putting it altogether (presuming you use 1024 as MAX, you can use
fscanf(fpt, "1023%s", temp))
Well done on checking the return of fscanf during your read.
Adding to the things that have already been mentioned.
In createWordCounter(...)
pCounter = (struct words*)malloc(sizeof(char));
you are allocating memory for a char. Even though the pointer to a struct is the pointer to its first member, the first element of words is a pointer to a char. It is better to be careful and write
struct words *pCounter = malloc(sizeof *pCounter);
Also, be mindful of operator precedence.
In addWord(...) you have
++pCounter->index;
What that does is increment the pointer pCounter before accessing index. If you are trying to increment index, it should be
++(pCounter->index);
or
pCounter->index++;
I recommend striping your program down to its bare essentials and test each part one at a time systematically to narrow down the cause of your errors.
I think the main problem is the size of temp array when you try to using fscanf.
while((fscanf(fpt, "%s ", temp)) == 1)
When the length of one line is bigger than MAX, segmentation fault occur.
You can change your code like this
#define SCANF_LEN2(x) #x
#define SCANF_LEN(x) SCANF_LEN2(x)
//...
//your original code
//...
while((fscanf(fpt, "%"SCANF_LEN(MAX)"s ", temp)) == 1)
By the way, you should check
(1) compile warning about type
char* removePunc(struct words* ch)
should be char* removePunc(char *ch)
if(temp == ' ') should be if(temp[0] == ' ')
if(temp == '\n') should be if(temp[0] == '\n')
(2) malloc size
pCounter = (struct words*)malloc(sizeof(char)); should be pCounter = (struct words*)malloc(sizeof(struct words));
(3) remember free after using malloc
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 6 years ago.
Improve this question
The code is given. I need to count number of words that are non identical. In order to do that i need to compare them using stcrmp. By looking the code below, how i need to construct while or if statements to compare words in a file by using double linked list? I suppose this condition should be in the main to print it then. My condition doesn't work. Also, can you give some advice where and how to sort words by their length here?
To understand the code some explanation:
This program holds a doubly linked list that will read a file that is entered as a command line argument, read each line from file, tokenize each word from line and for each word will place it into a Word Length structure depending on its length and then will place it into a word_count structure dependent on the word's string and count each word's occurrence in a file.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define DELIM " ,.+-=!?:;\t"
#define MAXLINE 25000
typedef struct word_count {
char *word;
int count;
struct word_count *next;
struct word_count *prev;
} WORD;
typedef struct word_length_count {
int length;
int count;
WORD *words;
struct word_length_count *next;
struct word_length_count *prev;
} WLENGTH;
int splitIntoWords(char line[]);
void processLength(char *word);
void processWord(char *word, WORD *wordCount);
void printWordLength();
WLENGTH *createWordLength(char *word);
WORD *createWordCount(char *word);
WLENGTH *wordLength = NULL;
int main(unsigned int argc, unsigned char *argv[]) {
FILE *fpin;
char line[MAXLINE];
int totalWordCount = 0;
if ((fpin = fopen(argv[1], "r")) == NULL) {
printf("Can't open input file.\n");
exit(-1);
}
printf("This is the words all tokenized from the input!\n");
while (fgets(line, MAXLINE, fpin) != NULL) {
line[strcspn(line, "\n")] = '\0';
if (line[0] == '\0')
continue;
totalWordCount += splitIntoWords(line);
}
printf("Total number of words is: %d\n", totalWordCount);
printWordLength();
printf("\nFINISHED!");
}
int splitIntoWords(char line[]) {
char *word;
int count=0;
word = strtok(line, DELIM);
for (;word != NULL;) {
count++;
printf("%s\n", word);
processLength(word);
word = strtok(NULL, DELIM);
}
return count;
}
void processLength(char *word)
{
WLENGTH *wLCounter = NULL;
WLENGTH *wLLast = NULL;
if (wordLength == NULL) {
wordLength = createWordLength(word);
return;
}
wLCounter = wordLength;
while (wLCounter != NULL) {
if (strlen(word) == wLCounter->length) {
++wLCounter->count;
processWord(word, wLCounter->words);
return;
}
wLLast = wLCounter;
wLCounter = wLCounter->next;
}
wLLast->next = createWordLength(word);
}
void processWord(char *word, WORD *wordCount) {
WORD *wCounter = NULL;
WORD *wLast = NULL;
if (wordCount == NULL) {
wordCount = createWordCount(word);
return;
}
wCounter = wordCount;
while (wCounter != NULL) {
if (strcmp(word, wCounter->word) == 0) {
++wCounter->count;
return;
}
wLast = wCounter;
wCounter = wCounter->next;
}
wLast->next = createWordCount(word);
}
WLENGTH *createWordLength(char *word) {
WLENGTH *wLCounter = NULL;
wLCounter = (WLENGTH*)malloc(sizeof(WLENGTH));
wLCounter->words = createWordCount(word);
wLCounter->count = 1;
wLCounter->length = strlen(word);
wLCounter->next = NULL;
return wLCounter;
}
WORD *createWordCount(char *word) {
WORD *wCount = NULL;
wCount = (WORD*)malloc(sizeof(WORD));
wCount->word = (char*)malloc(strlen(word+1));
strcpy(wCount->word, word);
wCount->count = 1;
wCount->next = NULL;
return wCount;
}
void printWordLength() {
WLENGTH *temp = wordLength;
WORD *tempWORD = wordLength->words;
while (temp != NULL) {
WORD *tempWORD = wordLength->words;
tempWORD = temp->words;
printf("\nFor Word Length: %d : There are: %d occurances!\n", temp->length, temp->count);
while (tempWORD != NULL) {
printf("\t%s\toccurs:%d\n", tempWORD->word, tempWORD->count);
tempWORD = tempWORD->next;
}
}
}
You're missing this at the bottom of the outermost while loop of printWordLength():
temp = temp->next;
That's why it goes into an infinite loop (which you didn't tell us).
Now, to count distinct words you just need to count every WORD* in every WORDLENGTH*, which you can do while you're printing them in printWordLength():
void printWordLength()
{
WLENGTH * temp = wordLength;
WORD * tempWORD = wordLength->words;
unsigned int unique_words = 0;
while(temp != NULL)
{
WORD * tempWORD = wordLength->words;
tempWORD = temp->words;
printf("\nFor Word Length: %d : There are: %d occurences!\n",
temp->length, temp->count);
while(tempWORD != NULL)
{
printf("\t%s\toccurs:%d\n", tempWORD->word, tempWORD->count);
unique_words++;
tempWORD = tempWORD->next;
}
temp = temp->next;
}
printf("\nThere are %u unique words\n", unique_words);
}
I'm trying to count the number of occurrences of each word in the function countWords I believe i started the for loop in the function properly but how do I compare the words in the arrays together and count them and then delete the duplicates? Isn't it like a fibonacci series or am I mistaken? Also int n has the value of 756 because thats how many words are in the array and wordsArray are the elements in the array.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
int *countWords( char **words, int n);
int main(int argc, char *argv[])
{
char buffer[100]; //Maximum word size is 100 letters
FILE *textFile;
int numWords=0;
int nextWord;
int i, j, len, lastChar;
char *wordPtr;
char **wordArray;
int *countArray;
int *alphaCountArray;
char **alphaWordArray;
int *freqCountArray;
char **freqWordArray;
int choice=0;
//Check to see if command line argument (file name)
//was properly supplied. If not, terminate program
if(argc == 1)
{
printf ("Must supply a file name as command line argument\n");
return (0);
}
//Open the input file. Terminate program if open fails
textFile=fopen(argv[1], "r");
if(textFile == NULL)
{
printf("Error opening file. Program terminated.\n");
return (0);
}
//Read file to count the number of words
fscanf(textFile, "%s", buffer);
while(!feof(textFile))
{
numWords++;
fscanf(textFile, "%s", buffer);
}
printf("The total number of words is: %d\n", numWords);
//Create array to hold pointers to words
wordArray = (char **) malloc(numWords*sizeof(char *));
if (wordArray == NULL)
{
printf("malloc of word Array failed. Terminating program.\n");
return (0);
}
//Rewind file pointer and read file again to create
//wordArray
rewind(textFile);
for(nextWord=0; nextWord < numWords; nextWord++)
{
//read next word from file into buffer.
fscanf(textFile, "%s", buffer);
//Remove any punctuation at beginning of word
i=0;
while(!isalpha(buffer[i]))
{
i++;
}
if(i>0)
{
len = strlen(buffer);
for(j=i; j<=len; j++)
{
buffer[j-i] = buffer[j];
}
}
//Remove any punctuation at end of word
len = strlen(buffer);
lastChar = len -1;
while(!isalpha(buffer[lastChar]))
{
lastChar--;
}
buffer[lastChar+1] = '\0';
//make sure all characters are lower case
for(i=0; i < strlen(buffer); i++)
{
buffer[i] = tolower(buffer[i]);
}
//Now add the word to the wordArray.
//Need to malloc an array of chars to hold the word.
//Then copy the word from buffer into this array.
//Place pointer to array holding the word into next
//position of wordArray
wordPtr = (char *) malloc((strlen(buffer)+1)*sizeof(char));
if(wordPtr == NULL)
{
printf("malloc failure. Terminating program\n");
return (0);
}
strcpy(wordPtr, buffer);
wordArray[nextWord] = wordPtr;
}
//Call countWords() to create countArray and replace
//duplicate words in wordArray with NULL
countArray = countWords(wordArray, numWords);
if(countArray == NULL)
{
printf("countWords() function returned NULL; Terminating program\n");
return (0);
}
//Now call compress to remove NULL entries from wordArray
compress(&wordArray, &countArray, &numWords);
if(wordArray == NULL)
{
printf("compress() function failed; Terminating program.\n");
return(0);
}
printf("Number of words in wordArray after eliminating duplicates and compressing is: %d\n", numWords);
//Create copy of compressed countArray and wordArray and then sort them alphabetically
alphaCountArray = copyCountArray(countArray, numWords);
freqCountArray = copyCountArray(alphaCountArray, numWords);
int *countWords( char **wordArray, int n)
{
return NULL;
int i=0;
int n=0;
for(i=0;i<n;i++)
{
for(n=0;n<wordArray[i];n++)
{
}
}
}
Assuming you want the return value of countWords to be an array of integers with word counts for each unique word, you need to have a double loop. One loop goes over the whole array, the second loop goes through the rest of the array (after the current word), looking for duplicates.
You could do something like this pseudo code:
Allocate the return array countArray (n integers)
Loop over all words (as you currently do in your `for i` loop)
If the word at `i` is not null // Check we haven't already deleted this word
// Found a new word
Set countArray[i] to 1
Loop through the rest of the words e.g. for (j = i + 1; j < n; j++)
If the word at j is not NULL and matches the word at i (using strcmp)
// Found a duplicate word
Increment countArray[i] (the original word's count)
// We don't want wordArray[j] anymore, so
Free wordArray[j]
Set wordArray[j] to NULL
Else
// A null indicates this was a duplicate, set the count to 0 for consistency.
Set countArray[i] to 0
Return wordArray
I'm going to throw you a bit of a curve ball here.
Rather than fix your code, which can be easily fixed as it's pretty good on its own, but incomplete, I decided to write an example from scratch.
No need to read the file twice [first time just to get the maximum count]. This could be handled by a dynamic array and realloc.
The main point, I guess, is that it is much easier to ensure that word list has no duplicates while creating it, rather than removing duplicates at the end.
I opted for a few things.
I created a "word control" struct. You've got several separate arrays that are indexed the same way. That, sort of, "cries out" for a struct. That is, rather than [say] 5 separate arrays, have a single array of a struct that has 5 elements in it.
The word list is a linked list of these structs. It could be a dynamic array on the heap that gets realloced instead, but the linked list is actually easier to maintain for this particular usage.
Each struct has the [cleaned up] word text and a count of the occurrences (vs. your separate wordArray and countArray).
When adding a word, the list is scanned for an existing match. If one is found, the count is incremented, rather than creating a new word list element. That's the key to eliminating duplicates [i.e. don't create them in the first place].
Anyway, here it is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
// word control
typedef struct word {
struct word *next; // linked list pointer
char *str; // pointer to word string
int count; // word frequency count
} word_t;
word_t wordlist; // list of words
// cleanword -- strip chaff and clean up word
void
cleanword(char *dst,const char *src)
{
int chr;
// NOTE: using _two_ buffers in much easier than trying to clean one
// buffer in-place
for (chr = *src++; chr != 0; chr = *src++) {
if (! isalpha(chr))
continue;
chr = tolower(chr);
*dst++ = chr;
}
*dst = 0;
}
// addword -- add unique word to list and keep count of number of words
void
addword(const char *str)
{
word_t *cur;
word_t *prev;
char word[1000];
// get the cleaned up word
cleanword(word,str);
// find a match to a previous word [if it exists]
prev = NULL;
for (cur = wordlist.next; cur != NULL; cur = cur->next) {
if (strcmp(cur->str,word) == 0)
break;
prev = cur;
}
// found a match -- just increment the count (i.e. do _not_ create a
// duplicate that has to be removed later)
if (cur != NULL) {
cur->count += 1;
return;
}
// new unique word
cur = malloc(sizeof(word_t));
if (cur == NULL)
sysfault("addword: malloc failure -- %s\n",strerror(errno));
cur->count = 1;
cur->next = NULL;
// save off the word string
cur->str = strdup(word);
if (cur->str == NULL)
sysfault("addword: strdup failure -- %s\n",strerror(errno));
// add the new word to the end of the list
if (prev != NULL)
prev->next = cur;
// add the first word
else
wordlist.next = cur;
}
int
main(int argc,char **argv)
{
FILE *xf;
char buf[1000];
char *cp;
char *bp;
word_t *cur;
--argc;
++argv;
xf = fopen(*argv,"r");
if (xf == NULL)
sysfault("main: unable to open '%s' -- %s\n",*argv,strerror(errno));
while (1) {
// get next line
cp = fgets(buf,sizeof(buf),xf);
if (cp == NULL)
break;
// loop through all words on a line
bp = buf;
while (1) {
cp = strtok(bp," \t\n");
bp = NULL;
if (cp == NULL)
break;
// add this word to the list [avoiding duplicates]
addword(cp);
}
}
fclose(xf);
// print the words and their counts
for (cur = wordlist.next; cur != NULL; cur = cur->next)
printf("%s %d\n",cur->str,cur->count);
return 0;
}
I've been trying to implement a spell-checker using a large dictionary against some text file which contains around 2000 words. However, my spell-checker returns all words as being misspelled. I honestly have no idea why — could someone help me?
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "dictionary.h"
#define lenght 45
#define hashtable_size 65536
char word[lenght+1];
int count = 0;
/*
*
* Hash function. Thanks to Brenda from cs50 reddit.
*/
int hash_it(const char* needs_hashing)
{
unsigned int hash = 0;
for (int i=0, n=strlen(needs_hashing); i<n; i++)
hash = (hash << 2) ^ needs_hashing[i];
return hash % hashtable_size;
}
typedef struct node
{
char* word;
struct node* next;
}node;
node* previous;
node* hashtable[hashtable_size];
/*
*
* Loads dictionary into memory. Returns true if successful else false.
*/
bool load(const char* dictionary)
{
char word[lenght+1];
FILE* dict = fopen(dictionary,"r");
for(int i = 0; i < 26;i++)
{
hashtable[i] = NULL;
for(int a = fgetc(dict); a != EOF; a = fgetc(dict))
{
count++;
int hashvalue = hash_it(word);
node* new = malloc(sizeof(node));
if(hashtable[hashvalue] == NULL)
{
hashtable[hashvalue] = new;
new -> next = NULL;
}
else
{
new -> next = hashtable[hashvalue];
hashtable[hashvalue] = new;
}
}
}
fclose(dict);
return true;
}
/*
*
* Returns true if word is in dictionary else false.
*/
bool check(const char* word)
{
char tmp[lenght + 1];
int lenghtw = strlen(word);
for (int i = 0; i < lenghtw; i++)
{
tmp[i] = tolower(word[i]);
}
int index = hash_it(tmp);
if (hashtable[index] == NULL)
{
return false;
}
node* cursor = hashtable[index];
while(cursor != NULL)
{
if(strcmp(tmp, cursor -> word) == 0)
{
return true;
}
cursor = cursor -> next;
}
return false;
}
/*
*
* Returns number of words in dictionary if loaded else 0 if not yet loaded.
*/
unsigned int size(void)
{
return count;
}
/*
*
* Unloads dictionary from memory. Returns true if successful else false.
*/
bool unload(void)
{
int index = 0;
while(index < hashtable_size)
{
if(hashtable[index] == NULL)
{
index++;
}
else
{
while(hashtable[index] != NULL)
{
node* cursor = hashtable[index];
hashtable[index] = cursor -> next;
free(cursor);
}
index++;
}
}
return true;
}
int main(int argc, char **argv)
{
if (argc != 2)
return 3;
if (!load("dictionary"))
return 1;
printf("loaded %d words\n", size());
printf("word '%s'%s found\n", argv[1], check(argv[1]) ? "" : " not");
unload();
return 0;
}
There are many problems in your code:
in the load function, you do not load words from the dictionary into the hash table. You read one character at a time with fgetc() and create a node from an uninitialized local buffer word.
the hash_it function only hashes the last 16 characters from the word. Furthermore, hashtable_size is a power of 2, a bad idea. Indeed only the last 8 characters participate in the hash value. This is not a bug, just an inefficient hashing method.
in the check function, you copy the word and convert it to lowercase, but you forget to set the final byte of the tmp array to '\0'.
Here is a corrected version of load that reads one word per dictionary line:
bool load(const char *dictionary) {
char line[256];
FILE *dict = fopen(dictionary, "r");
if (!dict)
return false;
while (fgets(line, sizeof line, dict) != NULL) {
char *p = line + strspn(line, " \t"); // skip blanks
p[strcspn(p, " \t\r\n")] = '\0'; // strip trailing blanks
if (*p == '\0' || *p == '#' || *p == ';')
continue; // ignore blank lines and comments
count++;
int hashvalue = hash_it(p);
node *np = malloc(sizeof(node));
np->word = strdup(p);
np->next = hashtable[hashvalue];
hashtable[hashvalue] = np;
}
fclose(dict);
return true;
}