Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 8 years ago.
Improve this question
Relevant code:
Constant Def:
#define MAX_NAME_LEN 15 /* Maximum length of any attribute or relation is 15. */
Struct Def:
typedef struct schnode {
char name[MAX_NAME_LEN + 1]; /* The name of the attribute. */
char type; /* The type of the attribute ('S' or 'I'). */
int len; /* The length of the attribute. */
struct schnode *next; /* A pointer to the next node. */
} SCHNODE;
Variable Def:
FILE *schemafile; /* A text file. */
SCHNODE *head = NULL; /* Head of the linked list. */
SCHNODE *ptrnode = NULL; /* A variable node in the linked list. */
SCHNODE *p = NULL; /* A variable node in the linked list. */
SCHNODE *tail = NULL; /* Tail of the linked list. */
char *attr_name; /* Holds the name of the current attribute. */
char *attr_type; /* Holds the current attribute type (S or I). */
int attr_len; /* Holds the current attribute length. */
int attr_loc = 0; /* Attribute location in respect to the tuple. */
int i; /* Index for loops. */
char attr_match = 0; /* 1 if the attribute is valid, 0 otherwise. */
Constant Def:
#define BUFFER_START_SIZE 20
#define BUFFER_INC_SIZE 10
NextLine Function:
char* NextLine (FILE *f) {
int size = BUFFER_START_SIZE;
char *buf; //buffer storing current line as a char array
int i = 0; //index for array
int c; //a character
if (f == NULL){ //check if the file pointer is NULL
printf("Unable to find the file.\n"); fflush(stdout);
return NULL; //nothing to read, so return NULL
}
if((c = getc(f)) == EOF) //check if the first 'char' in the line is EOF
return NULL; //if so, line is empty, so return NULL
if((buf = (char *) malloc(size)) == NULL){ //make room for the buffer
printf("Failed to create buffer.\n"); fflush(stdout); }
buf[0] = (char) c; //put the first char into the buffer string
i = 1; //and increment the index
while(1){
while(i < size){ //loop through the buffer char array
if((c = getc(f)) == '\n'){ //if end of line
buf[i] = '\0'; //instead put '\0' to terminate string
return buf; //and return the completed string
}
buf[i] = (char) c; //otherwise, just keep copying the characters
i++; //increment the index
}
//IF REACHED THIS POINT, LINE IS TOO LONG TO FIT IN BUFFER -> REALLOC
size += BUFFER_INC_SIZE;
if((buf = (char*) realloc(buf,size)) == NULL){ //make buffer bigger
printf("Unable to realloc memory.\n"); fflush(stdout); }
//loop and continue adding chars from where you left off
}
}
Executed Code:
for (i = 0; i < num_attr; i++) {
if ((i > 0) && !attr_match) attr_loc += attr_len;
attr_name = NextLine(schemafile);
attr_name = strtok(attr_name, " \t\n");
attr_type = strtok(NULL, " \t\n");
attr_len = atoi(strtok(NULL, " \t\n"));
/* Construct linked list of schema file data. */
/* If empty: */
if (head == NULL) {
if ((head = (SCHNODE*) malloc(sizeof(SCHNODE))) == NULL) {
fprintf(stderr, "Unable to Malloc space.\n");
return;
}
strcpy(head->name, attr_name);
head->type = attr_type[0];
head->len = attr_len;
head->next = NULL;
}
/* If the list already has a head defined: */
else {
/* Start from the head if tail is NULL (only 1 element) otherwise start from tail. */
if(tail == NULL) ptrnode = head;
else ptrnode = tail;
/* Malloc space for the tail node. */
if ((tail = (SCHNODE*) malloc(sizeof(SCHNODE))) == NULL) {
fprintf(stderr, "Unable to Malloc space.\n");
return;
}
strcpy(tail->name, attr_name);
tail->type = attr_type[0];
tail->len = attr_len;
tail->next = NULL;
/* Insert tail node at the end. */
ptrnode->next = tail;
if (strcmp(tail->name, "Instr") == 0)
p = tail;
if (p != NULL) printf("%d: %s\n", i, p->name);
}
Input:
CName S 25
CId S 8
Instr S 10
Credits I 4
Output:
2: Instr
3: Insur
No other values are changed (as far as I can tell). Can someone explain why this particular value is always altered? (Instr -> Insur). I just want the entry that I read (Instr) to stay the same throughout the entire process of reading.
In your program, the name field of your data structure is defined to be a pointer. Hence, when you strcpy, there will be memory corruption as no space has been allocated to the pointer. Hence, either malloc a space for name for every node or define name to be an array of n elements.
Related
I'm working with c90 on linux.
I have a strange bug when I want to end a string,
let idx be the index, so when I get to the last index I want the list[idx] to be NULL.
example:
list[0] actually "hello"
list[1] actually "world\n"
list[2] sometimes is "" or NULL
so when I put NULL to the the end of the list its deletes one of the other words..
for: list[2] = NULL;
unexpectedly list[0] turns NULL but list[1] still "world\n" and list[2] of course NULL.
I wrote this function:
void function()
{
char buffer[BUFF_LEN];
char** list = NULL;
int list_len = 0;
while (fgets(buffer, BUFF_LEN, fptr))
{
list = (char**)malloc((sizeof(char*)));
get_input(buffer, list, &list_len);
/*
some other code
*/
}
free_list(list, list_len); /*free the array of strings words*/
}
and wrote also the get_input because I work with c90
void get_input(char* line, char** list, int *idx)
{
char * token;
*idx = 0;
token = strtok(line, " "); /*extract the first token*/
/* loop through the string to extract all other tokens */
while (token != NULL)
{
if (token && token[0] == '\t')
memmove(token, token + 1, strlen(token));
printf("%s\n", token);
list[*idx] = (char *)malloc(strlen(token)+1);
strncpy(list[*idx], token, strlen(token));
token = strtok(NULL, " "); /*get every token*/
(*idx)++;
}
if (*idx == 0)
list = NULL;
list[*idx - 1][strcspn(list[*idx - 1], "\n")] = 0; /* remove the "\n" */
list[*idx] = NULL; /* to know when the list ends */
}
the free function:
void free_list(char** list, int list_len)
{
int i;
for(i= list_len - 1; i >= 0; i--)
{
list[i] = NULL;
free(list[i]);
}
}
You have multiple issues.
void function()
{
char buffer[BUFF_LEN];
char** list = NULL;
int list_len = 0;
while (fgets(buffer, BUFF_LEN, fptr))
{
list = (char**)malloc((sizeof(char*)));
get_input(buffer, list, &list_len);
/*
some other code
*/
}
free_list(list, list_len); /*free the array of strings words*/
}
You only allocate memory for 1 pointer.
You only free the pointers in the last list.
You never free the memory for list ifself.
You should not cast the return value of malloc and friends.
This should be changed like this:
void function()
{
char buffer[BUFF_LEN];
char** list = NULL;
int list_len = 0;
while (fgets(buffer, BUFF_LEN, fptr))
{
list = malloc((sizeof(char*)));
get_input(buffer, &list, &list_len);
/*
some other code
*/
free_list(list); /*free the array of strings words*/
free(list);
}
}
The freeing function is also broken:
void free_list(char** list, int list_len)
{
int i;
for( i= list_len - 1; i >= 0; i--)
{
list[i] = NULL;
free(list[i]);
}
}
You set the pointer within list to NULL before you free it. This causes a memory leak as the memory is not really freed.
You don't really need the length as you have added a sentinel. But that is not an error.
There is also no need to free the pointers backwards.
After cleanup the function could look like this:
void free_list(char** list)
{
while (list[i])
{
free(list[i]);
i++;
}
}
Now the biggest part:
void get_input(char* line, char** list, int *idx)
{
char * token;
*idx = 0;
token = strtok(line, " "); /*extract the first token*/
/* loop through the string to extract all other tokens */
while (token != NULL)
{
if (token && token[0] == '\t')
memmove(token, token + 1, strlen(token));
printf("%s\n", token);
list[*idx] = (char *)malloc(strlen(token)+1);
strncpy(list[*idx], token, strlen(token));
token = strtok(NULL, " "); /*get every token*/
(*idx)++;
}
if (*idx == 0)
list = NULL;
list[*idx - 1][strcspn(list[*idx - 1], "\n")] = 0; /* remove the "\n" */
list[*idx] = NULL; /* to know when the list ends */
}
You do not care about memory for the pointers in your list. That means you store the pointers in memory that you are not allowed to touch. By doing this you invoke undefined behaviour.
You must realloc the memory and for that you must be able to modify the passed pointer.
You should not cast the return values of malloc and friends.
You access illegal index values if *idx==0
You call strncpy with the length of the string without space for the 0 byte. That will cause the copy to be not nul terminated. Also there is no need to use strncpy over strcpy as you have reserved enough memory.
void get_input(char* line, char*** list, int *idx)
{
char *token;
char **list_local = *list; // Make things easier by avoiding one * within the function.
*idx = 0;
token = strtok(line, " "); /*extract the first token*/
/* loop through the string to extract all other tokens */
while (token != NULL)
{
if (token[0] == '\t') // No need to check for `token` again
memmove(token, token + 1, strlen(token));
printf("%s\n", token);
list_local[*idx] = malloc(strlen(token)+1);
strcpy(list_local[*idx], token);
token = strtok(NULL, " "); /*get every token*/
(*idx)++;
/* Increase array size to hold 1 more entry. */
/* That new element already includes memory for the sentinel NULL */
{
char ** temp = realloc(list_local, sizeof(char*) * (*idx));
if (temp != NULL)
list_local = temp;
// TODO: error handling ...
}
}
if (*idx != 0)
{
list_local[*idx - 1][strcspn(list_local[*idx - 1], "\n")] = 0; /* remove the "\n" */
}
list_local[*idx] = NULL; /* to know when the list ends */
*list = list_local;
}
I'm having problem with pointers in a C program that count the occurrences of a string or more in a bunch of file. The program take in input a file which contains the paths of the files in which search the occurrences. All the files that i will mention are contained in the same folder of the project, whose name is "find". In my case, the input file is "path.txt":
C:\Users\Utente\Desktop\find\try.txt
C:\Users\Utente\Desktop\find\try1.txt
The try.txt content is:
abc
abc
abc
ac
ac
ac
ac
The try1.txt content is:
ac
ac
ac
ac
abc
abc
abc
My program is composed by 4 files, two header-files and two source files:
find.c:
#include "find.h"
int main(int argc, char * argv[]){
FILE *fInput = NULL;
FILE *fp = NULL;
char *line1;
char *line2;
int endOfLineDetected = 0;
size_t nrOfCharRead = 0;
char ch;
fWord *w = NULL;
fWord *start = NULL;
fWord *tail = NULL;
fPath *head = NULL;
fPath *current = NULL;
fInput = fopen(argv[1], "r"); //the file that contains the path of the file in which search.
if(fInput == NULL){
fprintf(stderr, "Cannot open %s, exiting. . .\n", argv[1]);
exit(1);
}
while(!endOfLineDetected){ //read line by line the input file in order to save the path in a structure
line1 = getLineOfAnySize(fInput,128,&endOfLineDetected,&nrOfCharRead);
fPath *node = malloc (sizeof(fPath));
node->path = line1;
node->fileOccurrences = 0;
node->position = NULL;
node->next = NULL;
if(head == NULL){
current = head = node;
}else{
current = current->next = node;
}
}
fclose(fInput);
//create a linked list of the type fWord, one structure for each word.
do{
fWord *app = malloc(sizeof(fWord));
printf("Insert the word to search: ");
scanf("%s", app->word);
app->totalOccurences = 0;
app->p = head;
app->next = NULL;
if(start == NULL){
tail = start = app;
}else{
tail = tail->next = app;
}
printf("Do you want to insert another word? (Y/N): ");
scanf(" %c", &ch);
}while(ch == 'y' || ch == 'Y');
w = start; //pointer back to the top of the fWord structure
//traverse all the structure and execute the algorithm
while(w != NULL){
while(w->p != NULL){
fp = fopen(w->p->path, "r");
if(fp == NULL){
fprintf(stderr, "Cannot open %s, exiting. . .\n", w->p->path);
exit(1);
}
int countLine = 0;
w->p->fileOccurrences = 0;
endOfLineDetected = 0;
while(!endOfLineDetected){
line2 = getLineOfAnySize(fp,128,&endOfLineDetected,&nrOfCharRead);
int n = strlen(line2);
int m = strlen(w->word);
w->p->fileOccurrences = w->p->fileOccurrences + KMP(line2, w->word, n, m, countLine, w->p);
countLine = countLine + 1;
}
w->totalOccurences = w->totalOccurences + w->p->fileOccurrences;
w->p->position = getHead(); // //pointer back to the top of the fPosition structure
w->p = w->p->next;
fclose(fp);
}
w->p = head; //pointer back to the top of the fPath structure
}
w = start; //pointer back to the top of the fWord structure
//traverse all the structure and print out the occurrences and their position
while(w != NULL){
w->p = head;
printf("WORD %s \r\n", w->word);
printf("TOTAL %d \r\n", w->totalOccurences);
while(w->p != NULL){
printf("FILE %s \r\n", w->p->path);
printf("OCCURENCES %d \r\n", w->p->fileOccurrences);
while (w->p->position != NULL){
printf("%d %d\r\n", w->p->position->line, w->p->position->character);
w->p->position = w->p->position->next;
}
w->p = w->p->next;
}
w = w->next;
}
printf("\r\n"); //the file ends with an empty line
return 0;
}
//method used for read line by line a file
char * getLineOfAnySize(FILE* fp, size_t typicalSize, int *endOfLineDetected,size_t *nrOfCharRead){
char *line; // buffer for our string
int ch; // we will read line character by character
size_t len = 0; // number of characters read (character counter)
size_t lineSize = typicalSize; // initial size of the buffer allocated for the line
*nrOfCharRead = 0;
if(!fp) return NULL; // protection
// allocating the buffer
line = realloc(NULL, sizeof(char)*lineSize); // expected size of the line is up to typicalSize
if (!line) return line; // protection, if we fail to allocate the memory we will return NULL
while (1) { // loop forever
ch = fgetc(fp); // getting character by character from file
if (ch == '\n') break; // end of line detected - breaking the loop
if( ch == EOF) {
*endOfLineDetected = 1;
break; // end of file detected - breaking the loop
}
line[len++] = ch; // store the character in the line buffer, increase character counter
if (len == lineSize){ // we reached the end of line buffer (no more room)
lineSize = lineSize + 64; // we have to increase the line size
line = realloc(line, sizeof(char)*(lineSize)); // line buffer has new size now
if (!line) return line; // if we fail to allocate memory we will return NULL
}
if( (len == 0) && *endOfLineDetected){ // empty file
*endOfLineDetected = 1;
break;
}
}
line[len++] ='\0'; // ending the string (notice there is no '\n' in the string)
*nrOfCharRead = len;
return line; // return the string
}
find.h:
#include "kmp.h"
char * getLineOfAnySize(FILE* fp, size_t typicalSize, int *endOfLineDetected,size_t *nrOfCharRead);
kmp.c:
#include "kmp.h"
fPosition *head = NULL;
fPosition *current = NULL;
// Function to implement KMP algorithm
int KMP(const char* X, const char* Y, int m, int n, int line, fPath *app){
int count = 0;
// next[i] stores the index of next best partial match
int next[n + 1];
for (int i = 0; i < n + 1; i++)
next[i] = 0;
for (int i = 1; i < n; i++){
int j = next[i + 1];
while (j > 0 && Y[j] != Y[i])
j = next[j];
if (j > 0 || Y[j] == Y[i])
next[i + 1] = j + 1;
}
for (int i = 0, j = 0; i < m; i++){
if (*(X + i) == *(Y + j)){
if (++j == n){
count = count + 1; //count the occurrences of the string in this file
fPosition *node = malloc (sizeof(fPosition));
node->line = line; //the current line
node->character = i - j + 1; //the shift in which occurs
node->next = NULL;
if(head == NULL){
current = head = node;
}else{
current = current->next = node;
}
app->position = current;
}
}
else if (j > 0) {
j = next[j];
i--; // since i will be incremented in next iteration
}
}
return count; //return the number of occurences found
}
//take the pointer back to the top of fPosition
fPosition * getHead(){
fPosition *app = head;
head = NULL;
return app;
}
kmp.h:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
struct filePath{
char *path; //the file path
struct filePath *next;
};
struct OccurrencesPosition{
int line; //line in which an occurrence is founded
int character; //shift at which the occurrences comes
struct filePath pathInfo;
struct OccurrencesPosition *next; //pointer to the next occurrences
};
struct fileWord{
char word[50]; //the string to search
int totalOccurences; //the total occurrences of the string
int fileOccurrences; //the occurrences of each file
struct OccurrencesPosition *position; //pointer to the linked list which tracks all the occurrences and their positions
struct fileWord *next; //pointer to the next word
};
typedef struct filePath fPath;
typedef struct fileWord fWord;
typedef struct OccurrencesPosition fPosition;
fPosition * getHead();
int KMP(const char* X, const char* Y, int m, int n, int line, fPath *app);
The problem is that when i run my program passing in input "abc" and "ac" it returns wrong value. More precisely, returns the value corresponding to "ac" in both cases. Here's the execution:
PS C:\Users\Utente\Desktop\find> gcc find.c kmp.c -o "find.exe"
PS C:\Users\Utente\Desktop\find> .\find.exe "path.txt"
Insert the word to search: abc
Do you want to insert another word? (Y/N): Y
Insert the word to search: ac
Do you want to insert another word? (Y/N): N
WORD abc
TOTAL 6
FILE C:\Users\Utente\Desktop\find\try.txt
OCCURENCES 4
3 0
4 0
5 0
6 0
FILE C:\Users\Utente\Desktop\find\try1.txt
OCCURENCES 4
0 0
1 0
2 0
3 0
WORD ac
TOTAL 8
FILE C:\Users\Utente\Desktop\find\try.txt
OCCURENCES 4
FILE C:\Users\Utente\Desktop\find\try1.txt
OCCURENCES 4
As you can see, the WORD and TOTAL are correct in both cases, but the occurrences not. They correspond to "ac" in both cases.
The correct output should be:
WORD abc
TOTAL 6
FILE C:\Users\Utente\Desktop\find\try.txt
OCCURENCES 3
0 0
0 1
0 2
FILE C:\Users\Utente\Desktop\find\try1.txt
OCCURENCES 3
4 0
5 0
6 0
WORD ac
TOTAL 8
FILE C:\Users\Utente\Desktop\find\try.txt
OCCURENCES 4
3 0
4 0
5 0
6 0
FILE C:\Users\Utente\Desktop\find\try1.txt
OCCURENCES 4
0 0
1 0
2 0
3 0
I think that the problem is with the fPosition pointers. Thanks to anyone who helps.
You have design issue.
The problem is occurrences info you are maintaining as part of filePath list.
struct filePath{
char *path; //the file path
int fileOccurrences; //the occurrences of each file
struct OccurrencesPosition *position; // here *****************
struct filePath *next;
};
And file path info you are maintaining as part of fileWord list.
struct fileWord{
char word[50]; //the string to search
int totalOccurences; //the total occurrences of the string
struct filePath *p; //pointer to the linked list of all the files
struct fileWord *next; //pointer to the next word
};
Since you only have one file path list, each word in fileWord list is actually pointing to same filepath list.
Every word is pointing to same file path list
fWord *app = malloc(sizeof(fWord));
printf("Insert the word to search: ");
scanf("%s", app->word);
app->p = head; //here
and you are updating the position info inside the filepath for every word.
w->p->position = getHead(); // //pointer back to the top of the fPosition structure
Thus filePath list is holding position info only for the latest word you search.
Update:
Your design should look as below.
struct filePath{
char *path; //the file path
struct filePath *next;
};
struct OccurrencesPosition{
int line; //line in which an occurrences is founded
int character; //shift at which the occurrences comes
struct filePath pathInfo;
struct OccurrencesPosition *next; //pointer to the next occurrences
};
struct fileWord{
char word[50]; //the string to search
int totalOccurences; //the total occurrences of the string
int fileOccurrences; //the occurrences of each file
struct OccurrencesPosition *position; //pointer to the linked list which tracks all the occurrences and their positions
struct fileWord *next; //pointer to the next word
};
I am working on an assignment that requires me to print the top 10 most occurring words in a given text file. My code is printing the words from the file, but it is not sorting them according to their frequency.
Here is come of my code below. I use a hashtable to store each unique word and its frequency. I am currently sorting the words using the wordcmp function I wrote and calling it in the inbuilt qsort function in main.
If anyone can guide me to fix my error, I'd be very greatful.
My current output:
the top 10 words (out of 10) are:
1 im
1 are
1 again
3 happy
2 hello
1 how
1 lets
1 you
1 try
1 this
Expected output (what I want):
The top 10 words (out of 10) are:
3 happy
2 hello
1 you
1 try
1 this
1 lets
1 im
1 how
1 are
1 again
Here is some of my code:
typedef struct word
{
char *s; /* the word */
int count; /* number of times word occurs */
struct word* next;
}word;
struct hashtable
{
word **table;
int tablesize;
int currentsize;
};
typedef struct hashtable hashtable;
int main(int argc, char *argv[])
{
int top_words = 10;
word *word = NULL;
hashtable *hash = ht_create(5000);
char *file_name;
char *file_word;
FILE *fp;
struct word *present = NULL;
fp = fopen (file_name, "r");
if (fp == NULL)
{
fprintf (stderr,"%s: No such file or directory\n", file_name);
fprintf(stderr,"The top %d words (out of 0) are:\n", top_words);
exit(-1);
}
continue_program:
while ((file_word = getWord(fp)))
{
word = add(hash, file_word, 1);
}
fclose(fp);
qsort((void*)hash->table, hash->currentsize, sizeof(word),(int (*)(const void *, const void *)) wordcmp);
if(top_words > total_unique_words)
top_words = total_unique_words;
printf("the top %d words (out of %d) are:\n", top_words, total_unique_words);
int iterations =0;
for(i =0; i <= hash->tablesize && iterations< top_words; i++)
{
present = hash->table[i];
if(present != NULL)
{
printf(" %4d %s\n", present->count, present->s);
present = present->next;
iterations++;
}
}
freetable(hash);
return 0;
}
int wordcmp (word *a, word *b)
{
if (a != NULL && b!= NULL) {
if (a->count < b->count)
{
return +1;
}
else if (a->count > b->count)
{
return -1;
}
else if (a->count == b->count)
{
/*return strcmp(b->s, a->s);*/
return 0;
}
}
return 0;
}
/* Create a new hashtable. */
struct hashtable *ht_create( int size )
{
int i;
if( size < 1 )
return NULL;
hashtable *table = (hashtable *) malloc(sizeof(hashtable));
table->table = (word **) malloc(sizeof(word *) * size);
if(table != NULL)
{
table->currentsize = 0;
table->tablesize = size;
}
for( i = 0; i < size; i++ )
{
table->table[i] = NULL;
}
return table;
}
/* Adds a new node to the hash table*/
word * add(hashtable *h, char *key, int freq)
{
int index = hashcode(key) % h->tablesize;
word *current = h->table[index];
/* Search for duplicate value */
while(current != NULL) {
if(contains(h, key) == 1){
current->count++;
return current;
}
current = current->next;
}
/* Create new node if no duplicate is found */
word *newnode = (struct word*)malloc(sizeof(struct word));
if(newnode!=NULL){
newnode->s =strdup(key);
newnode-> count = freq;
newnode-> next = NULL;
}
h->table[index] = newnode;
h->currentsize = h->currentsize + 1;
total_unique_words++;
return newnode;
}
The primary problem you are facing is attempting to sort a hashtable with linked-list chaining of buckets. When a hash collision occurs, your table is not resized, you simply use a linked-list to store the word causing the collision at the same table[index] linked to the word already stored there. That is what add does.
This can easily result in the contents of your hashtable looking like this:
table[ 0] = NULL
table[ 1] = foo
table[ 2] = NULL
table[ 3] = |some|->|words|->|that|->|collided| /* chained bucket */
table[ 4] = other
table[ 5] = words
table[ 6] = NULL
table[ 7] = NULL
...
You cannot simply qsort table and hope to get the correct word frequencies. qsort has no way to know that "some" is just the beginning word in a linked-list, all qsort gets is a pointer to "some" and sizeof(word).
To make life much easier, simply forget the hashtable, and use a dynamically allocated array of word**. You can use a similar add where you increment the number of occurrences for duplicates, and you avoid all problems with chained-buckets. (and if you provide automatic storage for each word, it leaves you with a simple free() of your pointers and you are done)
The following example takes 2 arguments. The first the filename to read words from, and (optionally) a second integer value limiting the sorted output to the that top number of words. The words_t struct uses automatic storage for word limited to 32-chars (the largest word in the unabridged dictionary is 28-characters). You can change the way words or read to parse the input and ignore punctuation and plurals as desired. The following delimits words on all punctuation (except the hyphen), and discards the plural form of words (e.g. it stores "Mike" when "Mike's" is encountered, discarding the "'s")
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#define MAXC 32 /* max word length is 28-char, 29-char is sufficient */
#define MAXW 128 /* initial maximum number of words to allocate */
typedef struct {
char word[MAXC]; /* struct holding individual words */
size_t ninst; /* and the number of times they occur */
} words_t;
/* function prototypes */
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw);
void *xrealloc (void *ptr, size_t psz, size_t *nelem);
/* qsort compare function for words_t (alphabetical) */
int cmpwrds (const void *a, const void *b)
{
return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}
/* qsort compare function for words_t (by occurrence - descending)
* and alphabetical (ascending) if occurrences are equal)
*/
int cmpinst (const void *a, const void *b)
{
int ndiff = (((words_t *)a)->ninst < ((words_t *)b)->ninst) -
(((words_t *)a)->ninst > ((words_t *)b)->ninst);
if (ndiff)
return ndiff;
return strcmp (((words_t *)a)->word, ((words_t *)b)->word);
}
int main (int argc, char **argv) {
int c = 0, nc = 0, prev = ' ', total = 0;
size_t maxw = MAXW, wc = 0, top = 0;
char buf[MAXC] = "";
words_t *words = NULL;
FILE *fp = fopen (argv[1], "r");
if (!fp) { /* validate file open for reading */
fprintf (stderr, "error: file open failed '%s'.\n", argv[1]);
return 1;
}
if (argc > 2) { /* if 2 args, convert argv[2] to number of top words */
char *p = argv[2];
size_t tmp = strtoul (argv[2], &p, 0);
if (p != argv[2] && !errno)
top = tmp;
}
/* allocate/validate initial words */
if (!(words = calloc (maxw, sizeof *words))) {
perror ("calloc-words");
return 1;
}
while ((c = fgetc(fp)) != EOF) { /* read each character in file */
if (c != '-' && (isspace (c) || ispunct (c))) { /* word-end found */
if (!isspace (prev) && !ispunct (prev) && /* multiple ws/punct */
!(prev == 's' && nc == 1)) { /* exclude "'s" */
buf[nc] = 0; /* nul-terminate */
words = addword (words, buf, &wc, &maxw); /* add word */
nc = 0; /* reset char count */
}
}
else if (nc < MAXC - 1) { /* add char to buf */
buf[nc++] = c;
}
else { /* chars exceed MAXC - 1; storage capability of struct */
fprintf (stderr, "error: characters exceed %d.\n", MAXC);
return 1;
}
prev = c; /* save previous char */
}
if (!isspace (prev) && !ispunct (prev)) /* handle non-POSIX end */
words = addword (words, buf, &wc, &maxw);
if (fp != stdin) fclose (fp); /* close file if not stdin */
qsort (words, wc, sizeof *words, cmpinst); /* sort words by frequency */
printf ("'%s' contained '%zu' words.\n\n", /* output total No. words */
fp == stdin ? "stdin" : argv[1], wc);
/* output top words (or all words in descending order if top not given) */
for (size_t i = 0; i < (top != 0 ? top : wc); i++) {
printf (" %-28s %5zu\n", words[i].word, words[i].ninst);
total += words[i].ninst;
}
printf ("%33s------\n%34s%5d\n", " ", "Total: ", total);
free (words);
return 0;
}
/** add word to words, updating pointer to word-count 'wc' and
* the maximum words allocated 'maxw' as needed. returns pointer
* to words (which must be assigned back in the caller).
*/
void *addword (words_t *words, const char *word, size_t *wc, size_t *maxw)
{
size_t i;
for (i = 0; i < *wc; i++)
if (strcmp (words[i].word, word) == 0) {
words[i].ninst++;
return words;
}
if (*wc == *maxw)
words = xrealloc (words, sizeof *words, maxw);
strcpy (words[*wc].word, word);
words[(*wc)++].ninst++;
return words;
}
/** realloc 'ptr' of 'nelem' of 'psz' to 'nelem * 2' of 'psz'.
* returns pointer to reallocated block of memory with new
* memory initialized to 0/NULL. return must be assigned to
* original pointer in caller.
*/
void *xrealloc (void *ptr, size_t psz, size_t *nelem)
{ void *memptr = realloc ((char *)ptr, *nelem * 2 * psz);
if (!memptr) {
perror ("realloc(): virtual memory exhausted.");
exit (EXIT_FAILURE);
} /* zero new memory (optional) */
memset ((char *)memptr + *nelem * psz, 0, *nelem * psz);
*nelem *= 2;
return memptr;
}
(note: the output is sorted in descending order of occurrence, and in alphabetical order if words have the same number of occurrences)
Example Use/Output
$ ./bin/getchar_wordcnt_top dat/damages.txt 10
'dat/damages.txt' contained '109' words.
the 12
a 10
in 7
of 7
and 5
anguish 4
injury 4
jury 4
mental 4
that 4
------
Total: 61
Note: to use your hashtable as your basis for storage, you would have to, at minimum, create an array of pointers to each word in your hashtable, and then sort the array of pointers. Otherwise you would need to duplicate storage and copy the words to a new array to sort. (that would be somewhat a memory inefficient approach). Creating a separate array of pointers to each word in your hashtable to sort is about the only way you have to then call qsort and avoid the chained-bucket problem.
I'm trying to count the number of occurrences of each word in the function countWords I believe i started the for loop in the function properly but how do I compare the words in the arrays together and count them and then delete the duplicates? Isn't it like a fibonacci series or am I mistaken? Also int n has the value of 756 because thats how many words are in the array and wordsArray are the elements in the array.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
int *countWords( char **words, int n);
int main(int argc, char *argv[])
{
char buffer[100]; //Maximum word size is 100 letters
FILE *textFile;
int numWords=0;
int nextWord;
int i, j, len, lastChar;
char *wordPtr;
char **wordArray;
int *countArray;
int *alphaCountArray;
char **alphaWordArray;
int *freqCountArray;
char **freqWordArray;
int choice=0;
//Check to see if command line argument (file name)
//was properly supplied. If not, terminate program
if(argc == 1)
{
printf ("Must supply a file name as command line argument\n");
return (0);
}
//Open the input file. Terminate program if open fails
textFile=fopen(argv[1], "r");
if(textFile == NULL)
{
printf("Error opening file. Program terminated.\n");
return (0);
}
//Read file to count the number of words
fscanf(textFile, "%s", buffer);
while(!feof(textFile))
{
numWords++;
fscanf(textFile, "%s", buffer);
}
printf("The total number of words is: %d\n", numWords);
//Create array to hold pointers to words
wordArray = (char **) malloc(numWords*sizeof(char *));
if (wordArray == NULL)
{
printf("malloc of word Array failed. Terminating program.\n");
return (0);
}
//Rewind file pointer and read file again to create
//wordArray
rewind(textFile);
for(nextWord=0; nextWord < numWords; nextWord++)
{
//read next word from file into buffer.
fscanf(textFile, "%s", buffer);
//Remove any punctuation at beginning of word
i=0;
while(!isalpha(buffer[i]))
{
i++;
}
if(i>0)
{
len = strlen(buffer);
for(j=i; j<=len; j++)
{
buffer[j-i] = buffer[j];
}
}
//Remove any punctuation at end of word
len = strlen(buffer);
lastChar = len -1;
while(!isalpha(buffer[lastChar]))
{
lastChar--;
}
buffer[lastChar+1] = '\0';
//make sure all characters are lower case
for(i=0; i < strlen(buffer); i++)
{
buffer[i] = tolower(buffer[i]);
}
//Now add the word to the wordArray.
//Need to malloc an array of chars to hold the word.
//Then copy the word from buffer into this array.
//Place pointer to array holding the word into next
//position of wordArray
wordPtr = (char *) malloc((strlen(buffer)+1)*sizeof(char));
if(wordPtr == NULL)
{
printf("malloc failure. Terminating program\n");
return (0);
}
strcpy(wordPtr, buffer);
wordArray[nextWord] = wordPtr;
}
//Call countWords() to create countArray and replace
//duplicate words in wordArray with NULL
countArray = countWords(wordArray, numWords);
if(countArray == NULL)
{
printf("countWords() function returned NULL; Terminating program\n");
return (0);
}
//Now call compress to remove NULL entries from wordArray
compress(&wordArray, &countArray, &numWords);
if(wordArray == NULL)
{
printf("compress() function failed; Terminating program.\n");
return(0);
}
printf("Number of words in wordArray after eliminating duplicates and compressing is: %d\n", numWords);
//Create copy of compressed countArray and wordArray and then sort them alphabetically
alphaCountArray = copyCountArray(countArray, numWords);
freqCountArray = copyCountArray(alphaCountArray, numWords);
int *countWords( char **wordArray, int n)
{
return NULL;
int i=0;
int n=0;
for(i=0;i<n;i++)
{
for(n=0;n<wordArray[i];n++)
{
}
}
}
Assuming you want the return value of countWords to be an array of integers with word counts for each unique word, you need to have a double loop. One loop goes over the whole array, the second loop goes through the rest of the array (after the current word), looking for duplicates.
You could do something like this pseudo code:
Allocate the return array countArray (n integers)
Loop over all words (as you currently do in your `for i` loop)
If the word at `i` is not null // Check we haven't already deleted this word
// Found a new word
Set countArray[i] to 1
Loop through the rest of the words e.g. for (j = i + 1; j < n; j++)
If the word at j is not NULL and matches the word at i (using strcmp)
// Found a duplicate word
Increment countArray[i] (the original word's count)
// We don't want wordArray[j] anymore, so
Free wordArray[j]
Set wordArray[j] to NULL
Else
// A null indicates this was a duplicate, set the count to 0 for consistency.
Set countArray[i] to 0
Return wordArray
I'm going to throw you a bit of a curve ball here.
Rather than fix your code, which can be easily fixed as it's pretty good on its own, but incomplete, I decided to write an example from scratch.
No need to read the file twice [first time just to get the maximum count]. This could be handled by a dynamic array and realloc.
The main point, I guess, is that it is much easier to ensure that word list has no duplicates while creating it, rather than removing duplicates at the end.
I opted for a few things.
I created a "word control" struct. You've got several separate arrays that are indexed the same way. That, sort of, "cries out" for a struct. That is, rather than [say] 5 separate arrays, have a single array of a struct that has 5 elements in it.
The word list is a linked list of these structs. It could be a dynamic array on the heap that gets realloced instead, but the linked list is actually easier to maintain for this particular usage.
Each struct has the [cleaned up] word text and a count of the occurrences (vs. your separate wordArray and countArray).
When adding a word, the list is scanned for an existing match. If one is found, the count is incremented, rather than creating a new word list element. That's the key to eliminating duplicates [i.e. don't create them in the first place].
Anyway, here it is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#define sysfault(_fmt...) \
do { \
printf(_fmt); \
exit(1); \
} while (0)
// word control
typedef struct word {
struct word *next; // linked list pointer
char *str; // pointer to word string
int count; // word frequency count
} word_t;
word_t wordlist; // list of words
// cleanword -- strip chaff and clean up word
void
cleanword(char *dst,const char *src)
{
int chr;
// NOTE: using _two_ buffers in much easier than trying to clean one
// buffer in-place
for (chr = *src++; chr != 0; chr = *src++) {
if (! isalpha(chr))
continue;
chr = tolower(chr);
*dst++ = chr;
}
*dst = 0;
}
// addword -- add unique word to list and keep count of number of words
void
addword(const char *str)
{
word_t *cur;
word_t *prev;
char word[1000];
// get the cleaned up word
cleanword(word,str);
// find a match to a previous word [if it exists]
prev = NULL;
for (cur = wordlist.next; cur != NULL; cur = cur->next) {
if (strcmp(cur->str,word) == 0)
break;
prev = cur;
}
// found a match -- just increment the count (i.e. do _not_ create a
// duplicate that has to be removed later)
if (cur != NULL) {
cur->count += 1;
return;
}
// new unique word
cur = malloc(sizeof(word_t));
if (cur == NULL)
sysfault("addword: malloc failure -- %s\n",strerror(errno));
cur->count = 1;
cur->next = NULL;
// save off the word string
cur->str = strdup(word);
if (cur->str == NULL)
sysfault("addword: strdup failure -- %s\n",strerror(errno));
// add the new word to the end of the list
if (prev != NULL)
prev->next = cur;
// add the first word
else
wordlist.next = cur;
}
int
main(int argc,char **argv)
{
FILE *xf;
char buf[1000];
char *cp;
char *bp;
word_t *cur;
--argc;
++argv;
xf = fopen(*argv,"r");
if (xf == NULL)
sysfault("main: unable to open '%s' -- %s\n",*argv,strerror(errno));
while (1) {
// get next line
cp = fgets(buf,sizeof(buf),xf);
if (cp == NULL)
break;
// loop through all words on a line
bp = buf;
while (1) {
cp = strtok(bp," \t\n");
bp = NULL;
if (cp == NULL)
break;
// add this word to the list [avoiding duplicates]
addword(cp);
}
}
fclose(xf);
// print the words and their counts
for (cur = wordlist.next; cur != NULL; cur = cur->next)
printf("%s %d\n",cur->str,cur->count);
return 0;
}
I have a problem with my code. I want to load a dictionary which works fine with a small one. But when i try to load the larger version, my while loop stops at the 701th word which is " acclimatization" and then the programs continues. I searched a lot on forums and tried a lot of things, but i just can't find the reason this is caused. Does anyone have an idea of how this occurs?
Dictionary.c
bool load(const char* dictionary)
{
// reserve space for word
char* word = malloc(sizeof(char*));
// open file
FILE* dict = fopen(dictionary, "r");
if (dict == NULL)
{
fclose(dict);
fprintf(dict, "Could not load %s.\n", dictionary);
return 1;
}
root = (struct node *) malloc(sizeof(struct node));
root->is_word = false;
//Loops over word aslong the EOF is not reached
while (fgets(word,LENGTH,dict) != NULL)
{
printf("word = %s\n", word);
int word_length = strlen(word) -1;
node* current = root;
word_count++;
//Loops over letters
for (int i = 0; i < word_length; i++)
{
int index;
node *next_node;
// checks if letter isnt a apostrophe
if(word[i] == 39)
{
index = MAX_CHARS - 1;
}
// gets nummeric value of letter
else
{
index = tolower(word[i]) - 'a';
}
next_node = current->children[index];
// creates new node if letter didnt exists before
if(next_node == NULL)
{
next_node = malloc(sizeof(node));
current->children[index] = next_node;
current->is_word = false;
printf("new letter: %c\n", word[i]);
}
else
{
printf("letter: %c\n", word[i]);
}
// checks for end of the word
if(i == word_length - 1)
{
next_node->is_word = true;
}
current = next_node;
}
}
return true;
}
The node is defined by:
// node
typedef struct node
{
bool is_word;
struct node* children[27];
}
node;
char* word = malloc(sizeof(char*));
Depending on platform it can be 4 or 8 . You need to allocate more memory.
char* word;
word = malloc(LENGTH); // LENGTH as you use it here while (fgets(word,LENGTH,dict) != NULL)
if(word!=NULL){ // and checking if malloc is successful
// your code
free(word); // freeing allocated memory
return true;
}
else { // executed only if malloc fails
//handle error
}
You can give any desired size.
Note - Using function free() , you need to free every time you allocate memory.
You allocate very little space for word, it's probably 8 or 4 bytes depending on your platform.
You are allocating space for 1 char pointer, so when you read from the file LENGTH characters you can be storing bytes beyond the limits of the allocated buffer. The problem is, that the behavior is undefined thus the program might work or it might stop or anything can happen.
You don't need to allocate it dynamically, just like this it's ok
char word[100];
while (fgets(word, sizeof(word), file) != NULL) ...
/* ^ this only works with arrays, */
/* the benefit is that you can */
/* change the definition of word */
/* and resize it without changing */
/* this part. */
/* */
/* It will NOT work if you use `malloc()' */
Also, you would have a memory leak if fopen() failes, every malloc() requires a corresponding free().
Suggestion:
for (int i = 0; i < word_length; i++)
can be written like this too
for (int i = 0; ((word[i] != '\n') && (word[i] != '\0')); i++)
and you avoid calling strlen() which will also iterate through the characters.