Reading data from text file in C - c

I have a text file which contains words separated by space. I want to take each word from the file and store it. So i have opened the file but am unsure how to assign the word to a char.
FILE *fp;
fp = fopen("file.txt", "r");
//then i want
char one = the first word in the file
char two = the second word in the file

You cannot assign a word to a char. You can assign a single character to a char - hence the name. You can assign a word to an array of characters - such as s[128].
For instance:
char word[128];
fscanf(fp, "%s", word);
Note, in production code you cannot just use statically sized buffer, this will lead to buffer overflow exploitable code.

you can't hold a word in a char variable.It has to be a string or a char pointer that can be enlarged.
try this;
char p[10]; //assuming that a word can have most 10 characters.
FILE *fp;
fp = fopen("file.txt","r");
fscanf(fp,"%s",p);

If you want a bit more flexible - for example: by choosing the characters that identify a word - you could have a look at this:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// adjust size for your needs
#define MAX_WORD_LEN 1000
static char *parseable_characters_str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxy0123456789-";
static char parseable_characters_tbl[256] = {0}; // lookup index table, stores yes/no -> allowed/not allowed
/*
* builds the lookup table
*/
void build_lookup_index(char *table, const char *str)
{
int i;
// init table to zero
memset(table,0,256);
// set value to 1 at ASCII-code offset of the array if the character is allowed to be
// part of the word
for (i=0; str[i]; i++)
table[(unsigned char)str[i]] = 1;
}
/*
* returns unparsed bytes (kind of offset for next reading operation)
*/
int parse_buffer(char *buf, int size, const char *lookup_table)
{
int i,l,s;
char word[MAX_WORD_LEN+1];
i = 0;
l = 0;
s = 0;
while (i<size) {
// character not in lookup table -> delimiter
if (!lookup_table[buf[i]] || !buf[i]) {
if (l >= MAX_WORD_LEN) {
fprintf(stderr,"word exceeds bounds\n");
}
else if (l > 0) { // if word has at least 1 character...
// append string-terminator
word[l] = '\0';
printf("word found (%d): '%s'\n",l,word);
}
// reset word length
l = 0;
// save last word offset
s = i+1;
}
else {
// prevent buffer overflows
if (l < MAX_WORD_LEN)
word[l] = buf[i];
l++;
}
if (!buf[i])
break;
i++;
}
if (s > 0 && size-s > 0) {
// move rest of the buffer to the start for next iteration step
memmove(buf,buf+s,size-s);
return size-s;
}
return 0;
}
int main(int argc, char *argv[])
{
FILE *fh;
char buf[1000]; // read buffer
// "rb" because of Windows - we want all characters to be read
fh = fopen("out.txt","rb");
// initialize word index
build_lookup_index(parseable_characters_tbl,parseable_characters_str);
if (fh) {
int r,off = 0;
while (!feof(fh)) {
r = fread(buf+off,1,sizeof(buf)-off,fh);
off = parse_buffer(buf,r,parseable_characters_tbl);
}
fclose(fh);
}
return 0;
}

Related

Longest word in file

My program needs to print longest word which contains only letters from a file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int checkString(const char s[]) {
unsigned char c;
while ((c = *s) && (isalpha(c) || isblank(c)))
++s;
return *s == '\0';
}
int main() {
char file_name[]="document.txt";
FILE *fp = fopen(file_name, "r");
char *largest = str;
int largest_len = 0;
while (fgets(file_name, 1000, fp) != NULL) {
char *temp = strtok(file_name, " ");
while (temp != NULL) {
if (strlen(temp) > largest_len) {
strcpy(largest, temp);
largest_len = strlen(largest);
}
temp = strtok(NULL, "\",.,1,2,4,5,6,7,8,9 ");
}
}
if(checkString(largest))
printf("%s", largest);
fclose(fp);
return 0;
}
In my code, if the largest word contains only letters it will be printed. How to modify this code to check next words if the largest doesn't contain only letters?
First of all, you cannot store the pointer to longest word like that. You re-use str for the next line and so the pointer is not likely to point to something useful.
Second, while strtok() appears simple, initially, I tend to apply a straightforward approach to a straightforward problem.
The problem is O(n) (where n is the length of the document). You just need to go through it character by character. Of course, since every line is ended by a \n, you can use the line based approach in this case.
So, instead of strtok, simply check each character, if it is a legal word character (an alphanumeric character, that is). You can easily do so with the standard library function isalpha() from header ctype.h.
Below is the program, copying the longest string into a dedicated buffer, using isalpha() and doing the line based reading of the file, just like the code in the original question did.
Of course, this code assumes, no line is ever longer than 999 characters.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <ctype.h>
static size_t gulp(const char* line, size_t istart, size_t len) {
size_t n = 0;
for (size_t i = istart; i < len; i++, n++) {
if (!isalpha(line[i])) {
break;
}
}
return n;
}
int main(int argc, const char * argv[]) {
FILE* f = fopen("document.txt","r");
char line[1000];
char longest_word[1000];
size_t longest_word_length = 0;
while (fgets(line, sizeof(line), f) != NULL) {
size_t i0 = 0;
size_t line_length = strlen(line);
while (i0 < line_length) {
if (isalpha(line[i0])) {
size_t n = gulp(line, i0, line_length);
if (n > longest_word_length) {
strncpy(longest_word, &line[i0], n);
longest_word[n] = '\0';
longest_word_length = n;
}
i0 = i0 + n;
} else {
i0++;
}
}
}
fclose(f);
f = NULL;
if (longest_word_length > 0) {
printf("longest word: %s (%lu characters)\n",
longest_word, longest_word_length);
}
return 0;
}
There are a number of problems here:
you use the same buffer (str) for two different uses: as a read buffer and to store the longest word. If you find the largest word in the first line, the word will be erased when reading the second line. Furthemore, if you find a rather long word at the beginning of a line, the strings pointed to by largest and temp could overlap which leads to undefined behaviour => use a different array or strdup (and free) for largest
you only use the space as possible separator. You should wonder whether you should add tab and/or punctuations
once you have got a word you should ensure that it only contains valid letters before testing its length and ignore it if for example it contains digits.
if a single line can be longer than 1000 characters, you should wrap the end of the current part before the beginning of the next one for the possible case where a long word would be splitted there.
For additional corner case processing, you should specify what to do if a word contains illegal characters but only at one side. For example if . is not used as a word delimiter, a word with an embedded . like "a.b" should be ignored, but a terminating . should only be stripped (like "example." should become "example"
I think the order you do things should be a bit different, here is an example
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
int isCandidate(char* word);
int main(int argc, char* argv[])
{
if (--argc == 0)
{
perror("not enough command line arguments, expecting a filename");
return -1;
}
++argv;
FILE* fp = fopen(*argv, "r");
if (fp == NULL)
{
perror(*argv);
return -1;
}
// get size of file
fseek(fp, 0L, SEEK_END);
long fileLength = ftell(fp);
if (fileLength < 1)
{
perror("file is empty");
return -1;
}
fseek(fp, 0L, SEEK_SET); // position file pointer at the beginning again
// allocate space for the whole file and then read it in
// for a text file it should be OK to do so since they
// normally are not that large.
char* buffer = malloc(fileLength+1);
if (fread(buffer, 1, fileLength, fp) != 0)
{
buffer[fileLength] = '\0'; // make sure the buffer ends with \0
}
else
{
perror("Failed reading into buffer");
return -1;
}
fclose(fp); // we are done with the file
const char filter[] = " \n\r";
char* longestWord = malloc(fileLength+1); // max length in theory
long unsigned int maxLength = 0;
for (char* token = strtok(buffer, filter); token != NULL; token = strtok(NULL, filter))
{
if (isCandidate(token))
{
if (strlen(token) > maxLength)
{
strcpy(longestWord, token);
maxLength = strlen(token);
}
}
}
printf("Longest word:'%s', len=%lu\n", longestWord, maxLength);
free(longestWord);
free(buffer);
}
int isCandidate(char* word)
{
if (word == NULL)
{
perror("invalid argument to isCandidate");
return 0;
}
for (char* ch = word; *ch; ++ch)
{
if (!isalpha(*ch)) return 0;
}
return 1;
}

Reading the words of a file into a dynamic 2D array

I am trying to read a file and store every word into a dynamically allocated 2D array. The size of the input file is unknown.
I am totally lost and don't know how I could "fix/finish" the program.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(void) {
char filename[25];
printf("Input the filename");
scanf("%s", filename);
fileConverter(filename);
}
int fileConverter(char filename[25]) {
//int maxLines = 50000;
//int maxWordSize = 128;
//char words[maxLines][maxWordSize];
//char **words;
char **arr = (char**) calloc(num_elements, sizeof(char*));
for ( i = 0; i < num_elements; i++ ) {
arr[i] = (char*) calloc(num_elements_sub, sizeof(char));
}
FILE *file = NULL;
int amountOfWords = 0;
file = fopen(filename, "r");
if(file == NULL) {
exit(0);
}
while(fgets(words[amountOfWords], 10000, file)) {
words[amountOfWords][strlen(words[amountOfWords]) - 1] = "\0";
amountOfWords++;
}
for(int i = 0; i < amountOfWords; i++) {
printf("a[%d] = ", i);
printf("%s\n", words[i]);
}
printf("The file contains %d words and the same amount of lines.\n", amountOfWords);
return amountOfWords;
The main challenges for this kind of problem are
reallocating the array of strings as the program reads new words, and
handling words that are larger than the buffer used by fgets.
The general approach for these kind of parsing problems, is to design a state machine. The state machine here has two states:
The current character is whitespace. Action: Continue reading whitespace until we reach the end of the buffer, or until we land on a non-whitespace character, in which case we switch to state 2.
The current character is non-whitespace (i.e. a word). Action: Continue reading non-whitespace until we reach the end of the buffer, or until we land on a whitespace character, in which case we copy the word we just read to the array of strings and switch to state 1.
Particularly difficult is the case in which we are in state 2 and reach the end of the buffer. This means that this word spans multiple buffers. To accommodate for this, we deviate slightly from a direct state machine implementation. State 2 is slightly different, depending on if we are reading a new word or continuing one that was started in a previous buffer.
We now keep track of wordSize. If we start reading from the start of a buffer, but wordSize is not 0, then we know we are continuing a previous word and we know what size it was for the realloc we need.
Below is one possible implementation. All the work is done in the wordArrayRead function. Walking through it from the top of the function:
First we declare the variables that we need across lineBuffer reads: an index for the word itself and the length of the word we are currently reading, followed by the declaration of the buffer itself. The outside loop repeatedly reads using fgets until we have exhausted the input.
We start reading at index 0 and stop at the null-terminator. The first if-statement checks if we should be in state 2: either the current character is the start of a word or we were already reading a word.
State 2
The index wordStartIdx stays at the first character of the word (segment) and we walk the wordEndIdx to the end of the word (segment) or to the end of the buffer.
We then check if we need to increase the size of the array of strings. Here we increase it to 2 times + 1 the previous size to avoid frequent reallocations.
We set a boolean value, indication whether we have reached the end of a word. If we have, we need to allocate for and write the null-terminator at the end of the string.
If wordLength == 0 it means we are reading a new word and have to allocate memory for it for the first time. If wordLength != 0, we have to reallocate to append to an existing word.
We copy the word (segment) currently in the lineBuffer to the array of strings.
Now, we do some bookkeeping. If we reached the end of a word, we write the null-terminator, increment the index to point to the next word location and reset wordLength. If this wasn't the case, we only increment the wordLength with the length of the segment we just read. Finally, we update wordStartIdx, which still points to the start of the word, to point to the end of the word, so we can continue iterating over the buffer.
State 1
Having finishing the State 2 processing, we go into State 1 which has only two lines. It simply advances the index until we land at non-whitespace. Note that the null-terminator of the lineBuffer ('\0') does not count as whitespace, so this loop will not continue past the end of the buffer.
After all input has been processed, we shrink the array of strings to the actual size of its data. This "corrects" the allocation policy of increasing the size by 2n+1 each time it wasn't large enough.
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// BUFFER_SIZE must be >1U
#define BUFFER_SIZE 1024U
struct WordArray
{
char **words;
size_t numberOfWords;
};
static struct WordArray wordArrayConstruct(void);
static void wordArrayResize(struct WordArray *wordArray, size_t const newSize);
static void wordArrayDestruct(struct WordArray *wordArray);
static void wordArrayRead(FILE *restrict stream, struct WordArray *wordArray);
static char *reallocStringWrapper(char *restrict str, size_t const newSize);
static void wordArrayPrint(struct WordArray const *wordArray);
int main(void)
{
struct WordArray wordArray = wordArrayConstruct();
wordArrayRead(stdin, &wordArray);
wordArrayPrint(&wordArray);
wordArrayDestruct(&wordArray);
}
static void wordArrayRead(FILE *restrict stream, struct WordArray *wordArray)
{
size_t wordArrayIdx = 0U;
size_t wordLength = 0U;
char lineBuffer[BUFFER_SIZE];
while (fgets(lineBuffer, sizeof lineBuffer, stream) != NULL)
{
size_t wordStartIdx = 0U;
while (lineBuffer[wordStartIdx] != '\0')
{
if (!isspace(lineBuffer[wordStartIdx]) || wordLength != 0U)
{
size_t wordEndIdx = wordStartIdx;
while (!isspace(lineBuffer[wordEndIdx]) && wordEndIdx != BUFFER_SIZE - 1U)
++wordEndIdx;
if (wordArrayIdx >= wordArray->numberOfWords)
wordArrayResize(wordArray, wordArray->numberOfWords * 2U + 1U);
size_t wordSegmentLength = wordEndIdx - wordStartIdx;
size_t foundWordEnd = wordEndIdx != BUFFER_SIZE - 1U; // 0 or 1 bool
// Allocate for a new word, or reallocate for an existing word
// If a word end was found, add 1 to the size for the '\0' character
char *dest = wordLength == 0U ? NULL : wordArray->words[wordArrayIdx];
size_t allocSize = wordLength + wordSegmentLength + foundWordEnd;
wordArray->words[wordArrayIdx] = reallocStringWrapper(dest, allocSize);
memcpy(&(wordArray->words[wordArrayIdx][wordLength]),
&lineBuffer[wordStartIdx], wordSegmentLength);
if (foundWordEnd)
{
wordArray->words[wordArrayIdx][wordLength + wordSegmentLength] = '\0';
++wordArrayIdx;
wordLength = 0U;
}
else
{
wordLength += wordSegmentLength;
}
wordStartIdx = wordEndIdx;
}
while (isspace(lineBuffer[wordStartIdx]))
++wordStartIdx;
}
}
// All done. Shrink the words array to the size of the actual data
if (wordArray->numberOfWords != 0U)
wordArrayResize(wordArray, wordArrayIdx);
}
static struct WordArray wordArrayConstruct(void)
{
return (struct WordArray) {.words = NULL, .numberOfWords = 0U};
}
static void wordArrayResize(struct WordArray *wordArray, size_t const newSize)
{
assert(newSize > 0U);
char **tmp = (char**) realloc(wordArray->words, newSize * sizeof *wordArray->words);
if (tmp == NULL)
{
wordArrayDestruct(wordArray);
fprintf(stderr, "WordArray allocation error\n");
exit(EXIT_FAILURE);
}
wordArray->words = tmp;
wordArray->numberOfWords = newSize;
}
static void wordArrayDestruct(struct WordArray *wordArray)
{
for (size_t wordStartIdx = 0U; wordStartIdx < wordArray->numberOfWords; ++wordStartIdx)
{
free(wordArray->words[wordStartIdx]);
wordArray->words[wordStartIdx] = NULL;
}
free(wordArray->words);
}
static char *reallocStringWrapper(char *restrict str, size_t const newSize)
{
char *tmp = (char*) realloc(str, newSize);
if (tmp == NULL)
{
free(str);
fprintf(stderr, "Realloc string allocation error\n");
exit(EXIT_FAILURE);
}
return tmp;
}
static void wordArrayPrint(struct WordArray const *wordArray)
{
for (size_t wordStartIdx = 0U; wordStartIdx < wordArray->numberOfWords; ++wordStartIdx)
printf("%zu: %s\n", wordStartIdx, wordArray->words[wordStartIdx]);
}
Note: This program reads input from stdin, as Unix/Linux utilities typically do. Use input redirection to read from a file, or provide a file descriptor to the readWordArray function.
to allocate dynamic 2D array you need:
void allocChar2Darray(size_t rows, size_t columns, char (**array)[columns])
{
*array = malloc(rows * sizeof(**array));
}

i want to determine the two-letter combination that occurs most often in my code

i have a file (for example lorem.txt)
so now i have two task ,
was to read file and remove space and lines.
to determine the two letter combination that occurs most often in my file.
i have done the first part. can anyone tell me what can i do to 2. part? I tried various solutions, but was unable to do it. Let me know, if I can provide anything else.
#include <stdio.h>
#include <stdlib.h> // For exit()
FILE* pInputFile;
int chr = 0;
int main()
{
FILE* fptr;
char c;
char filename[] = "Lorem.txt";
char *str, *strblank;
// char str[469], strblank[469];
int i = 0;
//int x=500;
errno_t err;
if ((err = fopen_s(&pInputFile, "Lorem.txt", "r")) == 0)
{
size_t pos = ftell(pInputFile); // Current position
fseek(pInputFile, 0, SEEK_END); // Go to end
size_t length = ftell(pInputFile); // read the position which is the size
fseek(pInputFile, pos, SEEK_SET); // restore original position
//x = length;
//char str[x], strblank[x];
printf("%d", length);
str = malloc(length * sizeof(char));
strblank = malloc(length * sizeof(char));
// file exists: don't read a char before the loop or
// it will be lost
while ((chr = getc(pInputFile)) != EOF)
{
//printf("%c", chr); //This line prints the file lines.
str[i] = chr;
i++;
}
i= 0;
printf("%s", str);
removespace(str, strblank);
printf("%s", strblank);
fclose(pInputFile);
}
else
{
fprintf(stderr, "Cannot open file, error %d\n", err);
// handle the error further if needed
}
return 0;
}
int removespace(char aj[500],char mj[500])
{
//char mj[500];
int i = 0, j = 0, len;
//printf("\n\nEnter the string: ");
//gets(aj);
len = strlen(aj); // len stores the length of the input string
while (aj[i] != '\0') // till string doesn't terminate
{
if (aj[i] != ' ' && aj[i] != '\n') // if the char is not a white space
{
/*
incrementing index j only when
the char is not space
*/
mj[j++] = aj[i];
}
/*
i is the index of the actual string and
is incremented irrespective of the spaces
*/
i++;
}
mj[j] = '\0';
// printf("\n\nThe string after removing all the spaces is: ");
return 0;
}

ascii file processing in C

I have a hard time understanding how you process ascii files in c. I have no problem opening files and closing them or reading files with one value on each line. However, when the data is separated with characters, I really don't understand what the code is doing at a lower level.
Example: I have a file containing names separated with comas that looks like this:
"MARY","PATRICIA","LINDA","BARBARA","ELIZABETH","JENNIFER"
I have created an array to store them:
char names[6000][20];
And now, my code to process it is while (fscanf(data, "\"%s\",", names[index]) != EOF) { index++; }
The code executes for the 1st iteration and names[0] contains the whole file.
How can I separate all the names?
Here is the full code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main() {
char names[6000][20]; // an array to store 6k names of max length 19
FILE * data = fopen("./022names.txt", "r");
int index = 0;
int nbNames;
while (fscanf(data, "\"%s\",", names[index]) != EOF) {
index++;
}
nbNames = index;
fclose(data);
printf("%d\n", index);
for (index=0; index<nbNames; index++) {
printf("%s \n", names[index]);
}
printf("\n");
return 0;
}
PS: I am thinking this might also be because of the data structure of my array.
If you want a simple solution, you can read the file character by character using fgetc. Since there are no newlines in the file, just ignore quotation marks and move to the next index when you find a comma.
char names[6000][20]; // an array to store 6k names of max length 19
FILE * data = fopen("./022names.txt", "r");
int name_count = 0, current_name_ind = 0;
int c;
while ((c = fgetc(data)) != EOF) {
if (c == ',') {
names[name_count][current_name_ind] = '\0';
current_name_ind = 0;
++name_count;
} else if (c != '"') {
names[name_count][current_name_ind] = c;
++current_name_ind;
}
}
names[name_count][current_name_ind] = '\0';
fclose(data);
"The code executes for the 1st iteration and names[0] contains the whole file...., How can I separate all the names?"
Regarding the first few statements:
char names[6000][20]; // an array to store 6k names of max length 19
FILE * data = fopen("./022names.txt", "r");
What if there are there are 6001 names. Or one of the names has more than 20 characters?
Or what if there are way less than 6000 names?
The point is that with some effort to enumerate the tasks you have listed, and some time mapping out what information is needed to create the code that matches your criteria, you can create a better product: The following is derived from your post:
Process ascii files in c
Read file content that is separated by characters
input is a comma separated file, with other delimiters as well
Choose a method best suited to parse a file of variable size
As mentioned in the comments under your question there are ways to create your algorithms in such way as to flexibly allow for extra long names, or for a variable number of names. This can be done using a few C standard functions commonly used in parsing files. ( Although fscanf() has it place, it is not the best option for parsing file contents into array elements.)
The following approach performs the following steps to accomplish the user needs enumerated above
Read file to determine number of, and longest element
Create array sized to contain exact contents of file using count of elements and longest element using variable length array (VLA)
Create function to parse file contents into array. (using this technique of passing VLA as function argument.)
Following is a complete example of how to implement each of these, while breaking the tasks into functions when appropriate...
Note, code below was tested using the following input file:
names.txt
"MARY","PATRICIA","LINDA","BARBARA","ELIZABETH","JENNIFER",
"Joseph","Bart","Daniel","Stephan","Karen","Beth","Marcia",
"Calmazzothoulumus"
.
//Prototypes
int count_names(const char *filename, size_t *count);
size_t filesize(const char *fn);
void populateNames(const char *fn, int longest, char arr[][longest]);
char *filename = ".\\names.txt";
int main(void)
{
size_t count = 0;
int longest = count_names(filename, &count);
char names[count][longest+1];//VLA - See linked info
// +1 is room for null termination
memset(names, 0, sizeof names);
populateNames(filename, longest+1, names);
return 0;
}
//populate VLA with names in file
void populateNames(const char *fn, int longest, char names[][longest])
{
char line[80] = {0};
char *delim = "\",\n ";
char *tok = NULL;
FILE * fp = fopen(fn, "r");
if(fp)
{
int i=0;
while(fgets(line, sizeof line, fp))
{
tok = strtok(line, delim);
while(tok)
{
strcpy(names[i], tok);
tok = strtok(NULL, delim);
i++;
}
}
fclose(fp);
}
}
//passes back count of tokens in file, and return longest token
int count_names(const char *filename, size_t *count)
{
int len=0, lenKeep = 0;
FILE *fp = fopen(filename, "r");
if(fp)
{
char *tok = NULL;
char *delim = "\",\n ";
int cnt = 0;
size_t fSize = filesize(filename);
char *buf = calloc(fSize, 1);
while(fgets(buf, fSize, fp)) //goes to newline for each get
{
tok = strtok(buf, delim);
while(tok)
{
cnt++;
len = strlen(tok);
if(lenKeep < len) lenKeep = len;
tok = strtok(NULL, delim);
}
}
*count = cnt;
fclose(fp);
free(buf);
}
return lenKeep;
}
//return file size in bytes (binary read)
size_t filesize(const char *fn)
{
size_t size = 0;
FILE*fp = fopen(fn, "rb");
if(fp)
{
fseek(fp, 0, SEEK_END);
size = ftell(fp);
fseek(fp, 0, SEEK_SET);
fclose(fp);
}
return size;
}
You can use the in-built strtok() function which is easy to use.
I have used the tok+1 instead of tok to omit the first " and strlen(tok) - 2 to omit the last ".
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main() {
char names[6000][20]; // an array to store 6k names of max length 19
FILE * data = fopen("./022names.txt", "r");
int index = 0;
int nbNames;
char *str = (char*)malloc(120000*sizeof(char));
while (fscanf(data, "%s", str) != EOF) {
char *tok = strtok(str, ",");
while(tok != 0){
strncpy(names[index++], tok+1, strlen(tok)-2);
tok = strtok(0, ",");
}
}
nbNames = index;
fclose(data);
free(str); // just to free the memory occupied by the str variable in the heap.
printf("%d\n", index);
for (index=0; index<nbNames; index++) {
printf("%s \n", names[index]);
}
printf("\n");
return 0;
}
Also, the parameter 120000 is just the maximum number of characters that can be in the file. It is just 6000 * 20 as you mentioned.

recording of each word in a text file in c

I am trying to build a function that will check if the word is in a list of words, if it is, it will increment the corresponding counter for the frequency of that word. Otherwise, it will create a copy of the
word and append it to the list. Then set the corresponding frequency counter to 1.
I get no compiler errors but when I attempt to print the frequency of any word I get a number in the 2 millions and I have no idea why.
I am given a main file I cannot modify:
#include <stdlib.h>
#include <string.h>
#define MAX_WORDS 300
#define LINE_LEN 80
void increment_word_freq(char *freq_words[MAX_WORDS], int *frequency, int *n, char *word);
int main(){
char delim[] = " ,.!-;\"\n";
char filename[] = "cookbook.txt";
char line[LINE_LEN];
char *word;
char *freq_words[MAX_WORDS]; // a list of frequent words
int frequency[MAX_WORDS]; // frequency of the words
int n = 0; // number of words in the list
int min_occr;
FILE *fp;
fp = fopen(filename, "r");
if(!fp){
printf("Could not open file %s\n", filename);
exit(1);
}
// read one line at a time
while(fgets(line, LINE_LEN, fp)){
// get the words from the line
word = strtok(line, delim);
while(word != NULL) {
// convert the word to lowercase
int i;
for(i = 0; i < strlen(word); i++)
word[i] = tolower(word[i]);
increment_word_freq(freq_words, frequency, &n, word);
word = strtok(NULL,delim);
}
}
}
this is the function I am attempting to use:
void increment_word_freq(char *freq_words[MAX_WORDS], int *frequency, int *n, char *word){
for(int i=0; i<MAX_WORDS; i++){
if(freq_words[i] == word){
frequency[i]++;
break;
}
else if(i=MAX_WORDS-1){
frequency[i]= *word;
*n++;
}
}
}
like I said before, no compiler errors but attempting to print the frequency of any word will give a number in the 2 millions and I have no idea why.
Any and all help and advice is greatly appreciated!
freq_words[i] == word only compares the pionter freq_words[i] with the pointer word. You have to campare the strings the pointers refer to. Change your code to strcmp(freq_words[i], word) == 0. Apart from this you have to allocate dynamic memory to stroe your strings. Use strcpy to copy a string int the dynamic memory. You have to do so, because word is a pointer to a char somewhere in line, but line will be overwritten if you read the next line of the file. Adapt your code like this:
#include <string.h> // strcmp, strcpy
void increment_word_freq( char *freq_words[MAX_WORDS], int *frequency, int *n, char *word)
{
for ( int i=0; i < *n; i++) // for all current members of freq_words
{
if ( strcmp( freq_words[i], word ) == 0 ) // test if word is member of freq_words
{
frequency[i]++; // increment count
return; // finished, because word was found
}
}
// word was not found in freq_words => add new word to freq_words
if ( *n < MAX_WORDS-1 ) // test if there is one more place in freq_words
{
freq_words[*n] = malloc( strlen(word) + 1 ); // allocate dynamic memory for new meber of freq_words
strcpy( freq_words[*n], word ); // copy word to freq_words[*n]
frequency[*n] = 1; // int frequency[*n] with 1
(*n)++; // increment count of members of freq_words
}
}
Note you have to free the allocated memory at the end of main, otherwise you have memory leaks.
for ( int i=0; i < *n; i++)
{
free( freq_words[i] );
}

Resources