C Word Occurrence - c

I've perused all of the related questions to quantifying the occurrence of a word in a text file and I'm still unable to get this to execute and count the words. I don't get an error but the output that is returned is:
File opened
character count = 0 4195872
There are exactly 6 words "CORRESPONDENCE" in "file.txt", I just need the counter to return the value 6. Any help would be greatly greatly appreciated.
#include <stdio.h>
#include <stdlib.h>
int main(){
int counter;
int ch;
int str;
FILE *input;
input = fopen ("file.txt", "r");
if (input == NULL)
{
printf("File failed to open \n");
}
else
{
printf("File opened \n");
do {
str = fgetc(input);
// if ((char*)str == keyword) counter++;
if (str == ch)counter++;
} while (str != EOF);
printf("character count = %i %i\n", counter, ch );
fclose(input);
}
return 0;
}

You didn't initialize your ch and counter variable. So they can be of any surprising values.
So it is not bizarre for you to get result 4194872.
Another critical error you make is that your program never actually get to count the occurrence of the word "correspondence". In your program, the variable str can only represent one alphabet, but not the word. So when you use str == ch to test if the word "correspondence" exist, you are actually testing if the letter equal to c.(And that is only true if you have correctly initialized your str variable. For clarity, it is better to declare str as a char type instead of type int).
The correct way to compare between words is to compare their letters one by one. If any letter doesn't fit, then return false.
Read this code to understand the comparing process.
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char **argv)
{
FILE * file = fopen("file.txt","r");
if(file == NULL) {printf("File not open"); return -1;}
char* str ="correspondence"; //Declare str as an array and initiate it with your word
char input[50]; //Declare an char array long enough to hold ordinary words
int i;
int count = 0;
while(fscanf(file,"%s",input) != EOF)
{
for(i = 0;i < strlen(str) && i < strlen(input) ;++ i)
{
if(input[i] != str[i]) break;
else i++;
}
if(i >= strlen(str) && strlen(str) == strlen(input))
count++;
memset(input,0,sizeof(input));
}
printf("word count = %d",count);
return 0;
}

You need to initialize ch before comparing.

Related

Longest word in file

My program needs to print longest word which contains only letters from a file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int checkString(const char s[]) {
unsigned char c;
while ((c = *s) && (isalpha(c) || isblank(c)))
++s;
return *s == '\0';
}
int main() {
char file_name[]="document.txt";
FILE *fp = fopen(file_name, "r");
char *largest = str;
int largest_len = 0;
while (fgets(file_name, 1000, fp) != NULL) {
char *temp = strtok(file_name, " ");
while (temp != NULL) {
if (strlen(temp) > largest_len) {
strcpy(largest, temp);
largest_len = strlen(largest);
}
temp = strtok(NULL, "\",.,1,2,4,5,6,7,8,9 ");
}
}
if(checkString(largest))
printf("%s", largest);
fclose(fp);
return 0;
}
In my code, if the largest word contains only letters it will be printed. How to modify this code to check next words if the largest doesn't contain only letters?
First of all, you cannot store the pointer to longest word like that. You re-use str for the next line and so the pointer is not likely to point to something useful.
Second, while strtok() appears simple, initially, I tend to apply a straightforward approach to a straightforward problem.
The problem is O(n) (where n is the length of the document). You just need to go through it character by character. Of course, since every line is ended by a \n, you can use the line based approach in this case.
So, instead of strtok, simply check each character, if it is a legal word character (an alphanumeric character, that is). You can easily do so with the standard library function isalpha() from header ctype.h.
Below is the program, copying the longest string into a dedicated buffer, using isalpha() and doing the line based reading of the file, just like the code in the original question did.
Of course, this code assumes, no line is ever longer than 999 characters.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <ctype.h>
static size_t gulp(const char* line, size_t istart, size_t len) {
size_t n = 0;
for (size_t i = istart; i < len; i++, n++) {
if (!isalpha(line[i])) {
break;
}
}
return n;
}
int main(int argc, const char * argv[]) {
FILE* f = fopen("document.txt","r");
char line[1000];
char longest_word[1000];
size_t longest_word_length = 0;
while (fgets(line, sizeof(line), f) != NULL) {
size_t i0 = 0;
size_t line_length = strlen(line);
while (i0 < line_length) {
if (isalpha(line[i0])) {
size_t n = gulp(line, i0, line_length);
if (n > longest_word_length) {
strncpy(longest_word, &line[i0], n);
longest_word[n] = '\0';
longest_word_length = n;
}
i0 = i0 + n;
} else {
i0++;
}
}
}
fclose(f);
f = NULL;
if (longest_word_length > 0) {
printf("longest word: %s (%lu characters)\n",
longest_word, longest_word_length);
}
return 0;
}
There are a number of problems here:
you use the same buffer (str) for two different uses: as a read buffer and to store the longest word. If you find the largest word in the first line, the word will be erased when reading the second line. Furthemore, if you find a rather long word at the beginning of a line, the strings pointed to by largest and temp could overlap which leads to undefined behaviour => use a different array or strdup (and free) for largest
you only use the space as possible separator. You should wonder whether you should add tab and/or punctuations
once you have got a word you should ensure that it only contains valid letters before testing its length and ignore it if for example it contains digits.
if a single line can be longer than 1000 characters, you should wrap the end of the current part before the beginning of the next one for the possible case where a long word would be splitted there.
For additional corner case processing, you should specify what to do if a word contains illegal characters but only at one side. For example if . is not used as a word delimiter, a word with an embedded . like "a.b" should be ignored, but a terminating . should only be stripped (like "example." should become "example"
I think the order you do things should be a bit different, here is an example
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
int isCandidate(char* word);
int main(int argc, char* argv[])
{
if (--argc == 0)
{
perror("not enough command line arguments, expecting a filename");
return -1;
}
++argv;
FILE* fp = fopen(*argv, "r");
if (fp == NULL)
{
perror(*argv);
return -1;
}
// get size of file
fseek(fp, 0L, SEEK_END);
long fileLength = ftell(fp);
if (fileLength < 1)
{
perror("file is empty");
return -1;
}
fseek(fp, 0L, SEEK_SET); // position file pointer at the beginning again
// allocate space for the whole file and then read it in
// for a text file it should be OK to do so since they
// normally are not that large.
char* buffer = malloc(fileLength+1);
if (fread(buffer, 1, fileLength, fp) != 0)
{
buffer[fileLength] = '\0'; // make sure the buffer ends with \0
}
else
{
perror("Failed reading into buffer");
return -1;
}
fclose(fp); // we are done with the file
const char filter[] = " \n\r";
char* longestWord = malloc(fileLength+1); // max length in theory
long unsigned int maxLength = 0;
for (char* token = strtok(buffer, filter); token != NULL; token = strtok(NULL, filter))
{
if (isCandidate(token))
{
if (strlen(token) > maxLength)
{
strcpy(longestWord, token);
maxLength = strlen(token);
}
}
}
printf("Longest word:'%s', len=%lu\n", longestWord, maxLength);
free(longestWord);
free(buffer);
}
int isCandidate(char* word)
{
if (word == NULL)
{
perror("invalid argument to isCandidate");
return 0;
}
for (char* ch = word; *ch; ++ch)
{
if (!isalpha(*ch)) return 0;
}
return 1;
}

How to get ASCII code for characters from a text file?

Update, Hello guys Thank you all for the help, my initial approach was wrong and I did not use ASCII codes at all.
Sorry for the late replay I had a half-day off today and made a new post for the complete code
there is no errors but the prgram is not working proberly ( this is an update of old post )
I wrote the program, and it is working with no errors But it is not giving me the results I wanted
My only problem is when I read a character how to check its ASCII and store it.
#include <stdio.h>
#include <string.h>
int main()
{
char dictionary[300];
char ch, temp1, temp2;
FILE *test;
test=fopen("HW2.txt","r");
for(int i=0;i<2000;i+=1)
{ ch=fgetc(test);
printf("%c",ch);
}
}
If we are talking about plain ASCII, values goes from 0 to 127, your table shoud look like:
int dictionary[128] = {0};
Regarding your question:
how to check its ASCII and store it
Consider a char being a tiny int, they are interchangeable and you don't need any conversion.
fgetc wants an int in order to handle EOF, and trying to read 2000 characters from a file containing less than 2000 bytes can have very bad consequences, to read the whole file:
int c;
while ((c = fgetc(test)) != EOF)
{
if ((c > 0) && (c < 128))
{
dictionary[c]++;
}
}
for (int i = 1; i < 128; i++)
{
if (dictionary[i] > 0)
{
printf("%c appeared %d times\n", i, dictionary[i]);
}
}
EDIT:
Rereading, I see that you want to store words, not chars, ok, then it's a bit more difficult but nothing terrible, do not limit yourself to 300 words, use dynamic memory:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// A struct to hold the words and the
// number of times it appears
struct words
{
size_t count;
char *word;
};
int main(void)
{
FILE *file;
file = fopen("HW2.txt", "r");
// Always check the result of fopen
if (file == NULL)
{
perror("fopen");
exit(EXIT_FAILURE);
}
struct words *words = NULL;
size_t nwords = 0;
char *word = NULL;
size_t nchars = 1;
size_t i;
int c;
// while there is text to scan
while ((c = fgetc(file)) != EOF)
{
if (isspace(c))
{
if (word != NULL)
{
// Search the word in the table
for (i = 0; i < nwords; i++)
{
// Found, increment the counter
if (strcmp(word, words[i].word) == 0)
{
words[i].count++;
free(word);
break;
}
}
// Not found, add the word to the table
if (i == nwords)
{
struct words *temp;
temp = realloc(words, sizeof(*temp) * (nwords + 1));
if (temp == NULL)
{
perror("realloc");
exit(EXIT_FAILURE);
}
words = temp;
words[nwords].word = word;
words[nwords].count = 1;
nwords++;
}
// Prepare the next word
word = NULL;
nchars = 1;
}
}
else
{
char *temp;
temp = realloc(word, nchars + 1);
if (temp == NULL)
{
perror("realloc");
exit(EXIT_FAILURE);
}
word = temp;
word[nchars - 1] = (char)c;
word[nchars++] = '\0';
}
}
for (i = 0; i < nwords; i++)
{
printf("%s appeared %zu times\n", words[i].word, words[i].count);
free(words[i].word);
}
free(words);
fclose(file);
return 0;
}
In C, characters are, essentially, their ASCII code (or rather, their char or unsigned char value). So once you read a character, you have its ASCII code already.
However, fgetc() doesn't always return the character it read for you; it may fail, for which reason it returns an int, not an unsigned char, which will be -1 in case of failure.
So:
You need to define an int variable to take the result of fgetc().
If it's not EOF, you can cast the result back into a unsigned char. That's your character, and it's ASCII value, at the same time.
PS - I'm ignoring non-ASCII characters, non-Latin languages etc. (But C mostly ignores them in its basic standard library functions too.)

Selecting lines which start with a string

I want to use this function to show lines from a file that starts with a string.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include "RangeSelection.h"
bool RangeSelection(char * buffer, int len, char * argv[])
{
int i = 0, j, c;
char word[4];
bool is_break;
while((c = getchar()) != EOF)
{
is_break = false; //a variable for checking if instruction break was used
for(j = 0; j < 3; ++j) //load three first chars
{
word[j] = c;
c = getchar(); //to load chars into c
if(c == '\0' || c == EOF) //if there's less than 3 chars in this word, don't check it
{
is_break = true;
break;
} //if that's EOF or new line, break
}
if(!is_break)
{
word[3] = '\0';
}
if(strcmp(word, argv) == 0) //they're the same
{
printf("%c", word[0]);
printf("%c", word[1]);
printf("%c", word[2]);
do //do while as it will show the output at least three times
{
printf("%c", c); //print that char
}
while((c = getchar()) != '\0' && c != EOF);
}
else
{
while((c = getchar()) != '\0' && c != EOF) //load chars until a new line
{
;
}
}
}
return false;
}
But it doesn't work as I want to. Run with parameter "xxx" it has to show only lines starting with "xxx". For this input:
xxx
dasfdad
xxx works
x
it should print only:
xxx
xxx works
while it prints:
xxx
dasfdad
xxx works
x
However, while the input is like this one:
dasfdad
xxx works
x
It prints nothing although the program ought to show
xxx works
Char * argv[] is an argument provided by the user while running the program in a console(for example xxx). Thanks in advance.
Use memcmp() instead of strcmp() to make life much easier for yourself. The parameter len is never used in the RangeSelection() function. This should be removed unless it is needed for some future development. Unless the string indicated by buffer will be changed in the function, it would be good to use const char *buffer in the parameter list.
Also note that char *argv[] is converted to a pointer to a pointer to char in the function parameter list; I don't think that this is what is expected in the posted code. The function strcmp() expects pointers to char (pointers to the first elements of null-terminated character arrays), so the expression strcmp(word, argv) leads to undefined behavior due to type mismatch in the function call.
Here is an example program:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
bool filter_line(const char * buffer, const char *tag)
{
bool is_tagged;
size_t tag_sz = strlen(tag);
if (strlen(buffer) < tag_sz) {
is_tagged = false;
} else {
is_tagged = !memcmp(buffer, tag, tag_sz);
}
return is_tagged;
}
int main(int argc, char *argv[])
{
if (argc < 2) {
fprintf(stderr, "Usage: %s line_tag\n", argv[0]);
exit(EXIT_FAILURE);
}
const char *filter_tag = argv[1];
char line[4096];
while (fgets(line, sizeof line, stdin) != NULL) {
if (filter_line(line, filter_tag)) {
printf("%s", line);
}
}
return 0;
}
Sample interactions using text files as input:
test_filter_line.txt
xxx
dasfdad
xxx works
x
abc xxx
xxx
λ> ./line_filter xxx < test_filter_line.txt
xxx
xxx works
test_filter_line2.txt:
dasfdad
xxx works
x
λ> ./line_filter xxx < test_filter_line2.txt
xxx works
Your program prints the whole file when it starts with given pattern and prints nothing if it doesn't because you are not matching newline character \n at any time whereas you want to read your file line by line.
I would suggest you split the job into separate functions. This is how I would do it in C99 :
int starts_with(char *buf, char *pattern)
{
char *p = pattern;
for (int i = 0; i < strlen(pattern); i++)
{
if (!buf[i] || buf[i] != pattern[i])
return 0;
}
return 1;
}
Now you can store each line inside a buffer within your RangeSelection function and check if each buffer start with your pattern

Reading in a line from file or stdin dynamically

I am posed with a situation where my function does exactly what I want except handle higher amounts of input.
I initially thought to process each character one by one but was running into problems doing this. So fscanf not only does what I want it to do but it is essential in reading in only one line. I noticed, I cannot reallocate space for bigger array this way though. I have tried using format specifiers i.e. %*s to include a specific amount of buffer space before hand but this still does not work.
I have noticed also, I would have no way of knowing the size of the string I am reading in.
Here is my attempt and thoughts:
#define LINE_MAX 1000
char* getline(FILE* inputStream)
{
int capacity = LINE_MAX;
char* line = malloc(capacity * sizeof(char));
int ch;
/* if (sizeof(capacity) == sizeof(line)) { // Not a valid comparison? Too late?
capacity *= 2;
line = realloc(line, capacity * sizeof(line));
} */
if (fscanf(stream, "%[^\n]s", line) == 1) {
ch = fgetc(inputStream);
if (ch != '\n' && ch != EOF) {
fscanf(inputStream, "%*[^\n]");
fscanf(inputStream, "%*c");
}
free(line);
return line;
}
free(line);
return NULL;
}
I am new to memory allocation in general but I feel as though I had a good idea of what to do here. Turns out I was wrong.
Here is an example to read a line and store it in a Character array.
#include <stdio.h>
#include <stdlib.h>
int main(void) {
signed char *str;
int c;
int i;
int size = 10;
str = malloc(size*sizeof(char));
for(i=0;(c=getchar()) !='\n' && c != EOF;++i){
if( i == size){
size = 2*size;
str = realloc(str, size*sizeof(char));
if(str == NULL){
printf("Error Unable to Grow String! :(");
exit(-1);
}
}
str[i] = c;
}
if(i == size){
str = realloc(str, (size+1)*sizeof(char));
if(str == NULL){
printf("Error Unable to Grow String! :(");
exit(-1);
}
}
str[i] = '\0';
printf("My String : %s", str);
return 0;
}
The array is resized to twice it's original size if current array can't hold the characters read from input.

Find Nth Word given a file of Strings and ints C

I need to find the Nth word in a string which is given through standard input through redirection operators in Unix.
Input is something along these lines:
But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born
5
The European languages are members of the same family.
3
Can anyone give me any idea as to how to read in the string into a char array and then get the int and use it to find the given word? I've been at it for a while and can't get it to work properly.
#define INPUT_LENGTH 400
int main(void)
{
char input[INPUT_LENGTH];
char integer[INPUT_LENGTH];
int spaces = 0;
int value;
char n;
while(fgets(input, INPUT_LENGTH, stdin)) //read in string line
{
while(fgets(integer, INPUT_LENGTH,stdin)) //read in int
{
int num = sscanf(integer, "%d", &value); //assign int val to num
while(1 == sscanf(input, "%c", &n)) //go through string one char at a time
if(spaces == num && !isspace(n))
printf("%c", n); //print chars if we've reached the word
else if(isspace(n))
spaces++;
}
}
}
I've redone most of it with the comments in mind but still can't seem to have it actually reading in the input through the operator unfortunately.
I'm not certain but I don't think my fgets are correct. I'm rather new to C and am not entirely certain how they process the data even after research
Use strtok like this
#include <stdio.h>
#include <string.h>
#define INPUT_LENGTH 400
int main(void){
char input[INPUT_LENGTH];
char integer[INPUT_LENGTH];
int value;
while(fgets(input, sizeof input, stdin)) //read in string line
{
if(fgets(integer, sizeof integer, stdin)) //read in int
{
if(1==sscanf(integer, "%d", &value)) //assign int value to value
{
char *word = strtok(input, " \t\n");
int n;
for(n = 1; word != NULL && n < value; ++n){// 1 origin
word = strtok(NULL, " \t\n");
}
if(word != NULL && n == value)
puts(word);//Nth word
else
puts("No word");
}
else {
printf("Numerical value is not specified.\n");
}
}
else {
printf("There is no numeric specification line.\n");
}
}
}

Resources