I have a text file which consists of about 30000 words. My goal is to count the actual number of the words (keep in mind that multiple punctuation marks and consecutive spaces are included, as well as words connected with - (for example three-legged), so counting just the spaces isn't correct).
I have managed to count the total characters but I am struggling with the words.
Any help?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define SIZE 50
char *getfile(void);
void stats(char *filename);
int main() {
char *file;
file = getfile();
stats(file);
return 0;
}
char *getfile(void) {
char *filename;
FILE *fp;
filename = malloc(SIZE);
printf("Enter the name of the text file: ");
scanf("%49s", filename);
fp = fopen(filename, "r");
printf("\n");
if (fp == NULL) {
printf("The entered file does not exist.");
printf("\n");
} else {
printf("The file exists.");
fclose(fp);
}
return filename;
}
void stats(char *filename) {
int cnt = 0, space = 0, lines = 0;
int c;
int count = 0;
FILE *fp;
fp = fopen(filename, "r");
while (((c = fgetc(fp)) != EOF)) {
cnt++;
if (c == ' ') {
space++;
}
if (c == '\n' || c == '\0') {
lines++;
}
}
printf("\nTotal characters in file: %d", cnt);
printf("\nTotal characters (excluding spaces) in file: %d", cnt - space);
fclose(fp);
return;
}
You should make a list of all the chars that can separate between words, and count every sequence of separating characters.
The reason you are having trouble is you have no state. That is, classifying context about what came before. You can use other methods to break the file into words, but a state-machine is simple and fast. As suggested in the comments and by other answers, you need two states, a white-space came before, and a word character came before. It's sort of like the one-bit derivative, with rising edge, white-space space to word, as a the thing you count.
Stripping off most of the extraneous stuff, this might be how you do a state machine.
#include <stdio.h>
int main(void) {
unsigned char buf[16384 /*50*/]; /* 50 is small. */
enum { WHITE, WORD } state = WHITE;
size_t cnt = 0, lines = 0, words = 0, nread, i;
do { /* Fill `buf`. */
nread = fread(buf, 1, sizeof buf, stdin);
if(ferror(stdin)) { perror("wc"); return 1; }
cnt += nread;
for(i = 0; i < nread; i++) { /* Char-by-char in `buf`. */
unsigned char c = buf[i];
/* https://en.cppreference.com/w/cpp/string/byte/isspace */
switch(c) {
case '\n':
lines++; /* Fall-though. Doesn't handle CRs properly. */
case '\0': case ' ': case '\f': case '\r': case '\t': case '\v':
state = WHITE;
break;
default:
if(state == WORD) break;
state = WORD;
words++;
break;
}
}
} while(nread == sizeof buf);
printf("Total characters in file: %lu\n", (unsigned long)(cnt - lines));
printf("Total lines in file: %lu\n", (unsigned long)lines);
printf("Total words in file: %lu\n", (unsigned long)words);
return 0;
}
I off-loaded some work on the hosted-environment for brevity, ./wc < file.txt and I used a buffer.
Related
So I have been working on a code that will scan and print out all the spaces, words, sentences, digits, alphabets, punctuations that are in the text file. also some of the text file will have another text file saved in it to also be read in.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <dirent.h>
#define size 100
int words(char a[]);
int sentences (char a[]);
int sadp (char a[]);
int c;
char filename[size];
FILE *books, *books1, *books2, *books3;
int main(){
books = fopen("files.txt","r");
if(books == NULL){
printf("File is not found\n");
}
else{
while (fgets(filename, size-1 ,books) != NULL){
printf("%s",filename);
sentences (filename);
words(filename);
sadp (filename);
}
}
fclose(books);
return 0;
}
int sentences (char filename[]){
int count=0;
char i=0;
books3 =fopen(filename,"r");
if (books3 !=NULL){
while (fscanf(books3, "%d", &c)>0){
for(i = 0; filename[i] != '\0'; i++){
if(filename[i] == '.')
count++;
}
i++;
}
}
printf("\tSentences:\t\t%d\n", count);
return count;
}
int words(char filename[]){
char i;
int count=0;
books2 = fopen(filename,"r");
if (books2 != NULL){
while(fscanf(books2, "%d", &c)>0){
for (i = 0; filename[i] != '\0';i++){
if (filename[i]==' ' || filename[i]=='\n')
count++;
}
i++;
}
}
printf("\tWords:\t\t\t%d\n",count);
return count;
}
int sadp (char filename[]){
int alpha, digit, space, punct, i;
punct = 0; //for punctuations
alpha = 0; //alphabets
digit = 0; // digits
space = 0; // spaces between the words
i = 0; // place holder to count
books1 =fopen(filename,"r");
if (books1 != NULL){
while(fscanf(books1,"%d", &c) > 0){
if(isalpha(c)){
alpha++;
}//checks to see if it is alphabet
else if(isdigit(c)){
digit++;
}//checks to see if it is digits
else if(isspace(c)){
space++;
}//checks to see if it is space
else (ispunct(c));
punct++;
//checks to see if it is punctuations
i++;
}
}
printf("\tLetters:\t\t%d\n", alpha);
printf("\tDigits:\t\t\t%d\n", digit);
printf("\tSpaces:\t\t\t%d\n", space);
printf("\tPunctuations:\t%d\n\n", punct);
return (alpha /*digit, space, punct*/ );
}
However when I go to print this out I will get all the titles in the file to print out. but the counts for each of things that I wan to print out will not print out.
We don't need to read a file thrice to find statistics you're looking for, once is enough; going by one character at a time.
Alphabets, White-Spaces, Punctuations & Numerical-Digits
int print_file_stats (char filename[])
{
if (NULL == filename) {
fprintf (stderr, "\nEmpty filename %s", __FUNC__);
return -1;
}
int alpha = 0; //alphabets
int digit = 0; // digits
int punct = 0; //for punctuations
int space = 0; // spaces between the words
FILE *pFile = fopen (filename, "r");
if (NULL == pFile) {
fprintf (stderr, "\nReadingFile : %s", filename);
return -2;
}
int iCh;
while (EOF != (iCh = getc (pFile))) {
if ( isalpha (iCh)) ++alpha;
else if (isdigit (iCh)) ++digit;
else if (isspace (iCh)) ++space;
else if (ispunct (iCh)) ++punct;
}
fclose (pFile);
printf ("\tLetters:\t\t%d\n", alpha);
printf ("\tDigits:\t\t\t%d\n", digit);
printf ("\tSpaces:\t\t\t%d\n", space);
printf ("\tPunctuations:\t%d\n\n", punct);
return (alpha + digit + space + punct);
}
Function int getc(FILE *stream); will read the file one character at a time. Note it returns EOF (of size more than one byte) when it reaches end-of-file.
Words
Any alpha-numeric grouping separated by white space. When isspace() is true & it comes after one/more alpha-numeric chars then we just crossed a word.
Sentences
Any grouping of words terminated by sentence terminators (. ! ?). Meaning, ... !!! ??? are false positives for sentences.
> Is this a sentence
> "How about this? Take it"
Let's keep it simple for the time being. When ispunct() is true & it(. ! ?) comes after one or more words then we found a sentence.
You just have to keep flags(for word & sentence) of where you're and count words & sentences.
I try to count the number of characters, words, lines in a file.
The txt file is:
The snail moves like a
Hovercraft, held up by a
Rubber cushion of itself,
Sharing its secret
And here is the code,
void count_elements(FILE* fileptr, char* filename, struct fileProps* properties) // counts chars, words and lines
{
fileptr = fopen(filename, "rb");
int chars = 0, words = 0, lines = 0;
char ch;
while ((ch = fgetc(fileptr)) != EOF )
{
if(ch != ' ') chars++;
if (ch == '\n') // check lines
lines++;
if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\0') // check words
words++;
}
fclose(fileptr);
properties->char_count = chars;
properties->line_count = lines;
properties->word_count = words;
}
But when i print the num of chars, words and lines, outputs are 81, 18, 5 respectively
What am i missing?
(read mode does not changes anything, i tried "r" as well)
The solution I whipped up gives me the same results as the gedit document statistics:
#include <stdio.h>
void count_elements(char* filename)
{
// This can be a local variable as its not used externally. You do not have to put it into the functions signature.
FILE *fileptr = fopen(filename, "rb");
int chars = 0, words = 0, lines = 0;
int read;
unsigned char last_char = ' '; // Save the last char to see if really a new word was there or multiple spaces
while ((read = fgetc(fileptr)) != EOF) // Read is an int as fgetc returns an int, which is a unsigned char that got casted to int by the function (see manpage for fgetc)
{
unsigned char ch = (char)read; // This cast is safe, as it was already checked for EOF, so its an unsigned char.
if (ch >= 33 && ch <= 126) // only do printable chars without spaces
{
++chars;
}
else if (ch == '\n' || ch == '\t' || ch == '\0' || ch == ' ')
{
// Only if the last character was printable we count it as new word
if (last_char >= 33 && last_char <= 126)
{
++words;
}
if (ch == '\n')
{
++lines;
}
}
last_char = ch;
}
fclose(fileptr);
printf("Chars: %d\n", chars);
printf("Lines: %d\n", lines);
printf("Words: %d\n", words);
}
int main()
{
count_elements("test");
}
Please see the comments in the code for remarks and explanations. The code also would filter out any other special control sequences, like windows CRLF and account only the LF
Your function takes both a FILE* and filename as arguments and one of them should be removed. I've removed filename so that the function can be used with any FILE*, like stdin.
#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
typedef struct { /* type defining the struct for easier usage */
uintmax_t char_count;
uintmax_t word_count;
uintmax_t line_count;
} fileProps;
/* a helper function to print the content of a fileProps */
FILE* fileProps_print(FILE *fp, const fileProps *p) {
fprintf(fp,
"chars %ju\n"
"words %ju\n"
"lines %ju\n",
p->char_count, p->word_count, p->line_count);
return fp;
}
void count_elements(FILE *fileptr, fileProps *properties) {
if(!fileptr) return;
properties->char_count = 0;
properties->line_count = 0;
properties->word_count = 0;
char ch;
while((ch = fgetc(fileptr)) != EOF) {
++properties->char_count; /* count all characters */
/* use isspace() to check for whitespace characters */
if(isspace((unsigned char)ch)) {
++properties->word_count;
if(ch == '\n') ++properties->line_count;
}
}
}
int main() {
fileProps p;
FILE *fp = fopen("the_file.txt", "r");
if(fp) {
count_elements(fp, &p);
fclose(fp);
fileProps_print(stdout, &p);
}
}
Output for the file you showed in the question:
chars 93
words 17
lines 4
Edit: I just noticed your comment "trying to count only alphabetical letters as a char". For that you can use isalpha and replace the while loop with:
while((ch = fgetc(fileptr)) != EOF) {
if(isalpha((unsigned char)ch)) ++properties->char_count;
else if(isspace((unsigned char)ch)) {
++properties->word_count;
if(ch == '\n') ++properties->line_count;
}
}
Output with the modified version:
chars 74
words 17
lines 4
A version capable of reading "wide" characters (multibyte):
#include <locale.h>
#include <stdint.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
typedef struct {
uintmax_t char_count;
uintmax_t word_count;
uintmax_t line_count;
} fileProps;
FILE* fileProps_print(FILE *fp, const fileProps *p) {
fprintf(fp,
"chars %ju\n"
"words %ju\n"
"lines %ju\n",
p->char_count, p->word_count, p->line_count);
return fp;
}
void count_elements(FILE *fileptr, fileProps *properties) {
if(!fileptr) return;
properties->char_count = 0;
properties->line_count = 0;
properties->word_count = 0;
wint_t ch;
while((ch = fgetwc(fileptr)) != WEOF) {
if(iswalpha(ch)) ++properties->char_count;
else if(iswspace(ch)) {
++properties->word_count;
if(ch == '\n') ++properties->line_count;
}
}
}
int main() {
setlocale(LC_ALL, "sv_SE.UTF-8"); // set your locale
FILE *fp = fopen("the_file.txt", "r");
if(fp) {
fileProps p;
count_elements(fp, &p);
fclose(fp);
fileProps_print(stdout, &p);
}
}
If the_file.txt contains one line with öäü it'll report
chars 3
words 1
lines 1
and for your original file, it'd report the same as above.
I am new at c and I am writing a code that get a string from the user and compare it to a strings from the text file and my code is only working when I compare between two characters and when I compare between two strings it's not working If someone know how can I fix the problem it's will be verey helpful. the compare line is in the searchFile function. the text file is a csv file so insted compare the char to string I need to compare between what befor the , to the string_to_search. example for csv file at the end of the code
Example: string to search = 'c' work, string to search 'name' doesn't work
#include <stdio.h>
#define STR_LEN 100
int searchFile(char* string_to_search, char* path);
int main(int argc, char* argv[])
{
FILE* text_file = 0;
int found = 0, choice = 0;
char string_to_search[STR_LEN] = {0};
if (!(fopen(argv[1], "r") == NULL)) //check if file exists
{
do
{
printf("Please enter your choice:\n");
printf("1 - Search a term in the document.\n");
printf("2 - change a value in a specific place.\n");
printf("3 - copy a value from one place to another\n");
printf("4 - Exit\n");
scanf("%d", &choice);
getchar();
switch (choice)
{
case 1:
fgets(string_to_search, STR_LEN, stdin);
string_to_search[strcspn(string_to_search, "\n")] = 0;
found = searchFile(string_to_search, argv[1]); //found = where the string line
if (found != 0)
printf("Value was found in row %d\n", found);
else
printf("Value Wasn't Found\n");
}
}while(choice != 4);
}
else
{
printf("file does not exists\n");
}
getchar();
return 0;
}
int searchFile(char* string_to_search, char* path)
{
FILE* file = fopen(path, "r");
char ch = ' ';
int i = 0, len = 0, count = 1;
fseek(file, 0, SEEK_END);
len = ftell(file);
fseek(file, 0, SEEK_SET);
len = len - 2;
char* string = (char*)malloc(sizeof(char) * len);
do //copying the chars to a string
{
ch = fgetc(file);
string[i] = ch;
i++;
} while (ch != EOF);
fclose(file);
for (i = 0; i < len; i++)
{
if (string[i] == *string_to_search) //the compare
{
free(string);
return count;
}
if (string[i] == '\n')
{
count++;
}
}
free(string);
return 0;
}
Example for a CSV file:
roee,itay,3,4
5,6,7,8
a,b,c,d
e,f,g,h
You have to change the following line:
if (string[i] == *string_to_search) //the compare
into
if (string[i] == string_to_search[i]) //the compare
The problem is that *string_to_search always refers to the first character of string_to_search. With the [i] you will get the nth character of the string as you have done it for the variable string. So as you noticed it works for a comparsion of two characters but not for two strings, because on a string you will always compare with the first character of string_to_search. For example if you want to compare "aaa" it will also work.
But as noted in the comment section you may also want to use strcmp() instead of the loop. There you will also have to pass string_to_search and not *string_to_search, because you want to pass the pointer to the string and not a single character.
I am trying to replace words from a file, This works fine with words of the same length.
I know it can be done by storing content in a temporary array and then shifting but I was wondering if it can be done without using array.
#include<stdio.h>
#include<string.h>
int main(int argc, char **argv)
{
char s1[20], s2[20];
FILE *fp = fopen(argv[1], "r+");
strcpy(s1, argv[2]);
strcpy(s2, argv[3]);
int l, i;
while(fscanf(fp, "%s", s1)!=EOF){
if(strcmp(s1, argv[2]) == 0){
l = strlen(s2);
fseek(fp, -l, SEEK_CUR);
i=0;
while(l>0){
fputc(argv[3][i], fp);
i++;
l--;
}
}
}
}
Here is my code for replacing same length words, what can I modify here for different lengths?
Assuming that the OP's goal is to avoid storing the whole content of the file into a byte array (maybe not enough memory) and he also said that it needs to "shift" the file's content, so it cannot use a temp file to make the text replacement (perhaps not enough room in the storage device).
Note that copying into a temp file would be the easiest method.
So as I can see the solution has two algorithms:
Shift to left: Replace a text with another of equal or smaller length.
Shift to right: Replace a text with a longer one.
Shift to left:
Maintain 2 file position pointers: one for the read position (rdPos) and another for the write position (wrPos).
Both start in zero.
read char from rdPos until find the oldText and write it into the wrPos (but only if rdPos != wrPos to avoid unnecessary write operations).
write the newText into wrPos.
repeat from step 3 until EOF.
if len(oldText) > len(newText) then truncate the file
Shift to right:
Maintain 2 file position pointers: (rdPos and wrPos).
scan the whole file to find the number of the oldText occurrences.
store their file positions into a small array (not strictly needed, but useful to avoid a second reverse scan of the oldText)
set rdPos = EOF-1 (the last char in the file)
set wrPos = EOF+foundCount*(len(newText)-len(oldText)): reserving enough extra space for the shifting.
read char from rdPos until find the position in the "found" array and write the char into the wrPos.
write the newText into wrPos.
repeat from step 6 until BOF.
I wrote the following implementation as an example of the mentioned algorithms, but without caring too much about validations and edge cases.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#define MAX_ITEMS 100
#define DO_WRITE 0x01
#define DO_FIND 0x02
FILE *fp;
long rdPos = 0L, wrPos = 0L, rdCount=0L, wrCount=0L;
int newLen, oldLen;
char *newText, *oldText;
struct found_t { int len; long pos[MAX_ITEMS];} found;
/* helper functions */
void writeChars(char *buffer, int len){
if(wrPos < rdPos){
long p = ftell(fp);
fseek(fp, wrPos, SEEK_SET);
fwrite(buffer, len, 1, fp);
fseek(fp, p, SEEK_SET);
wrCount += len;
}
wrPos += len;
}
int nextReadChar = -1;
int readChar(){
int c;
if(nextReadChar == EOF) {
if((c = fgetc(fp)) != EOF)
rdCount++;
} else {
c = nextReadChar;
nextReadChar = EOF;
}
return c;
}
int findFirstChar(int action){
int c; char ch;
for(; (c = readChar()) != EOF && c != (int)oldText[0]; rdPos++)
if(action == DO_WRITE) {
ch = (char)c;
writeChars(&ch, 1);
}
return c;
}
int testOldText(int c, int action){
char *cmp;
for(cmp = oldText; *cmp != '\0' && c == (int)*cmp; cmp++)
c = readChar();
nextReadChar = c;
if(*cmp == '\0') { /* found oldText */
if(action == DO_FIND)
found.pos[found.len++] = rdPos;
rdPos += oldLen;
if(action == DO_WRITE){
writeChars(newText, newLen);
found.len++;
}
}
else { /* some chars were equal */
if(action == DO_WRITE)
writeChars(oldText, cmp-oldText);
rdPos += cmp-oldText;
}
return c;
}
void writeReverseBlock(long firstCharPos){
for(;rdPos >= firstCharPos+oldLen; rdPos--, wrPos--, rdCount++, wrCount++){
int c;
fseek(fp, rdPos, SEEK_SET); c = fgetc(fp);
fseek(fp, wrPos, SEEK_SET); fputc(c, fp);
}
rdPos = firstCharPos-1;
wrPos -= newLen-1;
fseek(fp, wrPos--, SEEK_SET);
fwrite(newText, newLen, 1, fp);
wrCount += newLen;
}
void scanFile(int action){
int c;
do {
if( (c = findFirstChar(DO_WRITE)) == EOF ) break;
}while(testOldText(c, DO_WRITE) != EOF);
}
/** Main Algorithms */
void shiftToLeft(){
scanFile(DO_WRITE);
fflush(fp);
ftruncate(fileno(fp), wrPos);
}
void shiftToRight(){
int i;
scanFile(DO_FIND);
wrPos = --rdPos + found.len * (newLen-oldLen); /* reserve space after EOF */
for(i=found.len-1; i>=0; i--)
writeReverseBlock(found.pos[i]);
}
/* MAIN program */
int main(int argc, char **argv){
if(argc != 4){
fprintf(stderr, "Usage: %s file.ext oldText newText\n", argv[0]);
return 1;
}
if(!(fp = fopen(argv[1], "r+b"))) {
fprintf(stderr, "Cannot open file '%s'\n", argv[1]);
return 2;
}
oldLen = strlen(oldText = strdup(argv[2]));
newLen = strlen(newText = strdup(argv[3]));
found.len = 0;
/* which algorithm? */
if(newLen <= oldLen) shiftToLeft();
else shiftToRight();
fclose(fp);
printf("%7d occurrences\n"
"%7ld bytes read\n"
"%7ld bytes written\n", found.len, rdCount, wrCount);
return 0;
}
This program attempts to save the contents of a text file into a character variable array. It is then supposed to use my_getline() to print the contents of the character array. I've tested and see that the contents are in fact getting saved into char *text but I can't figure out how to print the contents of char *text using my_getline(). my_getline is a function we wrote in class that I need to use in this program. When I attempt to call it in the way that was taught, it 1 is printed to terminal but then the terminal just waits and nothing else is printed. Any guidance would be appreciated. Also, let me know if I'm missing any information that would help.
/* Include the standard input/output and string libraries */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/* Define the maximum lines allowed in an input text and NEWLINE for getline funct. */
#define MAXPATTERN 15
#define MAXFILENAMELENGTH 15
#define NEWLINE '\n'
/* function prototypes */
void my_getline(char text[]);
int find_string(char text[], char pattern[], int length_text, int length_pattern);
int main()
{
FILE *fp;
long lSize;
char *text;
char fileName[MAXFILENAMELENGTH], pattern[MAXPATTERN];
char c;
int length_text, length_pattern, j, lineNumber = 1;
printf("Enter file name: ");
scanf("%s", fileName);
fp = fopen(fileName, "r");
if (fp == NULL)
{
printf("fopen failed.\n");
return(-1);
}
fseek(fp, 0L, SEEK_END);
lSize = ftell(fp);
rewind(fp);
/* allocate memory for all of text file */
text = calloc(1, lSize + 2);
if(!text)
{
fclose(fp);
fputs("memory allocs fails", stderr);
exit(1);
}
/* copy the file into text */
if(1 != fread(text, lSize, 1, fp))
{
fclose(fp);
free(text);
fputs("Entire read fails", stderr);
exit(1);
}
text[lSize + 1] = '\0';
printf("%s has been copied.\n", fileName);
rewind(fp);
printf("%d ", lineNumber);
for (j = 0; (j = getchar()) != '\0'; j++)
{
my_getline(text);
printf("%d %s\n", j+1, text);
}
printf("Enter the pattern you would like to search for: ");
scanf("%s", pattern);
printf("\nYou have chosen to search for: %s\n", pattern);
fclose(fp);
free(text);
return(0);
}
void my_getline(char text[])
{
int i = 0;
while ((text[i] = getchar()) != NEWLINE)
++i;
text[i] = '\0';
}
Your function is causing a system hang because you're calling getchar(), which returns the next character from the standard input. Is this really what you want?
At this point, your program is expecting input from the user. Try typing in the console windows and pressing to see it coming back from the "hang"
It is most likely causing an infinite loop because you are not checking whether you have reached EOF.
void my_getline(char text[])
{
int i = 0;
int c;
while ( (c = getchar()) != NEWLINE && c != EOF )
text[i++] = c;
text[i] = '\0';
}