Related
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
// fiter string to the first |
char* filterstringfirst(char* command, int i){
char *tok = command;
int x = 0;
while ((tok = strtok(tok, "|")) != NULL && x <= i)
{
if( x == i){
return tok;
}
x++;
printf(" === Parsed: --%s-- ===\n", tok);
tok = NULL;
}
return tok;
}
int main () {
char command[] = "ls -a | sort -h | grep h | wc -l";
char command2[] = "ls -a | sort -h | grep h | wc -l";
char* temp = command;
char* x = filterstringfirst(temp, 0);
printf("%s\n",x);
char* temp2 = command;
char* x2 = filterstringfirst(temp2, 1);
printf("%s\n",x2);
temp = command;
return 0;
}
I have this function I made which is supposed to just return part of a string. The original string should be similar to "ls -l | grep temp | sort".
The idea was that it would be called with the string and a number, and return that segment. Eg. 0 -> "ls -l"
Now this works the first time I call it, but calling it again seems to break and end in a segfault.
char command[] = "ls -a | sort -h | grep h | wc -l";
char* temp = command;
char* x = filterstringfirst(temp, 0);
printf("%s\n",x);
char* temp2 = command;
char* x2 = filterstringfirst(temp2, 1);
printf("%s\n",x2);`
This was my testing code
And the output:
ls -a
=== Parsed: --ls -a -- ===
[1] 1126 segmentation fault ./templ
➜ Current gcc -o templ templ.c
➜ Current ./templ ls -a
=== Parsed: --ls -a -- === [1]
1136 segmentation fault ./templ
Edit: Updated to have main too (based on comments)
strtok is destructive - it modifies the buffer passed in by replacing delimiters will null bytes.
After
char* x = filterstringfirst(temp, 0);
command will effectively be "ls -a ".
If you want to use strtok here, you will either need to:
mimic strtok in your wrapping function, by passing NULL and the position to start from in subsequent calls, or
duplicate the string before using it, and return a copy of the token.
An example of the second, with no error handling:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *get_token_at(char *command, size_t n) {
size_t position = 0;
char *copy = strdup(command);
char *token = strtok(copy, "|");
char *output = NULL;
while (token && position < n) {
token = strtok(NULL, "|");
position++;
}
if (token && position == n)
output = strdup(token);
free(copy);
return output;
}
int main(void) {
char command[] = "ls -a | sort -h | grep h | wc -l";
char *x = get_token_at(command, 0);
puts(x);
free(x);
x = get_token_at(command, 1);
puts(x);
free(x);
}
stdout:
ls -a
sort -h
(Note the whitespace in these tokens.)
The function strtok changes the passed string by inserting zero characters '\0' in the positions of delimiters.
So after the first call of the function filterstringfirst
char* x = filterstringfirst(temp, 0);
the character array command looks like
"ls -a \0 sort -h | grep h | wc -l";
^^^
That is in fact you have the following string "ls -a " stored in the array command.
So calling the function the second time with the second argument greater than 0 you will get as a result a null pointer.
If you want to extract substrings specifying an index then you should use functions strspn and strcspn and return from the function a dynamically allocated array containing the target substring.
Here is a demonstration program that shows how the function can be defined using the standard string functions strspn and strcspn and without creating dynamically a copy of the source string (that is inefficient and unsafe) each time when the function is called.
#include <string.h>
#include <stdlib.h>
#include <string.h>
char * filterstringfirst( const char *command, const char *delimiters, size_t i )
{
char *substring = NULL;
const char *p = command;
size_t n = 0;
do
{
p += n;
p += strspn( p, delimiters );
n = strcspn( p, delimiters );
} while (*p && i--);
if ( *p && ( substring = malloc( n + 1 ) ) != NULL )
{
memcpy( substring, p, n );
substring[n] = '\0';
}
return substring;
}
int main( void )
{
char command[] = "ls -a | sort -h | grep h | wc -l";
const char *delimiters = "|";
char *substring = NULL;
for (size_t i = 0;
( substring = filterstringfirst( command, delimiters, i ) ) != NULL;
i++)
{
printf( "%zu: \"%s\"\n", i, substring );
free( substring );
}
}
The program output is
0: "ls -a "
1: " sort -h "
2: " grep h "
3: " wc -l"
You can use this function with any delimiters used to separate a string.
I am trying to write a program that automatically opens a google classroom link of the class I have two minutes before it starts.
So far I have decided to have a tsv file with three columns: Time, Subject, Gmeet link respectively.
somewhat like this:
[1]
11:15 CD https://en.wikipedia.org/wiki/Inotify
14:00 SGD https://en.wikipedia.org/wiki/Inotify
15:05 SGD https://en.wikipedia.org/wiki/Inotify
[2]
09:00 AI https://en.wikipedia.org/wiki/Inotify
The numbers in the square boxes correspond to the day of the week.
So what I am planning to do is to store the day's schedule in an array of size 9 as I can have classes at 9 different timeslots at any given day, with the link of each class being at index[hour - 9]. So the first class at 9 am will be at index zero and the timeslots that are empty will have null as their value.
This is what the code looks like:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
const char FILE_NAME[] = "schedule.txt";
const int CLASSES = 9;
int main()
{
time_t now = time(NULL);
struct tm *local = localtime(&now);
FILE *file = fopen(FILE_NAME, "r"); /* Input file */
char line[100];
int hour;
char *min;
char *sub;
char *link;
/* char current_day = local->tm_wday; */
char current_day = 1;
char *sched[CLASSES];
if (file == NULL) {
printf("Cannot open file: \"%s\"\n", FILE_NAME);
exit(8);
}
for (int i = 0; i < CLASSES; ++i)
sched[i] = NULL;
char dayfound = 0;
while(fgets(line, sizeof(line), file)) {
if ((line[0] == '[' && atoi(&line[1]) == current_day)) {
dayfound = 1;
continue;
}
if (dayfound && line[0] == '\n') {
dayfound = 0;
break;
}
if (dayfound) {
hour = atoi(strtok(line, ":"));
min = strtok(NULL, "\t");
sub = strtok(NULL, "\t");
link = strtok(NULL, "\t");
sched[hour - 9] = link;
}
}
for (int i = 0; i < CLASSES; ++i) {
if (sched[i] != NULL)
printf("%s", sched[i]);
}
fclose(file);
return 0;
}
For some reason the value of the first class I have that day is getting overwritten by the empty string, and I am only getting the links of the other two classes.
This is the output:
https://en.wikipedia.org/wiki/Inotify
https://en.wikipedia.org/wiki/Inotify
As you can see the first link for CD is missing.
Can anyone point out why this is happening?
You're reading into the same line string for every line of the file. strtok() returns a pointer into that string. So all your link values point to the same string.
You should make a copy of link when assigning it to sched[hour-9]:
sched[hour - 9] = strdup(link);
Or you could make sched a 2-dimensional array and use strcpy()
char sched[CLASSES][100];
...
strcpy(sched[hour - 9], link);
I'm writing a program that computes the SHA-256 of a string read from stdin.
I'm using the functions provided by openssl/sha.h into my program. I implemented a "normal hash" and a "salted hash".
Normal hash
If I hash the word password I get the following outputs:
SHA256_Update(&sha256, string, strlen(string)) --> 5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8
SHA256_Update(&sha256, string, sizeof(string)) --> 5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8
which are correct, as seen by the output of the shell command
echo -n "password" | sha256sum --> 5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8
Salted hash
Then I tried to implement a salted hash, concatenating the input string after another string (the salt), obtaining
complete = salt + string
Given that the salt is always the same (26000, computed as 13*2000), I expected output matching the shell command:
echo -n "26000password" | sha256sum --> c9bcf6ab867bdff7bf2223407c6a391f5c475fb411f7eae08fb361a671d4fd0d
Using strlen(complete) gives me the correct answer:
SHA256_Update(&sha256, complete, strlen(complete)) --> c9bcf6ab867bdff7bf2223407c6a391f5c475fb411f7eae08fb361a671d4fd0d
However, using sizeof(complete) gives me a different answer:
SHA256_Update(&sha256, complete, sizeof(complete)) --> ef73eaf729a0601f9d99ed0a11ef82ae82ca74042de5a724889f82e4f6e59bb0
So I have two problems:
The second case has something wrong (I presume is the use of sizeof instead of strlen, but that doesn't explain why I got the second problem, so I'm thinking about some "inconsistent state of the memory");
If I execute the program again, the wrong hash keeps changing every time I stop and execute the program.
So I assume that the problem is in the use of sizeof instead of strlen. I know that sizeof returns the number of bytes used to store that data type (so a pointer will be 4 bytes), while strlen will give me the length of the string, but I don't understand why I got the errors only in the second case.
Code excerpts
#include <openssl/sha.h>
#define MAX_LENGTH 1024
unsigned long salt;
// Salted ////////////////////////////////
void compute_sha256_with_sizeof_complete(const char *string, char *sha)
{
int i;
char temp[65];
char complete[MAX_LENGTH];
char salt_str[MAX_LENGTH];
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256;
/* complete = salt_str + string */
if (sprintf(salt_str, "%lu", salt) < 0)
error_handler("compute_sha256 | sprintf");
if (strcpy(complete, salt_str) == NULL)
error_handler("compute_sha256 | strcpy");
if (strcat(complete, string) == NULL)
error_handler("compute_sha256 | strcat");
/* compute sha256 of 'complete' */
if (SHA256_Init(&sha256) == 0)
error_handler("compute_sha256 | SHA256_Init");
if (SHA256_Update(&sha256, complete, sizeof(complete)) == 0)
error_handler("compute_sha256 | SHA256_Update");
if (SHA256_Final(hash, &sha256) == 0)
error_handler("compute_sha256 | SHA256_Final");
for (i = 0; i < SHA256_DIGEST_LENGTH; i++)
if ((sprintf(temp + (i * 2), "%02x", hash[i])) < 0)
error_handler("compute_sha256 | sprintf");
temp[64] = 0;
if (strcpy(sha, temp) == NULL)
error_handler("compute_sha256 | strcpy");
printf("SHA256 (sizeof_complete):\t%s\n", sha);
}
void compute_sha256_with_strlen_complete(const char *string, char *sha)
{
int i;
char temp[65];
char complete[MAX_LENGTH];
char salt_str[MAX_LENGTH];
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256;
/* complete = salt_str + string */
if (sprintf(salt_str, "%lu", salt) < 0)
error_handler("compute_sha256 | sprintf");
if (strcpy(complete, salt_str) == NULL)
error_handler("compute_sha256 | strcpy");
if (strcat(complete, string) == NULL)
error_handler("compute_sha256 | strcat");
/* compute sha256 of 'complete' */
if (SHA256_Init(&sha256) == 0)
error_handler("compute_sha256 | SHA256_Init");
if (SHA256_Update(&sha256, complete, strlen(complete)) == 0)
error_handler("compute_sha256 | SHA256_Update");
if (SHA256_Final(hash, &sha256) == 0)
error_handler("compute_sha256 | SHA256_Final");
for (i = 0; i < SHA256_DIGEST_LENGTH; i++)
if ((sprintf(temp + (i * 2), "%02x", hash[i])) < 0)
error_handler("compute_sha256 | sprintf");
temp[64] = 0;
if (strcpy(sha, temp) == NULL)
error_handler("compute_sha256 | strcpy");
printf("SHA256 (strlen_complete):\t%s\n", sha);
}
// Normal ////////////////////////////////
void compute_sha256_with_sizeof_string(const char *string, char *sha)
{
int i;
char temp[65];
char complete[MAX_LENGTH];
char salt_str[MAX_LENGTH];
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256;
/* complete = salt_str + string */
if (sprintf(salt_str, "%lu", salt) < 0)
error_handler("compute_sha256 | sprintf");
if (strcpy(complete, salt_str) == NULL)
error_handler("compute_sha256 | strcpy");
if (strcat(complete, string) == NULL)
error_handler("compute_sha256 | strcat");
/* compute sha256 of 'string' */
if (SHA256_Init(&sha256) == 0)
error_handler("compute_sha256 | SHA256_Init");
if (SHA256_Update(&sha256, string, sizeof(string)) == 0)
error_handler("compute_sha256 | SHA256_Update");
if (SHA256_Final(hash, &sha256) == 0)
error_handler("compute_sha256 | SHA256_Final");
for (i = 0; i < SHA256_DIGEST_LENGTH; i++)
if ((sprintf(temp + (i * 2), "%02x", hash[i])) < 0)
error_handler("compute_sha256 | sprintf");
temp[64] = 0;
if (strcpy(sha, temp) == NULL)
error_handler("compute_sha256 | strcpy");
printf("SHA256 (sizeof_string):\t\t%s\n", sha);
}
void compute_sha256_with_strlen_string(const char *string, char *sha)
{
int i;
char temp[65];
char complete[MAX_LENGTH];
char salt_str[MAX_LENGTH];
unsigned char hash[SHA256_DIGEST_LENGTH];
SHA256_CTX sha256;
/* complete = salt_str + string */
if (sprintf(salt_str, "%lu", salt) < 0)
error_handler("compute_sha256 | sprintf");
if (strcpy(complete, salt_str) == NULL)
error_handler("compute_sha256 | strcpy");
if (strcat(complete, string) == NULL)
error_handler("compute_sha256 | strcat");
/* compute sha256 of 'string' */
if (SHA256_Init(&sha256) == 0)
error_handler("compute_sha256 | SHA256_Init");
if (SHA256_Update(&sha256, string, strlen(string)) == 0)
error_handler("compute_sha256 | SHA256_Update");
if (SHA256_Final(hash, &sha256) == 0)
error_handler("compute_sha256 | SHA256_Final");
for (i = 0; i < SHA256_DIGEST_LENGTH; i++)
if ((sprintf(temp + (i * 2), "%02x", hash[i])) < 0)
error_handler("compute_sha256 | sprintf");
temp[64] = 0;
if (strcpy(sha, temp) == NULL)
error_handler("compute_sha256 | strcpy");
printf("SHA256 (strlen_string):\t\t%s\n", sha);
}
void match_password(const char *line)
{
char hash[MAX_LENGTH];
salt *= 13;
compute_sha256_with_strlen_complete(line, hash);
compute_sha256_with_sizeof_complete(line, hash);
compute_sha256_with_strlen_string(line, hash);
compute_sha256_with_sizeof_string(line, hash);
}
void read_password_from_stdin(void)
{
char line[MAX_LENGTH];
salt = 2000;
printf("> ");
if (fgets(line, MAX_LENGTH, stdin) == NULL)
error_handler("read_password_from_stdin | fgets");
if (line[strlen(line)-1] == '\n')
line[strlen(line)-1] = '\0';
match_password(line);
}
int main(int argc, char **argv)
{
if (argc != 1)
{
fprintf(stderr, "Usage: %s <no arguments>\n", argv[0]);
return EXIT_FAILURE;
}
while (!feof(stdin))
read_password_from_stdin();
return EXIT_SUCCESS;
}
Using sizeof, you compute sha on every octet of complete (ie on MAX_LENGTH octets) instead only on useful octets.
And complete is not initialized, so content can be anything, and different on each launch. Using sprintf, strcat and strcpy only initialize parts containing string, so end of array still uninitialized. To have every time same hash, initialize complete:
char complete[MAX_LENGTH] = { 0 };
sizeof(complete) is always MAX_LENGTH, or 1024. In compute_sha256_with_sizeof_complete, complete and salt_str are local variables, so are allocated on the stack and may have any contents when the function starts running, depending on the compiler. I would guess that there are at least some nonzero values in complete, and strcpy and strcat won't replace those values unless the salt string, or what you read from stdin, is MAX_LENGTH characters long. Therefore, you are getting the SHA-256 not only of the data you put in, but also of whatever random data is in complete when compute_sha256_with_sizeof_complete starts running.
This is also the reason the sizeof_complete output changes every time: the random data left in the stack, and therefore in complete, can be different each time the program runs. To initialize it expressly, you could say char complete[MAX_LENGTH] = {0};, but you are better off to just not use sizeof in this situation :) .
Using strlen, by contrast, hashes only the string contents up to the terminating NULL character. The strcpy and strcat fill in that part of the string as you expect, which is why you get the result you expect. strlen is definitely the right choice here. (And while you're coding, use strncpy and strncat instead of strcpy and strcat for added protection against memory issues.)
I was writing a program in which I want to print common words between two strings . Well I use two loops and split those strings in those two loops . But didn't get the requisite result . Then I changed the program a bit and then I researched that outer loop run only once . Didn't able fathom why ? Anybody any idea ?
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
int main()
{
char str1[] = "Japan Korea Spain Germany Australia France ";
char str2[] = "England USA Russia Italy Australia India Nepal France";
char *tar1 = strtok(str1," ");
char *tar2 = NULL;
while(tar1)
{
tar2 = strtok(str2," ");
while(tar2)
{
if(strcmp(tar1,tar2)) printf("%s %s\n",tar1 , tar2);
tar2 = strtok(NULL," ");
}
tar1 = strtok(NULL," ");
tar2 = NULL;
}
return 0;
}
You cannot use strtok on two different strings at the same time, and you cannot parse a string more than once, because strtok has already modified the string by breaking it with nul terminators.
This example extracts the token pointers into an array of pointers for each input string, before checking for matches.
#include <stdio.h>
#include <string.h>
#define MAXSTR 20
int main()
{
char str1[] = "Japan Korea Spain Germany Australia France ";
char str2[] = "England USA Russia Italy Australia India Nepal France";
char *tar1[MAXSTR];
char *tar2[MAXSTR];
char *tok;
int ind1 = 0, ind2 = 0;
int i, j;
tok = strtok(str1, " \t");
while(tok != NULL && ind1 < MAXSTR) {
tar1[ind1++] = tok;
tok = strtok(NULL, " \t");
}
tok = strtok(str2, " \t");
while(tok != NULL && ind2 < MAXSTR) {
tar2[ind2++] = tok;
tok = strtok(NULL, " \t");
}
for(i=0; i<ind1; i++) {
for(j=0; j<ind2; j++) {
if(strcmp(tar1[i], tar2[j]) == 0) {
printf("%s\n", tar1[i]);
break;
}
}
}
return 0;
}
Program output:
Australia
France
The strtok() function breaks a string into a sequence of zero or more
nonempty tokens.
In other words: ' ' is replaced with a NUL (0) by strtok.
In consequence, you can not use tar2 = strtok(str2," "); twice with the same string.
And as pointed out by #WeatherVane: You cannot use strtok on two different strings at the same time.
An alternative to your code:
#include <stdio.h>
#include <string.h>
int main(void)
{
char str1[] = "Japan Korea Spain Germany Australia France ";
char str2[] = "England USA Russia Italy Australia India Nepal France";
char *tar = strtok(str1, " ");
char *ptr;
size_t sz;
while (tar) {
if ((ptr = strstr(str2, tar)) != NULL) {
/* First string or starts with " " */
if ((ptr == str2) || (*(ptr -1) == ' ')) {
sz = strlen(tar);
/* Last string or ends with " " */
if ((*(ptr + sz) == ' ') || (*(ptr + sz) == '\0')) {
puts(tar);
}
}
}
tar = strtok(NULL, " ");
}
return 0;
}
Output:
Australia
France
I need to write code that reads a text file and outputs the number of words, number of distinct words, and the most frequent word in C.
I have already done the code for outputting the number of words, but I have no idea how to find the number of distinct words or the most frequent word. I know I'm supposed to use strcmp, but I don't know about doing so. Any help would be greatly appreciated. Here's what I have so far.
int main(int argc, char *argv[])
{
int number=0;
char temp[25000][50];
char word[25000][50];
char *word2[25000][50];
int wordCount=0;
int distinctCount=0;
FILE *fp;
//reads file!
fp = fopen("COEN12_LAB1.txt", "r");
if(fp == NULL)
{
printf("File Missing!\n");
return 0;
}
//counts words in file!
while(fscanf(fp,"%s", word) == 1)
wordCount++;
printf("Total number of words: %d\n", wordCount);
fclose(fp);`
}
First you probably need to implement structure that will allow you to efficiently keep distinct words. Hash table is one of the possible ones (maybe best).
Here is example of implementation and usage of hashes on C:
http://troydhanson.github.io/uthash/index.html
http://elliottback.com/wp/hashmap-implementation-in-c/
Also you can look into this question: Porting std::map to C?
I wrote program for you, see source here: http://olegh.cc.st/src/words.c.txt
Of course, did not check special situations, like many words on single line,
distinct words qty > 16,000, etc. But, basic code you can get:
Run sample:
$ cat aaa.txt
aaa
bbb
ccc
aaa
xxx
aaa
cc
$ cc words.c ; ./a.out aaa.txt
1 xxx
1 ccc
1 bbb
1 cc
3 aaa
[EDIT]
1. replaced malloc with calloc (initializes memory to 0)
2. replaced second argument in qsort
3. program now works with wider range of files (more words, more delimiters)
This is not pretty, and may need some minor debug, but it will get you started for count, number of distinct and most frequently used words:
#include <ansi_c.h>
#include <stdio.h>
#define FILENAME "c:\\dev\\play\\test3.txt" //put your own path here
#define DELIM "- .,:;//_*&\n"
int longestWord(char *file, int *cnt);
void allocMemory(int numStrings, int max);
void freeMemory(int numStrings);
static int sortstring( const void *str1, const void *str2 );
char **strings;
int main()
{
int wc, longest, cnt, distinct, i, mostFreq, mostFreqKeep=0;
char line[260];
char *buf=0;
FILE *fp;
longest = longestWord(FILENAME, &wc);
char wordKeep[longest];
allocMemory(wc, longest);
//read file into string arrays
fp = fopen(FILENAME, "r");
cnt=0;
while(fgets(line, 260, fp))
{
buf = strtok(line, DELIM);
while(buf)
{
if((strlen(buf) > 0) && (buf[0] != '\t') && (buf[0] != '\n') && (buf[0] != '\0')&& (buf[0] > 0))
{
strcpy(strings[cnt], buf);
cnt++; //use as accurate count of words.
}
buf = strtok(NULL, DELIM);
}
}
fclose(fp);
//now get most frequent word
//sort
qsort(strings, cnt, sizeof(char*), sortstring);
distinct = 1;
mostFreq = 1; //every word will occur once
wordKeep[0]=0;
for(i=0;i<cnt-1;i++)
{
//depends on a successful sort (alphabetization)
if(strlen(strings[i]) >0)
{
if(strcmp(strings[i], strings[i+1]) == 0)
{
mostFreq++;
if(mostFreq > mostFreqKeep)
{
strcpy(wordKeep, strings[i]);
mostFreqKeep = mostFreq;
}
}
else
{
mostFreq = 1;
distinct++;
}
}
}
printf("number of words: %d\nNumber of distinct words:%d\nmost frequent word: %s - %d\n", cnt, distinct, wordKeep, mostFreqKeep);
freeMemory(cnt);
getchar();
return 0;
}
int longestWord(char *file, int *nWords)
{
FILE *fp;
int cnt=0, longest=0, numWords=0;
char c;
fp = fopen(file, "r");
while ( (c = fgetc ( fp) ) != EOF )
{
if ( isalpha ( c ) ) cnt++;
else if ( ( ispunct ( c ) ) || ( isspace ( c ) ) )
{
(cnt > longest) ? (longest = cnt, cnt=0) : (cnt=0);
numWords++;
}
}
*nWords = numWords;
fclose(fp);
return longest+1;
}
void allocMemory(int numStrings, int max)
{
int i;
strings = calloc(sizeof(char*)*(numStrings+1), sizeof(char*));
for(i=0;i<numStrings; i++)
{
strings[i] = calloc(sizeof(char)*max + 1, sizeof(char));
}
}
void freeMemory(int numStrings)
{
int i;
for(i=0;i<numStrings; i++)
if(strings[i]) free(strings[i]);
free(strings);
}
static int sortstring( const void *str1, const void *str2 )
{
const char *rec1 = *(const char**)str1;
const char *rec2 = *(const char**)str2;
int val = strcmp(rec1, rec2);
return val;
}
You could use a simple database to compute the different word counts from the input text. For simplicity I'd suggest to use SQLite. Below I have added some example code (I left out the error handling for the sake of brevity).
For reading words I took an approach to read a single line into a buffer using fgets. I noticed that this approach works nicely as long you can guarantee that the buffer is always large enough to hold the actual lines from the input file. Otherwise words are split up at the end of the buffer, which needs to be handled somehow.
For parsing the text I have used strtok. During the implementation I have learned that it's quite hard to get the word-delimiters right. Besides this, possible spelling differences (e.g., capitalization) and inflections of otherwise equal words are completely ignored by this approach and could thus negatively affect the result.
Once the data is in the database, the query language is very well suited to formulate queries to get the maximum (distinct) word count, or word frequencies. Therefore I think this flexible approach has an advantage when you want to compute multiple statistics from the input text, as you don't have to implement every special case in C. For testing, I copied a part of the Wikipedia article on SQLite into the file words.txt.
Here's the example:
#include <sqlite3.h>
#include <stdio.h>
#include <string.h>
#define DELIM " \r\n\t,.-;:_#+*\\=)(/&%$§\"“”!1234567890}][{'"
#define BUFSIZE 4096
#define SQLSIZE 256
int print_row(void* p, int ncols, char **values, char **names) {
for (int i = 0; i < ncols; i++)
printf("| %15s : %15s %s", names[i], values[i], i<ncols-1?"":"|\n");
return 0;
}
int main(int argc, char * argv[]) {
/* open infile */
FILE * infile = fopen("words.txt", "r");
/* initialize database */
sqlite3 *db_handle = 0;
sqlite3_open(":memory:", &db_handle);
sqlite3_exec(db_handle, "CREATE TABLE word (word TEXT);", 0, 0, 0);
/* parse file, populate db */
char buf[BUFSIZE], sql[SQLSIZE], *word;
while(fgets(buf, BUFSIZE, infile))
for (word = strtok(buf, DELIM); word != 0; word = strtok(0, DELIM)) {
snprintf(sql, SQLSIZE, "INSERT INTO word VALUES ('%s');", word);
sqlite3_exec(db_handle, sql, 0, 0, 0);
}
/* count of words */
sqlite3_exec(db_handle,
"SELECT COUNT(word) AS total_words FROM word;",
print_row, 0, 0);
/* count of distinct words */
sqlite3_exec(db_handle,
"SELECT COUNT(DISTINCT word) AS distinct_words FROM word;",
print_row, 0, 0);
/* top five most frequent words */
sqlite3_exec(db_handle,
"SELECT word, COUNT(*) AS count FROM word "
"GROUP BY word ORDER BY count DESC LIMIT 5;",
print_row, 0, 0);
sqlite3_close(db_handle);
}
Here's my output:
$ gcc test.c -std=c99 -lsqlite3 && ./a.out
| total_words : 561 |
| distinct_words : 314 |
| word : SQLite | count : 17 |
| word : is | count : 16 |
| word : the | count : 15 |
| word : a | count : 14 |
| word : to | count : 12 |
For reference:
http://en.cppreference.com/w/c/io/fgets
http://en.cppreference.com/w/c/string/byte/strtok
http://en.cppreference.com/w/c/io/fprintf
http://www.sqlite.org/inmemorydb.html
http://sqlite.org/c3ref/open.html
http://sqlite.org/c3ref/exec.html
http://sqlite.org/lang_createtable.html
http://sqlite.org/lang_insert.html
http://sqlite.org/lang_select.html
http://sqlite.org/lang_aggfunc.html