how to make an file word counter in C - c

I need to make a program in C that accesses some files and retrieves a set of keywords and stores them in a list, then for each keyword it would loop through a set of emails and count how many times each keyword appeared PER email
#note i'm not allowed to use global variables
my code's kind of a mess as it is, but here it is:
#define _GNU_SOURCE
#include<stdio.h>
#include<string.h>
#include<dirent.h>
#include <stdlib.h>
#define MAXKEYWORDLENGTH 64
typedef struct {
char keyword[MAXKEYWORDLENGTH];
int keywordCount;
int stdev;
} keywordData;
typedef struct {
int emailCount;
} emailData;
int fetchKeywordNumber(const char* filename)
{
int keywordNumber;
FILE* keywords = fopen(filename, "r");
fscanf(keywords,"%d", &keywordNumber);
fclose(keywords);
return keywordNumber;
}
keywordData *fetchKeywords(const char* filename)
{
FILE* keywords = fopen(filename, "r");
keywordData *kd;
// accessing first line of the file which has the number of keywords
int keywordNumber;
keywordNumber = fetchKeywordNumber(filename);
// first number in the file is the number of keywords in the file, so i dont need to count them
if(keywords)
if(fscanf(keywords,"%d", &keywordNumber) != 1) { /* error handling*/}
// printf("%d\n", keywordNumber);
kd = malloc(keywordNumber * sizeof(*kd));
if(kd)
for(int i = 0; i < keywordNumber; i++)
{
fscanf(keywords,"%s", kd[i].keyword);
// printf("%s\n", kd[i].keyword);
}
if(keywords) fclose(keywords);
return kd;
}
int countWord(const char* filename, char wordname, int wordIndex)
{
FILE* email = fopen(filename, "r");
int keywordCount = 0;
fseek(email, 0, SEEK_END);
long emailSize = ftell(email);
fseek(email, 0, SEEK_SET); // same as rewind(f);
char *string = malloc(emailSize + 1);
fread(string, emailSize, 1, email);
fclose(email);
string[emailSize] = 0;
char *bodyPos = strstr(string, "Body:");
const char *next = bodyPos;
while ((next = strcasestr(next, wordname)) != NULL)
{
keywordCount++;
next++;
printf("Found a keyword: %s, occurence number: %d\n", kd[i].keyword, kd[i].keywordCount);
}
return keywordCount;
}
keywordData *emailLoop(char wordName, int wordIndex)
{
keywordData* kd;
kd = malloc((wordIndex + 1) * sizeof(keywordData));
struct dirent *de; // Pointer for directory entry
DIR *dr = opendir("/home/student/bitdefender-challenge/check_baliza/data/emails/");
if (dr == NULL) // opendir returns NULL if couldn't open directory
{
printf("Could not open current directory" );
return 0;
}
while ((de = readdir(dr)) != NULL)
{
if(!(strcmp(de->d_name, ".")) || !(strcmp(de->d_name, "..")))
continue;
char directory[256];
sprintf(directory, "%s%s", "/home/student/bitdefender-challenge/check_baliza/data/emails/", de->d_name);
kd[wordIndex].keywordCount = countWord(directory, wordName, wordIndex);
}
closedir(dr);
return kd;
}
int main()
{
char* keywordFilename = "/home/student/bitdefender-challenge/check_baliza/data/keywords";
char* emailFilename = "/home/student/bitdefender-challenge/check_baliza/data/emails/40";
// keywordData* result = readEmail(emailFilename, fetchKeywords(keywordFilename));
int keywordCount = fetchKeywordNumber(keywordFilename);
keywordData *result = fetchKeywords(keywordFilename);
for(int i = 0; i < keywordCount; i++)
{
emailLoop(result[i].keyword, i);
}
return 0;
}
i'd like to receive some advice on how to make this count the number of occurences for each word per each email and the standard deviation for each keyword
my output would need to be like this:
<word> <total occurence number found in all emails> <standard deviation>

Related

Counting the mose frequent char in a file

For my CS class I need to write a program that reads an entire file. I've researched a whole bunch of different ways to do this with a string (the two for loops inside the while loops) and I've combined it with the way I was taught to read through a whole file. The problem is you can't index the frequency list with a char variable type (line). Is there an easier way to read through the file and do this?
# define MAX 200
void replace_most_freq(const char *filename, char c, FILE *destination) {
// your code here
FILE *in_file = NULL;
in_file = fopen(filename, "r");
if (!in_file) {
fprintf(destination,
"Error(replace_most_freq): Could not open file %s\n", filename);
fclose(in_file);
return;
}
int i, max = -1, len;
int freq[256] = { 0 };
char line[MAX], result;
while (fgets(line, sizeof(line), in_file)) {
len = strlen(line);
for (i = 0; i < len; i++) {
freq[line[i]]++;
}
}
while (fgets(line, sizeof(line), in_file)) {
len = strlen(line);
for (i = 0; i < len; i++) {
if (max < freq[line[i]]) {
max = freq[line[i]];
result = line[i];
}
}
}
printf("Most frequent char = %c\n", result);
return;
}
Your initial loop is almost correct: you should convert the char to an unsigned char to avoid undefined behavior on negative char values on platforms where char is signed.
The second loop is incorrect: there is no need to read from the file, just iterate over the freq array to find the largest count.
Here is a modified version:
#include <limits.h>
#include <stdio.h>
void replace_most_freq(const char *filename, char newc, FILE *destination) {
FILE *in_file = fopen(filename, "r");
if (!in_file) {
fprintf(stderr,
"Error(replace_most_freq): Could not open file %s\n", filename);
return;
}
int c, max, maxc;
int freq[UCHAR_MAX] = { 0 };
while ((c = getc(in_file)) != EOF) {
freq[c]++;
}
max = freq[maxc = 0];
for (c = 1; c < UCHAR_MAX; c++) {
if (max < freq[c])
max = freq[maxc = c];
}
printf("Most frequent char = %c (%d)\n", max, max);
rewind(in_file);
while ((c = getc(in_file)) != EOF) {
if (c == maxc)
c = newc;
putc(c, destination);
}
}
You can read file in much larger chunks:
#define BUFFSIZE (4*1024*1024)
int findMax(const size_t *, size_t);
int replace_most_freq(const char *filename, char c, FILE *destination) {
int result = 1;
FILE *fi ;
size_t freq[256] = { 0 };
size_t dataChunkLength;
long fileLength;
unsigned char *databuff = malloc(BUFFSIZE);
if(!databuff)
{
result = -2;
goto function_exit;
}
fi = fopen(filename, "r");
if (!fi)
{
result = -1;
goto function_exit;
}
if (fseek(fi, 0, SEEK_END) == -1)
{
result = -3;
goto function_exit;
}
fileLength = ftell(fi);
if (fileLength == -1)
{
result = -4;
goto function_exit;
}
if (fseek(fi, 0, SEEK_SET) == -1)
{
result = -3;
goto function_exit;
}
while(fileLength)
{
if(fileLength <= BUFFSIZE) dataChunkLength = fileLength;
else dataChunkLength = BUFFSIZE;
size_t bytesRead = fread(databuff, 1, dataChunkLength, fi);
if(bytesRead != dataChunkLength)
{
if(feof(fi) || ferror(fi))
{
result = -4;
goto function_exit;
}
}
for(size_t index = 0; index < bytesRead; index++)
{
freq[databuff[index]]++;
}
fileLength -= bytesRead;
}
int mostFrequent;
printf("The most freq char is 0x%02x\n", mostFrequent = findMax(freq, 256));
function_exit:
free(databuff);
if (fi) fclose(fi);
return result;
}

How to find words with capital letters in a char using c?

I'm trying to find all the words with capital letters in a string, but am unable to process my data structure. i seem to be able to print out fileContent, indicating that it is loading in successfully, but my second function is not working on the file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char* loadFile(char* fileName)
{
FILE *inputFile;
inputFile = fopen(fileName, "r");
//finds the end of the file
fseek(inputFile, 0, SEEK_END);
//stores the size of the file
int size = ftell(inputFile);
//Sets the scan to the start of the file
fseek(inputFile, 0, SEEK_SET);
char *documentStore = (char*)malloc(size);
int i = 0, c;
while((c = fgetc(inputFile)) != EOF)
{
documentStore[i] = c;
i++;
}
return documentStore;
}
void countImportantWords(char* fileContent, char** importantWords, int* frequencyWords)
{
int uniqueWordCount = 0;
int lengthWordStore = 10;
int i = 0;
int recording = 0;
char wordBuffer[50];
int wordBufferCount = 0;
int isWordPresent = 0;
while(fileContent[i] != EOF)
{
//To allocate more memory incase the structure is full
if(uniqueWordCount == lengthWordStore)
{
lengthWordStore += 10;
char** newWordStore = realloc(importantWords, lengthWordStore * sizeof(char*));
int* newFrequencyStore = realloc(frequencyWords, sizeof(int));
importantWords = newWordStore;
frequencyWords = newFrequencyStore;
}
printf("%s", wordBuffer);
//Conditions to fill if its a word
if(fileContent[i] >= 'A' && fileContent[i] <= 'Z' && recording == 0)
{
wordBuffer[0] = fileContent[i];
recording = 1;
}else if(fileContent[i] >= 'a' && fileContent[i] <= 'z' && recording == 1)
{
//each if is to check if the end of word is reached. Any character that is non alphabetical is considered end of word
wordBufferCount += 1;
wordBuffer[wordBufferCount] = fileContent[i];
} else if (fileContent[i] >= 'A' && fileContent[i] <= 'Z' && recording == 1)
{
wordBufferCount += 1;
wordBuffer[wordBufferCount] = fileContent[i];
} else {
//Adding a terminating character so that it strcpy only copies until that point
wordBuffer[wordBufferCount + 1] = '\0';
recording = 0;
//check to see if that word is in the array already, and if it is, it will just increment the frequency
for(int j = 0; j < uniqueWordCount; j++){
if(strcmp(wordBuffer, importantWords[j]) == 0)
{
frequencyWords[j] += 1;
isWordPresent = 1;
}
}
//if its not present, it should assign it to the structure
if(isWordPresent == 0)
{
char* wordStore = (char*)malloc(wordBufferCount * sizeof(char));
strcpy(wordStore, wordBuffer);
uniqueWordCount += 1;
importantWords[uniqueWordCount] = wordStore;
frequencyWords[uniqueWordCount] = 1;
}
}
i++;
}
}
int main() {
char fileName[50];
char *fileContent;
char **importantWords = (char**)malloc(10*sizeof(char**));
int *frequencyWords = (int*)malloc(10*sizeof(int));
printf("Please input the full file path: ");
scanf("%s", fileName);
fileContent = loadFile(fileName);
countImportantWords(fileContent, importantWords, frequencyWords);
int i = 0;
while(importantWords[i] != '\0')
{
printf("%s %d", importantWords[i], frequencyWords[i]);
i++;
}
return 0;
}
I've put in the full file so you can see how the structure was created incase that it is the issue, but ideally what would happen is that the final loop would print out all the words that are important and they're frequency. Currently i'm getting exit code 11, which i'm not sure what it means, but may be worth mentioning. I'd really appreciate any help :)
You can simplify the process dramatically but utilising functions and learning to manage your memory. I wrote a short example which does not take punctuation into account. It just assumes every word is separated by a space, which you can customise to your discretion.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
char* readfile(char* filename){
char* data = NULL;
FILE* file = fopen(filename, "r");
if(file == NULL){
return NULL;
}
fseek(file, 0, SEEK_END);
long size = ftell(file)+1;
fseek(file, 0, SEEK_SET);
data = (char*)malloc(size);
if(data == NULL){
return NULL;
}
fgets(data, (int)size, file);
return data;
}
typedef struct uppercase_t{
char** word;
int count;
}uppercase;
void copy(uppercase* u,char* token){
size_t length = strlen(token);
u->word[u->count] = (char*)malloc(length+1);
if(u->word[u->count] == NULL){
return;
}
strcpy(u->word[u->count], token);
++u->count;
}
void createuppercasedata(uppercase* u, char* data){
const char delimeter[] = " ";
char* token = strtok(data, delimeter);
if(token == NULL){
return;
}
u->word = (char**)malloc(u->count+1);
if(u->word == NULL){
return;
}
if(isupper(token[0])){
copy(u,token);
}
while(token != NULL){
token = strtok(0, delimeter);
if(token != NULL)
if(isupper(token[0])) {
char** reallocated = (char**)realloc(u->word, u->count+1);
if(reallocated == NULL){
return;
}
u->word = reallocated;
copy(u, token);
}
}
}
void destroyuppercasedata(uppercase* u){
for(int index = 0; index < u->count; ++index){
free(u->word[index]);
}
free(u->word);
}
int main(){
char filename[] = "textfile";
char* data = readfile(filename);
if(data == NULL){
return -1;
}
uppercase u = {0};
createuppercasedata(&u, data);
printf("found %i uppercase words\n",u.count);
for(int index = 0; index < u.count; ++index){
printf("%s\n", u.word[index]);
}
destroyuppercasedata(&u);
free(data);
}
The code will allocate a new pointer for each uppercase and memory for the word to be copied too. It will free all the memory it allocated in the structure with destroyuppercasedata and it will free the initial data that was read from file. Error checking and memory management in C is really important. So utilise those properly.
This was the test file I used.
textfile
How many Uppercase words can Be Found In this text File the answer should be Seven
And this was the output to the terminal:
How
Uppercase
Be
Found
In
File
Seven

INI file reader function is addressing wrong values to my structure, How do I solve this?

This is for a project for university (small replica of a Catan game) and I'm struggling a bit with this part, we have the read an INI file with fairly simple formatting, it only has some comments starting with ';' and then it's just tags with a value in front:
xdim=4
ydim=5
N=D
S=L2
E=S10
W=D
etc...
I have this function to read from an INI file and address the read values to the correct struct element. But it seems like it doesn't even read the file, the struct is a simple struct with xdim and ydim, after I call the func xdim is '&d&d&d&d etc...' and ydim is 0
I've tried placing in some printf's just to see if the values from the INI file itself where being read wrong, but nothing is printed.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAX 128
typedef struct UNIT { /**struct used in an array for the rest of the INI values*/
char N[4];
char S[4];
char W[4];
char E[4];
char Building;
}UNIT;
typedef struct{ /**This is declared in main and passed to the functions*/
UNIT *grid;
unsigned int xdim;
unsigned int ydim;
} MAP_CONFIG;
void set_config_val(MAP_CONFIG *config, const char *key, int val) {
if (config == NULL)
return;
if (strcmp(key, "xdim") == 0){
printf("here");
config->xdim = val;
}
else if (strcmp(key, "ydim") == 0){
printf("here");
config->ydim = val;
}
else{
;
}
}
void read_config(MAP_CONFIG *config,FILE *f) {
char str[MAX];
char *token;
const char *delim = "=\n";
while (1) {
fgets(str, MAX, f);
if(feof(f)!= 0) break;
puts(str);
if (strchr(str, '=')!=NULL) {
char varname[MAX];
int value;
token = strtok(str, delim);
strcpy(varname, token);
token = strtok(NULL, delim);
value = atoi(token);
printf("&d", token);
set_config_val(config, varname, value);
}
}
config = malloc(sizeof(MAP_CONFIG));
config->grid = calloc(config->xdim * config->ydim, sizeof(UNIT));
close(f);
return;
}
open file function:
FILE *openFile(char *nome, char *mode) {
FILE *f;
printf("Opening file %s\n", nome);
f = fopen(nome, mode);
if (f == NULL) {
fprintf(stderr, "*** It was not possible to open the file %s.", nome);
exit(1);
}
return f;
}
test main im using:
int main(int argc, char **argv) {
MAP_CONFIG map;
MAP_CONFIG *mapa = &map;
FILE *f;
char *filename;
for (int i = 0; i < argc; i++)
printf("Parametro %d: %s\n", i, argv[i]);
if (argc >= 2) {
filename = argv[1];
}
else {
printf("Opening base map file..\n");
filename = "mapa.ini";
}
f = openFile(filename, "r");
read_config(mapa, f);
printf("%d %d", map.xdim, map.ydim);
return 0;
}
I just want it to read the xdim and ydim, and then repeat the process to an array of structs for each struct to get the correct value of the N,S,E,W present in the INI file... Help!

How to parse and arrange lines of a csv file based on matching word in C?

I have csv file with below format :
name,birthmonth,country,hobby
jack,jan,england,soccer
roben,july,germany,soccer
emma,dec,china,tennis
yannick,sep,france,music
alex,nov,england,cricket
thomas,apr,germany,tennis
mike,oct,netherlands,cycling
michelle,feb,france,poetry
yui,mar,japan,coding
feng,jun,china,reading
I want to parse this file using C, and put all the lines with same country name in a consecutive manner i.e shown below:
name,birthmonth,country,hobby
jack,jan,england,soccer
alex,nov,england,cricket
roben,july,germany,soccer
thomas,apr,germany,tennis
emma,dec,china,tennis
feng,jun,china,reading
yannick,sep,france,music
michelle,feb,france,poetry
mike,oct,netherlands,cycling
yui,mar,japan,coding
So far, I have tried this code below, however not able to match things properly and proceed further:
#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>
#include<fcntl.h>
#include<string.h>
int main (int argc, char **argv) {
//int line;
char line[200];
char *inputFile = argv[1];
FILE *input_csv_file;
char a,b,c,d,e;
input_csv_file = fopen(inputFile, "rt");
if(input_csv_file ==0) {
printf("Can not open input file \n");
}
else {
//while((line = fgetc(input_csv_file)) != EOF) {
while(fgets(line, sizeof line, input_csv_file) != NULL) {
printf ("line = %s\n", line);
if(sscanf(line, "%s,%s,%s,%s,%s", a,b,c,d,e)) {
//if(sscanf(line, "%[^,], %[^,], %[^,], %[^,], %[^,]", a,b,c,d,e)) {
printf("d=%s\n",d);
}
}
}
return 0;
}
I am a newbie in C/C++. Any help would be much appreciated
Thanks.
I could write the code to get the required output. Below is the code:
#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>
#include<fcntl.h>
#include<string.h>
int main(int argc, char ** argv)
{
struct filedata {
char nation[8];
char content[50];
};
char line[100];
char *inputFile = argv[1];
FILE *input_csv_file;
int iter = 0, c;
char * tok;
int count = 0;
char ch;
char country[] = "country";
char header_line[50];
input_csv_file = fopen(inputFile, "rt");
//count line numbers of the input csv
for(ch = getc(input_csv_file); ch!= EOF; ch=getc(input_csv_file))
if(ch == '\n')
count = count + 1;
fclose(input_csv_file);
count = count -1;
struct filedata * record[count];
input_csv_file = fopen(inputFile, "rt");
if(input_csv_file == 0)
{
printf("Can not open input file\n");
} else
{
while(fgets(line, sizeof line, input_csv_file) != NULL)
{
//printf("-- line = %s\n", line);
int s_line = sizeof line;
char dup_line[s_line];
strcpy(dup_line, line);
int h = 0;
int s_token;
tok = strtok(line, ",");
while(tok != NULL)
{
h++;
if(h == 3)
{
s_token = sizeof tok;
break;
}
tok = strtok(NULL, ",");
}
// skipping the line having column headers
if(compare_col(tok, country) == 0) {
strcpy(header_line, dup_line);
continue;
}
iter++;
c = iter - 1;
record[c] = (struct filedata*)malloc(sizeof(struct filedata));
strcpy(record[c]->nation, tok);
strcpy(record[c]->content, dup_line);
} //while
struct filedata * temp;
FILE * fptr;
fptr = fopen("nation_csv.txt", "w");
if(fptr == NULL)
{
printf("Error in opening the file to write\n");
exit(1);
}
// sorting the arr of struct nation wise
for(iter=1; iter < count; iter++)
for(c =0 ; c < count -1; c++) {
if(strcmp(record[c]->nation, record[c+1]->nation) > 0) {
temp = record[c];
record[c] = record[c+1];
record[c+1] = temp;
}
}
for(iter=0; iter < count; ++iter)
{
if(iter == 0) {
fprintf(fptr, "%s", header_line);
continue;
}
fprintf(fptr, "%s", record[iter]->content);
}
fclose(fptr);
}
fclose(input_csv_file);
}
int compare_col(char a[], char b[] )
{
int c = 0;
while(a[c] == b[c]) {
if(a[c] == '\0' || b[c] == '\0')
break;
c++;
}
if(a[c] == '\0' && b[c] == '\0')
return 0;
else
return -1;
}
Thanks for all your inputs. Any further inputs to make it better are much appreciated.
Thanks

Storing tokens into an array to later pass as parameters

I have a file which I have already tokenized but I need to store each token in an array to later use as parameters. How would i go about doing this?
// Read in File //
FILE *fp;
char buffer[100];
fp = fopen(params, "r");
printf("Here is filename...");
printf("%s\n", params);
fseek(fp, 0, SEEK_END);
//byte_size = ftell(fp);
rewind(fp);
if (fgets(buffer,sizeof(buffer),fp) != NULL)
{
char*p, *b;
b = buffer;
printf("parsing %s", buffer);
while ((p = strsep(&b, ",")) != NULL)
{
printf("param: %s\n",p);
}
}
fclose(fp);
Using linked list and convert it to array later may be good because we don't know how many tokens are there.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *strsep(char **stringp, const char *delim);
typedef struct node_tag {
char *str;
struct node_tag* next;
} list_node;
list_node* create_node(const char *str) {
list_node* n = malloc(sizeof(list_node));
if (n == NULL) exit(1);
if (str == NULL) {
n->str = NULL;
} else {
n->str = malloc(sizeof(char) * (strlen(str) + 1));
if (n->str == NULL) exit(1);
strcpy(n->str, str);
}
n->next = NULL;
return n;
}
int main(void) {
const char *params = "dummy";
FILE *fp;
char buffer[100];
list_node *head = NULL;
list_node **tail = &head;
unsigned int count = 0;
unsigned int i;
char **array;
fp = stdin;//fopen(params, "r");
printf("Here is filename...");
printf("%s\n", params);
fseek(fp, 0, SEEK_END);
//byte_size = ftell(fp);
rewind(fp);
if (fgets(buffer,sizeof(buffer),fp) != NULL)
{
char*p, *b;
b = buffer;
printf("parsing %s", buffer);
while ((p = strsep(&b, ",")) != NULL)
{
printf("param: %s\n",p);
*tail = create_node(p);
tail = &(*tail)->next;
count++;
}
}
array = malloc(sizeof(char*) * count);
if (array == NULL) return 1;
for (i = 0; i < count && head != NULL; i++) {
list_node *next = head->next;
array[i] = head->str;
// Don't free(head->str) because it is used
free(head);
head = next;
}
for (i = 0; i < count; i++) {
printf("array[%u] = %s\n", i, array[i]);
}
for (i = 0; i < count; i++) free(array[i]);
free(array);
//fclose(fp);
return 0;
}
You can simply use array if you know the number of tokens.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *strsep(char **stringp, const char *delim);
#define ARRAY_SIZE 8
int main(void) {
const char *params = "dummy";
FILE *fp;
char buffer[100];
char *array[ARRAY_SIZE];
int array_count = 0;
int i;
fp = stdin;//fopen(params, "r");
printf("Here is filename...");
printf("%s\n", params);
fseek(fp, 0, SEEK_END);
//byte_size = ftell(fp);
rewind(fp);
if (fgets(buffer,sizeof(buffer),fp) != NULL)
{
char*p, *b;
b = buffer;
printf("parsing %s", buffer);
while ((p = strsep(&b, ",")) != NULL)
{
printf("param: %s\n",p);
if (array_count < ARRAY_SIZE)
{
array[array_count] = malloc(sizeof(char) * (strlen(p) + 1));
strcpy(array[array_count], p);
array_count++;
}
}
}
for (i = 0; i < array_count; i++) {
printf("array[%u] = %s\n", i, array[i]);
}
for (i = 0; i < array_count; i++) free(array[i]);
//fclose(fp);
return 0;
}

Resources