Freeing copies to allocated memory (by getline() function) - c

I have written the following function to read lines of a text file and save them into a global array:
/* maximum number of lines allowed in one source file */
#define MAXSRCLNS 100
char *G_source_lines[MAXSRCLNS]; /* source lines */
/* number of source lines saved into source_ */
size_t G_source_lines_count = 0;
/*
* reads the source file 'path' and puts non-empty lines into the
* array G_source_lines. increments the number of lines
* G_source_lines_count for each line saved.
*/
void read_lines(char *path)
{
FILE *stream;
stream = fopen(path, "r");
if (!stream) {
fprintf(stderr, "can't open source '%s'\n", path);
exit(EXIT_FAILURE);
}
char *lnptr = NULL;
size_t n = 0;
while ((getline(&lnptr, &n, stream) != -1)) {
/* throw away empty lines */
if (!isempty(lnptr)) {
/* assert(("line count too large", G_source_lines_count < MAXSRCLNS)); */
G_source_lines[G_source_lines_count++] = lnptr;
} else {
/* I don't save an empty line in the G_source_lines for later
freeing, so free the allocation right here! */
free(lnptr);
}
lnptr = NULL;
}
/* free the lnptr variable defined and allocated on this stack */
/* don't forget to free it's copies in G_source_lines when done with it */
free(lnptr);
fclose(stream);
}
void free_source_lines(void)
{
for (size_t ln = 0; ln < G_source_lines_count; ++ln)
free(G_source_lines[ln]);
}
I am not sure whether copying the pointers to the allocated memories by getline saved in the lnptr into G_source_lines makes it necessary to free those copies too as the function free_source_lines should do when done with the G_souce_lines, or is it enough to free lnptr in read_line once at the end?

Yes, it's necessary to free them in free_source_lines().
Since you're setting lnptr to NULL before each call to getline(), it's setting it to a pointer to a different buffer each time. The call to free(lnptr) at the end only frees the buffer that was allocated during the final, failing call, not any of the buffers that were saved in G_source_lines.

Related

Fastest way to get char* from file in C

I have function which gets arguments: char* filedata (stores whole file) and FILE *fp (the opened file).
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp))
{
char *new_str;
if((new_str = malloc(strlen(filedata) + strlen(buffer)+1)) != NULL)
{
new_str[0] = '\0'; // ensures the memory is an empty string
strcat(new_str, filedata);
strcat(new_str, buffer);
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str);
}
fclose(fp);
}
But this isn't too fast... is there faster way to read the whole file?
Below's my function illustrating how I usually do it. Not sure how fast it is compared to all other possible C implementations. But I'd imagine they're all pretty similar unless poorly programmed in one way or another, which might lead to slower, less efficient execution.
/* ==========================================================================
* Function: readfile ( FILE *ptr, int *nbytes )
* Purpose: read open file ptr into internal buffer
* --------------------------------------------------------------------------
* Arguments: ptr (I) FILE * to already open (via fopen)
* file, whose contents are to be read
* nbytes (O) int * returning #bytes in returned buffer
* --------------------------------------------------------------------------
* Returns: ( unsigned char * ) buffer with ptr's contents
* --------------------------------------------------------------------------
* Notes: o caller should free() returned output buffer ptr when finished
* ======================================================================= */
/* --- entry point --- */
unsigned char *readfile ( FILE *ptr, int *nbytes ) {
/* ---
* allocations and declarations
* ------------------------------- */
unsigned char *outbuff = NULL; /* malloc'ed and realloc'ed below */
int allocsz=0, reallocsz=500000, /*total #bytes allocated, #realloc */
blksz=9900, nread=0, /* #bytes to read, #actually read */
buffsz = 0; /* total #bytes in buffer */
/* ---
* collect all bytes from ptr
* ----------------------------- */
if ( ptr != NULL ) { /* return NULL error if no input */
while ( 1 ) { /* read all input from file */
if ( buffsz+blksz + 99 >= allocsz ) { /* first realloc more memory */
allocsz += reallocsz; /*add reallocsz to current allocation*/
if ( (outbuff=realloc(outbuff,allocsz)) == NULL ) /* reallocate */
goto end_of_job; } /* quit with NULL ptr if failed */
nread = fread(outbuff+buffsz,1,blksz,ptr); /* read next block */
if ( nread < 1 ) break; /* all done, nothing left to read */
buffsz += nread; /* add #bytes from current block */
} /* --- end-of-while(1) --- */
fclose(ptr); /* close fopen()'ed file ptr */
} /* --- end-of-if(ptr!=NULL) --- */
end_of_job:
if ( nbytes != NULL ) *nbytes = buffsz; /* #bytes in outbuff */
return ( outbuff ); /* back to caller with output or NULL*/
} /* --- end-of-function readfile() --- */
With some caveats, you can read the entire file into an appropriately-sized buffer in one fell swoop using the fread() function.
The following code outlines how to open the file, determine its size, allocate a buffer of that size, then read the file's data (all of it) into that buffer. But note the caveats about the fseek and ftell functions (discussed afterwards):
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
char* filename = "MyFile.txt"; // Or whatever
FILE* fp = fopen(filename, "rb"); // Open in binary mode
int seek = fseek(fp, 0, SEEK_END); // CAVEAT: Files in BINARY mode may not support SEEK_END ...
if (seek != 0) {
printf("Cannot fseek on binary file!\n");
fclose(fp);
return 1;
}
size_t filesize = (size_t)ftell(fp); // ... but this is not reliable if opened in TEXT mode!
char* filedata = calloc(filesize + 1, 1); // Add 1 for the terminating "nul" character
rewind(fp);
fread(filedata, 1, filesize, fp); // Read whole file
// Clean up ...
fclose(fp);
free(filedata);
return 0;
}
Caveats:
Note that files opened in BINARY mode (as in the "rb" mode argument I gave in the fopen() call) are not required to support the SEEK_END origin in calls to fseek(); if this is the case on your platform, then this answer offers some alternatives to determine the file's size. From cppreference:
… Binary streams are not required to support SEEK_END, in
particular if additional null bytes are output.
However, on the other hand, opening the file in TEXT mode (using "rt") will make the call to ftell effectively meaningless, in terms of the required size for your input buffer and the value specified to fread; from cppreference:
If the stream is open in text mode, the value returned by this
function is unspecified and is only meaningful as the input to
fseek().
Also note that, as pointed out in the comments, the fseek() and ftell() functions will fail if the size of the file is larger than the maximum value that can be stored in a long int variable; to handle such cases, you can use the (platform-dependent) 64-bit equivalents, as I described in an answer I posted some time ago.
Annotating your function (not mentioning the leaks, etc) and counting the operations on the character buffers:
void read_file(char *filedata, FILE *fp){
char buffer[1000];
while(fgets(buffer, sizeof(buffer), fp)) // <<-- NEW_SIZE
{
char *new_str;
if((new_str = malloc(strlen(filedata) // <<-- OLD_SIZE
+ strlen(buffer) // <<-- NEW_SIZE
+1)) != NULL)
{
new_str[0] = '\0'; // ensures the memory is an empty string
strcat(new_str, filedata); // <<-- OLD_SIZE
strcat(new_str, buffer); // <<-- OLD_SIZE + NEW_SIZE
}
else
{
printf("malloc failed!\n");
}
strcpy(filedata, new_str); // <<-- OLD_SIZE + NEW_SIZE
}
fclose(fp);
}
fgets(). strlen(), strcat() and strcpy() all need to loop over a character buffer.
Only the fgets() is actually needed, the rest of the copying can be avoided.
Adding the number of passes over the buffers:
sum of operations per loop: 4 * OLD_SIZE + 4 * NEW_SIZE
and: keep in mind that OLD_SIZE is actually SUM(NEW_SIZE), recursively, so your function has
QUADRATIC behavior wrt the number of times the loop iterates.(basically the number of lines read)
So you end up with:
Number of times a character is inspected
= 4 * N_LINE * LINE_SIZE
+ 8 * (NLINE * (NLINE-1) ) * LINE_SIZE
;
, which implies that for a 100 line file you need about 40K passes over the string(s).
[this is the "Schlemiel, the painter" story]

In C, how would you save different lines of a text file to different variables

How would I save different lines from a text File to different variables of different datatypes; all of these variables make up a struct (in my example a flight struct with the following).
struct Flight
{
int flightNum;
char desination[30];
char departDay[15];
};
An Example of the information that I would like to add via text file would be.
111
NYC
Monday
I obviously want to save the words NYC and Monday to a char array, but I want to save 111 to an integer variable
So far I have
while (fscanf(flightInfo, "%s", tempName) != EOF)
{
fscanf(flightInfo, "%d\n", &tempNum);
flight.flightNumber = tempNum;
fscanf(flightInfo, "%s\n", tempName);
strcpy(flight.desination, tempName);
fscanf(flightInfo, "%s\n", tempName)
strcpy(flight.departDay, tempName);
}
Assume that flightInfo is a pointer to a filename, tempNum is an integer, and tempName is a char array
It sounds like you're on the right track.
What about something like this:
#define MAX_FLIGHTS 100
...
struct Flight flights[MAX_FLIGHTS ];
int n_flights = 0;
...
while (!feof(fp) && (n_flights < MAX_FLIGHTS-1))
{
if (fscanf(fp, "%d\n", &flights[n_flights].flightNum) != 1)
error_handler();
if (fscanf(fp, "%29s\n", flights[n_flights].destination) != 1)
error_handler();
if (fscanf(fp, "%14s\n", flights[n_flights].departDay) != 1)
error_handler();
++n_flights;
}
...
ADDENDUM:
Per Chux's suggestion, I've modified the code to mitigate against potential buffer overruns, by setting scanf max string length to 29 (1 less than char[30] buffer size).
Here is a more detailed explanation:
SonarSource: "scanf()" and "fscanf()" format strings should specify a field width for the "%s" string placeholder
The first question you have to answer is this: how important is it for the file to be readable by people, or on other platforms?
If it isn't that important, then I recommend serializing with fwrite() and fread(). That is easier to code for each record, and - as long as your structs are all the same size - allows O(1) access to any record in the file.
If you do want to store these as individual lines, the best way to read a line in from a file is with fgets()
Pseudocode follows:
typedef struct flight {
int flightNum;
char desination[30];
char departDay[15];
} flight;
typedef struct flightSet {
flight *flights;
size_t n; /* number of flights */
size_t nAlloc; /* number of flights you have space for */
} flightSet;
#define FLIGHTSET_INIT_SIZE 16
#define MAX_LINE_LENGTH 128
#define FILENAME "file.txt"
// Create a new flightSet, calling it F
// Allocate FLIGHTSET_INIT_ALLOC number of flight structures for F->flights
// Set F->n to 0
// Set F->nAlloc to FLIGHTSET_INIT_ALLOC
/* Set up other variables */
size_t i = 0; // iterator */
char buffer[MAX_LINE_LENGTH]; // for reading with fgets() */
flights *temp; // for realloc()ing when we have more flights to read
// after reaching nAlloc flights
char *endptr; // for using strtol() to get a number from buffer
FILE *fp; // for reading from the file
// Open FILENAME with fp for reading
//MAIN LOOP
// If i == F->nAlloc, use realloc() to double the allocation of F->flights
// If successful, double F->nAlloc
if (fgets(buffer, MAX_LINE_LENGTH, fp) == NULL) {
// End of file
// Use break to get out of the main loop
}
F->flights[i]->flightNum = (int)strtol(buffer, &endptr, 10);
if (endptr == buffer) {
// The first invalid character that can't be converted to a number is at the very beginning
// of the buffer, so this is not a valid numerical character and your data file is corrupt
// Print out an error message
break;
}
if (fgets(buffer, MAX_LINE_LENGTH, fp) == NULL) {
// End of file when expecting new line; file format error
// Use break to get out of the main loop
} else {
F->flights[i]->destination = strdup(buffer); // If your system has strdup()
// Check for memory allocation
}
if (fgets(buffer, MAX_LINE_LENGTH, fp) == NULL) {
// End of file when expecting new line; file format error
// Use break to get out of the main loop
} else {
F->flights[i]->departDay = strdup(buffer); // If your system has strdup()
// Check for memory allocation
}
// If you've gotten here so far without errors, great!
// Increment F->n to reflect the number of successful records we have in F.
// Increment i, the loop iterator
//Final cleanup. Should include closing the file, and freeing any allocated
//memory that didn't end up in a valid record.

A more elegant way to parse

I'm kind of new to C.
I need to write a small function that opens a configuration file that has 3 lines, each line contains a path to files/directories that I need to extract.
I wrote this program and it seem to work:
void readCMDFile(char* cmdFile,char directoryPath[INPUT_SIZE], char inputFilePath[INPUT_SIZE],char outputFilePath [INPUT_SIZE]) {
//open files
int file = open(cmdFile, O_RDONLY);
if (file < 0) {
handleFailure();
}
char buffer[BUFF_SIZE];
int status;
int count;
while((count=read(file,buffer,sizeof(buffer)))>0)
{
int updateParam = UPDATE1;
int i,j;
i=0;
j=0;
for (;i<count;i++) {
if (buffer[i]!='\n'&&buffer[i]!=SPACE&&buffer[i]!='\0') {
switch (updateParam){
case UPDATE1:
directoryPath[j] = buffer[i];
break;
case UPDATE2:
inputFilePath[j] = buffer[i];
break;
case UPDATE3:
outputFilePath[j] = buffer[i];
break;
}
j++;
} else{
switch (updateParam){
case UPDATE1:
updateParam = UPDATE2;
j=0;
break;
case UPDATE2:
updateParam = UPDATE3;
j=0;
break;
}
}
}
}
if (count < 0) {
handleFailure();
}
}
but it is incredibly unintuitive and pretty ugly, so I thought there must be a more elegant way to do it. are there any suggestions?
Thanks!
Update: a config file content will look like that:
/home/bla/dirname
/home/bla/bla/file1.txt
/home/bla/bla/file2.txt
Your question isn't one about parsing the contents of the file, it is simply one about reading the lines of the file into adequate storage within a function in a manner that the object containing the stored lines can be return to the calling function. This is fairly standard, but you have a number of ways to approach it.
The biggest consideration is not knowing the length of the lines to be read. You say there are currently 3-lines to be read, but there isn't any need to know beforehand how many lines there are (by knowing -- you can avoid realloc, but that is about the only savings)
You want to create as robust and flexible method you can for reading the lines and storing them in a way that allocates just enough memory to hold what is read. A good approach is to declare a fixed-size temporary buffer to hold each line read from the file with fgets and then to call strlen on the buffer to determine the number of characters required (as well as trimming the trailing newline included by fgets) Since you are reading path information the predefined macro PATH_MAX can be used to adequately size your temporary buffer to insure it can hold the maximum size path usable by the system. You could also use POSIX geline instead of fgets, but we will stick to the C-standard library for now.
The basic type that will allow you to allocate storage for multiple lines in your function and return a single pointer you can use in the calling function is char ** (a pointer to pointer to char -- or loosely an dynamic array of pointers). The scheme is simple, you allocate for some initial number of pointers (3 in your case) and then loop over the file, reading a line at a time, getting the length of the line, and then allocating length + 1 characters of storage to hold the line. For example, if you allocate 3 pointers with:
#define NPATHS 3
...
char **readcmdfile (FILE *fp, size_t *n)
{
...
char buf[PATH_MAX] = ""; /* temp buffer to hold line */
char **paths = NULL; /* pointer to pointer to char to return */
size_t idx = 0; /* index counter (avoids dereferencing) */
...
paths = calloc (NPATHS, sizeof *paths); /* allocate NPATHS pointers */
if (!paths) { /* validate allocation/handle error */
perror ("calloc-paths");
return NULL;
}
...
while (idx < NPATHS && fgets (buf, sizeof buf, fp)) {
size_t len = strlen (buf); /* get length of string in buf */
...
paths[idx] = malloc (len + 1); /* allocate storage for line */
if (!paths[idx]) { /* validate allocation */
perror ("malloc-paths[idx]"); /* handle error */
return NULL;
}
strcpy (paths[idx++], buf); /* copy buffer to paths[idx] */
...
return paths; /* return paths */
}
(note: you can eliminate the limit of idx < NPATHS, if you include the check before allocating for each string and realloc more pointers, as required)
The remainder is just the handling of opening the file and passing the open file-stream to your function. A basic approach is to either provide the filename on the command line and then opening the filename provided with fopen (or read from stdin by default if no filename is given). As with every step in your program, you need to validate the return and handle any error to avoid processing garbage (and invoking Undefined Behavior)
A simple example would be:
int main (int argc, char **argv) {
char **paths; /* pointer to pointer to char for paths */
size_t i, n = 0; /* counter and n - number of paths read */
/* open file given by 1st argument (or read stdin by default) */
FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin;
if (!fp) { /* validate file open for reading */
perror ("fopen-failed");
return 1;
}
paths = readcmdfile (fp, &n); /* call function to read file */
/* passing open file pointer */
if (!paths) { /* validate return from function */
fprintf (stderr, "error: readcmdfile failed.\n");
return 1;
}
for (i = 0; i < n; i++) { /* output lines read from file */
printf ("path[%lu]: %s\n", i + 1, paths[i]);
free (paths[i]); /* free memory holding line */
}
free (paths); /* free pointers */
return 0;
}
Putting all the pieces together, adding the code the trim the '\n' read and included in buf by fgets, and adding an additional test to make sure the line you read actually fit in buf, you could do something like this:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h> /* for PATH_MAX */
#define NPATHS 3
/* read lines from file, return pointer to pointer to char on success
* otherwise return NULL. 'n' will contain number of paths read from file.
*/
char **readcmdfile (FILE *fp, size_t *n)
{
char buf[PATH_MAX] = ""; /* temp buffer to hold line */
char **paths = NULL; /* pointer to pointer to char to return */
size_t idx = 0; /* index counter (avoids dereferencing) */
*n = 0; /* zero the pointer passed as 'n' */
paths = calloc (NPATHS, sizeof *paths); /* allocate NPATHS pointers */
if (!paths) { /* validate allocation/handle error */
perror ("calloc-paths");
return NULL;
}
/* read while index < NPATHS & good read into buf
* (note: instead of limiting to NPATHS - you can simply realloc paths
* when idx == NPATHS -- but that is for later)
*/
while (idx < NPATHS && fgets (buf, sizeof buf, fp)) {
size_t len = strlen (buf); /* get length of string in buf */
if (len && buf[len - 1] == '\n') /* validate last char is '\n' */
buf[--len] = 0; /* overwrite '\n' with '\0' */
else if (len == PATH_MAX - 1) { /* check buffer full - line to long */
fprintf (stderr, "error: path '%lu' exceeds PATH_MAX.\n", idx);
return NULL;
}
paths[idx] = malloc (len + 1); /* allocate storage for line */
if (!paths[idx]) { /* validate allocation */
perror ("malloc-paths[idx]"); /* handle error */
return NULL;
}
strcpy (paths[idx++], buf); /* copy buffer to paths[idx] */
}
*n = idx; /* update 'n' to contain index - no. of lines read */
return paths; /* return paths */
}
int main (int argc, char **argv) {
char **paths; /* pointer to pointer to char for paths */
size_t i, n = 0; /* counter and n - number of paths read */
/* open file given by 1st argument (or read stdin by default) */
FILE *fp = argc > 1 ? fopen (argv[1], "r") : stdin;
if (!fp) { /* validate file open for reading */
perror ("fopen-failed");
return 1;
}
paths = readcmdfile (fp, &n); /* call function to read file */
/* passing open file pointer */
if (!paths) { /* validate return from function */
fprintf (stderr, "error: readcmdfile failed.\n");
return 1;
}
for (i = 0; i < n; i++) { /* output lines read from file */
printf ("path[%lu]: %s\n", i + 1, paths[i]);
free (paths[i]); /* free memory holding line */
}
free (paths); /* free pointers */
return 0;
}
(note: if you allocate memory -- it is up to you to preserve a pointer to the beginning of each block -- so it can be freed when it is no longer needed)
Example Input File
$ cat paths.txt
/home/bla/dirname
/home/bla/bla/file1.txt
/home/bla/bla/file2.txt
Example Use/Output
$ ./bin/readpaths <paths.txt
path[1]: /home/bla/dirname
path[2]: /home/bla/bla/file1.txt
path[3]: /home/bla/bla/file2.txt
As you can see the function has simply read each line of the input file, allocated 3 pointers, allocated for each line and assigned the address for each block to the corresponding pointer and then returns a pointer to the collection to main() where it is assigned to paths there. Look things over and let me know if you have further questions.
I recommend looking into regular expressions. That way you read everything, then match with regular expressions and handle your matches.
Regular expressions exist for this purpose: to make parsing elegant.
If I were you, I will create a method for if/else blocks. I feel like they're redundant.
switch(updateParam) {
case UPDATE1:
method(); /*do if/else here*/
break;
...............
...............
}
However, you can still put them there if you do not need the method for other times and you concern about performance issues as function call costs more than just collective instructions.
In your program, you are passing 3 array of char to store the 3 lines read from the file. But this is very inefficient as the input file may contain more lines and in future, you may have the requirement to read more than 3 lines from the file. Instead, you can pass the array of char pointers and allocate memory to them and copy the content of lines read from the file. As pointed by Jonathan (in comment), if you use standard I/O then you can use function like fgets() to read lines
from input file.
Read a line from the file and allocate memory to the pointer and copy the line, read from the file to it. If the line is too long, you can read remaining part in consecutive calls to fgets() and use realloc to expand the existing memory, the pointer is pointing to, large enough to accommodate the remaining part of the line read.
Putting these all together, you can do:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUF_SZ 100
#define MAX_LINES 3 /* Maximum number of lines to be read from file */
int readCMDFile(const char* cmdFile, char *paths[MAX_LINES]) {
int count, next_line, line_cnt, new_line_found;
char tmpbuf[BUF_SZ];
FILE *fp;
fp = fopen(cmdFile, "r");
if (fp == NULL) {
perror ("Failed to open file");
return -1;
}
next_line = 1; /* Keep track of next line */
count = 1; /* Used to calculate the size of memory, if need to reallocte
* in case when a line in the file is too long to read in one go */
line_cnt = 0; /* Keep track of index of array of char pointer */
new_line_found = 0;
while ((line_cnt < MAX_LINES) && (fgets (tmpbuf, BUF_SZ, fp) != NULL)) {
if (tmpbuf[strlen(tmpbuf) - 1] == '\n') {
tmpbuf[strlen(tmpbuf) - 1] = '\0';
new_line_found = 1;
} else {
new_line_found = 0;
}
if (next_line) {
paths[line_cnt] = calloc (sizeof (tmpbuf), sizeof (char));
if (paths[line_cnt] == NULL) {
perror ("Failed to allocate memory");
return -1;
}
next_line = 0;
count = 1;
} else {
char *ptr = realloc (paths[line_cnt], sizeof (tmpbuf) * (++count));
if (ptr == NULL) {
free (paths[line_cnt]);
perror ("Failed to reallocate memory");
return -1;
} else {
paths[line_cnt] = ptr;
}
}
/* Using strcat to copy the buffer to allocated memory because
* calloc initialize the block of memory with zero, so it will
* be same as strcpy when first time copying the content of buffer
* to the allocated memory and fgets add terminating null-character
* to the buffer so, it will concatenate the content of buffer to
* allocated memory in case when the pointer is reallocated */
strcat (paths[line_cnt], tmpbuf);
if (new_line_found) {
line_cnt++;
next_line = 1;
}
}
fclose(fp);
return line_cnt;
}
int main(void) {
int lines_read, index;
const char *file_name = "cmdfile.txt";
char *paths[MAX_LINES] = {NULL};
lines_read = readCMDFile(file_name, paths);
if (lines_read < 0) {
printf ("Failed to read file %s\n", file_name);
}
/* Check the output */
for (index = 0; index < lines_read; index++) {
printf ("Line %d: %s\n", index, paths[index]);
}
/* Free the allocated memory */
for (index = 0; index < lines_read; index++) {
free (paths[index]);
paths[index] = NULL;
}
return 0;
}
Output:
$ cat cmdfile.txt
/home/bla/dirname
/home/bla/bla/file1.txt
/home/bla/bla/file2.txt
$ ./a.out
Line 0: /home/bla/dirname
Line 1: /home/bla/bla/file1.txt
Line 2: /home/bla/bla/file2.txt
Note that the above program is not taking care of empty lines in the file as it has not been mentioned in the question. But if you want, you can add that check just after removing the trailing newline character from the line read from the file.

How to read in two text files and count the amount of keywords?

I have tried looking around but, to me files are the hardest thing to understand so far as I am learning C, especially text files, binary files were a bit easier. Basically I have to read in two text files both contains words that are formatted like this "hard, working,smart, works well, etc.." I am suppose to compare the text files and count the keywords. I would show some code but honestly I am lost and the only thing I have down is just nonsense besides this.
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#define SIZE 1000
void resumeRater();
int main()
{
int i;
int counter = 0;
char array[SIZE];
char keyword[SIZE];
FILE *fp1, *fp2;
int ch1, ch2;
errno_t result1 = fopen_s(&fp1, "c:\\myFiles\\resume.txt", "r");
errno_t result2 = fopen_s(&fp2, "c:\\myFiles\\ideal.txt", "r");
if (fp1 == NULL) {
printf("Failed to open");
}
else if (fp2 == NULL) {
printf("Failed to open");
}
else {
result1 = fread(array, sizeof(char), 1, fp1);
result2 = fread(keyword, sizeof(char), 1, fp2);
for (i = 0; i < SIZE; i++)
{
if (array[i] == keyword[i])
{
counter++;
}
}
fclose(fp1);
fclose(fp2);
printf("Character match: %d", counter);
}
system("pause");
}
When you have a situation where you are doing a multiple of something (like reading 2 files), it makes a lot of sense to plan ahead. Rather than muddying the body of main with all the code necessary to read 2 text files, create a function that reads the text file for you and have it return an array containing the lines of the file. This really helps you concentrate on the logic of what your code needs to do with the lines rather than filling space with getting the lines in the first place. Now there is nothing wrong with cramming it all in one long main, but from a readability, maintenance, and program structure standpoint, it makes all more difficult.
If you structure the read function well, you can reduce your main to the following. This reads both text files into character arrays and provides the number of lines read in a total of 4 lines (plus the check to make sure your provided two filenames to read):
int main (int argc, char **argv) {
if (argc < 3 ) {
fprintf (stderr, "error: insufficient input, usage: %s <filename1> <filename2>\n", argv[0]);
return 1;
}
size_t file1_size = 0; /* placeholders to be filled by readtxtfile */
size_t file2_size = 0; /* for general use, not needed to iterate */
/* read each file into an array of strings,
number of lines read, returned in file_size */
char **file1 = readtxtfile (argv[1], &file1_size);
char **file2 = readtxtfile (argv[2], &file2_size);
return 0;
}
At that point you have all your data and you can work on your key word code. Reading from textfiles is a very simple matter. You just have to get comfortable with the tools available. When reading lines of text, the preferred approach is to use line-input to read an entire line at a time into a buffer. You then parse to buffer to get what it is you need. The line-input tools are fgets and getline. Once you have read the line, you then have tools like strtok, strsep or sscanf to separate what you want from the line. Both fgets and getline read the newline at the end of each line as part of their input, so you may need to remove the newline to meet your needs.
Storing each line read is generally done by declaring a pointer to an array of char* pointers. (e.g. char **file1;) You then allocate memory for some initial number of pointers. (NMAX in the example below) You then access the individual lines in the file as file1_array[n] when n is the line index 0 - lastline of the file. If you have a large file and exceed the number of pointers you originally allocated, you simply reallocate additional pointers for your array with realloc. (you can set NMAX to 1 to make this happen for every line)
What you use to allocate memory and how you reallocate can influence how you make use of the arrays in your program. Careful choices of calloc to initially allocate your arrays, and then using memset when you reallocate to set all unused pointers to 0 (null), can really save you time and headache? Why? Because, to iterate over your array, all you need to do is:
n = 0;
while (file1[n]) {
<do something with file1[n]>;
n++;
}
When you reach the first unused pointer (i.e. the first file1[n] that is 0), the loop stops.
Another very useful function when reading text files is strdup (char *line). strdup will automatically allocate space for line using malloc, copy line to the newly allocated memory, and return a pointer to the new block of memory. This means that all you need to do to allocate space for each pointer and copy the line ready by getline to your array is:
file1[n] = strdup (line);
That's pretty much it. you have read your file and filled your array and know how to iterate over each line in the array. What is left is cleaning up and freeing the memory allocated when you no longer need it. By making sure that your unused pointers are 0, this too is a snap. You simply iterate over your file1[n] pointers again, freeing them as you go, and then free (file1) at the end. Your done.
This is a lot to take in, and there are a few more things to it. On the initial read of the file, if you noticed, we also declare a file1_size = 0; variable, and pass its address to the read function:
char **file1 = readtxtfile (argv[1], &file1_size);
Within readtxtfile, the value at the address of file1_size is incremented by 1 each time a line is read. When readtxtfile returns, file1_size contains the number of lines read. As shown, this is not needed to iterate over the file1 array, but you often need to know how many lines you have read.
To put this all together, I created a short example of the functions to read two text files, print the lines in both and free the memory associated with the file arrays. This explanation ended up longer than I anticipated. So take time to understand how it works, and you will be a step closer to handling textfiles easily. The code below will take 2 filenames as arguments (e.g. ./progname file1 file2) Compile it with something similar to gcc -Wall -Wextra -o progname srcfilename.c:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NMAX 256
char **readtxtfile (char *fn, size_t *idx);
char **realloc_char (char **p, size_t *n);
void prn_chararray (char **ca);
void free_chararray (char **ca);
int main (int argc, char **argv) {
if (argc < 3 ) {
fprintf (stderr, "error: insufficient input, usage: %s <filename1> <filename2>\n", argv[0]);
return 1;
}
size_t file1_size = 0; /* placeholders to be filled by readtxtfile */
size_t file2_size = 0; /* for general use, not needed to iterate */
/* read each file into an array of strings,
number of lines read, returned in file_size */
char **file1 = readtxtfile (argv[1], &file1_size);
char **file2 = readtxtfile (argv[2], &file2_size);
/* simple print function */
if (file1) prn_chararray (file1);
if (file2) prn_chararray (file2);
/* simple free memory function */
if (file1) free_chararray (file1);
if (file2) free_chararray (file2);
return 0;
}
char** readtxtfile (char *fn, size_t *idx)
{
if (!fn) return NULL; /* validate filename provided */
char *ln = NULL; /* NULL forces getline to allocate */
size_t n = 0; /* max chars to read (0 - no limit) */
ssize_t nchr = 0; /* number of chars actually read */
size_t nmax = NMAX; /* check for reallocation */
char **array = NULL; /* array to hold lines read */
FILE *fp = NULL; /* file pointer to open file fn */
/* open / validate file */
if (!(fp = fopen (fn, "r"))) {
fprintf (stderr, "%s() error: file open failed '%s'.", __func__, fn);
return NULL;
}
/* allocate NMAX pointers to char* */
if (!(array = calloc (NMAX, sizeof *array))) {
fprintf (stderr, "%s() error: memory allocation failed.", __func__);
return NULL;
}
/* read each line from fp - dynamicallly allocated */
while ((nchr = getline (&ln, &n, fp)) != -1)
{
/* strip newline or carriage rtn */
while (nchr > 0 && (ln[nchr-1] == '\n' || ln[nchr-1] == '\r'))
ln[--nchr] = 0;
array[*idx] = strdup (ln); /* allocate/copy ln to array */
(*idx)++; /* increment value at index */
if (*idx == nmax) /* if lines exceed nmax, reallocate */
array = realloc_char (array, &nmax);
}
if (ln) free (ln); /* free memory allocated by getline */
if (fp) fclose (fp); /* close open file descriptor */
return array;
}
/* print an array of character pointers. */
void prn_chararray (char **ca)
{
register size_t n = 0;
while (ca[n])
{
printf (" arr[%3zu] %s\n", n, ca[n]);
n++;
}
}
/* free array of char* */
void free_chararray (char **ca)
{
if (!ca) return;
register size_t n = 0;
while (ca[n])
free (ca[n++]);
free (ca);
}
/* realloc an array of pointers to strings setting memory to 0.
* reallocate an array of character arrays setting
* newly allocated memory to 0 to allow iteration
*/
char **realloc_char (char **p, size_t *n)
{
char **tmp = realloc (p, 2 * *n * sizeof *p);
if (!tmp) {
fprintf (stderr, "%s() error: reallocation failure.\n", __func__);
// return NULL;
exit (EXIT_FAILURE);
}
p = tmp;
memset (p + *n, 0, *n * sizeof *p); /* memset new ptrs 0 */
*n *= 2;
return p;
}
valgrind - Don't Forget To Check For Leaks
Lastly, anytime you allocate memory in your code, make sure you use a memory checker such as valgrind to confirm you have no memory errors and to confirm you have no memory leaks (i.e. allocated blocks you have forgotten to free, or that have become unreachable). valgrind is simple to use, just valgrind ./progname [any arguments]. It can provide a wealth of information. For example, on this read example:
$ valgrind ./bin/getline_readfile_fn voidstruct.c wii-u.txt
==14690== Memcheck, a memory error detector
==14690== Copyright (C) 2002-2012, and GNU GPL'd, by Julian Seward et al.
==14690== Using Valgrind-3.8.1 and LibVEX; rerun with -h for copyright info
==14690== Command: ./bin/getline_readfile_fn voidstruct.c wii-u.txt
==14690==
<snip - program output>
==14690==
==14690== HEAP SUMMARY:
==14690== in use at exit: 0 bytes in 0 blocks
==14690== total heap usage: 61 allocs, 61 frees, 6,450 bytes allocated
==14690==
==14690== All heap blocks were freed -- no leaks are possible
==14690==
==14690== For counts of detected and suppressed errors, rerun with: -v
==14690== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 2 from 2)
Pay particular attention to the lines:
==14690== All heap blocks were freed -- no leaks are possible
and
==14690== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 2 from 2)
You can ignore the (suppressed: 2 from 2) which just indicate I don't have the development files installed for libc.

Reading text file into an array of lines in C

Using C I would like to read in the contents of a text file in such a way as to have when all is said and done an array of strings with the nth string representing the nth line of the text file. The lines of the file can be arbitrarily long.
What's an elegant way of accomplishing this? I know of some neat tricks to read a text file directly into a single appropriately sized buffer, but breaking it down into lines makes it trickier (at least as far as I can tell).
Thanks very much!
Breaking it down into lines means parsing the text and replacing all the EOL (by EOL I mean \n and \r) characters with 0.
In this way you can actually reuse your buffer and store just the beginning of each line into a separate char * array (all by doing only 2 passes).
In this way you could do one read for the whole file size+2 parses which probably would improve performance.
It's possible to read the number of lines in the file (loop fgets), then create a 2-dimensional array with the first dimension being the number of lines+1. Then, just re-read the file into the array.
You'll need to define the length of the elements, though. Or, do a count for the longest line size.
Example code:
inFile = fopen(FILENAME, "r");
lineCount = 0;
while(inputError != EOF) {
inputError = fscanf(inFile, "%s\n", word);
lineCount++;
}
fclose(inFile);
// Above iterates lineCount++ after the EOF to allow for an array
// that matches the line numbers
char names[lineCount][MAX_LINE];
fopen(FILENAME, "r");
for(i = 1; i < lineCount; i++)
fscanf(inFile, "%s", names[i]);
fclose(inFile);
For C (as opposed to C++), you'd probably wind up using fgets(). However, you might run into issues due to your arbitrary length lines.
Perhaps a Linked List would be the best way to do this?
The compiler won't like having an array with no idea how big to make it. With a Linked List you can have a really large text file, and not worry about allocating enough memory to the array.
Unfortunately, I haven't learned how to do linked lists, but maybe somebody else could help you.
If you have a good way to read the whole file into memory, you are almost there. After you've done that you could scan the file twice. Once to count the lines, and once to set the line pointers and replace '\n' and (and maybe '\r' if the file is read in Windows binary mode) with '\0'. In between scans allocate an array of pointers, now that you know how many you need.
you can use this way
#include <stdlib.h> /* exit, malloc, realloc, free */
#include <stdio.h> /* fopen, fgetc, fputs, fwrite */
struct line_reader {
/* All members are private. */
FILE *f;
char *buf;
size_t siz;
};
/*
* Initializes a line reader _lr_ for the stream _f_.
*/
void
lr_init(struct line_reader *lr, FILE *f)
{
lr->f = f;
lr->buf = NULL;
lr->siz = 0;
}
/*
* Reads the next line. If successful, returns a pointer to the line,
* and sets *len to the number of characters, at least 1. The result is
* _not_ a C string; it has no terminating '\0'. The returned pointer
* remains valid until the next call to next_line() or lr_free() with
* the same _lr_.
*
* next_line() returns NULL at end of file, or if there is an error (on
* the stream, or with memory allocation).
*/
char *
next_line(struct line_reader *lr, size_t *len)
{
size_t newsiz;
int c;
char *newbuf;
*len = 0; /* Start with empty line. */
for (;;) {
c = fgetc(lr->f); /* Read next character. */
if (ferror(lr->f))
return NULL;
if (c == EOF) {
/*
* End of file is also end of last line,
` * unless this last line would be empty.
*/
if (*len == 0)
return NULL;
else
return lr->buf;
} else {
/* Append c to the buffer. */
if (*len == lr->siz) {
/* Need a bigger buffer! */
newsiz = lr->siz + 4096;
newbuf = realloc(lr->buf, newsiz);
if (newbuf == NULL)
return NULL;
lr->buf = newbuf;
lr->siz = newsiz;
}
lr->buf[(*len)++] = c;
/* '\n' is end of line. */
if (c == '\n')
return lr->buf;
}
}
}
/*
* Frees internal memory used by _lr_.
*/
void
lr_free(struct line_reader *lr)
{
free(lr->buf);
lr->buf = NULL;
lr->siz = 0;
}
/*
* Read a file line by line.
* http://rosettacode.org/wiki/Read_a_file_line_by_line
*/
int
main()
{
struct line_reader lr;
FILE *f;
size_t len;
char *line;
f = fopen("foobar.txt", "r");
if (f == NULL) {
perror("foobar.txt");
exit(1);
}
/*
* This loop reads each line.
* Remember that line is not a C string.
* There is no terminating '\0'.
*/
lr_init(&lr, f);
while (line = next_line(&lr, &len)) {
/*
* Do something with line.
*/
fputs("LINE: ", stdout);
fwrite(line, len, 1, stdout);
}
if (!feof(f)) {
perror("next_line");
exit(1);
}
lr_free(&lr);
return 0;
}

Resources