fscanf reads one line out of 7 wrong - c

I have a zipped text file with 7 very long lines of text containing information for the decoding of a JPEG encoded file.
When I try to read the unzipped File with my C program, line by line with fscanf, I get the first 3 and the last 3 lines correctly, just the 4th line isn't read as a string as expected.
The output of the 4th line is a very long string filled with 1 and 0.
If I look at the input file with Notepad or a hex editor everything looks fine as it should.
If I manually create a text file with the same structure (but with shorter lines) fscanf works fine.
There is no difference if I unzip the File with my program or do it manually.
FILE *tmpdata;
char enc_path[256];
int arrsize;
// Building the absolute Path
sprintf(enc_path, "%s%stmp.txt", dest, src_name);
arrsize = unzip(); // gives back size of the file
// not the best way to create the output strings,
// but I don't know the size of the lines.
char masse[10];
char ytabelle[arrsize / 3];
char cbtabelle[arrsize / 3];
char crtabelle[arrsize / 2];
char ywerte[arrsize /3];
char cbwerte[arrsize / 3];
char crwerte[arrsize / 3];
if ((tmpdata = fopen(enc_path, "r")) == NULL) {
printf("Error: canĀ“t read input file\n");
return EXIT_FAILURE;
}
fscanf(tmpdata, "%s %s %s %s %s %s %s", masse, ytabelle, cbtabelle, crtabelle, ywerte, cbwerte, crwerte);
The input file looks like:
512x512
Y{42:110000;13:111000;...;0:0;}
CB{42:110000;13:111000;...;0:0;}
CR{42:110000;13:111000;...;0:0;}
000111010010111001110000111100011...
100011011101110001101000011100110...
100011101110110111011001100111011...
if I print the separate strings:
512x512
Y{42:110000;13:111000;...;0:0;}
CB{42:110000;13:111000;...;0:0;}
111001111111111000110000111111000...
000111010010111001110000111100011...
100011011101110001101000011100110...
100011101110110111011001100111011...

There are multiples reasons for your program to not behave properly:
you may allocate too much data with automatic storage (aka on the stack), causing erratic behavior.
the strings int the file might contain embedded spaces, causing fscanf() to read words instead of lines.
you do not tell fscanf() the size of the destination arrays. fscanf() may store data beyond the end of the destination arrays, overflowing into the next array (which would explain the observed behavior) or causing some other undefined behavior.
It is very cumbersome to pass the size of the destination arrays when they are not simple constants. I suggest you use fgets() instead of fscanf() to read the file contents and allocate the arrays with malloc() to a larger size to avoid problems:
FILE *tmpdata;
char enc_path[256];
size_t arrsize;
// Building the absolute path
snprintf(enc_path, sizeof enc_path, "%s%stmp.txt", dest, src_name);
arrsize = unzip(); // gives back size of the file
// not the best way to create the output strings,
// but I don't know the size of the lines.
char masse[16];
size_t ytabelle_size = arrsize + 2;
size_t cbtabelle_size = arrsize + 2;
size_t crtabelle_size = arrsize + 2;
char *ytabelle = malloc(ytabelle_size);
char *cbtabelle = malloc(cbtabelle_size);
char *crtabelle = malloc(crtabelle_size);
size_t ywerte_size = arrsize + 2;
size_t cbwerte_size = arrsize + 2;
size_t crwerte_size = arrsize + 2;
char *ywerte = malloc(ywerte_size);
char *cbwerte = malloc(cbwerte_size);
char *crwerte = malloc(crwerte_size);
if (!ytabelle ||!cbtabelle ||!crtabelle ||!ywerte ||!cbwerte ||!crwerte) {
printf("Error: cannot allocate memory\n");
return EXIT_FAILURE;
}
if ((tmpdata = fopen(enc_path, "r")) == NULL) {
printf("Error: cannot open input file\n");
return EXIT_FAILURE;
}
if (!fgets(masse, sizeof masse, tmpdata)
|| !fgets(ytabelle, ytabelle_size, tmpdata)
|| !fgets(cbtabelle, cbtabelle_size, tmpdata)
|| !fgets(crtabelle, crtabelle_size, tmpdata)
|| !fgets(ywerte, ywerte_size, tmpdata)
|| !fgets(cbwerte, cbwerte_size, tmpdata)
|| !fgets(crwerte, crwerte_size, tmpdata)) {
printf("Error: cannot read input file\n");
return EXIT_FAILURE;
}
// file contents were read, arrays should have a trailing newline, which
// you should strip or handle in the decoding phase.
...
If you are using the GNUlibc or some modern Posix systems, you could use the m prefix in fscanf() to allocate the space for the words read from the file. Using this allows for a simpler but non portable solution:
FILE *tmpdata;
char enc_path[256];
size_t arrsize;
// Building the absolute path
snprintf(enc_path, sizeof enc_path, "%s%stmp.txt", dest, src_name);
arrsize = unzip(); // gives back size of the file
// not the best way to create the output strings,
// but I don't know the size of the lines.
char masse[16];
char *ytabelle = NULL;
char *cbtabelle = NULL;
char *crtabelle = NULL;
char *ywerte = NULL;
char *cbwerte = NULL;
char *crwerte = NULL;
if ((tmpdata = fopen(enc_path, "r")) == NULL) {
printf("Error: cannot open input file\n");
return EXIT_FAILURE;
}
if (fscanf(tmpdata, "%ms %ms %ms %ms %ms %ms %ms", &masse,
&ytabelle, &cbtabelle, &crtabelle,
&ywerte, &cbwerte, &crwerte) != 7) {
printf("Error: cannot read input file\n");
return EXIT_FAILURE;
}
...
PS: Unlike German, the initial letters of nouns are not capitalized in English, except for some exceptions such as language, people and place names.

Maybe avoid stack allocation??
char masse[10];
char *ytabelle = malloc(arrsize/3); if (!ytabelle) exit(EXIT_FAILURE);
char *cbtabelle = malloc(arrsize/3); if (!cbtabelle) exit(EXIT_FAILURE);
char *crtabelle = malloc(arrsize/2); if (!crtabelle) exit(EXIT_FAILURE);
char *ywerte = malloc(arrsize/3); if (!ywerte) exit(EXIT_FAILURE);
char *cbwerte = malloc(arrsize/3); if (!cbwerte) exit(EXIT_FAILURE);
char *crwerte = malloc(arrsize/3); if (!crwerte) exit(EXIT_FAILURE);
/* use as before */
free(ytabelle);
free(cbtabelle);
free(crtabelle);
free(ywerte);
free(cbwerte);
free(crwerte);

Related

How to read from a file and parse it

I have a file .txt containing some values formatted like this:
0,30,25,10
Now, I open up the file and store it into an array
char imposta_tratt[300];
FILE *fp;
fp = fopen("/home/pi/Documents/imposta_trattamento.txt", "r");
if (fp == 0) return;
fread(imposta_tratt, sizeof(imposta_tratt), 1, fp);
fclose(fp);
Now I expect to have the array filled with my data. I have the values separated by a , so I go on and parse it:
const char delim[2] = ",";
int t=0;
char *token = strtok(imposta_tratt, delim);
while (token!=NULL){
strcpy(tratt[t],token);
token = strtok(NULL, delim);
tratt[t]=token;
t++;
}
Here, referring to what's in the file .txt, I expect to have tratt[0]=0; tratt[1]=30; tratt[2]=25; and so on, but seems like I am missing something since it's not like this.
All I want is to have the values of the txt file stored in single variables. Can someone help?
What you are trying to achieve can simply be done using fgets():
bool read_file_content(const char *filename, const size_t tsizemax, int tratt[tsizemax], size_t *tsize, const char *delim)
{
// Attempt to open filename.
FILE *fp = fopen(filename, "r");
if (!fp) return false; // Return false upon failure.
// Try to read one line. If you have more, you need a while loop.
char imposta_tratt[300];
if (!fgets(imposta_tratt, sizeof imposta_tratt, fp)) {
fclose(fp);
return false;
}
*tsize = 0;
char tmp[300]; // Temporary buffer. Used for conversion into int.
char *token = strtok(imposta_tratt, delim);
while (token && *tsize < tsizemax) {
strncpy(tmp, token, sizeof tmp);
tratt[(*tsize)++] = atoi(tmp);
token = strtok(NULL, delim);
}
fclose(fp);
return true;
}
const char *filename: The file you want to parse.
const size_t tsizemax: The maximum size of your tratt array. It is important to control the size, otherwise your code will have buffer overflow (think of when your file has more than 100 tokens, for example).
int tratt[tsizemax]: The array that will hold the values.
size_t *tsize: The number of tokens read (used in combination of tsizemax).
const char *delim: The delimiter(s), in your case a ,.
This is your main():
int main(void)
{
int tratt[100];
size_t size = 0;
if (!read_file_content("in.txt", 100, tratt, &size, ",")) {
puts("Failed");
return 1;
}
for (size_t i = 0; i < size; ++i)
printf("%d\n", tratt[i]);
}
Output:
0
30
25
10
Suppose "in.txt" has contents
0,30,25,10
The below program uses fscanf to read the integers into the tratt array, one-by-one. As we read integers using fscanf, we make sure it's return value is as expected. If not, we close the file and exit. In the event that the return value of fscanf is not as expected, the program also prints which type of error occurred. Currently, if any error occurs, the program stops. However, you can make the program behave differently depending on the error that occurred if you like.
As output, the program prints all of the integers read into the tratt array. The output is
0
30
25
10
Now this program assumes we know the number of elements we want to read into tratt. If we do not, we could allow for dynamically allocating more memory should the array need more elements or perhaps "in.txt" could contain a data structure, say, at the beginning/end of the file that records information about the file, such as the number of numbers in the file and the data type (a binary file would be best suited for this). These are just a couple of the possibilities.
A better approach might be to read characters in one-by-one (say, using getc) and use strtol to convert a sequence of character digits to a long int (I would have taken an approach similar to this).
Nevertheless, this approach is more succinct and should suffice.
#include <stdio.h>
#include <stdlib.h>
#define FILE_NAME "in.txt"
#define MAX_LEN 4
int main(void) {
int i, tratt[MAX_LEN];
FILE *fp = fopen(FILE_NAME, "r"); /* open file for reading */
/* if cannot open file */
if (fp == NULL) {
printf("Cannot open %s\n", FILE_NAME);
exit(EXIT_FAILURE);
}
/* read integer, checking return value of scanf as expected */
if (fscanf(fp, "%d", &tratt[0]) != 1) {
if (ferror(fp))
printf("fscanf: read error\n");
else if (feof(fp))
printf("fscanf: end of file\n");
else
printf("fscanf: matching failure\n");
fclose(fp);
exit(EXIT_FAILURE);
}
for (i = 1; i < MAX_LEN; i++)
/* read comma plus integer, checking return value of scanf */
if (fscanf(fp, ",%d", &tratt[i]) != 1) {
if (ferror(fp))
printf("fscanf: read error\n");
else if (feof(fp))
printf("fscanf: end of file\n");
else
printf("fscanf: matching failure\n");
fclose(fp);
exit(EXIT_FAILURE);
}
fclose(fp); /* close file */
/* print integers stored in tratt */
for (i = 0; i < MAX_LEN; i++)
printf("%d\n", tratt[i]);
return 0;
}

read write binary into a string c

So, I looked around the internet and a couple questions here and I couldn't find anything that could fix my problem here. I have an assignment for C programming, to write a program that allows user to enter words into a string, add more words, put all words in the string to a text file, delete all words in string, and when they exit it saves the words in a binary, which is loaded upon starting up the program again. I've gotten everything to work except where the binary is concerned.
I made two functions, one that loads the bin file when the program starts, one that saves the bin file when it ends. I don't know in which, or if in both, the problem starts. But basically I know it's not working right because I get garbage in my text file if I save it in a text file after the program loads the bin file into the string. I know for sure that the text file saver is working properly.
Thank you to anyone who takes the time to help me out, it's been an all-day process! lol
Here are the two snippets of my functions, everything else in my code seems to work so I don't want to blot up this post with the entire program, but if need be I'll put it up to solve this.
SIZE is a constant of 10000 to meet program specs of a 1000 words. But I couldn't get this to run even asking for only 10 elements or 1, just to clear that up
void loadBin(FILE *myBin, char *stringAll) {
myBin = fopen("myBin.bin", "rb");
if (myBin == NULL) {
saveBin(&myBin, stringAll);
}//if no bin file exists yet
fread(stringAll, sizeof(char), SIZE + 1, myBin);
fclose(myBin); }
/
void saveBin(FILE *myBin, char *stringAll) {
int stringLength = 0;
myBin = fopen("myBin.bin", "wb");
if (myBin == NULL) {
printf("Problem writing file!\n");
exit(-1);
stringLength = strlen(stringAll);
fwrite(&stringAll, sizeof(char), (stringLength + 1), myBin);
fclose(myBin); }
You are leaving bad values in your myBin FILE*, and passing the & (address) of a pointer.
Pass the filename, and you can (re) use the functions for other purposes, other files, et al.
char* filename = "myBin.bin";
Pass the filename, buffer pointer, and max size to read. You should consider using stat/fstat to discover file size
size_t loadBin(char *fn, char *stringAll, size_t size)
{
//since you never use myBin, keep this FILE* local
FILE* myBin=NULL;
if( NULL == (myBin = fopen(fn, "rb")) ) {
//create missing file
saveBin(fn, stringAll, 0);
}//if no bin file exists yet
size_t howmany = fread(stringAll, sizeof(char), size, myBin);
if( howmany < size ) printf("read fewer\n");
if(myBin) fclose(myBin);
return howmany;
}
Pass the file name, buffer pointer, and size to save
size_t saveBin(char *fn, char *stringAll, size_t size)
{
int stringLength = 0;
//again, why carry around FILE* pointer only used locally?
FILE* myBin=NULL;
if( NULL == (myBin = fopen("myBin.bin", "wb")) ) {
printf("Problem writing file!\n");
exit(-1);
}
//binary data may have embedded '\0' bytes, cannot use strlen,
//stringLength = strlen(stringAll);
size_t howmany = fwrite(stringAll, sizeof(char), size, myBin);
if( howmany < size ) printf("short write\n");
if(myBin) fclose(myBin);
return howmany;
}
Call these; you are not guaranteed to write & read the same sizes...
size_t buffer_size = SIZE;
char buffer[SIZE]; //fill this with interesting bytes
saveBin(filename, buffer, buffer_size);
size_t readcount = loadBin(filename, buffer, buffer_size);

Get the length of each line in file with C and write in output file

I am a biology student and I am trying to learn perl, python and C and also use the scripts in my work. So, I have a file as follows:
>sequence1
ATCGATCGATCG
>sequence2
AAAATTTT
>sequence3
CCCCGGGG
The output should look like this, that is the name of each sequence and the count of characters in each line and printing the total number of sequences in the end of the file.
sequence1 12
sequence2 8
sequence3 8
Total number of sequences = 3
I could make the perl and python scripts work, this is the python script as an example:
#!/usr/bin/python
import sys
my_file = open(sys.argv[1]) #open the file
my_output = open(sys.argv[2], "w") #open output file
total_sequence_counts = 0
for line in my_file:
if line.startswith(">"):
sequence_name = line.rstrip('\n').replace(">","")
total_sequence_counts += 1
continue
dna_length = len(line.rstrip('\n'))
my_output.write(sequence_name + " " + str(dna_length) + '\n')
my_output.write("Total number of sequences = " + str(total_sequence_counts) + '\n')
Now, I want to write the same script in C, this is what I have achieved so far:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[])
{
input = FILE *fopen(const char *filename, "r");
output = FILE *fopen(const char *filename, "w");
double total_sequence_counts = 0;
char sequence_name[];
char line [4095]; // set a temporary line length
char buffer = (char *) malloc (sizeof(line) +1); // allocate some memory
while (fgets(line, sizeof(line), filename) != NULL) { // read until new line character is not found in line
buffer = realloc(*buffer, strlen(line) + strlen(buffer) + 1); // realloc buffer to adjust buffer size
if (buffer == NULL) { // print error message if memory allocation fails
printf("\n Memory error");
return 0;
}
if (line[0] == ">") {
sequence_name = strcpy(sequence_name, &line[1]);
total_sequence_counts += 1
}
else {
double length = strlen(line);
fprintf(output, "%s \t %ld", sequence_name, length);
}
fprintf(output, "%s \t %ld", "Total number of sequences = ", total_sequence_counts);
}
int fclose(FILE *input); // when you are done working with a file, you should close it using this function.
return 0;
int fclose(FILE *output);
return 0;
}
But this code, of course is full of mistakes, my problem is that despite studying a lot, I still can't properly understand and use the memory allocation and pointers so I know I especially have mistakes in that part. It would be great if you could comment on my code and see how it can turn into a script that actually work. By the way, in my actual data, the length of each line is not defined so I need to use malloc and realloc for that purpose.
For a simple program like this, where you look at short lines one at a time, you shouldn't worry about dynamic memory allocation. It is probably good enough to use local buffers of a reasonable size.
Another thing is that C isn't particularly suited for quick-and-dirty string processing. For example, there isn't a strstrip function in the standard library. You usually end up implementing such behaviour yourself.
An example implementation looks like this:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#define MAXLEN 80 /* Maximum line length, including null terminator */
int main(int argc, char *argv[])
{
FILE *in;
FILE *out;
char line[MAXLEN]; /* Current line buffer */
char ref[MAXLEN] = ""; /* Sequence reference buffer */
int nseq = 0; /* Sequence counter */
if (argc != 3) {
fprintf(stderr, "Usage: %s infile outfile\n", argv[0]);
exit(1);
}
in = fopen(argv[1], "r");
if (in == NULL) {
fprintf(stderr, "Couldn't open %s.\n", argv[1]);
exit(1);
}
out = fopen(argv[2], "w");
if (in == NULL) {
fprintf(stderr, "Couldn't open %s for writing.\n", argv[2]);
exit(1);
}
while (fgets(line, sizeof(line), in)) {
int len = strlen(line);
/* Strip whitespace from end */
while (len > 0 && isspace(line[len - 1])) len--;
line[len] = '\0';
if (line[0] == '>') {
/* First char is '>': copy from second char in line */
strcpy(ref, line + 1);
} else {
/* Other lines are sequences */
fprintf(out, "%s: %d\n", ref, len);
nseq++;
}
}
fprintf(out, "Total number of sequences. %d\n", nseq);
fclose(in);
fclose(out);
return 0;
}
A lot of code is about enforcing arguments and opening and closing files. (You could cut out a lot of code if you used stdin and stdout with file redirections.)
The core is the big while loop. Things to note:
fgets returns NULL on error or when the end of file is reached.
The first lines determine the length of the line and then remove white-space from the end.
It is not enough to decrement length, at the end the stripped string must be terminated with the null character '\0'
When you check the first character in the line, you should check against a char, not a string. In C, single and double quotes are not interchangeable. ">" is a string literal of two characters, '>' and the terminating '\0'.
When dealing with countable entities like chars in a string, use integer types, not floating-point numbers. (I've used (signed) int here, but because there can't be a negative number of chars in a line, it might have been better to have used an unsigned type.)
The notation line + 1 is equivalent to &line[1].
The code I've shown doesn't check that there is always one reference per sequence. I'll leave this as exercide to the reader.
For a beginner, this can be quite a lot to keep track of. For small text-processing tasks like yours, Python and Perl are definitely better suited.
Edit: The solution above won't work for long sequences; it is restricted to MAXLEN characters. But you don't need dynamic allocation if you only need the length, not the contents of the sequences.
Here's an updated version that doesn't read lines, but read characters instead. In '>' context, it stored the reference. Otherwise it just keeps a count:
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h> /* for isspace() */
#define MAXLEN 80 /* Maximum line length, including null terminator */
int main(int argc, char *argv[])
{
FILE *in;
FILE *out;
int nseq = 0; /* Sequence counter */
char ref[MAXLEN]; /* Reference name */
in = fopen(argv[1], "r");
out = fopen(argv[2], "w");
/* Snip: Argument and file checking as above */
while (1) {
int c = getc(in);
if (c == EOF) break;
if (c == '>') {
int n = 0;
c = fgetc(in);
while (c != EOF && c != '\n') {
if (n < sizeof(ref) - 1) ref[n++] = c;
c = fgetc(in);
}
ref[n] = '\0';
} else {
int len = 0;
int n = 0;
while (c != EOF && c != '\n') {
n++;
if (!isspace(c)) len = n;
c = fgetc(in);
}
fprintf(out, "%s: %d\n", ref, len);
nseq++;
}
}
fprintf(out, "Total number of sequences. %d\n", nseq);
fclose(in);
fclose(out);
return 0;
}
Notes:
fgetc reads a single byte from a file and returns this byte or EOF when the file has ended. In this implementation, that's the only reading function used.
Storing a reference string is implemented via fgetc here too. You could probably use fgets after skipping the initial angle bracket, too.
The counting just reads bytes without storing them. n is the total count, len is the count up to the last non-space. (Your lines probably consist only of ACGT without any trailing space, so you could skip the test for space and use n instead of len.)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[]){
FILE *my_file = fopen(argv[1], "r");
FILE *my_output = fopen(argv[2], "w");
int total_sequence_coutns = 0;
char *sequence_name;
int dna_length;
char *line = NULL;
size_t size = 0;
while(-1 != getline(&line, &size, my_file)){
if(line[0] == '>'){
sequence_name = strdup(strtok(line, ">\n"));
total_sequence_coutns +=1;
continue;
}
dna_length = strlen(strtok(line, "\n"));
fprintf(my_output, "%s %d\n", sequence_name, dna_length);
free(sequence_name);
}
fprintf(my_output, "Total number of sequences = %d\n", total_sequence_coutns);
fclose(my_file);
fclose(my_output);
free(line);
return (0);
}

How do you save the characters of a file into a string or array?

For example, if I had a file name random.txt, which reads:
This is a string.
Abc
Zxy
How would you save the characters in random.txt to a string or array that includes all of the characters in the text file?
So far I have (using redirection for file)
#include <stdio.h>
#include <string.h>
int main () {
int c;
do {
c = fgetc(stdin);
putchar(c);
} while (c != EOF);
return 0;
}
First part: About file handles
The stdin variable holds a FILE handle to which the user input is redirected (the FILE data type is defined in stdio.h). You can create handles to files using the function FILE *fopen(const char *path, const char *mode).
Your example applied to a regular file would be something like this (no error checking is done):
int main() {
int c;
FILE *myfile = fopen("path/to/file", "r"); //Open file for reading
while(!feof(myfile)) {
c = fgetc(myfile);
//do stuff with 'c'
//...
}
fclose(myfile); //close the file
return 0;
}
More information about fopen here: http://linux.die.net/man/3/fopen
Second part: About C strings
C strings (char arrays terminated with the null character '\0') can be defined in several ways. One of them is by statically defining them:
char mystring[256]; //This defines an array of 256 bytes (255 characters plus end null)
It is very important to take care about the limits of the buffer. In our example, writing beyond 256 bytes in the buffer will make the program crash. If we assume our file will not have lines longer than 255 characters (including line terminators like \r and \n) we can use the fgets function (http://linux.die.net/man/3/fgets):
char *fgets(char *s, int size, FILE *stream);
Simple (newbie) example:
int main() {
char mystring[256];
FILE *myfile = fopen("path/to/file", "r"); //Open file for reading
while(!feof(myfile)) {
if (fgets(mystring, sizeof(mystring), myfile) != NULL) {
printf("%s", mystring);
}
}
fclose(myfile); //close the file
return 0;
}
Notice that fgets is used for reading lines. If you want to read characters 1 by 1, you should keep using fgetc and pushing them manually into a buffer.
Finally, if you want to read a whole text file into a C string (no error checking):
int main() {
FILE *myfile = fopen("path/to/file", "r"); //Open file for reading
//Get the file size
fseek(myfile, 0, SEEK_END);
long filesize = ftell(myfile);
fseek(myfile, 0, SEEK_SET);
//Allocate buffer dynamically (not statically as in previous examples)
//We are reserving 'filesize' bytes plus the end null required in all C strings.
char *mystring = malloc(filesize + 1); //'malloc' is defined in stdlib.h
fread(mystring, filesize, 1, myfile); //read all file
mystring[filesize] = 0; //write the end null
fclose(myfile); //close file
printf("%s", mystring); //dump contents to screen
free(mystring); //deallocate buffer. 'mystring' is no longer usable.
return 0;
}
The following will work on either stdin or an actual file (i.e. you can replace "stream" with stdin, or fopen() it with a filename), dynamically allocating as it goes. After it runs, "ptr" will be a pointer to an array holding the contents of the file.
/* read chars from stream in blocks of 4096 bytes,
dynamically allocating until eof */
size_t bytes_read = 0;
char * ptr = NULL;
while (1) {
size_t chunk_read;
/* increase size of allocation by 4096 bytes */
ptr = realloc(ptr, bytes_read + 4096);
/* read up to 4096 bytes to the newest portion of allocation */
chunk_read = fread(ptr + bytes_read, 1, 4096, stream);
bytes_read += chunk_read;
/* if fread() got less than the full amount of characters, break */
if (chunk_read < 4096) break;
}
/* resize pointer downward to actual number of bytes read,
plus an explicit null termination */
bytes_read += 1;
ptr = realloc(ptr, bytes_read);
ptr[bytes_read - 1] = '\0';

Reading text file into an array of lines in C

Using C I would like to read in the contents of a text file in such a way as to have when all is said and done an array of strings with the nth string representing the nth line of the text file. The lines of the file can be arbitrarily long.
What's an elegant way of accomplishing this? I know of some neat tricks to read a text file directly into a single appropriately sized buffer, but breaking it down into lines makes it trickier (at least as far as I can tell).
Thanks very much!
Breaking it down into lines means parsing the text and replacing all the EOL (by EOL I mean \n and \r) characters with 0.
In this way you can actually reuse your buffer and store just the beginning of each line into a separate char * array (all by doing only 2 passes).
In this way you could do one read for the whole file size+2 parses which probably would improve performance.
It's possible to read the number of lines in the file (loop fgets), then create a 2-dimensional array with the first dimension being the number of lines+1. Then, just re-read the file into the array.
You'll need to define the length of the elements, though. Or, do a count for the longest line size.
Example code:
inFile = fopen(FILENAME, "r");
lineCount = 0;
while(inputError != EOF) {
inputError = fscanf(inFile, "%s\n", word);
lineCount++;
}
fclose(inFile);
// Above iterates lineCount++ after the EOF to allow for an array
// that matches the line numbers
char names[lineCount][MAX_LINE];
fopen(FILENAME, "r");
for(i = 1; i < lineCount; i++)
fscanf(inFile, "%s", names[i]);
fclose(inFile);
For C (as opposed to C++), you'd probably wind up using fgets(). However, you might run into issues due to your arbitrary length lines.
Perhaps a Linked List would be the best way to do this?
The compiler won't like having an array with no idea how big to make it. With a Linked List you can have a really large text file, and not worry about allocating enough memory to the array.
Unfortunately, I haven't learned how to do linked lists, but maybe somebody else could help you.
If you have a good way to read the whole file into memory, you are almost there. After you've done that you could scan the file twice. Once to count the lines, and once to set the line pointers and replace '\n' and (and maybe '\r' if the file is read in Windows binary mode) with '\0'. In between scans allocate an array of pointers, now that you know how many you need.
you can use this way
#include <stdlib.h> /* exit, malloc, realloc, free */
#include <stdio.h> /* fopen, fgetc, fputs, fwrite */
struct line_reader {
/* All members are private. */
FILE *f;
char *buf;
size_t siz;
};
/*
* Initializes a line reader _lr_ for the stream _f_.
*/
void
lr_init(struct line_reader *lr, FILE *f)
{
lr->f = f;
lr->buf = NULL;
lr->siz = 0;
}
/*
* Reads the next line. If successful, returns a pointer to the line,
* and sets *len to the number of characters, at least 1. The result is
* _not_ a C string; it has no terminating '\0'. The returned pointer
* remains valid until the next call to next_line() or lr_free() with
* the same _lr_.
*
* next_line() returns NULL at end of file, or if there is an error (on
* the stream, or with memory allocation).
*/
char *
next_line(struct line_reader *lr, size_t *len)
{
size_t newsiz;
int c;
char *newbuf;
*len = 0; /* Start with empty line. */
for (;;) {
c = fgetc(lr->f); /* Read next character. */
if (ferror(lr->f))
return NULL;
if (c == EOF) {
/*
* End of file is also end of last line,
` * unless this last line would be empty.
*/
if (*len == 0)
return NULL;
else
return lr->buf;
} else {
/* Append c to the buffer. */
if (*len == lr->siz) {
/* Need a bigger buffer! */
newsiz = lr->siz + 4096;
newbuf = realloc(lr->buf, newsiz);
if (newbuf == NULL)
return NULL;
lr->buf = newbuf;
lr->siz = newsiz;
}
lr->buf[(*len)++] = c;
/* '\n' is end of line. */
if (c == '\n')
return lr->buf;
}
}
}
/*
* Frees internal memory used by _lr_.
*/
void
lr_free(struct line_reader *lr)
{
free(lr->buf);
lr->buf = NULL;
lr->siz = 0;
}
/*
* Read a file line by line.
* http://rosettacode.org/wiki/Read_a_file_line_by_line
*/
int
main()
{
struct line_reader lr;
FILE *f;
size_t len;
char *line;
f = fopen("foobar.txt", "r");
if (f == NULL) {
perror("foobar.txt");
exit(1);
}
/*
* This loop reads each line.
* Remember that line is not a C string.
* There is no terminating '\0'.
*/
lr_init(&lr, f);
while (line = next_line(&lr, &len)) {
/*
* Do something with line.
*/
fputs("LINE: ", stdout);
fwrite(line, len, 1, stdout);
}
if (!feof(f)) {
perror("next_line");
exit(1);
}
lr_free(&lr);
return 0;
}

Resources