I am writing some code that needs to read fasta files, so part of my code (included below) is a fasta parser. As a single sequence can span multiple lines in the fasta format, I need to concatenate multiple successive lines read from the file into a single string. I do this, by realloc'ing the string buffer after reading every line, to be the current length of the sequence plus the length of the line read in. I do some other stuff, like stripping white space etc. All goes well for the first sequence, but fasta files can contain multiple sequences. So similarly, I have a dynamic array of structs with a two strings (title, and actual sequence), being "char *". Again, as I encounter a new title (introduced by a line beginning with '>') I increment the number of sequences, and realloc the sequence list buffer. The realloc segfaults on allocating space for the second sequence with
*** glibc detected *** ./stackoverflow: malloc(): memory corruption: 0x09fd9210 ***
Aborted
For the life of me I can't see why. I've run it through gdb and everything seems to be working (i.e. everything is initialised, the values seems sane)... Here's the code:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <errno.h>
//a struture to keep a record of sequences read in from file, and their titles
typedef struct {
char *title;
char *sequence;
} sequence_rec;
//string convenience functions
//checks whether a string consists entirely of white space
int empty(const char *s) {
int i;
i = 0;
while (s[i] != 0) {
if (!isspace(s[i])) return 0;
i++;
}
return 1;
}
//substr allocates and returns a new string which is a substring of s from i to
//j exclusive, where i < j; If i or j are negative they refer to distance from
//the end of the s
char *substr(const char *s, int i, int j) {
char *ret;
if (i < 0) i = strlen(s)-i;
if (j < 0) j = strlen(s)-j;
ret = malloc(j-i+1);
strncpy(ret,s,j-i);
return ret;
}
//strips white space from either end of the string
void strip(char **s) {
int i, j, len;
char *tmp = *s;
len = strlen(*s);
i = 0;
while ((isspace(*(*s+i)))&&(i < len)) {
i++;
}
j = strlen(*s)-1;
while ((isspace(*(*s+j)))&&(j > 0)) {
j--;
}
*s = strndup(*s+i, j-i);
free(tmp);
}
int main(int argc, char**argv) {
sequence_rec *sequences = NULL;
FILE *f = NULL;
char *line = NULL;
size_t linelen;
int rcount;
int numsequences = 0;
f = fopen(argv[1], "r");
if (f == NULL) {
fprintf(stderr, "Error opening %s: %s\n", argv[1], strerror(errno));
return EXIT_FAILURE;
}
rcount = getline(&line, &linelen, f);
while (rcount != -1) {
while (empty(line)) rcount = getline(&line, &linelen, f);
if (line[0] != '>') {
fprintf(stderr,"Sequence input not in valid fasta format\n");
return EXIT_FAILURE;
}
numsequences++;
sequences = realloc(sequences,sizeof(sequence_rec)*numsequences);
sequences[numsequences-1].title = strdup(line+1); strip(&sequences[numsequences-1].title);
rcount = getline(&line, &linelen, f);
sequences[numsequences-1].sequence = malloc(1); sequences[numsequences-1].sequence[0] = 0;
while ((!empty(line))&&(line[0] != '>')) {
strip(&line);
sequences[numsequences-1].sequence = realloc(sequences[numsequences-1].sequence, strlen(sequences[numsequences-1].sequence)+strlen(line)+1);
strcat(sequences[numsequences-1].sequence,line);
rcount = getline(&line, &linelen, f);
}
}
return EXIT_SUCCESS;
}
You should use strings that look something like this:
struct string {
int len;
char *ptr;
};
This prevents strncpy bugs like what it seems you saw, and allows you to do strcat and friends faster.
You should also use a doubling array for each string. This prevents too many allocations and memcpys. Something like this:
int sstrcat(struct string *a, struct string *b)
{
int len = a->len + b->len;
int alen = a->len;
if (a->len < len) {
while (a->len < len) {
a->len *= 2;
}
a->ptr = realloc(a->ptr, a->len);
if (a->ptr == NULL) {
return ENOMEM;
}
}
memcpy(&a->ptr[alen], b->ptr, b->len);
return 0;
}
I now see you are doing bioinformatics, which means you probably need more performance than I thought. You should use strings like this instead:
struct string {
int len;
char ptr[0];
};
This way, when you allocate a string object, you call malloc(sizeof(struct string) + len) and avoid a second call to malloc. It's a little more work but it should help measurably, in terms of speed and also memory fragmentation.
Finally, if this isn't actually the source of error, it looks like you have some corruption. Valgrind should help you detect it if gdb fails.
One potential issue is here:
strncpy(ret,s,j-i);
return ret;
ret might not get a null terminator. See man strncpy:
char *strncpy(char *dest, const char *src, size_t n);
...
The strncpy() function is similar, except that at most n bytes of src
are copied. Warning: If there is no null byte among the first n bytes
of src, the string placed in dest will not be null terminated.
There's also a bug here:
j = strlen(*s)-1;
while ((isspace(*(*s+j)))&&(j > 0)) {
What if strlen(*s) is 0? You'll end up reading (*s)[-1].
You also don't check in strip() that the string doesn't consist entirely of spaces. If it does, you'll end up with j < i.
edit: Just noticed that your substr() function doesn't actually get called.
I think the memory corruption problem might be the result of how you're handling the data used in your getline() calls. Basically, line is reallocated via strndup() in the calls to strip(), so the buffer size being tracked in linelen by getline() will no longer be accurate. getline() may overrun the buffer.
while ((!empty(line))&&(line[0] != '>')) {
strip(&line); // <-- assigns a `strndup()` allocation to `line`
sequences[numsequences-1].sequence = realloc(sequences[numsequences-1].sequence, strlen(sequences[numsequences-1].sequence)+strlen(line)+1);
strcat(sequences[numsequences-1].sequence,line);
rcount = getline(&line, &linelen, f); // <-- the buffer `line` points to might be
// smaller than `linelen` bytes
}
Related
i'm trying to implement little program that takes a text and breaks it into lines and sort them in alphabetical order but i encountered a little problem, so i have readlines function which updates an array of pointers called lines, the problem is when i try to printf the first pointer in lines as an array using %s nothing is printed and there is no errors.
I have used strcpy to copy an every single text line(local char array) into a pointer variable and then store that pointer in lines array but it gave me the error.
Here is the code:
#include <stdio.h>
#define MAXLINES 4
#define MAXLENGTH 1000
char *lines[MAXLINES];
void readlines() {
int i;
for (i = 0; i < MAXLINES; i++) {
char c, line[MAXLENGTH];
int j;
for (j = 0; (c = getchar()) != '\0' && c != '\n' && j < MAXLENGTH; j++) {
line[j] = c;
}
lines[i] = line;
}
}
int main(void) {
readlines();
printf("%s", lines[0]);
getchar();
return 0;
}
One problem is the following line:
lines[i] = line;
In this line, you make lines[i] point to line. However, line is a local char array whose lifetime ends as soon as the current loop iteration ends. Therefore, lines[i] will contain a dangling pointer (i.e. a pointer to an object that is no longer valid) as soon as the loop iteration ends.
For this reason, when you later call
printf("%s", lines[0]);
lines[0] is pointing to an object whose lifetime has ended. Dereferencing such a pointer invokes undefined behavior. Therefore, you cannot rely on getting any meaningful output, and your program may crash.
One way to fix this would be to not make lines an array of pointers, but rather an multidimensional array of char, i.e. an array of strings:
char lines[MAXLINES][MAXLENGTH+1];
Now you have a proper place for storing the strings, and you no longer need the local array line in the function readlines.
Another issue is that the line
printf("%s", lines[0]);
requires that lines[0] points to a string, i.e. to an array of characters terminated by a null character. However, you did not put a null character at the end of the string.
After fixing all of the issues mentioned above, your code should look like this:
#include <stdio.h>
#define MAXLINES 4
#define MAXLENGTH 1000
char lines[MAXLINES][MAXLENGTH+1];
void readlines() {
int i;
for (i = 0; i < MAXLINES; i++) {
char c;
int j;
for (j = 0; (c = getchar()) != '\0' && c != '\n' && j < MAXLENGTH; j++) {
lines[i][j] = c;
}
//add terminating null character
lines[i][j] = '\0';
}
}
int main(void) {
readlines();
printf("%s", lines[0]);
return 0;
}
However, this code still has a few issues, which are probably unrelated to your immediate problem, but could cause trouble later:
The function getchar will return EOF, not '\0', when there is no more data (or when an error occurred). Therefore, you should compare the return value of getchar with EOF instead of '\0'. However, a char is not guaranteed to be able to store the value of EOF. Therefore, you should store the return value of getchar in an int instead. Note that getchar returns a value of type int, not char.
When j reaches MAX_LENGTH, you will call getchar one additional time before terminating the loop. This can cause undesired behavior, such as your program waiting for more user input or an important character being discarded from the input stream.
In order to also fix these issues, I recommend the following code:
#include <stdio.h>
#define MAXLINES 4
#define MAXLENGTH 1000
char lines[MAXLINES][MAXLENGTH+1];
void readlines() {
int i;
for (i = 0; i < MAXLINES; i++)
{
//changed type from "char" to "int"
int c;
int j;
for ( j = 0; j < MAXLENGTH; j++ )
{
if ( (c = getchar()) == EOF || c == '\n' )
break;
lines[i][j] = c;
}
//add terminating null character
lines[i][j] = '\0';
}
}
int main(void) {
readlines();
printf("%s", lines[0]);
return 0;
}
Problem 1
char *lines[MAXLINES];
For the compiler it makes no difference how you write this, but for you, as you are learning C, maybe it is worth consider different spacing and naming. Question is: what is lines[]? lines[] is supposed to be an array of strings and hold some text inside. So lines[0] is a string, lines[1] is a string and so on. As pointed in a comment you could also use char lines[MAX_LINES][MAX_LENGTH] and have a 2D box of NxM char. This way you would have a pre-determined size in terms of number and size of lines and have simpler things at a cost of wasting space in lines of less than MAX_LENGTH chars and having a fixed number of lines you can use, but no need to allocate memory.
A more flexible way is to use an array of pointers. Since each pointer will represent a line, a single one
char* line[MAXLINES];
is a better picture of the use: line[0] is char*, line[1] is char* and so on. But you will need to allocate memory for each line (and you did not) in your code.
Remember int main(int argc, char**argv)
This is the most flexible way, since in this way you can hold any number of lines. The cost? Additional allocations.
size_t n_lines;
char** line;
This may be the best representation, as known by every C program since K&R.
Problem 2
for (
j = 0;
(c = getchar()) != '\0' && c != '\n' && j < MAXLENGTH;
j++) {
line[j] = c;
}
lines[i] = line;
This loop does not copy the final 0 that terminates each string. And reuses the same line, a char[] to hold the data as being read. And the final line does not copy a string, if one existed there. There is no one since the final 0 was stripped off by the loop. And there is no data too, since the area is being reused.
A complete C example of uploading a file to a container in memory
I will let an example of a more controlled way of writing this, a container for a set of lines and even a sorting function.
a data structure
The plan is to build an array of pointers as the system does for main. Since we do no know ahead the number of lines and do not want this limitation we will allocate memory in groups of blk_size lines. At any time we have limit pointers to use. From these size are in use. line[] is char* and points to a single line of text. The struct is
typedef struct
{
size_t blk_size; // block
size_t limit; // actual allocated size
size_t size; // size in use
char** line; // the lines
} Block;
the test function
Block* load_file(const char*);
Plan is to call load_file("x.txt") and the function returns a Block* pointing to the array representing the lines in file, one by one. Then we call qsort() and sort the whole thing. If the program is called lines we will run
lines x.txt
and it will load the file x.txt, show its contents on screen, sort it, show the sorted lines and then erase everything at exit.
main() for the test
int main(int argc, char** argv)
{
char msg[80] = {0};
if (argc < 2) usage();
Block* test = load_file(argv[1]);
sprintf(msg, "==> Loading \"%s\" into memory", argv[1]);
status_blk(test, msg);
qsort(test->line, test->size, sizeof(void*), cmp_line);
sprintf(msg, "==> \"%s\" after sort", argv[1]);
status_blk(test, msg);
test = delete_blk(test);
return 0;
};
As planned
load_file() is the constructor and load the file contents into a Block.
status_blk() shows the contents and accepts a convenient optional message
qsort() sorts the lines using a one-line cmp_line() function.
status_blk() is called again and shows the now sorted contents
as in C++ delete_blk() is the destructor and erases the whole thing._
output using main() as tlines.c for testing
PS M:\> .\lines tlines.c
loading "tlines.c" into memory
Block extended for a total of 16 pointers
==> Loading "tlines.c" into memory
Status: 13 of 16 lines. [block size is 8]:
1 int main(int argc, char** argv)
2 {
3 char msg[80] = {0};
4 if (argc < 2) usage();
5 Block* test = load_file(argv[1]);
6 sprintf(msg, "==> Loading \"%s\" into memory", argv[1]);
7 status_blk(test, msg);
8 qsort(test->line, test->size, sizeof(void*), cmp_line);
9 sprintf(msg, "==> \"%s\" after sort", argv[1]);
10 status_blk(test, msg);
11 test = delete_blk(test);
12 return 0;
13 };
==> "tlines.c" after sort
Status: 13 of 16 lines. [block size is 8]:
1 Block* test = load_file(argv[1]);
2 char msg[80] = {0};
3 if (argc < 2) usage();
4 qsort(test->line, test->size, sizeof(void*), cmp_line);
5 return 0;
6 sprintf(msg, "==> Loading \"%s\" into memory", argv[1]);
7 sprintf(msg, "==> \"%s\" after sort", argv[1]);
8 status_blk(test, msg);
9 status_blk(test, msg);
10 test = delete_blk(test);
11 int main(int argc, char** argv)
12 {
13 };
About the code
I am not sure if it needs much explanation, it is a single function that does the file loading and it has around 20 lines of code. The other functions has less than 10. The whole file is represented in line that is char** and Block has the needed info about actual size.
Since line[] is an array of pointers we can call
qsort(test->line, test->size, sizeof(void*), cmp_line);
and use
int cmp_line(const void* one, const void* other)
{
return strcmp(
*((const char**)one), *((const char**)other));
}
using strcmp() to compare the strings and have the lines sorted.
create_blk() accepts a block size for use in the calls to realloc() for eficiency.
Delete a Block is a 3-step free() in the reverse order of allocation.
The complete code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
size_t blk_size; // block
size_t limit; // actual allocated size
size_t size; // size in use
char** line; // the lines
} Block;
Block* create_blk(size_t);
Block* delete_blk(Block*);
int status_blk(Block*, const char*);
Block* load_file(const char*);
int cmp_line(const void*, const void*);
void usage();
int main(int argc, char** argv)
{
char msg[80] = {0};
if (argc < 2) usage();
Block* test = load_file(argv[1]);
sprintf(msg, "\n\n==> Loading \"%s\" into memory", argv[1]);
status_blk(test, msg);
qsort(test->line, test->size, sizeof(void*), cmp_line);
sprintf(msg, "\n\n==> \"%s\" after sort", argv[1]);
status_blk(test, msg);
test = delete_blk(test);
return 0;
};
int cmp_line(const void* one, const void* other)
{
return strcmp(
*((const char**)one), *((const char**)other));
}
Block* create_blk(size_t size)
{
Block* nb = (Block*)malloc(sizeof(Block));
if (nb == NULL) return NULL;
nb->blk_size = size;
nb->limit = size;
nb->size = 0;
nb->line = (char**)malloc(sizeof(char*) * size);
return nb;
}
Block* delete_blk(Block* blk)
{
if (blk == NULL) return NULL;
for (size_t i = 0; i < blk->size; i += 1)
free(blk->line[i]); // free lines
free(blk->line); // free block
free(blk); // free struct
return NULL;
}
int status_blk(Block* bl,const char* msg)
{
if (msg != NULL) printf("%s\n", msg);
if (bl == NULL)
{
printf("Status: not allocated\n");
return -1;
}
printf(
"Status: %zd of %zd lines. [block size is %zd]:\n",
bl->size, bl->limit, bl->blk_size);
for (int i = 0; i < bl->size; i += 1)
printf("%4d\t%s", 1 + i, bl->line[i]);
return 0;
}
Block* load_file(const char* f_name)
{
if (f_name == NULL) return NULL;
fprintf(stderr, "loading \"%s\" into memory\n", f_name);
FILE* F = fopen(f_name, "r");
if (F == NULL) return NULL;
// file is open
Block* nb = create_blk(8); // block size is 8
char line[200];
char* p = &line[0];
p = fgets(p, sizeof(line), F);
while (p != NULL)
{
// is block full?
if (nb->size >= nb->limit)
{
const size_t new_sz = nb->limit + nb->blk_size;
char* new_block =
realloc(nb->line, (new_sz * sizeof(char*)));
if (new_block == NULL)
{
fprintf(
stderr,
"\tCould not extend block to %zd "
"lines\n",
new_sz);
break;
}
printf(
"Block extended for a total of %zd "
"pointers\n",
new_sz);
nb->limit = new_sz;
nb->line = (char**)new_block;
}
// now copy the line
nb->line[nb->size] = (char*)malloc(1 + strlen(p));
strcpy(nb->line[nb->size], p);
nb->size += 1;
// read next line
p = fgets(p, sizeof(line), F);
}; // while()
fclose(F);
return nb;
}
void usage()
{
fprintf(stderr,"Use: program file_to_load\n");
exit(EXIT_FAILURE);
}
Try something like this:
#include <stdio.h>
#include <stdlib.h> // for malloc(), free(), exit()
#include <string.h> // for strcpy()
#define MAXLINES 4
#define MAXLENGTH 1000
char *lines[MAXLINES];
void readlines() {
for( int i = 0; i < MAXLINES; i++) {
char c, line[MAXLENGTH + 1]; // ALWAYS one extra to allow for '\0'
int j = 0;
// RE-USE(!) local array for input characters until NL or length
// NB: Casting return value to character (suppress warning)
while( (c = (char)getchar()) != '\0' && c != '\n' && j < MAXLENGTH )
line[ j++ ] = c;
line[j] = '\0'; // terminate array (transforming it to 'string')
// Attempt to get a buffer to preserve this line
// (Old) compiler insists on casting return from malloc()
if( ( lines[i] = (char*)malloc( (j + 1) * sizeof lines[0][0] ) ) == NULL ) {
fprintf( stderr, "malloc failure\n" );
exit( -1 );
}
strcpy( lines[i], line ); // preserve this line
}
}
int my_main() {
readlines(); // only returns after successfully reading 4 lines of input
for( int i = 0; i < MAXLINES; i++)
printf( "Line %d: '%s'\n", i, lines[i] ); // enhanced
/* Maybe do stuff here */
for( int j = 0; j < MAXLINES; j++) // free up allocated memory.
free( lines[j] );
return 0;
}
If you would prefer to 'factor out` some code (and have a facility that you've written is absent, here's a version:
char *my_strdup( char *str ) {
int len = strlen( str ) + 1; // ALWAYS +1
// Attempt to get a buffer to preserve this line
// (Old) compiler insists on casting return from malloc()
char *pRet = (char*)malloc( len * sizeof *pRet );
if( pRet == NULL ) {
fprintf( stderr, "malloc failure\n" );
exit( -1 );
}
return strcpy( pRet, str );
}
The the terminating and preserve is condensed to:
line[j] = '\0'; // terminate array (transforming it to 'string')
lines[i] = my_strdup( line ); // preserve this line
I probably got an easy one for the C programmers out there!
I am trying to create a simple C function that will execute a system command in and write the process output to a string buffer out (which should be initialized as an array of strings of length n). The output needs to be formatted in the following way:
Each line written to stdout should be initialized as a string. Each of these strings has variable length. The output should be an array consisting of each string. There is no way to know how many strings will be written, so this array is also technically of variable length (but for my purposes, I just create a fixed-length array outside the function and pass its length as an argument, rather than going for an array that I would have to manually allocate memory for).
Here is what I have right now:
#define MAX_LINE_LENGTH 512
int exec(const char* in, const char** out, const size_t n)
{
char buffer[MAX_LINE_LENGTH];
FILE *file;
const char terminator = '\0';
if ((file = popen(in, "r")) == NULL) {
return 1;
}
for (char** head = out; (size_t)head < (size_t)out + n && fgets(buffer, MAX_LINE_LENGTH, file) != NULL; head += strlen(buffer)) {
*head = strcat(buffer, &terminator);
}
if (pclose(file)) {
return 2;
}
return 0;
}
and I call it with
#define N 128
int main(void)
{
const char* buffer[N];
const char cmd[] = "<some system command resulting in multi-line output>";
const int code = exec(cmd, buffer, N);
exit(code);
}
I believe the error the above code results in is a seg fault, but I'm not experienced enough to figure out why or how to fix.
I'm almost positive it is with my logic here:
for (char** head = out; (size_t)head < (size_t)out + n && fgets(buffer, MAX_LINE_LENGTH, file) != NULL; head += strlen(buffer)) {
*head = strcat(buffer, &terminator);
}
What I thought this does is:
Get a mutable reference to out (i.e. the head pointer)
Save the current stdout line to buffer (via fgets)
Append a null terminator to buffer (because I don't think fgets does this?)
Overwrite the data at head pointer with the value from step 3
Move head pointer strlen(buffer) bytes over (i.e. the number of chars in buffer)
Continue until fgets returns NULL or head pointer has been moved beyond the bounds of out array
Where am I wrong? Any help appreciated, thanks!
EDIT #1
According to Barmar's suggestions, I edited my code:
#include <stdio.h>
#include <stdlib.h>
#define MAX_LINE_LENGTH 512
int exec(const char* in, const char** out, const size_t n)
{
char buffer[MAX_LINE_LENGTH];
FILE *file;
if ((file = popen(in, "r")) == NULL) return 1;
for (size_t i = 0; i < n && fgets(buffer, MAX_LINE_LENGTH, file) != NULL; i += 1) out[i] = buffer;
if (pclose(file)) return 2;
return 0;
}
#define N 128
int main(void)
{
const char* buffer[N];
const char cmd[] = "<system command to run>";
const int code = exec(cmd, buffer, N);
for (int i = 0; i < N; i += 1) printf("%s", buffer[i]);
exit(code);
}
While there were plenty of redundancies with what I wrote that are now fixed, this still causes a segmentation fault at runtime.
Focusing on the edited code, this assignment
out[i] = buffer;
has problems.
In this expression, buffer is implicitly converted to a pointer-to-its-first-element (&buffer[0], see: decay). No additional memory is allocated, and no string copying is done.
buffer is rewritten every iteration. After the loop, each valid element of out will point to the same memory location, which will contain the last line read.
buffer is an array local to the exec function. Its lifetime ends when the function returns, so the array in main contains dangling pointers. Utilizing these values is Undefined Behaviour.
Additionally,
for (int i = 0; i < N; i += 1)
always loops to the maximum storable number of lines, when it is possible that fewer lines than this were read.
A rigid solution uses an array of arrays to store the lines read. Here is a cursory example (see: this answer for additional information on using multidimensional arrays as function arguments).
#include <stdio.h>
#include <stdlib.h>
#define MAX_LINES 128
#define MAX_LINE_LENGTH 512
int exec(const char *cmd, char lines[MAX_LINES][MAX_LINE_LENGTH], size_t *lc)
{
FILE *stream = popen(cmd, "r");
*lc = 0;
if (!stream)
return 1;
while (*lc < MAX_LINES) {
if (!fgets(lines[*lc], MAX_LINE_LENGTH, stream))
break;
(*lc)++;
}
return pclose(stream) ? 2 : 0;
}
int main(void)
{
char lines[MAX_LINES][MAX_LINE_LENGTH];
size_t n;
int code = exec("ls -al", lines, &n);
for (size_t i = 0; i < n; i++)
printf("%s", lines[i]);
return code;
}
Using dynamic memory is another option. Here is a basic example using strdup(3), lacking robust error handling.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char **exec(const char *cmd, size_t *length)
{
FILE *stream = popen(cmd, "r");
if (!stream)
return NULL;
char **lines = NULL;
char buffer[4096];
*length = 0;
while (fgets(buffer, sizeof buffer, stream)) {
char **reline = realloc(lines, sizeof *lines * (*length + 1));
if (!reline)
break;
lines = reline;
if (!(lines[*length] = strdup(buffer)))
break;
(*length)++;
}
pclose(stream);
return lines;
}
int main(void)
{
size_t n = 0;
char **lines = exec("ls -al", &n);
for (size_t i = 0; i < n; i++) {
printf("%s", lines[i]);
free(lines[i]);
}
free(lines);
}
I'd like to read a big file while the first character of a line isn't " ".
But the code I have written is very slow. How can I speed up the routine?
Is there a better solution instead of getline?
void readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res)
{
vString = (char *)calloc(fdstat.st_size + 1, sizeof(char));
int dataEnd = 1;
size_t len = 0;
int emptyLine = 1;
char **linePtr = malloc(sizeof(char*));
*linePtr = NULL;
while(dataEnd)
{
// Check every line
getline(linePtr, &len, fp);
// When data ends, the line begins with space (" ")
if(*linePtr[0] == 0x20)
emptyLine = 0;
// If line begins with space, stop writing
if(emptyLine)
strcat(vString, *linePtr);
else
dataEnd = 0;
}
strcat(vString, "\0");
free(linePtr);
linePtr = NULL;
}
}
int main(int argc, char **argv){
readString(argv[1]);
return EXIT_SUCCESS;
}
How can I speed up the routine?
The most suspicious aspect of your program performance-wise is the strcat(). On each call, it needs to scan the whole destination string from the beginning to find the place to append the source string. As a result, if your file's lines have length bounded by a constant (even a large one), then your approach's performance scales with the square of the file length.
The asymptotic complexity analysis doesn't necessarily tell the whole story, though. The I/O part of your code scales linearly with file length, and since I/O is much more expensive than in-memory data manipulation, that will dominate your performance for small enough files. If you're in that regime then you're probably not going to do much better than you already do. In that event, though, you might still do a bit better by reading the whole file at once via fread(), and then scanning it for end-of-data via strstr():
size_t nread = fread(vString, 1, fdstat.st_size, fp);
// Handle nread != fdstat.st_size ...
// terminate the buffer as a string
vString[nread] = '\0';
// truncate the string after the end-of-data:
char *eod = strstr(vString, "\n ");
if (eod) {
// terminator found - truncate the string after the newline
eod[1] = '\0';
} // else no terminator found
That scales linearly, so it addresses your asymptotic complexity problem, too, but if the data of interest will often be much shorter than the file, then it will leave you in those cases doing a lot more costly I/O than you need to do. In that event, one alternative would be to read in chunks, as #laissez_faire suggested. Another would be to tweak your original algorithm to track the end of vString so as to use strcpy() instead of strcat() to append each new line. The key part of that version would look something like this:
char *linePtr = NULL;
size_t nread = 0;
size_t len = 0;
*vString = '\0'; // In case the first line is end-of-data
for (char *end = vString; ; end += nread) {
// Check every line
nread = getline(&linePtr, &len, fp);
if (nread < 0) {
// handle eof or error ...
}
// When data ends, the line begins with space (" ")
if (*linePtr == ' ') {
break;
}
strcpy(end, *linePtr);
}
free(linePtr);
Additionally, note that
you do not need to initially zero-fill the memory allocated for *vString, as you're just going to overwrite those zeroes with the data of real interest (and then ignore the rest of the buffer).
You should not cast the return value of malloc()-family functions, including calloc().
Have you tried to read the file using fread and read a bigger chunk of data in each step and then parse the data after reading it? Something like:
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
char *readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res) {
vString = (char *) calloc(fdstat.st_size + 1, sizeof(char));
int newline = 1;
int index = 0;
while (index < fdstat.st_size) {
int len =
fdstat.st_size - index >
4096 ? 4096 : fdstat.st_size - index;
char *buffer = (char *) malloc(len);
int read_len = fread(buffer, 1, len, fp);
int i;
if (newline) {
if (read_len > 0 && buffer[0] == ' ') {
return vString;
}
newline = 0;
}
for (i = 0; i < read_len; ++i) {
if (buffer[i] == '\n') {
if (i + 1 < read_len && buffer[i + 1] == ' ') {
memcpy(vString + index, buffer, i + 1);
return vString;
}
newline = 1;
}
}
memcpy(vString + index, buffer, read_len);
index += read_len;
}
}
return vString;
}
int main(int argc, char **argv)
{
char *str = readString(argv[1]);
printf("%s", str);
free(str);
return EXIT_SUCCESS;
}
I need a version of read line that is memory save. I have this "working" solution. But I'm not sure how it behaves with memory. When I enable free(text) it works for a few lines and then I get an error. So now neither text nor result is ever freed although I malloc text. Is that correct ? And why is that so ?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char* readFromIn()
{
char* text = malloc(1024);
char* result = fgets(text, 1024, stdin);
if (result[strlen(result) - 1] == 10)
result[strlen(result) - 1] = 0;
//free(text);
return result;
}
I have A LOT of short lines to read with this and I also need stdin to be replaceable with a FILE* handle. There is no need for me to realloc text because I have only short lines.
fgets returns a pointer to the string, so after the fgets line, result will be the same memory address as text. Then when you call free (text); you are returning invalid memory.
You should free the memory in the calling function when you have finished with result
You could also avoid the malloc/free stuff by structuring your code to pass a buffer something like this:
void parent_function ()
{
char *buffer[1024];
while (readFromIn(buffer)) {
// Process the contents of buffer
}
}
char *readFromIn(char *buffer)
{
char *result = fgets(buffer, 1024, stdin);
int len;
// fgets returns NULL on error of end of input,
// in which case buffer contents will be undefined
if (result == NULL) {
return NULL;
}
len = strlen (buffer);
if (len == 0) {
return NULL;
}
if (buffer[len - 1] == '\n') {
buffer[len - 1] = 0;
return buffer;
}
Trying to avoid the malloc/free is probably wise if you are dealing with many small, short lived items so that the memory doesn't get fragmented and it should faster as well.
char *fgets(char *s, int size, FILE *stream) reads in at most one less than size characters from stream and stores them into the buffer pointed to by s. Reading stops after an EOF or a newline. If a newline is read, it is stored into the buffer. A terminating null byte ('\0') is stored after the last character in the buffer.
Return Value: returns s on success, and NULL on error or when end of file occurs while no characters have been read.
So there are 2 critical problems with your code:
You don't check the return value of fgets
You want to deallocate the memory, where this string is stored and return a pointer to this memory. Accessing the memory, where such a pointer (dangling pointer) points to, leads to undefined behaviour.
Your function could look like this:
public char* readFromIn() {
char* text = malloc(1024);
if (fgets(text, 1024, stdin) != NULL) {
int textLen = strlen(text);
if (textLen > 0 && text[textLen - 1] == '\n')
text[textLen - 1] == '\0'; // getting rid of newline character
return text;
}
else {
free(text);
return NULL;
}
}
and then caller of this function should be responsible for deallocating the memory that return value of this function points to.
I know you mentioned that the lines are only short, but none of the solutions provided will work for lines greater than 1024 in length. It is for this reason that I provide a solution which will attempt to read entire lines, and resize the buffer when there's not enough space.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MINIMUM_CAPACITY 16
size_t read_line(char **buffer, size_t *capacity) {
char *buf = *buffer;
size_t cap = *capacity, pos = 0;
if (cap < MINIMUM_CAPACITY) { cap = MINIMUM_CAPACITY; }
for (;;) {
buf = realloc(buf, cap);
if (buf == NULL) { return pos; }
*buffer = buf;
*capacity = cap;
if (fgets(buf + pos, cap - pos, stdin) == NULL) {
break;
}
pos += strcspn(buf + pos, "\n");
if (buf[pos] == '\n') {
break;
}
cap *= 2;
}
return pos;
}
int main(void) {
char *line = NULL;
size_t size = 0;
for (size_t end = read_line(&line, &size); line[end] == '\n'; end = read_line(&line, &size)) {
line[end] = '\0'; // trim '\n' off the end
// process contents of buffer here
}
free(line);
return 0;
}
An ideal solution should be able to operate with a fixed buffer of 1 byte. This requires a more comprehensive understanding of the problem, however. Once achieved, adapting such a solution would achieve the most optimal solution.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *readFromIn(FILE *fp)
{
char text[1024];
size_t len;
if (!fgets(text, sizeof text, fp)) return NULL;
len = strlen(text);
while (len && text[len-1] == '\n') text[--len] = 0;
return strdup(text);
}
Why did no one propose to move the buffer from heap to stack ? This is my solution now:
char input[1024]; // held ready as buffer for fgets
char* readFromIn()
{
char* result = fgets(input, 1024, stdin);
if (result == null)
return "";
if (result[strlen(result) - 1] == '\n')
result[strlen(result) - 1] = 0;
return result;
}
I have been trying to take chars from a txt file(in which the words of the text that will become strings will be separated by spaces) and import them into strings in my code. I tried it but I only could print the words (that are separated by spaces). How can I input them into strings?
The code that prints the words is the following, but I also need it to save the string into arrays or pointers if possible.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(){
FILE *fp;
int i=0;
char *words=NULL,*word=NULL,c;
if ((fp=fopen("monologue.txt","r"))==NULL){ /*Where monologue txt is a normal file with plain text*/
printf("Error Opening File\n");
exit(1);}
while ((c = fgetc(fp))!= EOF){
if (c=='\n'){ c = ' '; }
words = (char *)realloc(words, ++i*sizeof(char));
words[i-1]=c;}
word=strtok(words," ");
while(word!= NULL){
printf("%s\n",word);
word = strtok(NULL," ");}
exit(0);
}
Your code is rather hard to read. Here is almost identical code that is (I submit) considerably more readable:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main()
{
const char filename[] = "monologue.txt";
FILE *fp;
int i = 0;
char *words = NULL;
char *word = NULL;
int c;
if ((fp = fopen(filename, "r")) == NULL)
{
/*Where monologue txt is a normal file with plain text*/
fprintf(stderr, "Error opening file %s\n", filename);
exit(1);
}
while ((c = fgetc(fp)) != EOF)
{
if (c == '\n')
c = ' ';
words = (char *)realloc(words, ++i * sizeof(char));
words[i-1] = c;
}
word = strtok(words, " ");
while (word != NULL)
{
printf("%s\n", word);
word = strtok(NULL, " ");
}
return(0);
}
This shows us that you are slurping the entire file into the string pointed to by words, but you are doing so rather inefficiently in that you are reallocating memory one byte at a time for each byte read. You should be looking to do things much more effectively, by reading bigger chunks of the file into memory. For example, you might allocate an initial buffer of 32 KiB; you could read into that buffer using fread(); if you don't encounter EOF, you could then reallocate the space, doubling the amount available to you. (For testing, you'd start with a much smaller block - maybe 16 bytes, maybe even as small as 4 bytes; this ensures you test the memory reallocation code, whereas 32 KiB would probably seldom exercise the reallocation code.)
You also need to ensure that your string is null terminated; as it stands, it is not. You would need to do a final realloc() to make space for the null terminator too.
You can avoid mapping newlines during input since strtok() can be given a list of characters on which to split, so you can add newline to that list.
To generate a list of words, you need to adapt the loop around strtok(). You might simply count the spaces and newlines and then allocate enough pointers to point to that many words; you might have an overestimate if there are adjacent spaces or newlines, but better over than under. Alternatively, you can can allocate, for sake of argument, 16 pointers. As you process the first 16 words, you use these pointers; when you run out of space, you double the number of pointers allocated, and use the new supply until that runs out. You can use any algorithm that allocates a significant number of pointers (meaning 'more than one' and 'increasing as the number already used goes up') instead of simple doubling, but doubling has its merits (notably, it is simple).
One word of caution: you should never assign the result of realloc() to the variable that is its first argument:
words = (char *)realloc(words, ++i * sizeof(char)); // Bad!
The trouble is that if realloc() fails, you've just wiped out the only pointer to the previously allocated memory, so you have leaked it all. Always assign to a new variable, test that it worked, then copy the result:
char *new_space = (char *)realloc(words, ++i * sizeof(char));
if (new_space == 0)
{
fprintf(stderr, "Memory allocation failed at size %d\n", i);
exit(1);
}
words = new_space;
I assembled this code yesterday. Notice that it uses functions to do repeated jobs - such as checking that memory allocation succeeded. There is room to improve it (there always is). It does character at a time input still (and newline mapping, therefore) but allocates increasingly large chunks of memory so that it does not do memory allocation on every character read. The err_exit() function is a useful skeleton; you can flesh it out into a much more complex system, but the basic idea of a function to report errors and exit (with a behaviour similar to fprintf() + exit() can simplify programs a lot (and error checking and reporting is important, but needs to be simple when it can be).
#include <assert.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
static void err_exit(const char *format, ...);
static void *emalloc(size_t nbytes);
static void *erealloc(void *old_space, size_t nbytes);
int main(void)
{
const char filename[] = "monologue.txt";
FILE *fp;
size_t i = 0;
size_t len_data = 4;
char *data = emalloc(len_data);
int c;
/* Read data from file */
if ((fp = fopen(filename, "r")) == NULL)
err_exit("Error opening file %s\n", filename);
while ((c = fgetc(fp)) != EOF)
{
if (c == '\n')
c = ' ';
if (i >= len_data)
{
assert(i == len_data);
data = realloc(data, 2 * len_data);
len_data *= 2;
}
data[i++] = c;
}
if (i >= len_data)
{
assert(i == len_data);
data = erealloc(data, len_data + 1);
len_data++;
}
data[i] = '\0';
fclose(fp);
/* Split file into words */
size_t len_wordlist = 16;
size_t num_words = 0;
char **wordlist = emalloc(len_wordlist * sizeof(char *));
char *location = data;
char *word;
for (num_words = 0; (word = strtok(location, " ")) != NULL; num_words++)
{
if (num_words >= len_wordlist)
{
assert(num_words == len_wordlist);
wordlist = erealloc(wordlist, 2 * len_wordlist * sizeof(char *));
len_wordlist *= 2;
}
wordlist[num_words] = word;
location = NULL;
}
/* Print the word list - one per line */
for (i = 0; i < num_words; i++)
printf("%zu: %s\n", i, wordlist[i]);
/* Release allocated space */
free(data);
free(wordlist);
return(0);
}
static void err_exit(const char *format, ...)
{
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
static void *emalloc(size_t nbytes)
{
void *new_space = malloc(nbytes);
if (new_space == 0)
err_exit("Failed to allocate %zu bytes of memory\n", nbytes);
return(new_space);
}
static void *erealloc(void *old_space, size_t nbytes)
{
void *new_space = realloc(old_space, nbytes);
if (new_space == 0)
err_exit("Failed to reallocate %zu bytes of memory\n", nbytes);
return(new_space);
}
Try this. I've modified very little about your code, just to keep it close to your starting point. The main thing I did was add allwords which is an array of char * (this is where I store each string one by one). Then right after printing each version of word (what you were already doing), I also copied it into the next open slot in the allwords array. At the end I added another printing loop to display the contents of each string.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAXWORDS 999
int main(){
FILE *fp;
int i=0, j;
char *words=NULL,*word=NULL,c;
char *allwords[MAXWORDS];
if ((fp=fopen("monologue.txt","r"))==NULL){ /*Where monologue txt is a normal file with plain text*/
printf("Error Opening File\n");
exit(1);}
while ((c = fgetc(fp))!= EOF){
if (c=='\n'){ c = ' '; }
words = (char *)realloc(words, ++i*sizeof(char));
words[i-1]=c;}
word=strtok(words," ");
i=0;
while(word!= NULL && i < MAXWORDS){
printf("%s\n",word);
allwords[i] = malloc(strlen(word));
strcpy(allwords[i], word);
word = strtok(NULL," ");
i++;
}
printf("\nNow printing each saved string:\n");
for (j=0; j<i; j++)
printf("String %d: %s\n", j, allwords[j]);
exit(0);
}