I need to print all phrases from a file (phrases can end in '.', '?' or '!')
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
char* read_file(char *name) {
FILE *file;
char *text;
long num_bytes;
file = fopen(name, "r");
if(!file) {
printf("File could not be opened!");
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
num_bytes = ftell(file);
fseek(file, 0, SEEK_SET);
text = (char*) malloc(num_bytes * sizeof(char));
fread(text, 1, num_bytes, file);
fclose(file);
return text;
}
I have this piece of code that kind of works but if my file as the following text: "My name is Maria. I'm 19." the second phrase is printed with a ' ' in the beggining.
Can someone please help finding a way to ignore those spaces? Thank you
To start, you have several problems that will invoke Undefined Behaviour. In
char *line = (char*) malloc(sizeof(text));
sizeof (text) is the size of a pointer (char *), not the length of the buffer it points to.
sizeof (char *) depends on your system, but is very likely to be 8 (go ahead and test this: printf("%zu\n", sizeof (char *));, if you are curious), which means line can hold a string of length 7 (plus the null-terminating byte).
Long sentences will easily overflow this buffer, leading to UB.
(Aside: do not cast the return of malloc in C.)
Additionally, strlen(text) may not work properly as text may not include the null-terminating byte ('\0'). fread works with raw bytes, and does not understand the concept of a null-terminated string - files do not have to be null-terminated, and fread will not null-terminate buffers for you.
You should allocate one additional byte to in the read_file function
text = malloc(num_bytes + 1);
text[num_bytes] = 0;
and place the null-terminating byte there.
(Aside: sizeof (char) is guaranteed to be 1.)
Note that ftell to determine the length of a file should not be relied upon.
isspace from <ctype.h> can be used to determine if the current character is whitespace. Its argument should be cast to unsigned char. Note this will include characters such as '\t' and '\n'. Use simple comparison if you only care about spaces (text[i + 1] == ' ').
A loop can be used to consume the trailing whitespace after matching a delimiter.
Make sure to null-terminate line before printing it, as %s expects a string.
Use %u to print an unsigned int.
Do not forget to free your dynamically allocated memory when you are done with it. Additionally, heavily consider checking any library function that can fail has not done so.
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
void pdie(const char *msg) {
perror(msg);
exit(EXIT_FAILURE);
}
char *read_file(char *name) {
FILE *file = fopen(name, "r");
if (!file)
pdie(name);
fseek(file, 0, SEEK_END);
long num_bytes = ftell(file);
if (-1 == num_bytes)
pdie(name);
fseek(file, 0, SEEK_SET);
char *text = malloc(num_bytes + 1);
if (!text)
pdie("malloc");
if (-1 == num_bytes)
pdie(name);
text[num_bytes] = 0;
if (fread(text, 1, num_bytes, file) != num_bytes)
pdie(name);
fclose(file);
return text;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "usage: %s TEXT_FILE\n", argv[0]);
return EXIT_FAILURE;
}
char *text = read_file(argv[1]);
unsigned int count = 0;
size_t length = strlen(text);
size_t index = 0;
char *line = malloc(length + 1);
if (!line)
pdie("malloc");
for (size_t i = 0; i < length; i++) {
line[index++] = text[i];
if (text[i] == '.' || text[i] == '?' || text[i] == '!') {
line[index] = '\0';
index = 0;
printf("[%u] <<%s>>\n", ++count, line);
while (isspace((unsigned char) text[i + 1]))
i++;
}
}
free(text);
free(line);
return EXIT_SUCCESS;
}
Input file:
My name is Maria. I'm 19. Hello world! How are you?
stdout:
[1] <<My name is Maria.>>
[2] <<I'm 19.>>
[3] <<Hello world!>>
[4] <<How are you?>>
You can test for a whitespace character by comparing the char in question to ' '.
if(text[i] == ' ')
// text[i] is whitespace
One possible solution, advance to the next non-whitespace character when you find the end of the sentence. You also need to make sure you've mallocd enough memory for the current phrase:
#include <ctype.h> // for isspace
...
size_t textLength = strlen(text);
// malloc based on the text length here, plus 1 for the NUL terminator.
// sizeof(text) gives you the size of the pointer, not the size of the
// memory block it points to.
char *line = malloc(textLength+1);
for(size_t i = 0; i < textLength; i++) {
line[index] = text[i];
index++;
if(text[i] == '.' || text[i] == '?' || text[i] == '!') {
count++;
printf("[%d] %s\n", count, line);
memset(line, 0, index + 1);
index = 0;
// advance to the next non-whitespace char
do
{
// advance to the next char (we know the current char is not a space)
i++;
// keep advancing i while the next char is in range of the
// text and the next char is a space.
}while (i+1 < textLength && isspace(text[i+1]) != 0);
}
}
Output:
[1] My name is Maria.
[2] I'm 19.
Demonstration
There's also no need to cast the return value of malloc
Related
My program needs to print longest word which contains only letters from a file.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int checkString(const char s[]) {
unsigned char c;
while ((c = *s) && (isalpha(c) || isblank(c)))
++s;
return *s == '\0';
}
int main() {
char file_name[]="document.txt";
FILE *fp = fopen(file_name, "r");
char *largest = str;
int largest_len = 0;
while (fgets(file_name, 1000, fp) != NULL) {
char *temp = strtok(file_name, " ");
while (temp != NULL) {
if (strlen(temp) > largest_len) {
strcpy(largest, temp);
largest_len = strlen(largest);
}
temp = strtok(NULL, "\",.,1,2,4,5,6,7,8,9 ");
}
}
if(checkString(largest))
printf("%s", largest);
fclose(fp);
return 0;
}
In my code, if the largest word contains only letters it will be printed. How to modify this code to check next words if the largest doesn't contain only letters?
First of all, you cannot store the pointer to longest word like that. You re-use str for the next line and so the pointer is not likely to point to something useful.
Second, while strtok() appears simple, initially, I tend to apply a straightforward approach to a straightforward problem.
The problem is O(n) (where n is the length of the document). You just need to go through it character by character. Of course, since every line is ended by a \n, you can use the line based approach in this case.
So, instead of strtok, simply check each character, if it is a legal word character (an alphanumeric character, that is). You can easily do so with the standard library function isalpha() from header ctype.h.
Below is the program, copying the longest string into a dedicated buffer, using isalpha() and doing the line based reading of the file, just like the code in the original question did.
Of course, this code assumes, no line is ever longer than 999 characters.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <ctype.h>
static size_t gulp(const char* line, size_t istart, size_t len) {
size_t n = 0;
for (size_t i = istart; i < len; i++, n++) {
if (!isalpha(line[i])) {
break;
}
}
return n;
}
int main(int argc, const char * argv[]) {
FILE* f = fopen("document.txt","r");
char line[1000];
char longest_word[1000];
size_t longest_word_length = 0;
while (fgets(line, sizeof(line), f) != NULL) {
size_t i0 = 0;
size_t line_length = strlen(line);
while (i0 < line_length) {
if (isalpha(line[i0])) {
size_t n = gulp(line, i0, line_length);
if (n > longest_word_length) {
strncpy(longest_word, &line[i0], n);
longest_word[n] = '\0';
longest_word_length = n;
}
i0 = i0 + n;
} else {
i0++;
}
}
}
fclose(f);
f = NULL;
if (longest_word_length > 0) {
printf("longest word: %s (%lu characters)\n",
longest_word, longest_word_length);
}
return 0;
}
There are a number of problems here:
you use the same buffer (str) for two different uses: as a read buffer and to store the longest word. If you find the largest word in the first line, the word will be erased when reading the second line. Furthemore, if you find a rather long word at the beginning of a line, the strings pointed to by largest and temp could overlap which leads to undefined behaviour => use a different array or strdup (and free) for largest
you only use the space as possible separator. You should wonder whether you should add tab and/or punctuations
once you have got a word you should ensure that it only contains valid letters before testing its length and ignore it if for example it contains digits.
if a single line can be longer than 1000 characters, you should wrap the end of the current part before the beginning of the next one for the possible case where a long word would be splitted there.
For additional corner case processing, you should specify what to do if a word contains illegal characters but only at one side. For example if . is not used as a word delimiter, a word with an embedded . like "a.b" should be ignored, but a terminating . should only be stripped (like "example." should become "example"
I think the order you do things should be a bit different, here is an example
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
int isCandidate(char* word);
int main(int argc, char* argv[])
{
if (--argc == 0)
{
perror("not enough command line arguments, expecting a filename");
return -1;
}
++argv;
FILE* fp = fopen(*argv, "r");
if (fp == NULL)
{
perror(*argv);
return -1;
}
// get size of file
fseek(fp, 0L, SEEK_END);
long fileLength = ftell(fp);
if (fileLength < 1)
{
perror("file is empty");
return -1;
}
fseek(fp, 0L, SEEK_SET); // position file pointer at the beginning again
// allocate space for the whole file and then read it in
// for a text file it should be OK to do so since they
// normally are not that large.
char* buffer = malloc(fileLength+1);
if (fread(buffer, 1, fileLength, fp) != 0)
{
buffer[fileLength] = '\0'; // make sure the buffer ends with \0
}
else
{
perror("Failed reading into buffer");
return -1;
}
fclose(fp); // we are done with the file
const char filter[] = " \n\r";
char* longestWord = malloc(fileLength+1); // max length in theory
long unsigned int maxLength = 0;
for (char* token = strtok(buffer, filter); token != NULL; token = strtok(NULL, filter))
{
if (isCandidate(token))
{
if (strlen(token) > maxLength)
{
strcpy(longestWord, token);
maxLength = strlen(token);
}
}
}
printf("Longest word:'%s', len=%lu\n", longestWord, maxLength);
free(longestWord);
free(buffer);
}
int isCandidate(char* word)
{
if (word == NULL)
{
perror("invalid argument to isCandidate");
return 0;
}
for (char* ch = word; *ch; ++ch)
{
if (!isalpha(*ch)) return 0;
}
return 1;
}
If I prompt user to give filename as input, then check for and remove newline character do I still have to add +1 to strlen(filename) when allocating memory i function, because of using strlen and the fact that it counts characters without the last character?
Or there's no need for that since I removed it in main()?
I've read that not adding +1 can allocate to little memory for string and cause problems, but I read contradictory things on the matter and would appreciate some clarification.
double** wczytaj_macierz (char* filename, int x, int y)
{
char *file = malloc(strlen(filename) + 1);
sprintf(file, "%s", filename);
FILE *fin = fopen (file, "r");
...
rest of the code
...
int main(void)
char filename[BUFSIZ];
{
printf("\nPlease enter filename, max %d characters.\n", sizeof(filename));
if (fgets(filename, sizeof(filename), stdin) != NULL)
{
if ((p = strchr(filename, '\n')) != NULL)
{
*p = '\0';
}
}
wczytaj_macierz (filename, x, y);
When you allocate memory for a string, you must always allocate one more byte than the length of the string. That extra byte is for the terminating '\0' character.
If you have a string that happens to end in a newline \n character, and you strip that character off, you will obviously make the string one character shorter, and this means you will need one less byte of memory to store it. But storing it will still require space for the \0, as always.
As an example:
char string[] = "test\n";
int stringlen = strlen(string); // length = 5
char *copy1 = malloc(stringlen + 1); // allocates 6 bytes
strcpy(copy1, string); // this works
char *p = strchr(string, '\n'); // find the \n
if(p != NULL) *p = '\0'; // strip it off
printf("string is now: \"%s\"\n", string);
stringlen = strlen(string); // length = 4
char *copy2 = malloc(stringlen + 1); // allocates 5 bytes
strcpy(copy2, string); // this works also
Now, if you wrote the code like this:
char string[] = "test\n";
int stringlen = strlen(string);
char *p = strchr(string, '\n');
if(p != NULL) *p = '\0';
printf("string is now: \"%s\"\n", string);
char *copy = malloc(stringlen); // don't have to add 1,
// since I just stripped off \n
strcpy(copy, string);
it looks like you can get away with not adding the + 1. But any time you have to add a comment to explain some code that's not there, and especially if the comment you have to write is longer than the code it replaces, it's usually a sign that you've gotten too clever, that you ought to just leave the code in. And, indeed, even though it might seem to work at first, writing the code this way would be pretty dangerous, because if you ever end up with a string that doesn't end in \n, meaning that there's nothing to strip off, the code will stop working properly.
Does this answers your question :?
#include <stdio.h>
#include <string.h>
int main()
{
char filename[10];
printf("\nPlease enter filename, max %d characters.\n", sizeof(filename));
if (fgets(filename, sizeof(filename), stdin) != NULL)
{
printf("%s %d\n", filename, strlen(filename));
if (filename[strlen(filename) - 1] == '\n')
{
filename[strlen(filename) - 1] = '\0'; // Here lies \n
}
printf("%s %d\n", filename, strlen(filename));
}
// If filename was "file\n\0", filename[strlen(filename) - 1] == '\n'
// Now with filename[strlen(filename) - 1] = '\0'
// filename is "file\0\0" that is "file\0".
// strlen("file\n") == 5
// strlen("file") = 4
// strlen(filename) was 5 and now it is 4
return 0;
}
So I am writing a little function to parse paths, it looks like this:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
int parse_path() {
char *pathname = "this/is/a/path/hello";
int char_index = 0;
char current_char = pathname[char_index];
char *buffer = malloc(2 * sizeof(char));
char *current_char_str = malloc(2 * sizeof(char));
while (current_char != '\0' && (int)current_char != 11) {
if (char_index == 0 && current_char == '/') {
char_index++; current_char = pathname[char_index];
continue;
}
while (current_char != '/' && current_char != '\0') {
current_char_str[0] = current_char;
current_char_str[1] = '\0';
buffer = (char *)realloc(buffer, (strlen(buffer) + 2) * sizeof(char));
strcat(buffer, current_char_str);
char_index++; current_char = pathname[char_index];
}
if (strlen(buffer)) {
printf("buffer(%s)\n", buffer);
current_char_str[0] = '\0';
buffer[0] = '\0';
}
char_index++; current_char = pathname[char_index];
}
};
int main(int argc, char *argv[]) {
parse_path();
printf("hello\n");
return 0;
}
Now, there is undefined behavior in my code, it looks like the printf call inside the main method is changing the buffer variable... as you can see, the output of this program is:
buffer(this)
buffer(is)
buffer(a)
buffer(path)
buffer(hello)
buffer(buffer(%s)
)
buffer(hello)
hello
I have looked at other posts where the same sort of problem is mentioned and people have told me to use a static char array etc. but that does not seem to help.
Any suggestions?
For some reason, at one time in this program the "hello" string from printf is present in my buffer variable.
Sebastian, if you are still having problems after #PaulOgilvie answer, then it is most likely due to not understanding his answer. Your problem is due to buffer being allocated but not initialized. When you call malloc, it allocates a block of at least the size requested, and returns a pointer to the beginning address for the new block -- but does nothing with the contents of the new block -- meaning the block is full random values that just happened to be in the range of addresses for the new block.
So when you call strcat(buffer, current_char_str); the first time and there is nothing but random garbage in buffer and no nul-terminating character -- you do invoke Undefined Behavior. (there is no end-of-string in buffer to be found)
To fix the error, you simply need to make buffer an empty-string after it is allocated by setting the first character to the nul-terminating character, or use calloc instead to allocate the block which will ensure all bytes are set to zero.
For example:
int parse_path (const char *pathname)
{
int char_index = 0, ccs_index = 0;
char current_char = pathname[char_index];
char *buffer = NULL;
char *current_char_str = NULL;
if (!(buffer = malloc (2))) {
perror ("malloc-buffer");
return 0;
}
*buffer = 0; /* make buffer empty-string, or use calloc */
...
Also do not hardcode paths or numbers (that includes the 0 and 2, but we will let those slide for now). Hardcoding "this/is/a/path/hello" within parse_path() make is a rather un-useful function. Instead, make your pathname variable your parameter so I can take any path you want to send to it...
While the whole idea of realloc'ing 2-characters at a time is rather inefficient, you always need to realloc with a temporary pointer rather than the pointer itself. Why? realloc can and does fail and when it does, it returns NULL. If you are using the pointer itself, you will overwrite your current pointer address with NULL in the event of failure, losing the address to your existing block of memory forever creating a memory leak. Instead,
void *tmp = realloc (buffer, strlen(buffer) + 2);
if (!tmp) {
perror ("realloc-tmp");
goto alldone; /* use goto to break nested loops */
}
...
}
alldone:;
/* return something meaningful, your function is type 'int' */
}
A short example incorporating the fixes and temporary pointer would be:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int parse_path (const char *pathname)
{
int char_index = 0, ccs_index = 0;
char current_char = pathname[char_index];
char *buffer = NULL;
char *current_char_str = NULL;
if (!(buffer = malloc (2))) {
perror ("malloc-buffer");
return 0;
}
*buffer = 0; /* make buffer empty-string, or use calloc */
if (!(current_char_str = malloc (2))) {
perror ("malloc-current_char_str");
return 0;
}
while (current_char != '\0' && (int) current_char != 11) {
if (char_index == 0 && current_char == '/') {
char_index++;
current_char = pathname[char_index];
continue;
}
while (current_char != '/' && current_char != '\0') {
current_char_str[0] = current_char;
current_char_str[1] = '\0';
void *tmp = realloc (buffer, strlen(buffer) + 2);
if (!tmp) {
perror ("realloc-tmp");
goto alldone;
}
strcat(buffer, current_char_str);
char_index++;
current_char = pathname[char_index];
}
if (strlen(buffer)) {
printf("buffer(%s)\n", buffer);
current_char_str[0] = '\0';
buffer[0] = '\0';
}
if (current_char != '\0') {
char_index++;
current_char = pathname[char_index];
}
}
alldone:;
return ccs_index;
}
int main(int argc, char* argv[]) {
parse_path ("this/is/a/path/hello");
printf ("hello\n");
return 0;
}
(note: your logic is quite tortured above and you could just use a fixed buffer of PATH_MAX size (include limits.h) and dispense with allocating. Otherwise, you should allocate some anticipated number of characters for buffer to begin with, like strlen (pathname) which would ensure sufficient space for each path component without reallocating. I'd rather over-allocate by 1000-characters than screw up indexing worrying about reallocating 2-characters at a time...)
Example Use/Output
> bin\parsepath.exe
buffer(this)
buffer(is)
buffer(a)
buffer(path)
buffer(hello)
hello
A More Straight-Forward Approach Without Allocation
Simply using a buffer of PATH_MAX size or an allocated buffer of at least strlen (pathname) size will allow you to simply step down your string without any reallocations, e.g.
#include <stdio.h>
#include <limits.h> /* for PATH_MAX - but VS doesn't provide it, so we check */
#ifndef PATH_MAX
#define PATH_MAX 2048
#endif
void parse_path (const char *pathname)
{
const char *p = pathname;
char buffer[PATH_MAX], *b = buffer;
while (*p) {
if (*p == '/') {
if (p != pathname) {
*b = 0;
printf ("buffer (%s)\n", buffer);
b = buffer;
}
}
else
*b++ = *p;
p++;
}
if (b != buffer) {
*b = 0;
printf ("buffer (%s)\n", buffer);
}
}
int main (int argc, char* argv[]) {
char *path = argc > 1 ? argv[1] : "this/is/a/path/hello";
parse_path (path);
printf ("hello\n");
return 0;
}
Example Use/Output
> parsepath2.exe
buffer (this)
buffer (is)
buffer (a)
buffer (path)
buffer (hello)
hello
Or
> parsepath2.exe another/path/that/ends/in/a/filename
buffer (another)
buffer (path)
buffer (that)
buffer (ends)
buffer (in)
buffer (a)
buffer (filename)
hello
Now you can pass any path you would like to parse as an argument to your program and it will be parsed without having to change or recompile anything. Look things over and let me know if you have questions.
You strcat something to buffer but buffer has never been initialized. strcat will first search for the first null character and then copy the string to concatenate there. You are now probably overwriting memory that is not yours.
Before the outer while loop do:
*buffer= '\0';
There are 2 main problems in your code:
the arrays allocated by malloc() are not initialized, so you have undefined behavior when you call strlen(buffer) before setting a null terminator inside the array buffer points to. The program could just crash, but in your case whatever contents is present in the memory block and after it is retained up to the first null byte.
just before the end of the outer loop, you should only take the next character from the path if the current character is a '/'. In your case, you skip the null terminator and the program has undefined behavior as you read beyond the end of the string constant. Indeed, the parse continues through another string constant "buffer(%s)\n" and through yet another one "hello". The string constants seem to be adjacent without padding on your system, which is just a coincidence.
Here is a corrected version:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
void parse_path(const char *pathname) {
int char_index = 0;
char current_char = pathname[char_index];
char *buffer = calloc(1, 1);
char *current_char_str = calloc(1, 1);
while (current_char != '\0' && current_char != 11) {
if (char_index == 0 && current_char == '/') {
char_index++; current_char = pathname[char_index];
continue;
}
while (current_char != '/' && current_char != '\0') {
current_char_str[0] = current_char;
current_char_str[1] = '\0';
buffer = (char *)realloc(buffer, strlen(buffer) + 2);
strcat(buffer, current_char_str);
char_index++; current_char = pathname[char_index];
}
if (strlen(buffer)) {
printf("buffer(%s)\n", buffer);
current_char_str[0] = '\0';
buffer[0] = '\0';
}
if (current_char == '/') {
char_index++; current_char = pathname[char_index];
}
}
}
int main(int argc, char *argv[]) {
parse_path("this/is/a/path/hello");
printf("hello\n");
return 0;
}
Output:
buffer(this)
buffer(is)
buffer(a)
buffer(path)
buffer(hello)
hello
Note however some remaining problems:
allocation failure is not tested, resulting in undefined behavior,
allocated blocks are not freed, resulting in memory leaks,
it is unclear why you test current_char != 11: did you mean to stop at TAB or newline?
Here is a much simpler version with the same behavior:
#include <stdio.h>
#include <string.h>
void parse_path(const char *pathname) {
int i, n;
for (i = 0; pathname[i] != '\0'; i += n) {
if (pathname[i] == '/') {
n = 1; /* skip path separators and empty items */
} else {
n = strcspn(pathname + i, "/"); /* get the length of the path item */
printf("buffer(%.*s)\n", n, pathname + i);
}
}
}
int main(int argc, char *argv[]) {
parse_path("this/is/a/path/hello");
printf("hello\n");
return 0;
}
I'd like to read a big file while the first character of a line isn't " ".
But the code I have written is very slow. How can I speed up the routine?
Is there a better solution instead of getline?
void readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res)
{
vString = (char *)calloc(fdstat.st_size + 1, sizeof(char));
int dataEnd = 1;
size_t len = 0;
int emptyLine = 1;
char **linePtr = malloc(sizeof(char*));
*linePtr = NULL;
while(dataEnd)
{
// Check every line
getline(linePtr, &len, fp);
// When data ends, the line begins with space (" ")
if(*linePtr[0] == 0x20)
emptyLine = 0;
// If line begins with space, stop writing
if(emptyLine)
strcat(vString, *linePtr);
else
dataEnd = 0;
}
strcat(vString, "\0");
free(linePtr);
linePtr = NULL;
}
}
int main(int argc, char **argv){
readString(argv[1]);
return EXIT_SUCCESS;
}
How can I speed up the routine?
The most suspicious aspect of your program performance-wise is the strcat(). On each call, it needs to scan the whole destination string from the beginning to find the place to append the source string. As a result, if your file's lines have length bounded by a constant (even a large one), then your approach's performance scales with the square of the file length.
The asymptotic complexity analysis doesn't necessarily tell the whole story, though. The I/O part of your code scales linearly with file length, and since I/O is much more expensive than in-memory data manipulation, that will dominate your performance for small enough files. If you're in that regime then you're probably not going to do much better than you already do. In that event, though, you might still do a bit better by reading the whole file at once via fread(), and then scanning it for end-of-data via strstr():
size_t nread = fread(vString, 1, fdstat.st_size, fp);
// Handle nread != fdstat.st_size ...
// terminate the buffer as a string
vString[nread] = '\0';
// truncate the string after the end-of-data:
char *eod = strstr(vString, "\n ");
if (eod) {
// terminator found - truncate the string after the newline
eod[1] = '\0';
} // else no terminator found
That scales linearly, so it addresses your asymptotic complexity problem, too, but if the data of interest will often be much shorter than the file, then it will leave you in those cases doing a lot more costly I/O than you need to do. In that event, one alternative would be to read in chunks, as #laissez_faire suggested. Another would be to tweak your original algorithm to track the end of vString so as to use strcpy() instead of strcat() to append each new line. The key part of that version would look something like this:
char *linePtr = NULL;
size_t nread = 0;
size_t len = 0;
*vString = '\0'; // In case the first line is end-of-data
for (char *end = vString; ; end += nread) {
// Check every line
nread = getline(&linePtr, &len, fp);
if (nread < 0) {
// handle eof or error ...
}
// When data ends, the line begins with space (" ")
if (*linePtr == ' ') {
break;
}
strcpy(end, *linePtr);
}
free(linePtr);
Additionally, note that
you do not need to initially zero-fill the memory allocated for *vString, as you're just going to overwrite those zeroes with the data of real interest (and then ignore the rest of the buffer).
You should not cast the return value of malloc()-family functions, including calloc().
Have you tried to read the file using fread and read a bigger chunk of data in each step and then parse the data after reading it? Something like:
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
char *readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res) {
vString = (char *) calloc(fdstat.st_size + 1, sizeof(char));
int newline = 1;
int index = 0;
while (index < fdstat.st_size) {
int len =
fdstat.st_size - index >
4096 ? 4096 : fdstat.st_size - index;
char *buffer = (char *) malloc(len);
int read_len = fread(buffer, 1, len, fp);
int i;
if (newline) {
if (read_len > 0 && buffer[0] == ' ') {
return vString;
}
newline = 0;
}
for (i = 0; i < read_len; ++i) {
if (buffer[i] == '\n') {
if (i + 1 < read_len && buffer[i + 1] == ' ') {
memcpy(vString + index, buffer, i + 1);
return vString;
}
newline = 1;
}
}
memcpy(vString + index, buffer, read_len);
index += read_len;
}
}
return vString;
}
int main(int argc, char **argv)
{
char *str = readString(argv[1]);
printf("%s", str);
free(str);
return EXIT_SUCCESS;
}
I need a version of read line that is memory save. I have this "working" solution. But I'm not sure how it behaves with memory. When I enable free(text) it works for a few lines and then I get an error. So now neither text nor result is ever freed although I malloc text. Is that correct ? And why is that so ?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char* readFromIn()
{
char* text = malloc(1024);
char* result = fgets(text, 1024, stdin);
if (result[strlen(result) - 1] == 10)
result[strlen(result) - 1] = 0;
//free(text);
return result;
}
I have A LOT of short lines to read with this and I also need stdin to be replaceable with a FILE* handle. There is no need for me to realloc text because I have only short lines.
fgets returns a pointer to the string, so after the fgets line, result will be the same memory address as text. Then when you call free (text); you are returning invalid memory.
You should free the memory in the calling function when you have finished with result
You could also avoid the malloc/free stuff by structuring your code to pass a buffer something like this:
void parent_function ()
{
char *buffer[1024];
while (readFromIn(buffer)) {
// Process the contents of buffer
}
}
char *readFromIn(char *buffer)
{
char *result = fgets(buffer, 1024, stdin);
int len;
// fgets returns NULL on error of end of input,
// in which case buffer contents will be undefined
if (result == NULL) {
return NULL;
}
len = strlen (buffer);
if (len == 0) {
return NULL;
}
if (buffer[len - 1] == '\n') {
buffer[len - 1] = 0;
return buffer;
}
Trying to avoid the malloc/free is probably wise if you are dealing with many small, short lived items so that the memory doesn't get fragmented and it should faster as well.
char *fgets(char *s, int size, FILE *stream) reads in at most one less than size characters from stream and stores them into the buffer pointed to by s. Reading stops after an EOF or a newline. If a newline is read, it is stored into the buffer. A terminating null byte ('\0') is stored after the last character in the buffer.
Return Value: returns s on success, and NULL on error or when end of file occurs while no characters have been read.
So there are 2 critical problems with your code:
You don't check the return value of fgets
You want to deallocate the memory, where this string is stored and return a pointer to this memory. Accessing the memory, where such a pointer (dangling pointer) points to, leads to undefined behaviour.
Your function could look like this:
public char* readFromIn() {
char* text = malloc(1024);
if (fgets(text, 1024, stdin) != NULL) {
int textLen = strlen(text);
if (textLen > 0 && text[textLen - 1] == '\n')
text[textLen - 1] == '\0'; // getting rid of newline character
return text;
}
else {
free(text);
return NULL;
}
}
and then caller of this function should be responsible for deallocating the memory that return value of this function points to.
I know you mentioned that the lines are only short, but none of the solutions provided will work for lines greater than 1024 in length. It is for this reason that I provide a solution which will attempt to read entire lines, and resize the buffer when there's not enough space.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MINIMUM_CAPACITY 16
size_t read_line(char **buffer, size_t *capacity) {
char *buf = *buffer;
size_t cap = *capacity, pos = 0;
if (cap < MINIMUM_CAPACITY) { cap = MINIMUM_CAPACITY; }
for (;;) {
buf = realloc(buf, cap);
if (buf == NULL) { return pos; }
*buffer = buf;
*capacity = cap;
if (fgets(buf + pos, cap - pos, stdin) == NULL) {
break;
}
pos += strcspn(buf + pos, "\n");
if (buf[pos] == '\n') {
break;
}
cap *= 2;
}
return pos;
}
int main(void) {
char *line = NULL;
size_t size = 0;
for (size_t end = read_line(&line, &size); line[end] == '\n'; end = read_line(&line, &size)) {
line[end] = '\0'; // trim '\n' off the end
// process contents of buffer here
}
free(line);
return 0;
}
An ideal solution should be able to operate with a fixed buffer of 1 byte. This requires a more comprehensive understanding of the problem, however. Once achieved, adapting such a solution would achieve the most optimal solution.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *readFromIn(FILE *fp)
{
char text[1024];
size_t len;
if (!fgets(text, sizeof text, fp)) return NULL;
len = strlen(text);
while (len && text[len-1] == '\n') text[--len] = 0;
return strdup(text);
}
Why did no one propose to move the buffer from heap to stack ? This is my solution now:
char input[1024]; // held ready as buffer for fgets
char* readFromIn()
{
char* result = fgets(input, 1024, stdin);
if (result == null)
return "";
if (result[strlen(result) - 1] == '\n')
result[strlen(result) - 1] = 0;
return result;
}