Read access of a file to be shared by multiple threads: pthreads - c

I have to implement an application where user passes multiple words via command line and the application finds count of the word in each line of file. Each word will search the file in its own thread.
So far I have implemented it as single threaded app.
The code looks like:
//Below function reads file line and returns it
char* readLine(FILE* file, char* line)
{
if (file == NULL) {
printf("Error: file pointer is null.");
exit(1);
}
int maximumLineLength = 128;
char *lineBuffer = (char *) malloc(sizeof(char) * maximumLineLength);
if (lineBuffer == NULL) {
printf("Error allocating memory for line buffer.");
exit(1);
}
char ch = getc(file);//Get each character
int count = 0;
//loop for line or EOF
while ((ch != '\n') && (ch != EOF))
{
if (count == maximumLineLength)
{
maximumLineLength += 128;
lineBuffer = realloc(lineBuffer, maximumLineLength);
if (lineBuffer == NULL)
{
printf("Error reallocating space for line buffer.");
exit(1);
}
}
lineBuffer[count] = ch;
count++;
ch = getc(file);
}
lineBuffer[count] = '\0';//Add null character
line = (char *) malloc(sizeof(char) * (count + 1));
strncpy(line, lineBuffer, (count + 1));
free(lineBuffer);
return line;
}
//Below function finds the occurance of
//word in the line
//Need to refine to take into consideration
//scenarios such that {"Am"," am "," am","?Am",".Am"}etc
int findWord(char* line,char* word)
{
int count=0;
int lineLen = strlen(line);
int wordLen = strlen(word);
char* temp= (char *) malloc(sizeof(char) * (lineLen+1));
strcpy(temp,line);
while(true)
{
if( strstr(temp,word) == NULL)
break;
strcpy(temp, strstr(temp,word));
// printf("##%s\n",temp);
strcpy(temp,temp+wordLen+1);
// printf("##%s\n",temp);
count++;
}
//printf("%d\n",count);
free(temp);
return count;
}
//Below function fills the linked list for data structure lineCount
//with word occurance statistics
//line by line and the total
//The number of elements in the list would be number of lines in the
//file
LineCount* findCount(FILE* file, char* word,LineCount** lineCountHead)//Make it multithreaded fn()
{
LineCount* lineHead= NULL;
char* line = NULL;
int lineNumber=1;
int count=0;
if (file == NULL) {
printf("Error: file pointer is null.");
exit(1);
}
while (!feof(file)) {
LineCount* temp=NULL;
line = readLine(file, line);
//printf("%s\n", line);
count=findWord(line,word);
//Critical Section Start
temp=LineCountNode(lineNumber,count);
addToLineCountList(temp,lineCountHead);
//Criticla Section End
lineNumber++;
}
free(line);
return lineHead;
}
So basically I want my calling thread function to be LineCount* findCount(FILE* file, char* word,LineCount** lineCountHead)
My understanding is that, the file will be accessed - only for read purpose by the threads, so no need to take care of synchronization.
Currently I am opening the file as:
pFile = fopen (argv[1],"r");. My question is how do I open in read shared mode ?
I know in C++ there exists a read shared mode. How to achieve this in c?
Also how do I write my function LineCount* findCount(FILE* file, char* word,LineCount** lineCountHead) in the form required by thread call function i.e. the form void* fn(void*)

While in read-only mode there are no issues with the file itself, the IO functions in the standard C library are not designed to be usable from multiple threads in parallel. They are thread-safe (or at least, I think so) but using them correctly from multiple threads is not trivial.
At the lowest level, each FILE structure contains a file position pointer - or the IO functions maintain an OS-provided pointer. Having multiple threads mess with the file cursor position sounds like a good way to make your life more difficult than it should be.
The best approach would be to open your file multiple times - once in each thread. Each thread would then have its own FILE pointer, stream buffer etc. Note that this is not unique to C & POSIX threads - its an inherent issue with using multiple threads.
In any case, I am unsure what you are trying to achieve by using multiple threads. Generally search operations like this are I/O bound - multithreaded accesses to the same file are quite likely to make things worse.
The only case where it might make sense is if you had a huge amount of strings to search for and you had a single I/O thread feeding all other threads through a common buffer. That would distribute the CPU-intensive part, without causing undue I/O...

Related

code in C being killed when reading a 250MB file

I am trying to process a 250MB file using a script in C.
The file is basically a dataset and I want to read just some of the columns and (more importantly) break one of them (which is originally a string) into a sequence of characters.
However, even though I have plenty of RAM available, the code is killed by konsole (using KDE Neon) everytime I run it.
The source is available below:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main() {
FILE *arquivo;
char *line = NULL;
size_t len = 0;
int i = 0;
int j;
int k;
char *vetor[500];
int acertos[45];
FILE *licmat = fopen("licmat.csv", "w");
//creating the header
fprintf(licmat,"CO_CATEGAD,CO_UF_CURSO,ACERTO09,ACERTO10,ACERTO11,ACERTO12,ACERTO13,ACERTO14,ACERTO15,ACERTO16,ACERTO17,ACERTO18,ACERTO19,ACERTO20,ACERTO21,ACERTO22,ACERTO23,ACERTO24,ACERTO25,ACERTO26,ACERTO27,ACERTO28,ACERTO29,ACERTO30,ACERTO31,ACERTO32,ACERTO33,ACERTO34,ACERTO35\n");
if ((arquivo = fopen("MICRODADOS_ENADE_2017.csv", "r")) == NULL) {
printf ("\nError");
exit(0);
}
//reading one line at a time
while (getline(&line, &len, arquivo)) {
char *ptr = strsep(&line,";");
j=0;
//breaking the line into a vector based on ;
while(ptr != NULL)
{
vetor[j]=ptr;
j=j+1;
ptr = strsep(&line,";");
}
//filtering based on content
if (strcmp(vetor[4],"702")==0 && strcmp(vetor[33],"555")==0) {
//copying some info
fprintf(licmat,"%s,%s,",vetor[2],vetor[8]);
//breaking the string (32) into isolated characters
for (k=0;k<27;k=k+1) {
fprintf(licmat,"%c", vetor[32][k]);
if (k<26) {
fprintf(licmat,",");
}
}
fprintf(licmat,"\n");
}
i=i+1;
}
free(line);
fclose(arquivo);
fclose(licmat);
}
The output is perfect up to the point when the script is killed. The output file is just 640KB long and has about 10000 lines only.
What could be the issue?
It looks to me like you're mishandling the memory buffer managed by getline() - which allocates/reallocates as needed - by the use of strsep(), which seems to manipulate that same pointer value.
Once line has been updated to reflect some other element on the line, it's no longer pointing to the start of allocated memory, and then boom the next time getline() needs to do anything with it.
Use a different variable to pass to strsep():
while (getline(&line, &len, arquivo) > 0) { // use ">=" if you want blank lines
char *parseline = line;
char *ptr = strsep(&parseline,";");
// do the same thing later
The key thing here: you're not allowed to muck with the value of line other than to free() it at the end (which you do), and you can't let any other routine do it either.
Edit: updated to reflect getline() returning <0 on error (h/t to #user3121023)

Getting data from a file and printing it out every time a new line starts

Im Completely new to programming but need to get a program running as part of my training. The programs ultimate goal is to read files from a database and then send them to the client who is asking for it.
Currently im just learning how to read strings from a file and write it to a different file. But my problem is that I want to print data out every time i hit a new line.
The data in the file im using is in the following format:
<DESCRIPTION>data,<DESCRIPTION>data,<DESCRIPTION>data etc.
The data is both int and chars.
Since the data is seperated with a "," i was thinking of first puting all "<DESCRIPTION>data" into substrings with the strtok function i managed to find while googling, after that i would scan only for the "DESCRIPTION" part and then put the desired data into an array that I then would print out when reaching the end of the array (end of the line) and then move on to the next line until End of file.
What functions can I use to fix this? Or how do I set up a loop that wont take forever by scanning all chars in the line everytime it wants data? If what im saying and what im doing is 2 different things I again apologize for being a total beginner at programming. I have been prgramming for a week now and this is all I could produce
#include <stdio.h>
#include <ctype.h>
void get9202() {
char
const * str;
const char s = ",";
char * token;
/*open database file*/
FILE * fp = fopen("datafile.dat", "r");
/*create array with all lines of data
I would like it to be able to handle unknown amounts of data.
current file is ~177000 lines of data.*/
int i = 0;
char line[i];
/*checking until end of file*/
while (fgets(line, sizeof(line), fp)) {
/*This part has to be included in the loop somehow but put in here
so that you might get a picture of what im trying to do.*/
while ( * str) {
if (!isspace( * str++))
i++;
else break;
/*not entirely sure how to exit this subloop
to print out the data and go to new line*/
}
/*trying to segment the string into an array of substrings
but dont know when to introduce x*/
token[x] = strtok(str, s);
while (token[x] != NULL) {
printf("%s\n,", token);
}
}
return result;
/* dont know how to return the file to main*/
flclose("datafile.dat");
}
If the data looks like this:
<SYMBOL>9202.T,<SYMSTAT>2,<MSGSTAT>0,<TIME>20:50:40.905246,<SYS_DT>2018/07/19,<SYS_TIM>20:50:40.503,<SYS_TIMU>20:50:40.503236
<SYMBOL>9202.T,<SYMSTAT>2,<MSGSTAT>0,<TIME>20:51:40.000235,<SYS_DT>2018/07/19,<SYS_TIM>20:51:39.598,<SYS_TIMU>20:51:39.598597
the expected file could look like
9202.T,2,0,20:50:40.905246
9202.T,2,0,20:51:40.000235
as the wanted pieces are being selected some will fall away.
Few problems:
Will declare zero length array.
int i=0;
char line[i];
fclose is never executed because of return also fclose needs FILE * as argument.
return result;
/* dont know how to return the file to main*/
flclose("datafile.dat");
Suggestions:
trying to segment the string into an array of substrings but dont
know when to introduce x
Use fgets with fscanf to parse your line since all the lines are identical.
dont know how to return the file to main
Define a structure with needed fields and return it to main.
Example:
typedef struct {
char symbol[50];
char symstat;
char msgstat;
char time[50];
}data;
data *get9202(int *numData) {
int memAllocated = 10;
data *mData = malloc(sizeof(*mData) * memAllocated);
FILE *fp = fopen("datafile.dat", "r");
char buf[3000];
int i = 0;
while (fgets(buf, sizeof buf, fp) != NULL) {
if (i == memAllocated) {
memAllocated *= 2;
void *temp = realloc(mData, sizeof( *mData) * memAllocated);
if (temp != NULL) mData = temp;
else break; //error
}
if (sscanf(buf, "<SYMBOL>%[^,],<SYMSTAT>%c,<MSGSTAT>%c,<TIME>%[^,]",
mData[i].symbol, &mData[i].symstat, &mData[i].msgstat, mData[i].time) == 4) {
i++;
} else {
printf("error\n"); //error
}
}
fclose(fp);
*numData = i;
return mData;
}
int main() {
int len = 0;
data *mData = get9202( &len);
int i = 0;
for (i = 0; i < len; i++)
printf("%s,%c,%c,%s\n", mData[i].symbol, mData[i].symstat, mData[i].msgstat,
mData[i].time);
if (mData) free(mData);
}

Using fgets to read and print multiple lines from .txt file

FILE *fp = fopen("story.txt", "r");
if(fp == NULL){
printf("\nError opening file.\nExiting program.\n");
exit(1);
}
char text[100];
while(fgets(text, 100, fp) != NULL){
printf("%s", text);
}
printf("\n");
fclose(fp);
I'm trying to print the first 100 characters of a text file, including new lines, however when I use the code above it presents some weird behavior. First of all, it only prints the very last line of the text file, which itself is under 100 characters. Also, if I include two print statements in the while loop i.e.
while(fgets(text, 100, fp) != NULL){
printf("%s", text);
printf("%s", text);
}
It prints a lot more than 125 chars of the text file (somewhere in the thousands, it's a big text file), and the contents of said text is a bunch of seemingly random segments from the file in one constant stream, no new lines or anything.
So I guess my question is is there any way to use fgets so that it prints the text in the file, starting from the top, and includes new lines? I eventually have to use this to turn a text file into a character array, so that I can make a new, modified character array based off of that array, which will be printed to a new text file. So if there is a better way to approach that end goal, that would be appreciated.
EDIT: after some discussion in the comments I've realized that the text I am using is just one big block of text with carriage returns and no newlines. I guess at this point my main problem is how to turn this text file with carriage returns into a character array.
If the goal is to read a text file in lines of 100 characters, and to do away with the carriage returns, you can still use fgets() as long as you remember that fgets() will take characters up to and including the next newline, or until one less than the specified number of characters has been read.
The code below reads a line of text, up to BUFFER_SZ-1 characters, increases the memory allocation to hold a new line of text, and copies the line into the allocated space, removing carriage returns and any trailing newlines. Note that the address of the reallocated space is first stored in a temporary pointer. realloc() returns a null pointer in the event of an allocation error, and this step avoids a potential memory leak.
Since the lines are broken at BUFFER_SZ-1 characters, words may be split across lines, and you may want to develop additional logic to handle this. It would be more efficient to reallocate in larger chunks and less frequently, rather than once for every line, as is done here. Also note that it may be useful to open the file in binary mode to more closely parse line endings.
#include <stdio.h>
#include <stdlib.h>
#define BUFFER_SZ 100
int main(void)
{
FILE *fp = fopen("story.txt", "r");
if (fp == NULL) {
fprintf(stderr, "Unable to open file\n");
exit(EXIT_FAILURE);
}
char buffer[BUFFER_SZ];
char (*text)[sizeof buffer] = NULL;
char (*temp)[sizeof buffer] = NULL;
size_t numlines = 0;
while (fgets(buffer, sizeof buffer, fp) != NULL) {
++numlines;
/* Allocate space for next line */
temp = realloc(text, sizeof(*text) * numlines);
if (temp == NULL) {
fprintf(stderr, "Error in realloc()\n");
exit(EXIT_FAILURE);
}
text = temp;
/* Copy buffer to text, removing carriage returns and newlines */
char *c = buffer;
char *line = text[numlines-1];
while (*c != '\n' && *c != '\0') {
if (*c != '\r') {
*line++ = *c;
}
++c;
}
*c = '\0';
}
if (fclose(fp) != 0) {
fprintf(stderr, "Unable to close file\n");
}
for (size_t i = 0; i < numlines; i++) {
printf("%s\n", text[i]);
}
free(text);
return 0;
}
Another option would be to replace the carriage returns with newlines. This may be what OP had in mind. The above program is easily modified to accomplish this. Note that the \n is removed from the printf() statement that displays the results, since newlines are now included in the strings.
...
/* Copy buffer to text, converting carriage returns to newlines */
char *c = buffer;
char *line = text[numlines-1];
while (*c != '\n' && *c != '\0') {
if (*c == '\r') {
*line++ = '\n';
} else {
*line++ = *c;
}
++c;
}
*c = '\0';
}
if (fclose(fp) != 0) {
fprintf(stderr, "Unable to close file\n");
}
for (size_t i = 0; i < numlines; i++) {
printf("%s", text[i]);
}
...
It doesn't copy one line onto the end of another. It simply reuses the buffer you keep passing it. If you want multiple lines stored, copy them to another buffer, and concatenate them. (See: strcat)

Piping log output through a C program for easy log rotation

I'm trying to make it really easy to logrotate some of my apps that log via bash redirection. Basically, I have a C program that reads STDIN into a buffer. It reads this buffer, and whenever it encounters a newline, it will write the output it has gathered to a file.
The difference in this program is that it does not leave the file open. It opens it for appending each time a new line is encountered. This works great with the logrotate utility, but I'm wondering if there's some sort of horrible unforseen issue I'm not accounting for that I'll run into later on.
Is it better just to implement signal handling in this utility and have logrotate send it a SIGHUP? Are there horrible performance penalties to what I'm doing?
So normally where you'd do:
./app >> output.log
With the logger util you do:
./app | ./mylogger output.log
Although I'm too bad in C, I'm not very well versed in its best practices. Any guidance would be greatly appreciated.
Here's the source:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#define BUFSIZE 1024
#define MAX_WRITE_FAILS 3
/**
* outputs the given content to the specified file.
*/
int file_output(char *filename, char *content, size_t content_length)
{
FILE *fp;
fp = fopen(filename, "a");
content[content_length + 1] = '\0';
if(fp == NULL) return errno;
fwrite(content, sizeof(char), content_length, fp);
fclose(fp);
return 0;
}
/**
* Loops over STDIN and whenever it finds a newline, sends the current content
* buffer to the file specified on the command line.
*/
int main(int argc, char *argv[])
{
int i;
char buffer[BUFSIZE];
char *content = malloc(sizeof(char) * BUFSIZE);
size_t content_size = 0;
int content_buf_size = BUFSIZE;
int write_failures = 0;
char *file;
if(argc < 2)
{
fprintf(stderr, "Usage: logger <file>");
exit(1);
}
file = argv[1];
// loop over STDIN
while(fgets(buffer, BUFSIZE, stdin))
{
int output_err;
int buflength = strlen(buffer);
// loop over character for character, searching for newlines and
// appending our buffer to the output content as we go along
for(i = 0; i < buflength; i++)
{
char *old = content;
// check if we have a newline or end of string
if(buffer[i] == '\n' || buffer[i] == '\0' || (i != (buflength - 1) && buffer[i] == '\r' && buffer[i+1] == '\n'))
{
content[content_size] = '\n';
output_err = file_output(file, content, content_size + 1);
if(output_err == 0)
{
// success! reset the content size (ie more or less resets
// the output content string)
content_size = 0;
write_failures = 0;
}
else
{
// write failed, try to keep going. this will preserve our
// newline so that the next newline we encounter will write
// both lines (this AND and the next).
content_size++;
write_failures++;
}
}
if(write_failures >= MAX_WRITE_FAILS)
{
fprintf(stderr, "Failed to write output to file %d times (errno: %d). Quitting.\n", write_failures, output_err);
exit(3);
}
if(buffer[i] != '\n' && buffer[i] != '\r' && buffer[i] != '\0')
{
// copy buffer into content (if it's not a newline/null)
content[content_size] = buffer[i];
content_size++;
}
// check if we're pushing the limits of our content buffer
if(content_size >= content_buf_size - 1)
{
// we need to up the size of our output buffer
content_buf_size += BUFSIZE;
content = (char *)realloc(content, sizeof(char) * content_buf_size);
if(content == NULL)
{
fprintf(stderr, "Failed to reallocate buffer memory.\n");
free(old);
exit(2);
}
}
}
}
return 0;
}
Thanks!
Since my suggestion in the comments turned out to be what you needed, I am adding it as an answer, with more of an explanation.
When you have a logging application which can not be told to close its logfile (usually via SIGHUP), you can use the 'copytruncate' option in your logrotate.conf.
Here is the description from the man page:
Truncate the original log file in place after creating a copy,
instead of moving the old log file and optionally creating a new
one, It can be used when some program can not be told to close
its logfile and thus might continue writing (appending) to the
previous log file forever. Note that there is a very small time
slice between copying the file and truncating it, so some log-
ging data might be lost. When this option is used, the create
option will have no effect, as the old log file stays in place.
Source: http://linuxcommand.org/man_pages/logrotate8.html

Reading text file into an array of lines in C

Using C I would like to read in the contents of a text file in such a way as to have when all is said and done an array of strings with the nth string representing the nth line of the text file. The lines of the file can be arbitrarily long.
What's an elegant way of accomplishing this? I know of some neat tricks to read a text file directly into a single appropriately sized buffer, but breaking it down into lines makes it trickier (at least as far as I can tell).
Thanks very much!
Breaking it down into lines means parsing the text and replacing all the EOL (by EOL I mean \n and \r) characters with 0.
In this way you can actually reuse your buffer and store just the beginning of each line into a separate char * array (all by doing only 2 passes).
In this way you could do one read for the whole file size+2 parses which probably would improve performance.
It's possible to read the number of lines in the file (loop fgets), then create a 2-dimensional array with the first dimension being the number of lines+1. Then, just re-read the file into the array.
You'll need to define the length of the elements, though. Or, do a count for the longest line size.
Example code:
inFile = fopen(FILENAME, "r");
lineCount = 0;
while(inputError != EOF) {
inputError = fscanf(inFile, "%s\n", word);
lineCount++;
}
fclose(inFile);
// Above iterates lineCount++ after the EOF to allow for an array
// that matches the line numbers
char names[lineCount][MAX_LINE];
fopen(FILENAME, "r");
for(i = 1; i < lineCount; i++)
fscanf(inFile, "%s", names[i]);
fclose(inFile);
For C (as opposed to C++), you'd probably wind up using fgets(). However, you might run into issues due to your arbitrary length lines.
Perhaps a Linked List would be the best way to do this?
The compiler won't like having an array with no idea how big to make it. With a Linked List you can have a really large text file, and not worry about allocating enough memory to the array.
Unfortunately, I haven't learned how to do linked lists, but maybe somebody else could help you.
If you have a good way to read the whole file into memory, you are almost there. After you've done that you could scan the file twice. Once to count the lines, and once to set the line pointers and replace '\n' and (and maybe '\r' if the file is read in Windows binary mode) with '\0'. In between scans allocate an array of pointers, now that you know how many you need.
you can use this way
#include <stdlib.h> /* exit, malloc, realloc, free */
#include <stdio.h> /* fopen, fgetc, fputs, fwrite */
struct line_reader {
/* All members are private. */
FILE *f;
char *buf;
size_t siz;
};
/*
* Initializes a line reader _lr_ for the stream _f_.
*/
void
lr_init(struct line_reader *lr, FILE *f)
{
lr->f = f;
lr->buf = NULL;
lr->siz = 0;
}
/*
* Reads the next line. If successful, returns a pointer to the line,
* and sets *len to the number of characters, at least 1. The result is
* _not_ a C string; it has no terminating '\0'. The returned pointer
* remains valid until the next call to next_line() or lr_free() with
* the same _lr_.
*
* next_line() returns NULL at end of file, or if there is an error (on
* the stream, or with memory allocation).
*/
char *
next_line(struct line_reader *lr, size_t *len)
{
size_t newsiz;
int c;
char *newbuf;
*len = 0; /* Start with empty line. */
for (;;) {
c = fgetc(lr->f); /* Read next character. */
if (ferror(lr->f))
return NULL;
if (c == EOF) {
/*
* End of file is also end of last line,
` * unless this last line would be empty.
*/
if (*len == 0)
return NULL;
else
return lr->buf;
} else {
/* Append c to the buffer. */
if (*len == lr->siz) {
/* Need a bigger buffer! */
newsiz = lr->siz + 4096;
newbuf = realloc(lr->buf, newsiz);
if (newbuf == NULL)
return NULL;
lr->buf = newbuf;
lr->siz = newsiz;
}
lr->buf[(*len)++] = c;
/* '\n' is end of line. */
if (c == '\n')
return lr->buf;
}
}
}
/*
* Frees internal memory used by _lr_.
*/
void
lr_free(struct line_reader *lr)
{
free(lr->buf);
lr->buf = NULL;
lr->siz = 0;
}
/*
* Read a file line by line.
* http://rosettacode.org/wiki/Read_a_file_line_by_line
*/
int
main()
{
struct line_reader lr;
FILE *f;
size_t len;
char *line;
f = fopen("foobar.txt", "r");
if (f == NULL) {
perror("foobar.txt");
exit(1);
}
/*
* This loop reads each line.
* Remember that line is not a C string.
* There is no terminating '\0'.
*/
lr_init(&lr, f);
while (line = next_line(&lr, &len)) {
/*
* Do something with line.
*/
fputs("LINE: ", stdout);
fwrite(line, len, 1, stdout);
}
if (!feof(f)) {
perror("next_line");
exit(1);
}
lr_free(&lr);
return 0;
}

Resources