I am creating a program in C that splits a large text file into 10 segments, and then creates 10 threads with each thread generating a word count for each segment. I took the function word_count from this code: https://github.com/prateek-khatri/seaOfC/blob/master/frequencyMultiThread.c. That program works fine for me, but when I tried to use word_count in my own program, it crashes when trying to get the size of the buffer.
It seems like everything is ok in the function getCurrentSegmentWordcount, but when that function calls word_count, it crashes (segmentation fault) at the line printf("sizeof Buff: %d", sizeof(buff));.
#define _GNU_SOURCE
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <unistd.h>
#define NUMBER_OF_THREADS 10
//struct taken from reference:
struct return_val{
char wordlist[100][100]; //[chars][lines]
int count[100];
} *arr; //array of words
void *print_hello_world(void * tid)
{
//This function prints the thread’s identifier and then exits.
printf("Hello World. Greetings from thread %d\n", tid);
pthread_exit(NULL);
}
void *word_count(void* num)
{
int *ln = num;
unsigned int line_number = *ln;
//line_number++;
printf("Thread %d\n",line_number);
char cmd_p1[9] = "sed -n '\0";
char cmd_p2[2];
sprintf(cmd_p2,"%d",line_number); //stores string in buffer
char cmd_p3[21] = "p' 'maintainers.txt'\0";
char command[100];
command[0] = '\0';
//char * strcat ( char * destination, const char * source );
//appends a copy of source to destination
strcat(command,cmd_p1);
strcat(command,cmd_p2);
strcat(command,cmd_p3);
usleep(line_number);
char cmd[100] = " | tr [:space:] '\\n' | grep -v '^\\s*$' | sort | uniq -c | sort\0";
strcat(command,cmd);
printf("Command: %s\n",command);
//fflush(stdout);
FILE *in;
in= popen(command, "r"); //read command and pipe into the shell
rewind(in); //set file position to beginning of 'in'
char buff[50];
int counter = 0;
//char * fgets ( char * str, int num, FILE * stream );
//reads chars from stream and stores them as string into buff until all of buffer has been read
printf("before\n");
bool testBool = fgets(buff,sizeof(buff),in);
printf("testBool: %d\n", testBool);
//CRASH HAPPENS HERE:
//buff
printf("sizeof Buff: %d", sizeof(buff));
while(fgets(buff,sizeof(buff),in))
{
printf("fire 0.5");
char c=' ';
int i = 0;
int cnt = atoi(buff); //converts string to int.. buff == # of chars in file?
arr[line_number-1].count[counter] = cnt; //at this point line_number == 1
printf("fire1\n");
while(c!='\0')
{
c=buff[i];
buff[i]=buff[i+6];
i++;
}
int cnnt = 0;
while(c!=' ')
{
c = buff[cnnt];
cnnt++;
}
i=0;
while(c!='\0')
{
c=buff[i];
buff[i]=buff[i+cnnt];
i++;
}
sprintf(arr[line_number-1].wordlist[counter],"%s",buff);
printf("%d %s",arr[line_number-1].count[counter],arr[line_number-1].wordlist[counter]);
counter++;
}
printf("final count: %d", counter);
arr[line_number-1].count[counter] = -1;
fclose(in);
//pthread_exit(NULL); //didn't help to move here from getCurrentSegment...()
return NULL;
}
void *getCurrentSegmentWordcount(void * tid) { //declaring file pointer (value?)
int segment = tid;
segment = segment + 1; //converts to int
printf("segment/thread: %d \n", segment);
char text[1000];
//char buffer[150];
FILE *fp = fopen("words.txt", "r");
if(fp == NULL) {
printf("null file");
}
int i = 0;
long lSize;
char *buffer;
if( !fp ) perror("words.txt"),exit(1);
fseek( fp , 0L , SEEK_END);
lSize = ftell( fp );
rewind( fp );
buffer = calloc( 1, lSize+1 );
if( !buffer ) fclose(fp),fputs("memory alloc fails",stderr),exit(1);
if( 1!=fread( buffer , lSize, 1 , fp) )
fclose(fp),free(buffer),fputs("entire read fails",stderr),exit(1);
//printf(buffer);
char *token = strtok(buffer, "~");
if(segment == 1) {
printf("segment 1: %s", token);
word_count(&segment);
}
if(segment == 2) {
token = strtok(NULL,"~");
printf("segment 2: %s", token);
}
if(segment == 3) {
token = strtok(NULL,"~");
token = strtok(NULL,"~");
printf("segment 3: %s", token);
}
if(segment == 4) {
token = strtok(NULL,"~");
token = strtok(NULL,"~");
token = strtok(NULL,"~");
printf("segment 4: %s", token);
}
fclose(fp);
free(buffer);
//pthread_exit(NULL);//moving to end of word_count()
}
int main(int argc, char *argv[])
{
//The main program creates x threads and then exits.
pthread_t threads[NUMBER_OF_THREADS];
int status, i;
for(i=0; i < NUMBER_OF_THREADS; i++) {
printf("Main here. Creating thread %d\n", i+1);
status = pthread_create(&threads[i], NULL, getCurrentSegmentWordcount, (void * )i);
if (status != 0) {
printf("Oops. pthread create returned error code %d\n", status);
exit(-1);
}
}
sleep(8);
exit(NULL);
}
Output:
Main here. Creating thread 1
Main here. Creating thread 2
segment/thread: 1
Main here. Creating thread 3
segment 1: test(segment 1, handled my thread 1)
Thread 1
Main here. Creating thread 4
Command: sed -n '1p' 'maintainers.txt' | tr [:space:] '\n' | grep -v '^\s*$' | sort | uniq -c | sort
Main here. Creating thread 5
segment/thread: 2
before
segment/thread: 4
Main here. Creating thread 6
segment 4:
test test test test (segment 4, handled by thread 4)
Main here. Creating thread 7
segment 2:
test test (segment 2, handled by thread 2)
Main here. Creating thread 8
Main here. Creating thread 9
Main here. Creating thread 10
segment/thread: 3
segment 3:
test test test (segment 3, handled by thread 3)
segment/thread: 10
segment/thread: 9
segment/thread: 8
segment/thread: 5
segment/thread: 6
segment/thread: 7
testBool: 1
Makefile:20: recipe for target 'all' failed
make: *** [all] Segmentation fault (core dumped)
There are many issues with this code, some have been already mentioned by
user3629249, so I'll try to summarize the errors here.
Passing (void * )i for the argument for the thread is rather ugly. Sure it
works but this is for me sloppy programming, I'd declare an int array and fill
it with the id values and pass a pointer to the locations.
int ids[NUMBER_OF_THREADS];
for(i=0; i < NUMBER_OF_THREADS; i++) {
ids[i] = i+1;
status = pthread_create(&threads[i], NULL, getCurrentSegmentWordcount, ids + i);
...
}
and then in the thread function:
void *getCurrentSegmentWordcount(void * tid) { //declaring file pointer (value?)
int segment = *((int*) tid);
// segment = segment + 1; not needed anymore
...
}
This code is more clean, easier to understand for you and for the code reviewer,
does not relay on ugly unnecessary casts and is more portable.
Same thing with
void *print_hello_world(void *tid)
{
//This function prints the thread’s identifier and then exits.
printf("Hello World. Greetings from thread %d\n", tid);
pthread_exit(NULL);
}
This is uggly, you are trying to pass a pointer as an int. The size of a
pointer may not be the same as the size of an int. Using the same way of
passing a pointer to int (like for getCurrentSegmentWordcount):
void *print_hello_world(void *tid)
{
//This function prints the thread’s identifier and then exits.
printf("Hello World. Greetings from thread %d\n", *((int*) tid));
pthread_exit(NULL);
}
Write error messages to stderr. This FILE buffer is opened for that reason,
that's what people expect from programs to do. When you execute a program, you
can do this:
$ program 2>/tmp/error.log
or this
$ program 2>/dev/null | some_other_tool
so that you can separate the normal output from the error outputs.
And when a system function fails, the errno variable is set to the error code.
You can use perror for a standard error message or if you want a custom one,
use strerror:
pid_t p = fork();
if(p < 0)
{
perror("fork failed");
// or
fprintf(stderr, "Error while executing fork: %s\n", strerror(errno));
return; // or exit or whatever
}
You can write code in one line if you want to enter the C obfuscated contest,
otherwise don't do that. It's hard to read for you, it's hard to read for the
code reviewer/co-worker/superior. You gain nothing from it.
Instead of
if( !buffer ) fclose(fp),fputs("memory alloc fails",stderr),exit(1);
do
if(buffer == NULL)
{
fclose(fp);
fputs("memory alloc fails", stderr);
exit(EXIT_FAILURE); // or exit(your_exit_status)
}
It's easier to read for everyone.
You should always check the return value of functions that return a pointer.
Check the return value of malloc, calloc, realloc, strtok, etc.
if(segment == 2) {
token = strtok(NULL,"~");
printf("segment 2: %s", token);
}
If strtok returns NULL, then the printf line yields undefined behaviour.
See 3.5.3.3 comment 2:
3.5.3.3:
Synopsis
#define __STDC_WANT_LIB_EXT1__ 1
#include <stdio.h>
int printf_s(const char * restrict format, ...);
[...]
2 format shall not be a null pointer. The %n specifier (modified or not by flags, field width, or precision) shall not appear in the string pointed to by format. Any argument to printf_s corresponding to a %s specifier shall not be a null pointer.
[...]
4 The printf_s function is equivalent to the printf function except for the explicit runtime-constraints listed above.
Some libc implementation may forgive you to pass NULL to printf with %s
and print (null), but this is not portable and is undefined behaviour. So you
can only do the printf if token is not NULL.
The word_count function is a little bit horrible, specially how you construct
the commands.
char cmd_p1[9] = "sed -n '\0";
can be rewritten as
char cmd_p1[] = "sed -n '";
This will create a char array with the correct amount of bytes and initialize
it with a valid 0-terminated string, no need to add the '\0' yourself.
The commands that are the same, meaning that they don't need a value from a
variable can be store in a char[] or even in a const char*. Then construct
the whole thing with snprintf and sprintf, less lines, less mistakes:
void *word_count(void* num)
{
...
const char *pipe_cmd = "| tr [:space:] '\\n' | grep -v '^\\s*$' | sort | uniq -c | sort";
const char *format = "sed -n '%dp' 'maintainers.txt' %s";
int cmd_size = snprintf(NULL, 0, format, line_number, pipe_cmd);
char *command = malloc(cmd_size + 1);
if(command == NULL)
return NULL;
sprintf(command, format, line_number, pipe_cmd);
...
FILE *in;
in= popen(command, "r");
free(command);
...
}
Also note that
char cmd_p2[2];
sprintf(cmd_p2,"%d",line_number); //stores string in buffer
will overflow the buffer if the line number is greater than 9.
bool testBool = fgets(buff,sizeof(buff),in);
printf("testBool: %d\n", testBool);
fgets returns a pointer to char, not a bool. The printf will print the
value of a pointer as an integer. A pointer size is not necessarily the same as
an int size, in fact on my system a pointer is 8 bytes long, int is 4 bytes
long. You should do:
if(fgets(buff, sizeof(buff), in))
puts("fgets success");
//CRASH HAPPENS HERE:
//buff
printf("sizeof Buff: %d", sizeof(buff));
It won't crash because of the sizeof. sizeof is evaluated at compile time,
not at run-time.
The sizeof-operator returns a size_t.
%d is not the correct specifier for size_t, %lu is, it should be
printf("sizeof buff: %lu\n", sizeof buff);
It will most probably crash because of all the undefined behaviour before
this point.
arr[line_number-1].count[counter] = cnt;
In your whole code, arr is uninitialized, so you are accessing a value
through an uninitialized pointer. That's undefined behaviour and might lead to a
segfault.
I want to quote user3629249 here:
user3629249 wrote:
the main() function is starting several threads, then immediately exiting. The process of exiting also eliminates the threads Suggest: in main() calling pthread_join() for each thread. in the thread, at the end, call pthread_exit()
Please don't ignore compiler warnings, they are not there to annoy you, they are
there to help you. They are a hint that what you are doing may not be what you
really want. Undefined behaviour, segfaults etc. are often a consequence of
that. So heed warnings of the compiler and when you see one, look at your code,
try to understand it and fix it. If you don't understand the warning, you can
come here and ask a question about it. But having thousand of warning and
ignoring them will lead to headaches and quite franky a lot of wasted time on
your side and ours.
So, please fix all this warnings and details, look at the warning messages of
the compiler and the code might run without problems.
Related
I have to figure out the available space in /mnt/ in my application. I wrote the following code. However, execute_cmd some times returns junk apart from the actual output. For ex: 4.5K(followed by junk). Where am I going wrong? Could some one review and let me know why execute_cmd returns a junk byte at the end? How do I improve the code?
char *execute_cmd(char *cmd)
{
FILE *fp;
char path[100];
int ii = 0;
//char ii = 0;
char *buffer = malloc(1024);
char len = 0;
/* Open the command for reading. */
fp = popen(cmd, "r");
if (fp == NULL) {
printf("Failed to run command\n" );
exit(1);
}
printf("Running command is: %s\n", cmd);
memset(buffer, 0, sizeof(buffer));
do {
len = fread(path, 100, 1, fp); /* Is it okay to use fread? I do not know how many bytes to read as this function is a generic function which can be used for executing any command */
strcat(buffer,path);
printf("Number of bytes is: %d\n", len);
} while (len != 0);
len = strlen(buffer);
printf("Buffer contents are: %s %d\n", buffer,len);
/* close */
pclose(fp);
}
void main()
{
char *buffer = "df -h | grep \"/mnt\" | awk '{ print $4}'"; /* FIXME */
char len;
char units;
float number;
char dummy = 0;
char *avail_space;
avail_space = execute_cmd(buffer);
len = strlen(avail_space);
units = avail_space[len - 1];
printf("Available space is: %s %d %c end here\n", avail_space, len, units);
number = strtof(avail_space, NULL);
printf("Number is: %f\n", number);
}
sizeof(buffer) is sizeof(char*), which is probably 8 (or maybe 4). So your memset only clears a little bit of buffer. But with your use of fread, it's not just buffer that needs to be cleared; it's the temporary path.
Uninitialized local variables like path are not zero-initialised. You could use memset(path, 0, sizeof(path)); to clear it -- here the sizeof works because path really is an array -- but simpler is to initialise it in the declaration: char path[100] = "";.
Since fread does not NUL-terminate what it reads, there might be arbitrary garbage following it, making the strcat Undefined Behaviour. In fact, the strcat is totally unnecessary and a waste of cycles. You know how much data you read (it's in len) so you know exactly where to read the next chunk and you can do so directly without a temporary buffer and without a copy.
For future reference, if you are planning on calling malloc and then using memset to clear the allocated region, you should instead use calloc. That's what it's there for.
I'm trying to read information printed by program A from program B. How can I pass data from A to B using read()?.
code for A
#include <stdio.h>
int main(int argc, char **argv)
{
int i, j;
char instruc_list[11][3] = {"sa", "sb", "ss", "pa", "pb",
"ra", "rb", "rr", "rra", "rrb", "rrr"};
i = 0;
while (i < 11)
{
j = 0;
while (j < 3)
{
printf("%c", instruc_list[i][j]);
j++;
}
i++;
printf("\n");
}
return (0);
}
code for B
int main()
{
char buf[4];
while ((read(0,buf, 4)))
{
printf("%s", buf);
}
printf("\n");
return 0;
}
When I run this two programs, I get the following result.
Use the popen() and pclose() functions defined in stdio.h to pipe output between programs.
Here's an example program of how to print the output of the ls shell command in your program, taken from this link:
FILE *fp;
int status;
char path[PATH_MAX];
fp = popen("ls *", "r");
if (fp == NULL)
/* Handle error */;
while (fgets(path, PATH_MAX, fp) != NULL)
printf("%s", path);
status = pclose(fp);
if (status == -1) {
/* Error reported by pclose() */
...
} else {
/* Use macros described under wait() to inspect `status' in order
to determine success/failure of command executed by popen() */
...
}
For your case, you'd call popen("./A", "r");.
You can use popen() to read the output of program A from program B.
Compile the first program:
gcc a.c -o a
In the program B:
#include <stdio.h>
int main(void)
{
char buf[4];
FILE *fp;
fp = popen("./a", "r");
while( !feof(fp)) {
fscanf(fp, "%s", buf);
printf("%s\n", buf);
}
return 0;
pclose(fp);
}
Now compile and execute the program B:
gcc b.c -o b
me#linux:$ ./b
The output I got is:
sa
sb
ss
pa
pb
ra
rb
rr
rra
rrb
rrr
rrr
In program A, you're not writing the null terminators for the 3-letter strings... and in program B, you're not adding a null char after the characters you read (and haven't initialised buf, so it might not contain one). That's why you're getting garbage between the 3-letter strings you read... printf() is continuing past the characters you read because it hasn't found a null yet.
Also note that read() can return -1 for error, which would still test as true for your while loop. You should at least check that read() returns greater than 0 (rather than just non-zero), if not put in more thorough error handling.
So with some changes to address these issues, program B might become:
int main()
{
char buf[4];
int ret; // ** for the return from read()
while ((ret = read(0,buf, 4)) > 0) // ** read >0 (no error, and bytes read)
{
fwrite(buf, 1, ret, stdout); // ** write the number of chars
// you read to stdout
}
printf("\n");
return 0;
}
As for program A, right now it writes 3 characters for both the 2-letter and the 3-letter strings -- which means it includes the null char for the 2-letter strings but not for the 3-letter strings. With the changes to program B above, you don't need to write the null characters at all... so you could change:
while (j < 3)
to:
while (j < 3 && instruc_list[i][j] != 0)
to stop when the null character is reached (though it's still inefficient to use a printf() call just to write a single char -- perhaps putchar(instruc_list[i][j]); would be better). Or, you could just replace that inner while loop with:
fputs(instruc_list[i], stdout);
...which would then write the string in instruc_list[i] up to but not including the null char, and also change instruc_list[11][3] to instruc_list[11][4] so that it has room for the null char from the 3-letter string literals in the initialiser list.
I'm new in C and already feel like bored on its data types.
#include <stdio.h>
#include <string.h>
int main ()
{
char command[50];
strcpy( command, "ps -x | wc" );
int first = pepen(command);
int two_digit = first;
printf("two = %i, three = %d", two_digit, first);
return(0);
}
I want to get first two numbers 88 of int first.
I have no idea to which kind of integer int first is equal. When I use %c instead of %d terminal reporting that I int first is integer type and using first[0] reporting that it is not object. Then what is the type of first and
how can I print only first two digits white there is space between them?
IF (that's a big if):
You are on a Unix-like system, and
The command you run generates numbers on standard output,
then you can probably use popen() and pclose() to run the command and read the output of the command:
#include <stdio.h>
int main(void)
{
char command[] = "echo 82 509 8274";
FILE *fp = popen(command, "r");
if (fp != 0)
{
int first;
if (fscanf(fp, "%d", &first) == 1)
printf("Got %d from the command\n", first);
pclose(fp);
}
return 0;
}
I've been racking my brain against this program for a couple months now. This was an assignment of a class I took last semester and while I passed, I could never make this one assignment work correctly (Seg Fault). I welcome any help or tips but I greatly appreciate explanations with answers to follow along.
This program is supposed to receive a filename that contains a list of filenames (240 for my example). Those files are located in a folder within the same directory as the list and the program. This program is supposed to take this list and parse it for 4 threads, splitting the filenames evenly for each thread (60 per thread for my example). Each thread then takes this list of 60 file names and opens each file one by one performing a WordCount function on each file. Once the threads complete their tasks, they are supposed to print the results for each file in order with each thread in its own block(i.e. Thread1 Results | Thread2 Results | Thread 3 Results, etc...).
I've debugged quite a bit and know that up until the threads are created everything works as it is supposed to. My problem seems to be during thread launch/execution. I've tried adding mutex to the mix but sadly it hasn't helped. I seem to be missing something or over thinking something as some of my class mates have shown me their much more compact code. Please assist. Thanks!
Here is the Main:
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define MaxLine 200
#define NUMTHREADS 4
char Line[MaxLine];
pthread_mutex_t Lock = PTHREAD_MUTEX_INITIALIZER;
typedef struct thread {
int id;
char file;
}ThreadData;
/* ThreadFunction will give each thread its processes to execute */
void *threadFunc (void *td) {
ThreadData *data = (ThreadData*)td;
int thread_num=data->id;
char filename=data->file;
printf("thread debug tid: %d and file: %c",thread_num, filename);
pthread_mutex_trylock(&Lock);
FILE *fn = fopen(filename, "r");
if (fn == NULL) {
error("ERROR: Opening file");
return 1;
}
while (fgets(Line, sizeof(Line), fn) != NULL) {
CountWord(thread_num, Line);
}
fclose(fn);
free(data);
pthread_mutex_unlock(&Lock);
pthread_exit(NULL);
}
int main(int argc, char *argv[]){
char buf[20];
int c, i, t, tnum, QUEUETOTAL;
pthread_t thread[NUMTHREADS];
ThreadData td[NUMTHREADS];
if (argc != 2){
fprintf(stderr,"ERROR: Usage must be Countfile filename\n", argv[0]);
exit(0);
}
char const* const filename = argv[1];
FILE* file = fopen(filename, "r");
if ( file == 0 ){
printf( "Could not open file!\n" );
exit(0);
}
/* Count iterations of while loop to divide files among threads. */
while (fgets(Line, sizeof(Line), file)){
QUEUETOTAL++;
}
/* Divide work for threads. */
int thread2taskstart=(QUEUETOTAL/NUMTHREADS); //60
int thread3taskstart=(QUEUETOTAL/NUMTHREADS)*2; //120
int thread4taskstart=(QUEUETOTAL/NUMTHREADS)*3; //180
// QUEUETOTAL = 240
rewind(file);
FILE *tempfile1 = fopen("temp1.txt","w");
for (i=0; i<thread2taskstart; i++) {
// populate tempfile1 with entries 1-60
if(fgets(Line,sizeof(Line),file)!=NULL) {
fputs(Line,tempfile1);
//printf("Debug temp1: %s",Line);
}
}
fclose(tempfile1);
FILE *tempfile2 = fopen("temp2.txt","w");
for (i=thread2taskstart; i<thread3taskstart; i++) {
// populate tempfile2 with entries 60-120
if(fgets(Line,sizeof(Line),file)!=NULL) {
fputs(Line,tempfile2);
//printf("Debug temp2: %s",Line);
}
}
fclose(tempfile2);
FILE *tempfile3 = fopen("temp3.txt","w");
for (i=thread3taskstart; i<thread4taskstart; i++) {
// populate tempfile3 with entries 120-180
if(fgets(Line,sizeof(Line),file)!=NULL) {
fputs(Line,tempfile3);
//printf("Debug temp3: %s",Line);
}
}
fclose(tempfile3);
FILE *tempfile4 = fopen("temp4.txt","w");
for (i=thread4taskstart; i<=QUEUETOTAL; i++) {
// populate tempfile3 with entries 180-240
if(fgets(Line,sizeof(Line),file)!=NULL) {
fputs(Line,tempfile4);
//printf("Debug temp4: %s",Line);
}
}
fclose(tempfile4);
fclose(file);
/* Prepare parameters & launch (4) threads. Wait for threads
to finish & print out results as specified in assignment. */
printf("Counting files …\n");
for(t=0;t<NUMTHREADS;t++){
tnum=t+1;
snprintf(buf, "temp%d.txt", tnum);
printf("debug tnum and array: %d and %s\n",tnum, buf);
td[t].id = tnum;
td[t].file = buf;
// Creates a new thread for each temp file.
pthread_create(&thread[t], NULL, threadFunc, td);
}
// Joins threads.
printf("debug: printing threads \n");
for(t=0;t<NUMTHREADS;t++){
pthread_join(thread[t], NULL);
printf("------------------------- Processes finished for Thread %d ----------------------- \n",t+1);
}
return 0;
}
Here is the Count Function:
#include <stdio.h>
int CountWord(int tinfo, char cfile){
int i;
int ccount = 0;
int wcount = 0;
int lcount = 0;
FILE *fname;
char fn[strlen(cfile) + 18];
sprintf(fn, "./CountingFolder/%s", cfile);
printf("Debug: %s\n", fn);
fname = fopen(fn, "r");
if (fname == NULL) {
error("ERROR: Opening file");
}
while ((i = fgetc(fname)) != EOF){
if (i == '\n') {
lcount++;
}
if (i == '\t' || i == ' '){
wcount++;
}
ccount++;
}
printf("Threadid %d processes %s which has %d characters, %d words and %d lines\n", tinfo, cfile, ccount, wcount, lcount);
fclose(fname);
return 0;
}
1) Probably a typo. But
int CountWord(int tinfo, char cfile){ .. }
should be
int CountWord(int tinfo, char *cfile){ .. }
2) You are passing the same buf to all threads from main(). Data race and undefined behaviour.
3) None of the snprintf() calls take the size argument. Undefined behaviour.
4) Since all threads work on different data, you don't need a lock at all.
5) You didn't allocate td array. So you can't call free(data); in the thread function. Undefined behaviour.
There may be more issues with the code but the segfault is likely because of (3) or (5).
In your code here
snprintf(buf, "temp%d.txt", tnum);
printf("debug tnum and array: %d and %s\n",tnum, buf);
td[t].id = tnum;
td[t].file = buf;
the last line assigns a pointer into the file field of this struct
typedef struct thread {
int id;
char file;
}ThreadData;
Shouldn't it be char *file; ? I don't have thread.h in MSVC so I can't compile it. Surely you have all warnings enabled??
I wrote a simple program that would open a csv file, read it, make a new csv file, and only write some of the columns (I don't want all of the columns and am hoping removing some will make the file more manageable). The file is 1.15GB, but fopen() doesn't have a problem with it. The segmentation fault happens in my while loop shortly after the first progress printf().
I tested on just the first few lines of the csv and the logic below does what I want. The strange section for when index == 0 is due to the last column being in the form (xxx, yyy)\n (the , in a comma separated value file is just ridiculous).
Here is the code, the while loop is the problem:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char** argv) {
long size;
FILE* inF = fopen("allCrimes.csv", "rb");
if (!inF) {
puts("fopen() error");
return 0;
}
fseek(inF, 0, SEEK_END);
size = ftell(inF);
rewind(inF);
printf("In file size = %ld bytes.\n", size);
char* buf = malloc((size+1)*sizeof(char));
if (fread(buf, 1, size, inF) != size) {
puts("fread() error");
return 0;
}
fclose(inF);
buf[size] = '\0';
FILE *outF = fopen("lessColumns.csv", "w");
if (!outF) {
puts("fopen() error");
return 0;
}
int index = 0;
char* currComma = strchr(buf, ',');
fwrite(buf, 1, (int)(currComma-buf), outF);
int progress = 0;
while (currComma != NULL) {
index++;
index = (index%14 == 0) ? 0 : index;
progress++;
if (progress%1000 == 0) printf("%d\n", progress/1000);
int start = (int)(currComma-buf);
currComma = strchr(currComma+1, ',');
if (!currComma) break;
if ((index >= 3 && index <= 10) || index == 13) continue;
int end = (int)(currComma-buf);
int endMinusStart = end-start;
char* newEntry = malloc((endMinusStart+1)*sizeof(char));
strncpy(newEntry, buf+start, endMinusStart);
newEntry[end+1] = '\0';
if (index == 0) {
char* findNewLine = strchr(newEntry, '\n');
int newLinePos = (int)(findNewLine-newEntry);
char* modifiedNewEntry = malloc((strlen(newEntry)-newLinePos+1)*sizeof(char));
strcpy(modifiedNewEntry, newEntry+newLinePos);
fwrite(modifiedNewEntry, 1, strlen(modifiedNewEntry), outF);
}
else fwrite(newEntry, 1, end-start, outF);
}
fclose(outF);
return 0;
}
Edit: It turned out the problem was that the csv file had , in places I was not expecting which caused the logic to fail. I ended up writing a new parser that removes lines with the incorrect number of commas. It removed 243,875 lines (about 4% of the file). I'll post that code instead as it at least reflects some of the comments about free():
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char** argv) {
long size;
FILE* inF = fopen("allCrimes.csv", "rb");
if (!inF) {
puts("fopen() error");
return 0;
}
fseek(inF, 0, SEEK_END);
size = ftell(inF);
rewind(inF);
printf("In file size = %ld bytes.\n", size);
char* buf = malloc((size+1)*sizeof(char));
if (fread(buf, 1, size, inF) != size) {
puts("fread() error");
return 0;
}
fclose(inF);
buf[size] = '\0';
FILE *outF = fopen("uniformCommaCount.csv", "w");
if (!outF) {
puts("fopen() error");
return 0;
}
int numOmitted = 0;
int start = 0;
while (1) {
char* currNewLine = strchr(buf+start, '\n');
if (!currNewLine) {
puts("Done");
break;
}
int end = (int)(currNewLine-buf);
char* entry = malloc((end-start+2)*sizeof(char));
strncpy(entry, buf+start, end-start+1);
entry[end-start+1] = '\0';
int commaCount = 0;
char* commaPointer = entry;
for (; *commaPointer; commaPointer++) if (*commaPointer == ',') commaCount++;
if (commaCount == 14) fwrite(entry, 1, end-start+1, outF);
else numOmitted++;
free(entry);
start = end+1;
}
fclose(outF);
printf("Omitted %d lines\n", numOmitted);
return 0;
}
you're malloc'ing but never freeing. possibly you run out of memomry, one of your mallocs returns NULL, and the subsequent call to str(n)cpy segfaults.
adding free(newEntry);, free(modifiedNewEntry); immediately after the respective fwrite calls should solve your memory shortage.
also note that inside your loop you compute offsets into the buffer buf which contains the whole file. these offsets are held in variables of type int whose maximum value on your system may be too small for the numbers you are handling. also note that adding large ints may result in a negative value which is another possible cause of the segfault (negative offsets into buf take you to some address outside the buffer possibly not even readable).
The malloc(3) function can (and sometimes does) fail.
At least code something like
char* buf = malloc(size+1);
if (!buf) {
fprintf(stderr, "failed to malloc %d bytes - %s\n",
size+1, strerror(errno));
exit (EXIT_FAILURE);
}
And I strongly suggest to clear with memset(buf, 0, size+1) the successful result of a malloc (or otherwise use calloc ....), not only because the following fread could fail (which you are testing) but to ease debugging and reproducibility.
and likewise for every other calls to malloc or calloc (you should always test them against failure)....
Notice that by definition sizeof(char) is always 1. Hence I removed it.
As others pointed out, you have a memory leak because you don't call free appropriately. A tool like valgrind could help.
You need to learn how to use the debugger (e.g. gdb). Don't forget to compile with all warnings and debugging information (e.g. gcc -Wall -g). And improve your code till you get no warnings.
Knowing how to use a debugger is an essential required skill when programming (particularly in C or C++). That debugging skill (and ability to use the debugger) will be useful in every C or C++ program you contribute to.
BTW, you could read your file line by line with getline(3) (which can also fail and you should test that).