extract comma separated strings from txt file in C - c

I need to read from a file different strings that are comma-separated and storage them into an array.
I have the following code, that I developed reading different questions online.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (){
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
const char s[2] = ",";
char line[300];
char* filename = "pathtofile.txt";
int n = 0;
myfile = fopen(filename, "r");
if (myfile == NULL) {printf("could not open file %s", filename); exit(0);}
while (fgets(line, sizeof(line), myfile) != NULL){
token = strtok(line, s);
while (token != NULL){
strcpy(Nseq[n], token);
printf("%s\t%u\n", token, n);
token = strtok(NULL, s);
n++;
}
}
fclose(myfile);
}
my txt file is the following:
1AAAAAAAAAAAAAAAAAAAAAAAAAAAA,2AAAAAAAAAAAAAAAAAAAAAAAAAAAA,3AAAAAAAAAAAAAAAAAAAAAAAAAAAA,4AAAAAAAAAAAAAAAAAAAAAAAAAAAA,5AAAAAAAAAAAAAAAAAAAAAAAAAAAA,6AAAAAAAAAAAAAAAAAAAAAAAAAAAA,7AAAAAAAAAAAAAAAAAAAAAAAAAAAA,8AAAAAAAAAAAAAAAAAAAAAAAAAAAA,9AAAAAAAAAAAAAAAAAAAAAAAAAAAA,10AAAAAAAAAAAAAAAAAAAAAAAAAAAA,11AAAAAAAAAAAAAAAAAAAAAAAAAAAA,12AAAAAAAAAAAAAAAAAAAAAAAAAAAA,13AAAAAAAAAAAAAAAAAAAAAAAAAAAA,14AAAAAAAAAAAAAAAAAAAAAAAAAAAA,15AAAAAAAAAAAAAAAAAAAAAAAAAAAA,16AAAAAAAAAAAAAAAAAAAAAAAAAAAA,17AAAAAAAAAAAAAAAAAAAAAAAAAAAA,18AAAAAAAAAAAAAAAAAAAAAAAAAAAA,19AAAAAAAAAAAAAAAAAAAAAAAAAAAA,20AAAAAAAAAAAAAAAAAAAAAAAAAAAA,21AAAAAAAAAAAAAAAAAAAAAAAAAAAA,22AAAAAAAAAAAAAAAAAAAAAAAAAAAA,23AAAAAAAAAAAAAAAAAAAAAAAAAAAA,24AAAAAAAAAAAAAAAAAAAAAAAAAAAA,25AAAAAAAAAAAAAAAAAAAAAAAAAAAA,26AAAAAAAAAAAAAAAAAAAAAAAAAAAA,27AAAAAAAAAAAAAAAAAAAAAAAAAAAA,28AAAAAAAAAAAAAAAAAAAAAAAAAAAA,29AAAAAAAAAAAAAAAAAAAAAAAAAAAA,30AAAAAAAAAAAAAAAAAAAAAAAAAAAA,
There are 30 strings and no new-line characters.
My issue is that when I run the code, I get the following output:
1AAAAAAAAAAAAAAAAAAAAAAAAAAAA 0
2AAAAAAAAAAAAAAAAAAAAAAAAAAAA 1
3AAAAAAAAAAAAAAAAAAAAAAAAAAAA 2
4AAAAAAAAAAAAAAAAAAAAAAAAAAAA 3
5AAAAAAAAAAAAAAAAAAAAAAAAAAAA 4
6AAAAAAAAAAAAAAAAAAAAAAAAAAAA 5
7AAAAAAAAAAAAAAAAAAAAAAAAAAAA 6
8AAAAAAAAAAAAAAAAAAAAAAAAAAAA 7
9AAAAAAAAAAAAAAAAAAAAAAAAAAAA 8
10AAAAAAAAAAAAAAAAAAAAAAAAAAA 9
A 10
11AAAAAAAAAAAAAAAAAAAAAAAAAAAA 11
12AAAAAAAAAAAAAAAAAAAAAAAAAAAA 12
13AAAAAAAAAAAAAAAAAAAAAAAAAAAA 13
14AAAAAAAAAAAAAAAAAAAAAAAAAAAA 14
15AAAAAAAAAAAAAAAAAAAAAAAAAAAA 15
16AAAAAAAAAAAAAAAAAAAAAAAAAAAA 16
17AAAAAAAAAAAAAAAAAAAAAAAAAAAA 17
18AAAAAAAAAAAAAAAAAAAAAAAAAAAA 18
19AAAAAAAAAAAAAAAAAAAAAAAAAAAA 19
20AAAAAAAAAAAAAAAA 20
AAAAAAAAAAAA 21
21AAAAAAAAAAAAAAAAAAAAAAAAAAAA 22
22AAAAAAAAAAAAAAAAAAAAAAAAAAAA 23
23AAAAAAAAAAAAAAAAAAAAAAAAAAAA 24
24AAAAAAAAAAAAAAAAAAAAAAAAAAAA 25
25AAAAAAAAAAAAAAAAAAAAAAAAAAAA 26
26AAAAAAAAAAAAAAAAAAAAAAAAAAAA 27
27AAAAAAAAAAAAAAAAAAAAAAAAAAAA 28
28AAAAAAAAAAAAAAAAAAAAAAAAAAAA 29
29AAAAAAAAAAAAAAAAAAAAAAAAAAAA 30
30AAAAA 31
AAAAAAAAAAAAAAAAAAAAAAA 32
33
I have tried with different lengths and sooner or later I get these weird splits.
Does someone know why is this happening? thank you!

Your text file is 921 chars in size and is a single line.
Your line buffer is only 300 chars.
So, you're getting truncation.
Also, note that your file has no newline. And, your code didn't handle the case where there is a newline (particularly, if the line ended in ,<newline>).
The simple solution is to increase the size of line so that it is larger than the size of the file (e.g.) char line[10000];
The long term solution is to either read the file char-by-char with (e.g.) fgetc and copy into Nseq[n] and store/print the token after a delimiter.
Or, you can stat the file, and use malloc to allocate a buffer that is the file size.
But, although slightly more advanced, the fastest way [particularly for large files], is to stat the file, mmap it, and then scan the buffer. This will work well on any 64 bit machine, or you could map it in chunks on a 32 bit machine
Here's a version that uses fgetc:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
char *filename = "pathtofile.txt";
int chr;
int n = 0;
myfile = fopen(filename, "r");
if (myfile == NULL) {
printf("could not open file %s", filename);
exit(0);
}
token = Nseq[n];
while (1) {
chr = fgetc(myfile);
if (chr == EOF)
break;
switch (chr) {
case ',':
case '\n':
*token = 0;
if (token > Nseq[n]) {
printf("%s\t%u\n", Nseq[n], n);
++n;
}
token = Nseq[n];
break;
default:
*token++ = chr;
break;
}
}
fclose(myfile);
return 0;
}
Here's a version that uses malloc:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
FILE *myfile;
char *token;
const char s[2] = ",";
char *line;
int len;
char *filename = "pathtofile.txt";
int err;
struct stat st;
int n = 0;
err = stat(filename,&st);
if (err < 0) {
printf("could not stat file %s", filename);
exit(1);
}
len = st.st_size + 1;
line = malloc(len);
myfile = fopen(filename, "r");
if (myfile == NULL) {
printf("could not open file %s", filename);
exit(1);
}
while (fgets(line, len, myfile) != NULL) {
token = strtok(line, s);
while (token != NULL) {
strcpy(Nseq[n], token);
printf("%s\t%u\n", token, n);
token = strtok(NULL, s);
n++;
}
}
fclose(myfile);
return 0;
}
Here's a version that uses mmap:
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/mman.h>
int
main(void)
{
int N = 300;
int L = 1000;
char Nseq[N][L];
char *token;
char *line;
char *cur;
char *end;
char *filename = "pathtofile.txt";
int fd;
int chr;
int n = 0;
int err;
struct stat st;
size_t len;
fd = open(filename,O_RDONLY);
if (fd < 0) {
printf("could not open file %s", filename);
exit(1);
}
err = fstat(fd,&st);
if (err < 0) {
printf("could not stat file %s", filename);
exit(1);
}
len = st.st_size;
line = mmap(NULL,len,PROT_READ,MAP_PRIVATE,fd,0);
if (line == MAP_FAILED) {
printf("could not mmap file %s", filename);
exit(1);
}
cur = line;
end = &line[len];
token = Nseq[n];
for (cur = line; cur < end; ++cur) {
chr = *cur;
switch (chr) {
case ',':
case '\n':
*token = 0;
if (token > Nseq[n]) {
printf("%s\t%u\n", Nseq[n], n);
++n;
}
token = Nseq[n];
break;
default:
*token++ = chr;
break;
}
}
munmap(line,len);
close(fd);
return 0;
}

Related

How to split a text file into multiple parts in c

What i need to do, is to take a file of n lines, and for every x lines, create a new file with the lines of the original file. An example would be this:
Original File:
stefano
angela
giuseppe
lucrezia
In this case, if x == 2, 3 file would be created, in order:
First file:
stefano
angela
Second FIle:
giuseppe
lucrezia
Third File:
lorenzo
What i've done so far is this:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 10
int getlines(FILE *fp)
{
int c = 0;
int ch;
do{
ch = fgetc(fp);
if(ch == '\n')
{
c++;
}
}while(ch != EOF);
fseek(fp, 0 , SEEK_SET);
return c;
}
int ix = 0;
void Split(FILE *fp, FILE **fpo, int step, int lines, int *mem)
{
FILE **fpo2 = NULL;
char * filename = malloc(sizeof(char)*64);
char * ext = ".txt";
char number[2];
for(int i = ix; i < *mem; i++)
{
itoa(i+1, number,10);
strcpy(filename, "temp");
strcat(filename, number);
strcat(filename, ext);
if(!(fpo[i] = fopen(filename, "w")))
{
fprintf(stderr, "Error in writing\n");
exit(EXIT_FAILURE);
}
}
char ch;
int c = 0;
do{
ch = fgetc(fp);
printf("%c", ch);
if(ch == '\n')
{
c++;
}
if(c >= step)
{
c = 0;
ix++;
if(ix >= *mem && (ix*step) <= lines)
{
*mem = *mem + 1;
fpo2 = realloc(fpo, sizeof(FILE*)*(*mem));
Split(fp, fpo2, step, lines, mem);
}
}
putc(ch, fpo[ix]);
}while(ch != EOF);
}
int main()
{
FILE * fp;
if(!(fp = fopen("file.txt", "r")))
{
fprintf(stderr, "Error in opening file\n");
exit(EXIT_FAILURE);
}
int mem = N;
int lines = getlines(fp);
int step = lines/N;
FILE **fpo = malloc(sizeof(FILE *)*N);
Split(fp, fpo, step, lines, &mem);
exit(EXIT_SUCCESS);
}
I'm stack with segmentation error, i couldn't find the bug doing
gdb myprogram
run
bt
I really appreciate any help.
EDIT:
I've changed some things and now it works, but it creates an additional file that contains strange characters. I need to still adjust some things:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define N 10
int getlines(FILE *fp)
{
int c = 0;
int ch;
do{
ch = fgetc(fp);
if(ch == '\n')
{
c++;
}
}while(ch != EOF);
fseek(fp, 0 , SEEK_SET);
return c;
}
int ix = 0;
void Split(FILE *fp, FILE **fpo, int step, int lines, int *mem)
{
FILE **fpo2 = NULL;
char * ext = ".txt";
for(int i = ix; i < *mem; i++)
{
char * filename = malloc(sizeof(char)*64);
char * number = malloc(sizeof(char)*64);
itoa(i+1, number,10);
strcpy(filename, "temp");
strcat(filename, number);
strcat(filename, ext);
if(!(fpo[i] = fopen(filename, "w")))
{
fprintf(stderr, "Error in writing\n");
exit(EXIT_FAILURE);
}
free(number);
free(filename);
}
char ch;
int c = 0;
do{
ch = fgetc(fp);
printf("%c", ch);
if(ch == '\n')
{
c++;
}
if(c >= step)
{
c = 0;
ix++;
if(ix >= *mem && ((ix-1)*step) <= lines)
{
*mem = *mem + 1;
fpo2 = realloc(fpo, sizeof(FILE*)*(*mem));
Split(fp, fpo2, step, lines, mem);
}
}
putc(ch, fpo[ix]);
}while(ch != EOF);
}
int main()
{
FILE * fp;
if(!(fp = fopen("file.txt", "r")))
{
fprintf(stderr, "Error in opening file\n");
exit(EXIT_FAILURE);
}
int mem = N;
int lines = getlines(fp);
int step = lines/N;
FILE **fpo = malloc(sizeof(FILE *)*N);
Split(fp, fpo, step, lines, &mem);
exit(EXIT_SUCCESS);
}
There are a few problems in your code. But first I think you need to fix the most important thing
int step = lines/N;
Here step is 0 if your input file has less than N lines of text. This is because lines and N both are integer and integer division is rounding down.
I won't fix your code, but I'll help you with it. Some changes I
suggest:
Instead of getlines, use getline(3) from the standard
library.
fseek(fp, 0 , SEEK_SET) is pointless.
In char * filename = malloc(sizeof(char)*64), note that
both arguments to malloc are constant, and the size is arbitrary.
These days, it's safe to allocate filename buffers statically,
either on the stack or with static: char filename[PATH_MAX].
You'll want to use limits.h to get that constant.
Similarly you have no need to dynamically allocate your FILE
pointers.
Instead of
itoa(i+1, number,10);
strcpy(filename, "temp");
strcat(filename, number);
strcat(filename, ext);
use sprintf(filename, "temp%d%s", i+1, ext)
get familiar with err(3) and friends, for your own convenience.
Finally, your recursive Split is -- how shall we say it? -- a nightmare. Your whole program
should be something like:
open input
while getline input
if nlines % N == 0
create output filename with 1 + n/N
open output
write output
nlines++

Declaring char array results segmentation fault in C programming language

I was just creating simple implementation where I read input.txt file which has one line of code containing two numbers separated by space (e.g. 4 4).
I was trying to separate them by delimiting with space first.
And I was trying to use the first value as a size of char array.
However it keeps causing segmentation fault but I have no idea.
int main(int argc, char **argv){
int number;
int i = 0;
char *token;
char buf[100];
int tempNum[2];
// Open file
FILE * fPointer;
fPointer = fopen("input.txt", "r");
// Read first line
fgets(buf, 1024, fPointer);
token = strtok(buf, " ");
number = atoi(token);
char charArray[number];
while(token != NULL){
tempNum[i] = atoi(token);
token = strtok(NULL, "\n");
printf("%d\n", tempNum[i]);
i++;
}
If I comment out "char charArray[number]" it does not cause segmentation fault. If I comment out only the while loop, it does not cause segmentation fault. But I cannot figure out why it is causing the segmentation in the first place. Please help.
Thank you in advance.
Something like this?
#include <stdlib.h>
#include <stdio.h>
#include <stdio.h>
#include <string.h>
#define BUF_SIZE 1024
#define MAX_INPUT 2
int main(int argc, char **argv) {
char *token = NULL;
char *next_token = NULL;
char buf[BUF_SIZE];
long tempNum[MAX_INPUT] = { 0 };
const char *delim = " ";
char * end = NULL;
// Init vars
memset(buf, 0, BUF_SIZE);
// Open file
FILE * fPointer = fopen("input.txt", "r");
if (fPointer == NULL) {
return 1;
}
// Read first line
if (fgets(buf, BUF_SIZE, fPointer) == NULL) {
fclose(fPointer);
return 2;
}
// Parse line
token = strtok_s(buf, delim, &next_token);
for (int i = 0;((i < MAX_INPUT) && (token != NULL)); i++){
tempNum[i] = strtol(token, &end, 10);
if (*end != NULL){
printf("error in %s\n", token);
} else {
printf("%d\n", tempNum[i]);
}
token = strtok_s(NULL, delim, &next_token);
}
fclose(fPointer);
return 0;
}

adding char into an array and returning

Im new to c and am trying to understand pointers.
here I am opening a file and reading the lines given. Im trying to append these lines into an array and return it from the function. I dont seem to be appending or accessing the array correctly. output[count] = status; gives an error with mismatched char and char *.
Im essentially trying to get an array with a list of words given by a file where each element in the array is a word.
char *fileRead(char *command, char output[255]) {
int count = 0;
char input[255];
char *status;
FILE *file = fopen(command, "r");
if (file == NULL) {
printf("Cannot open file\n");
} else {
do {
status = fgets(input, sizeof(input), file);
if (status != NULL) {
printf("%s", status);
strtok(status, "\n");
// add values into output array
output[count] = status;
++count;
}
} while (status);
}
fclose(file);
return output;
}
I access fileRead via:
...
char commandArray[255];
char output[255];
int y = 0;
char *filename = "scriptin.txt";
strcpy(commandArray, fileRead(filename, output));
// read from array and pass into flag function
while (commandArray[y] != NULL) {
n = flagsfunction(flags, commandArray[y], sizeof(buf), flags.position, &desc, &parentrd, right, left, lconn);
y++;
...
Example of Read from file Line by line then storing nonblank lines into an array (array of pointer to char (as char*))
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//for it does not exist because strdup is not a standard function.
char *strdup(const char *str){
char *ret = malloc(strlen(str)+1);
if(ret)
strcpy(ret, str);
return ret;
}
//Read rows up to 255 rows
int fileRead(const char *filename, char *output[255]) {
FILE *file = fopen(filename, "r");
if (file == NULL) {
perror("Cannot open file:");
return 0;
}
int count = 0;
char input[255];
while(count < 255 && fgets(input, sizeof(input), file)) {
char *line = strtok(input, "\n");
if(line)//When it is not a blank line
output[count++] = strdup(line);//Store replica
}
fclose(file);
return count;
}
int main(void){
char *output[255];//(`char *` x 255)
int number_of_read_line = fileRead("data.txt", output);
for(int i = 0; i < number_of_read_line; ++i){
printf("%s\n", output[i]);
free(output[i]);//Discard after being used
}
return 0;
}

parsing a file using fgets()

There is probably more issues at hand, but for now my problem is that when I compile and run this like so:
cc -o parser parser.c
./parser
I expect it to open a particular file, read from it, and parse it. However, it seems to expect me to provide input and I have to Ctrl-C to kill it. Am I using fgets wrong? I tried getline() with the same results. I added the puts() to make sure it was reading what I expected and it does. Any help is appreciated.
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
#include <limits.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
FILE *fp;
char buf[1024];
char *tmp, *pattern, *dir;
char *skip, *p;
char *tok[5];
char **ap;
size_t sz = 0;
ssize_t len;
int i;
int action = 0; // placeholder
int fileinto = 1; // placeholder
char path[PATH_MAX] = "/home/edgar/.patfile";
fp = fopen(path, "r");
if (fp == NULL)
fprintf(stderr, "fopen failed");
while (fgets(buf, sizeof(buf), fp) != NULL) {
buf[strcspn(buf, "\n")] = '\0';
// skip comments
for (skip = buf; *skip; ++skip) {
if (*skip == '#') {
*skip = '\0';
break;
}
}
// skip empty lines
if (strlen(buf) == 0)
continue;
puts(buf); // debug only
// make a copy
tmp = strdup(buf);
for (i = 0, ap = tok; ap < &tok[4] && (*ap = strsep(&tmp, " ")) != NULL; i++) {
if (**ap != '\0')
ap++;
}
while (i >= 0) {
if(tok[i] == "match")
pattern = tok[i + 1];
if(tok[i] == "fileinto") {
action = fileinto;
dir = tok[i + 1];
}
}
}
free(tmp);
fclose(fp);
exit(0);
}

where is the pthread segfault happening?

in my program, I provide a directory which contains text files. Each of the text files contain a few hundred lines in the following format
Username,Password,BloodType,Domain,Number
I then create a thread for each file in the directory which will merge-sort(by number) these lines into the array char* text_lines[6000];
I can't figure out why I'm getting a segmentation fault because I'm getting different output on every run.
Heres my code:
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <sys/types.h>
#include <dirent.h>
#include <string.h>
void store_line(char* line);
void* my_merge_sort(void* file);
char** text_lines;
int main(int argc, char* argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: ./coolsort <directory>\n");
}
else
{
text_lines = malloc(6000 * sizeof(char*));
DIR* the_directory;
int filecount = 0;
struct dirent* directory_files[50];
if((the_directory = opendir(argv[1])) != NULL)
{
//make a list of the files in the directory
while((directory_files[filecount++] = readdir(the_directory))) ;
filecount--;
//<<<DEBUGGING INFO>
int i;
fprintf(stderr,"there are %i files in %s:\n", filecount, argv[1]);
for(i = 0; i < filecount; i++)
{
fprintf(stderr, "%s\n",directory_files[i]->d_name);
}
char cwd[512];
chdir(argv[1]);
getcwd(cwd, sizeof(cwd));
fprintf(stderr, "the CWD is: %s\n", cwd);
//<DEBUGGING INFO>>>
//lets start some threads
pthread_t threads[filecount-2];
int x = 0;
for(i = 0; i < (filecount); i++ )
{
if (!strcmp (directory_files[i]->d_name, "."))
continue;
if (!strcmp (directory_files[i]->d_name, ".."))
continue;
pthread_create(&threads[x++], NULL, my_merge_sort, (void*)directory_files[i]->d_name);
}
//do stuff here
//
}
else
{
fprintf(stderr, "Failed to open directory: %s\n", argv[1]);
}
}
}
void* my_merge_sort(void* file)
{
fprintf(stderr, "We got into the function!\n");
FILE* fp = fopen(file, "r");
char* buffer;
char* line;
char delim[2] = "\n";
int numbytes;
//minimize I/O's by reading the entire file into memory;
fseek(fp, 0L, SEEK_END);
numbytes = ftell(fp);
fseek(fp, 0L, SEEK_SET);
buffer = (char*)calloc(numbytes, sizeof(char));
fread(buffer, sizeof(char), numbytes, fp);
fclose(fp);
//now read the buffer by '\n' delimiters
line = strtok(buffer, delim);
fprintf(stderr, "Heres the while loop\n");
while(line != NULL)
{
store_line(line);
line = strtok(buffer, NULL);
}
free(buffer);
}
void store_line(char* line)
{
//extract the ID.no, which is the fifth comma-seperated-token.
char delim[] = ",";
char* buff;
int id;
int i;
strtok(line, delim);
for(i = 0; i < 3; i++)
{
strtok(line, NULL);
}
buff = strtok(line, NULL);
id = atoi(buff);
//copy the line to text_lines[id]
memcpy(text_lines[id], line, strlen(line));
}
edit: I checked to make sure that it would fit into the initial array, and found that the highest ID is only 3000;
You use of strtok() is wrong:
line = strtok(buffer, NULL);
should be
line = strtok(NULL, delim);
Another mistakes should be fixed similarly.
The elements of text_lines are uninitialized:
text_lines = malloc(6000 * sizeof(char*));
this allocated 6000 pointers to char, but none of these pointers are initialized.

Resources