I'm trying to make a code, that should read all of the columns, row by row in csv file; then storing them under a struct array, that is called movies in this code. I actually managed to store the data, but just because of picking the parameter of delimater in strtok lines, some of my movies' data has missing;
Example: Row in the file:
Synecdoche, New York - Charlie Kaufman - 2008 - Drama
has to be stored as;
id = 37
name = Synecdoche, New York
directorName = Charlie Kaufman
year = 2008
genre = Drama
but it is stored as;
id = 37
name = Synecdoche
directorName = New York
year = Charlie Kaufman
genre = 2008
I am aware that this is because the string of characters I need to separate contains a comma; but I couldn't find how to solve it. So how can I make the strtok only split the string inside the double quotes?
I don't know if anyone can understand; but still I leave my code like this below;
#include <conio.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
typedef struct movie{
int id;
char name[100];
char directorName[100];
int year;
char genre[30];
} movie;
movie movies[100];
void remove_all_chars(char* str, char c) {
char *pr = str, *pw = str;
while (*pr) {
*pw = *pr++;
pw += (*pw != c);
}
*pw = '\0';
}
void fillMovies(FILE *filePointer)
{
char line[150];
int id = 1;
int arrIndex = 0;
while(!feof(filePointer))
{
fgets(line, sizeof(line), filePointer);
puts(line);
//sleep(1);
int i = 1;
char* value = strtok(line, ",");
struct movie movie = {
id,
"",
"",
0,
""
};
while(value != NULL)
{
//remove_all_chars(value, '\"');
printf("%s ", value);
if(i == 1)
{
//movie.name = value;
strcat(movie.name, value);
//movie.name += value;
i++;
value = strtok(NULL, ",");
continue;
}
if(i == 2)
{
//movie.directorName = value;
strcat(movie.directorName, value);
//movie.directorName += value;
i++;
value = strtok(NULL, ",");
continue;
}
if(i == 3)
{
movie.year = atoi(value);
i++;
value = strtok(NULL, ",");
continue;
}
if(i == 4)
{
//movie.genre = value;
strcat(movie.genre, value);
movies[arrIndex] = movie;
arrIndex++;
id++;
value = strtok(NULL, ",");
break;
}
}
printf("\n");
}
// Close the file
fclose(filePointer);
}
void printMovie(int i)
{
sleep(1);
printf("%d. ", movies[i].id);
printf("%s", movies[i].name);
printf(", ");
printf("%s", movies[i].directorName);
printf(", ");
printf("%d", movies[i].year);
printf(", ");
printf("%s", movies[i].genre);
}
// close, stringleri nasýl eþitleyeceðini bul.
int main()
{
// Buraya kendi dosya pathini lütfen yaz.
FILE* filePointer = fopen("movies.csv", "r");
if (!filePointer)
{
printf("Can't open file\n");
} else {
fillMovies(filePointer);
int i = 0;
while(i < 60){
printMovie(i);
i++;
}
}
return 0;
}
One of the sad truths to CSV files is that they look simple, and promise simplicity, but very quickly become nightmares to read. For any truly non-trivial CSV file, you need to build a state machine.
However, if we can put three significant constraints on your input then we can make life a whole easier:
Each record in your CSV is exactly N fields long
No quoted field will embed the quote character itself
Each field is known to be quoted or unquoted
No field is empty
If that is the case then you really only need a function to read characters from a file until one of a set of delimiters is encountered. That is fortunately very easy.
The other disheartening truth (and this applies to all computer languages) is that user input is really, really hard. Here is a function that crosses all the important ‘t’s and dots the ‘i’s.
int read_delimited( FILE * f, char * s, int n, const char * cs )
//
// Get text from a file.
//
// f File to read
// s Buffer to store characters read from f.
// The resulting buffer will always be null-terminated.
// May not be NULL.
// n Size of buffer. Must be at least 1.
// cs Delimiters.
// May not be NULL. (But it may be empty.)
//
// Read terminates only when EOF or one of the delimiters is read.
// Read does not terminate when the buffer fills up! If your buffer
// is too small the entire field is still read, but only (n-1)
// characters from the file are stored.
//
// Returns the last character read (either EOF or a delimiter).
//
{
int count = 0;
while (true)
{
int c = fgetc( f );
if ((c == EOF) or strchr( cs, c )) break;
if (count < n) s[count++] = c;
}
s[count] = '\0';
return c;
}
You can then use this and a little helper function in a loop to collect all your data for each record.
int skip_chars( FILE * f, const char * cs )
//
// Skips all characters in cs[].
// Returns the character last read (EOF or something not in cs[]).
//
{
int c;
do c = fgetc( f );
while ((c != EOF) and !!strchr( cs, c ));
return c;
}
bool read_movie( FILE * f, movie * m )
{
char s[100];
int c = skip_chars( f, " \t\n" ); // skip whitespace, including newlines
if (c == EOF) return false;
read_delimited( f, s, sizeof(s), "," );
m->id = atoi( s );
skip_chars( f, " \t" ); // skip leading ws
read_delimited( f, m->name, sizeof(m->name), "\"" );
skip_chars( f, " \t," ); // skip ws and trailing ,
read_delimited( f, m->directorName, sizeof(m->directorName), "\"" );
skip_chars( f, " \t," ); // skip trailing ws, trailing comma, leading ws
read_delimited( f, s, sizeof(s), "," );
m->year = atoi( s );
skip_chars( f, " \t" ); // skip leading ws
read_delimited( f, m->genre, sizeof(m->genre), "\n" );
return m->genre[0] != '\0';
}
After that you just need a loop to read all the records:
const int max_movies = 100;
movie movies[max_movies];
int num_movies = 0;
while ((num_movies < max_movies) and read_movie( f, &movies[num_movies] ))
{
num_movies += 1;
}
As you can see, it gets less-than-one-line really fast. But I don’t think you can really make it any simpler.
Another useful option, if it is available to you, is to use a TAB character instead of a comma to separate fields. I assume that you are given to handle a CSV, though, and cannot change that.
Related
I have written a code to read a csv file in c. The file contains data of games and i am supposed to read it and sort it according to the score and print the top 10 rated games. The code is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define tablesize 18626
typedef struct
{
char title[200];
char platform[20];
char Score[20];
char release_year[20];
} dict;
void printValues(dict *values)
{
for (int i = 0; i < 100; i++)
{
printf("title->%s,platform->%s,Score->%s,release->%s\n", values[i].title, values[i].platform, values[i].Score, values[i].release_year);
}
}
void sort(dict *values)
{
for (int i = 0; i < tablesize; i++)
{
for (int j = i + 1; j < tablesize; j++)
{
int a = *values[i].Score - '0';
int b = *values[j].Score - '0';
// printf("%d %d\n",values[i].Score,values[j].Score);
if (a < b)
{
dict temp = values[i];
values[i] = values[j];
values[j] = temp;
}
}
}
}
int main()
{
FILE *fp = fopen("t4_ign.csv", "r");
if (!fp)
{
printf("Error");
return 0;
}
char buff[1024];
int row = 0, column = 0;
int count = 0;
dict *values = NULL;
int i = 0;
while (fgets(buff, 1024, fp))
{
column = 0;
row++;
count++;
values = realloc(values, sizeof(dict) * count);
if (NULL == values)
{
perror("realloc");
break;
}
if (row == 1)
{
continue;
}
char *field = strtok(buff, ",");
while (field)
{
if (column == 0)
{
strcpy(values[i].title, field);
}
if (column == 1)
{
strcpy(values[i].platform, field);
}
if (column == 2)
{
strcpy(values[i].Score, field);
}
if (column == 3)
{
strcpy(values[i].release_year, field);
}
field = strtok(NULL, ",");
column++;
}
i++;
}
fclose(fp);
printf("File loaded!\n", fp);
sort(values);
printValues(values);
free(values);
return 0;
}
The problem i am facing is that the CSV file's Title field has commas in it and it thus differentiates the data separated by the commas as different columns which gives an error in loading the data in the struct.
Here are two example lines of the input file. Quotes are used when the title contains commas.
"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe",PlayStation 2,8,2005
The Chronicles of Narnia: Prince Caspian,Wireless,5,2008
Any suggestions? Thanks in advance.
Since quotes are used for the title field when it contains commas, I suggest you check to see if the " has been used. If so, use that delimiter for the first item.
char *field;
if(buff[0] == '"') {
field = strtok(buff, "\"");
}
else {
field = strtok(buff, ",");
}
The first one will leave a comma as the first character of the next field, but the next strtok will filter that off, since it does not allow "empty" fields.
The function strtok does not suit your needs, because it considers the quotation marks as characters like any other. Therefore, when strtok sees a comma, it won't care whether the comma is inside quotation marks or not.
Also, as someone else pointed out in the comments section, another problem with strtok is that it skips empty fields.
Therefore, I do not recommend using strtok for what you want to do.
In order to solve your problem, I recommend that you write your own function that does something very similar to strtok and strsep, but if the first non-whitespace character is a quotation mark, it considers the next quotation mark as the delimiter instead of the next comma. In the code below, I named this function my_strsep.
Here is an example:
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#define NUM_LINES 2
//this function is equivalent to the POSIX function "strsep", except
//that it always uses "," as a delimiter, unless the first
//non-whitespace character is a quotation mark, in which case it //skips the quotation mark and uses the next quotation mark as a
//delimiter, also consuming the next comma
char *my_strsep( char **restrict stringp )
{
char *p = *stringp;
char *start;
char delimiter = ',';
//do nothing if *stringp is
if ( *stringp == NULL )
return NULL;
//skip all whitespace characters
while ( isspace( (unsigned char)*p ) )
p++;
//remember start of field
start = p;
//determine whether this field uses quotation marks
if ( *p == '"' )
{
//set delimiter to quotation mark instead of comma
delimiter = '\"';
//skip the first quotation mark
p++;
}
//remember the start of the string
start = p;
while ( *p != delimiter )
{
if ( *p == '\0' )
{
if ( delimiter == '\"' )
{
fprintf( stderr,
"Warning: Encountered end of string before the "
"second quotation mark!\n"
);
}
//pass information back to calling function
*stringp = NULL;
return start;
}
p++;
}
//overwrite the delimiter with a null character
*p = '\0';
//go past the delimiter
p++;
//skip the comma too, if quotation marks are being used
if ( delimiter == '\"' )
{
//skip all whitespace characters
while ( isspace( (unsigned char)*p ) )
p++;
//skip the comma
if ( *p == ',' )
p++;
}
//pass information back to calling function
*stringp = p;
return start;
}
int main( void )
{
char lines[NUM_LINES][200] = {
"\"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe\",PlayStation 2,8,2005",
"The Chronicles of Narnia: Prince Caspian,Wireless,5,2008"
};
for ( int i = 0; i < NUM_LINES; i++ )
{
char *p, *q;
printf( "Processing line #%d:\n", i + 1 );
p = lines[i];
while ( ( q = my_strsep( &p ) ) != NULL )
{
printf( "Found field: %s\n", q );
}
printf( "\n" );
}
}
This program has the following output:
Processing line #1:
Found field: The Chronicles of Narnia: The Lion, The Witch and The Wardrobe
Found field: PlayStation 2
Found field: 8
Found field: 2005
Processing line #2:
Found field: The Chronicles of Narnia: Prince Caspian
Found field: Wireless
Found field: 5
Found field: 2008
As you can see, the function my_strsep can handle fields both with and without quotation marks.
I am trying to take inputs from the standard input in the form (a,b) (c,d) (e, f) (g,h) and will stop taking input if an empty line is added. I need these inputs tuple by tuple like (a,b) first then I perform some computation with it like add in the binary tree and add (c,d) then (e, f) and so on in the following way:
insert_in_tree(&tree->root,&tree->root,a, b);
I know how to accept integers till empty line is added which i do in the following way:
AVLTree *CreateAVLTree(const char *filename)
{
// put your code here
AVLTree *tree = newAVLTree();
int key, value;
if(strcmp(filename, "stdin") == 0){
char str[1024]={};
printf("Enter your values");
while( fgets(str, 1024, stdin) && strlen(str) && str[0] != '\n' ){
printf("string %s", str);
sscanf(str, "%d, %d", &key, &value);
//int key = atoi(str);
printf("This is key you entered %d\n", key);
printf("This is value you entered %d\n", value);
}
}else{
FILE* file = fopen(filename, "r"); // open a file
if(file == NULL) {
return NULL; // error checking
}
while (fscanf (file, " (%d,%d)", &key, &value) == 2) // check for number of conversions
// space added here ^
{
insert_in_tree_q5(&tree->root,&tree->root, key, value);
//printf("%d,%d\n", key, value);
}
fclose(file);
//node = tree->root;
}
return tree;
}
but i am not sure how use this to solve my problem stated above.
I'm not a fan of using scanf() et.al. to parse data, as a simple scanf("%d,%d") tends to be error prone with differing user input.
My general approach when dealing with known formatting characters (like (, ,, )), is to find them first with strchr(), validate they're somewhat sensible, and only then try to extract the value.
In the code below, I locate the parentheses and comma, then copy out the possibly numeric data in between, before handing it off to strtol() for converting the integer string to a numeric representation.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAX_NUMBER_LEN 32
/*
** Given a string which contains (somewhere) a pair
** of numbers in the form "... (x, y) ...", parse the pair
** into val1 and val2 respectively.
**
** Returns the point at which the input ended successfully
** or NULL on error
*/
const char *parseTuple(const char *input, int *val1, int *val2)
{
char *result = NULL;
char val1_str[ MAX_NUMBER_LEN+1 ] = { '\0' };
char val2_str[ MAX_NUMBER_LEN+1 ] = { '\0' };
// Find the first '('
char *left_paren = strchr( input, '(' );
char *right_paren = strchr( input, ')' );
char *comma = strchr( input, ',' );
// validate the things we found exist, and are in valid positions
if ( left_paren != NULL && right_paren != NULL && comma != NULL && // needed parts exist
left_paren < comma && comma < right_paren ) // in the correct order
{
// val1 source string exists between left_paren+1 and comma-1
int val1_len = comma-1 - left_paren+1 - 1;
if ( val1_len > 0 && val1_len < MAX_NUMBER_LEN )
{
strncpy( val1_str, left_paren+1, val1_len );
val1_str[ val1_len ] = '\0';
}
// val2 source string exists between comma+1 and right_paren-1
int val2_len = right_paren-1 - comma+1 - 1;
if ( val2_len > 0 && val2_len < MAX_NUMBER_LEN )
{
strncpy( val2_str, comma+1, val2_len );
val2_str[ val2_len ] = '\0';
}
// If we extracted some reasonable numbers, try to parse them
if ( val1_str[0] != '\0' && val2_str[0] != '\0' )
{
*val1 = strtol( val1_str, NULL, 10 );
*val2 = strtol( val2_str, NULL, 10 );
// TODO handle errno when string is not a number
// if errono did not indicate a strol() failure
result = right_paren+1; // point to the next input location, so we can call again
}
}
return result;
}
int main(int argc, char **argv)
{
const char *input;
int val1;
int val2;
for (int i=1; i<argc; i++)
{
input = argv[i];
do
{
printf( "From input of: [%s]\n" , input );
input = parseTuple( input, &val1, &val2 );
if ( input != NULL )
printf( " Parsed out: (%3d,%3d)\n", val1, val2 );
} while ( input != NULL && strlen( input ) );
}
return 0;
}
Giving the test-run:
$ ./parse_tuple '(-3, 2)' '(1,1)(11111111111111111111111111111111111111111111111111111111111111111111,0) () (,)' '(' '()' ')' '(,)' '(-12,)' '(123,456)'
From input of: [(-3, 2)]
Parsed out: ( -3, 2)
From input of: [(1,1)(11111111111111111111111111111111111111111111111111111111111111111111,0) () (,)]
Parsed out: ( 1, 1)
From input of: [(11111111111111111111111111111111111111111111111111111111111111111111,0) () (,)]
From input of: [(]
From input of: [()]
From input of: [)]
From input of: [(,)]
From input of: [(-12,)]
From input of: [(123,456)]
Parsed out: (123,456)
OK... first question so please forgive me if it isn't quite understandable the first go.
I am attempting to parse a string input to stdin through a couple of different conditions.
Example input string: move this into "tokens that I need" \n
I would like to parse this into tokens as:
Token 1 = move
Token 2 = this
Token 3 = into
Token 4 = tokens that I need
Where the tokens are by whitespace (easy enough) until a quote is encountered, then everything inside of the open and close quotes is treated as a single token.
I've tried several different methods, but I unfortunately feel that I may be in over my head here so any help would be greatly appreciated.
My latest attempt:
fgets(input, BUFLEN, stdin); //gets the input
input[strlen(input)-1] = '\0';//removes the new line
printf("Input string = %s\n",input);//Just prints it out for me to see
char *token = strtok(input,delim);//Tokenizes the input, which unfortunately does not do what I need. delim is just my string of delimiters which currently only has a " " in it.
I tried to scan through the string one character at a time and then place those characters into arrays so that I could have them as I wanted, but that failed miserably.
The ultimate solution with customized version of my_strtok_r is here. This solution has advantage over solution with non re-entrant: strtok.
my_strtok_r is re-entrant: you can call them from multiple threads simultaneously, or in nested loops, et cetera.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char * my_strtok_r(char *s, const char *delim1, const char *delim2, char **save_ptr)
{
char *end;
size_t s1;
size_t s2;
int delim2found = 0;
if (s == NULL)
s = *save_ptr;
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
s1 = strspn (s, delim1);
s2 = strspn (s, delim2);
if(s2 > s1){
s += s2;
delim2found = 1;
}
else{
s += s1;
}
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
/* Find the end of the token. */
if(delim2found)
end = s + strcspn (s, delim2);
else
end = s + strcspn (s, delim1);
if (*end == '\0') {
*save_ptr = end;
return s;
}
/* Terminate the token and make *save_ptr point past it. */
*end = '\0';
*save_ptr = end + 1;
return s;
}
int main (void)
{
char str[] = " 123 abc \"SPLITTING WORKS\" yes! \"GREAT WE HAVE A SOLUTION\" ! ";
char *d1 = " ";
char *d2 = "\"";
char *token;
char *rest = str;
char array[20][80];
printf ("Splitting string \"%s\" into tokens:\n",str);
size_t nr_of_tokens = 0;
while ((token = my_strtok_r(rest, d1, d2, &rest)))
{
strcpy (array[nr_of_tokens], token);
nr_of_tokens++;
}
for(int i=0; i < nr_of_tokens; i++)
printf ("%s\n",array[i]);
return 0;
}
Test:
Splitting string " 123 abc "SPLITING WORKS" yes! "GREAT WE HAVE A SOLUTION" ! " into tokens:
123
abc
SPLITTING WORKS
yes!
GREAT WE HAVE A SOLUTION
!
This is another solution (fully tested) which you can use. You can mix any number of tokens delimited by white spaces and '\"'. It can be configured to your needs. Extensive explanations are given in the code itself.
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include <ctype.h>
char *get_str_segment(char *output_str, char *input_str, char extDel)
{
/*
Purpose :
To copy to output first segment.
To extract the segment two types of delimiters are used:
1. white space delimiter
2. 'extDel' -
do not put here white space or '\0'!
(typicaly '"' = quote!)
'extDel' allows us to put white spaces inside the segment.
Notice that 'extDel' cannot be embedded inside the segment!
It makes 'extDel' special character which will not be encountered
in the 'output_string'! First appearance of 'extDel' starts new
segment!
Notice that unbalanced 'extDel' will cause coping whole string to
destination from that point!
Return:
Pointer to the first character after the segment
or NULL !!!
we will not allow **empty** segments with unbalanced 'extDel'
if ('extDel' is unbalanced) it has to have at list one character!
It can be white space!
Notice!
"get_str_segment()" on strings filed with white spaces
and empty strings will return *** NULL *** to indicate that
no conclusive segment has been found!
Example:
input_str = " qwerty"123 45" "samuel" G7 "
output_str = ""
// Call:
char *ptr = get_str_segment(output_str,input_str,'"');
Result:
input_str = " qwerty"123 45" "samuel" G7 "
^
|
ptr----------------------.
output_str = "qwerty"
*/
char *s = input_str;
char *d = output_str;
char i = 0;
if(!s) return NULL; // rule #1 our code never brakes!
if(!d) return NULL;
// eliminate white spaces from front of the block
while(1)
{
if ( *s == '\0')
{
*d = '\0' ; // end the output string
return (NULL) ; // return NULL to indicate that no
// coping has been done.
//
//
// "get_str_segment()" on
// strings filed with white spaces
// and empty strings
// will return NULL to indicate that
// no conclusive segment has been found
//
}
if (isspace(*s)) ++s; // move pointer to next char
else break; // break the loop!
}
// we found first non white character!
if( *s != extDel)
{
// copy block up to end of string first white space or extDel
while( ((*s) != '\0') && (!isspace(*s)) && ((*s) != extDel) )
{
*d = *s; // copy segment characters
++s;
++d;
}
*d = '\0' ; // end the destination string
return (s); // return pointer to end of the string ||
// trailing white space ||
// 'extDel' char
}
else // It is 'extDel' character !
{
++s; // skip opening 'extDel'
while( ((*s) != '\0') && ((*s) != extDel) )
{
i=1; // we executed loop at list one time
*d = *s; // copy segment characters till '\0' or extDel
++s;
++d;
}
*d = '\0' ; // end the destination string
if( *s == extDel ) ++s; // skip *closing* 'extDel'
else
{
// unbalanced 'extDel'!
printf("WARNING:get_str_segment: unbalanced '%c' encountered!\n",extDel);
if (i==0) return NULL; // we will not allow
// **empty** unbalanced segments 'extDel'
// if ('extDel' is unbalanced) it has to have at list one character!
// It can be white space!
}
return (s); // return pointer to next char after 'extDel'
// ( It can be '\0')
// if it is '\0' next pass will return 'NULL'!
}
}
int parse_line_to_table(int firstDim, int secondDim, char *table, char * line, char separator)
{
// Purpose:
// Parse 'line' to 'table'
// Function returns: number of segments
// 'table' has to be passed from outside
char* p;
int i;
if(!table) return (-1);
// parse segments to 'table':
if(line)
{
p = line; // A necessary initialization!
for(i=0; i<firstDim; i++)
{
p = get_str_segment( table+i*secondDim , p , separator );
if(p==NULL) break;
}
}
else
return (-1);
// debug only
// for(int j=0; j<i; j++) { printf(" i=%d %s",j, table+j*secondDim ); }
// printf("\n");
return (i); // notice that i is post incremented
}
int main(void)
{
char table[20][80];
char *line = "move this into \"tokens that I need\"";
int ret = parse_line_to_table(20, 80, table, line, '\"');
for(int i = 0; i < ret; i++ )
printf("%s\n",table[i]);
return 0;
}
Output:
move
this
into
tokens that I need
I have a String like this:
"00:00:00 000~00:02:00 0000|~00:01:00 0000;00:01:00 0000~",
I want to get each of the items like "00:00:00 000".
My idea is that first, split the string by ";", then split by "|", and finally split by "~".
But the problem is that I can't get it if it's null, such like "00:01:00 0000~", the part after "~", I wanna get it and set a default value to it then store it somewhere else, but the code doesn't work. What is the problem?
Here is my code:
int main(int argc, char *argv[])
{
char *str1, *str2, *str3, *str4, *token, *subtoken, *subt1, *subt2;
char *saveptr1, *saveptr2, *saveptr3;
int j;
for (j = 1, str1 = argv[1]; ; j++, str1 = NULL) {
token = strtok_r(str1, ";", &saveptr1);
if (token == NULL)
break;
printf("%d: %s\n", j, token);
int flag1 = 1;
for (str2 = token; ; str2 = NULL) {
subtoken = strtok_r(str2, "|", &saveptr2);
if (subtoken == NULL)
break;
printf(" %d: --> %s\n", flag1++, subtoken);
int flag2 = 1;
for(str3 = subtoken; ; str3 = NULL) {
subt1 = strtok_r(str3, "~", &saveptr3);
if(subt1 == NULL) {
break;
}
printf(" %d: --> %s\n",flag2++, subt1);
}
}
}
exit(EXIT_SUCCESS);
} /* main */
You can simplify your algorithm if you first make all delimiters uniform. First replace all occurrences of , and | with ~, then the parsing will be easier. You can do this externally via sed or vim or programmatically in your C code. Then you should be able to get the 'NULL' problem easily. (Personally, I prefer not to use strtok as it modifies the original string).
It is indeed easier to just write a custom parser in this case.
The version below allocates new strings, If allocating new memory is not desired, change the add_string method to instead just point to start, and set start[len] to 0.
static int add_string( char **into, const char *start, int len )
{
if( len<1 ) return 0;
if( (*into = strndup( start, len )) )
return 1;
return 0;
}
static int is_delimeter( char x )
{
static const char delimeters[] = { 0, '~', ',', '|',';' };
int i;
for( i=0; i<sizeof(delimeters); i++ )
if( x == delimeters[i] )
return 1;
return 0;
}
static char **split( const char *data )
{
char **res = malloc(sizeof(char *)*(strlen(data)/2+1));
char **cur = res;
int last_delimeter = 0, i;
do {
if( is_delimeter( data[i] ) )
{
if( add_string( cur, data+last_delimeter,i-last_delimeter) )
cur++;
last_delimeter = i+1;
}
} while( data[i++] );
*cur = NULL;
return res;
}
An example usage of the method:
int main()
{
const char test[] = "00:00:00 000~00:02:00 0000|~00:01:00 0000;00:01:00 0000~";
char **split_test = split( test );
int i = 0;
while( split_test[i] )
{
fprintf( stderr, "%2d: %s\n", i, split_test[i] );
free( split_test[i] );
i++;
}
free( split_test );
return 0;
}
Instead of splitting the string, it might be more suitable to come up with a simple finite state machine that parses the string. Fortunately, your tokens seem to have an upper limit on their length, which makes things a lot easier:
Iterate over the string and distinguish four different states:
current character is not a delimiter, but previous character was (start of token)
current character is a delimiter and previous character wasn't (end of token)
current and previous character are both not delimiters (store them in temporary buffer)
current and previous character are both delimiters (ignore them, read next character)
It should be possible to come up with a very short (10 lines?) and concise piece of code that parses the string as specified.
Im trying to take a user input string and parse is into an array called char *entire_line[100]; where each word is put at a different index of the array but if a part of the string is encapsulated by a quote, that should be put in a single index.
So if I have
char buffer[1024]={0,};
fgets(buffer, 1024, stdin);
example input: "word filename.txt "this is a string that shoudl take up one index in an output array";
tokenizer=strtok(buffer," ");//break up by spaces
do{
if(strchr(tokenizer,'"')){//check is a word starts with a "
is_string=YES;
entire_line[i]=tokenizer;// if so, put that word into current index
tokenizer=strtok(NULL,"\""); //should get rest of string until end "
strcat(entire_line[i],tokenizer); //append the two together, ill take care of the missing space once i figure out this issue
}
entire_line[i]=tokenizer;
i++;
}while((tokenizer=strtok(NULL," \n"))!=NULL);
This clearly isn't working and only gets close if the double quote encapsulated string is at the end of the input string
but i could have
input: word "this is text that will be user entered" filename.txt
Been trying to figure this out for a while, always get stuck somewhere.
thanks
The strtok function is a terrible way to tokenize in C, except for one (admittedly common) case: simple whitespace-separated words. (Even then it's still not great due to lack of re-entrance and recursion ability, which is why we invented strsep for BSD way back when.)
Your best bet in this case is to build your own simple state-machine:
char *p;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
for (p = buffer; *p != '\0'; p++) {
c = (unsigned char) *p; /* convert to unsigned char for is* functions */
switch (state) {
case DULL: /* not in a word, not in a double quoted string */
if (isspace(c)) {
/* still not in a word, so ignore this char */
continue;
}
/* not a space -- if it's a double quote we go to IN_STRING, else to IN_WORD */
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1; /* word starts at *next* char, not this one */
continue;
}
state = IN_WORD;
start_of_word = p; /* word starts here */
continue;
case IN_STRING:
/* we're in a double quoted string, so keep going until we hit a close " */
if (c == '"') {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_STRING or we handled the end above */
case IN_WORD:
/* we're in a word, so keep going until we get to a space */
if (isspace(c)) {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_WORD or we handled the end above */
}
}
Note that this does not account for the possibility of a double quote inside a word, e.g.:
"some text in quotes" plus four simple words p"lus something strange"
Work through the state machine above and you will see that "some text in quotes" turns into a single token (that ignores the double quotes), but p"lus is also a single token (that includes the quote), something is a single token, and strange" is a token. Whether you want this, or how you want to handle it, is up to you. For more complex but thorough lexical tokenization, you may want to use a code-building tool like flex.
Also, when the for loop exits, if state is not DULL, you need to handle the final word (I left this out of the code above) and decide what to do if state is IN_STRING (meaning there was no close-double-quote).
Torek's parts of parsing code are excellent but require little more work to use.
For my own purpose, I finished c function.
Here I share my work that is based on Torek's code.
#include <stdio.h>
#include <string.h>
#include <ctype.h>
size_t split(char *buffer, char *argv[], size_t argv_size)
{
char *p, *start_of_word;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
size_t argc = 0;
for (p = buffer; argc < argv_size && *p != '\0'; p++) {
c = (unsigned char) *p;
switch (state) {
case DULL:
if (isspace(c)) {
continue;
}
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1;
continue;
}
state = IN_WORD;
start_of_word = p;
continue;
case IN_STRING:
if (c == '"') {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
case IN_WORD:
if (isspace(c)) {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
}
}
if (state != DULL && argc < argv_size)
argv[argc++] = start_of_word;
return argc;
}
void test_split(const char *s)
{
char buf[1024];
size_t i, argc;
char *argv[20];
strcpy(buf, s);
argc = split(buf, argv, 20);
printf("input: '%s'\n", s);
for (i = 0; i < argc; i++)
printf("[%u] '%s'\n", i, argv[i]);
}
int main(int ac, char *av[])
{
test_split("\"some text in quotes\" plus four simple words p\"lus something strange\"");
return 0;
}
See program output:
input: '"some text in quotes" plus four simple words p"lus something strange"'
[0] 'some text in quotes'
[1] 'plus'
[2] 'four'
[3] 'simple'
[4] 'words'
[5] 'p"lus'
[6] 'something'
[7] 'strange"'
I wrote a qtok function some time ago that reads quoted words from a string. It's not a state machine and it doesn't make you an array but it's trivial to put the resulting tokens into one. It also handles escaped quotes and trailing and leading spaces:
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
// Strips backslashes from quotes
char *unescapeToken(char *token)
{
char *in = token;
char *out = token;
while (*in)
{
assert(in >= out);
if ((in[0] == '\\') && (in[1] == '"'))
{
*out = in[1];
out++;
in += 2;
}
else
{
*out = *in;
out++;
in++;
}
}
*out = 0;
return token;
}
// Returns the end of the token, without chaning it.
char *qtok(char *str, char **next)
{
char *current = str;
char *start = str;
int isQuoted = 0;
// Eat beginning whitespace.
while (*current && isspace(*current)) current++;
start = current;
if (*current == '"')
{
isQuoted = 1;
// Quoted token
current++; // Skip the beginning quote.
start = current;
for (;;)
{
// Go till we find a quote or the end of string.
while (*current && (*current != '"')) current++;
if (!*current)
{
// Reached the end of the string.
goto finalize;
}
if (*(current - 1) == '\\')
{
// Escaped quote keep going.
current++;
continue;
}
// Reached the ending quote.
goto finalize;
}
}
// Not quoted so run till we see a space.
while (*current && !isspace(*current)) current++;
finalize:
if (*current)
{
// Close token if not closed already.
*current = 0;
current++;
// Eat trailing whitespace.
while (*current && isspace(*current)) current++;
}
*next = current;
return isQuoted ? unescapeToken(start) : start;
}
int main()
{
char text[] = " \"some text in quotes\" plus four simple words p\"lus something strange\" \"Then some quoted \\\"words\\\", and backslashes: \\ \\ \" Escapes only work insi\\\"de q\\\"uoted strings\\\" ";
char *pText = text;
printf("Original: '%s'\n", text);
while (*pText)
{
printf("'%s'\n", qtok(pText, &pText));
}
}
Outputs:
Original: ' "some text in quotes" plus four simple words p"lus something strange" "Then some quoted \"words\", and backslashes: \ \ " Escapes only work insi\"de q\"uoted strings\" '
'some text in quotes'
'plus'
'four'
'simple'
'words'
'p"lus'
'something'
'strange"'
'Then some quoted "words", and backslashes: \ \ '
'Escapes'
'only'
'work'
'insi\"de'
'q\"uoted'
'strings\"'
I think the answer to your question is actually fairly simple, but I'm taking on an assumption where it seems the other responses have taken a different one. I'm assuming that you want any quoted block of text to be separated out on its own regardless of spacing with the rest of the text being separated by spaces.
So given the example:
"some text in quotes" plus four simple words p"lus something strange"
The output would be:
[0] some text in quotes
[1] plus
[2] four
[3] simple
[4] words
[5] p
[6] lus something strange
Given that this is the case, only a simple bit of code is required, and no complex machines. You would first check if there is a leading quote for the first character and if so tick a flag and remove the character. As well as removing any quotes at the end of the string. Then tokenize the string based on quotation marks. Then tokenize every other of the strings obtained previously by spaces. Tokenize starting with the first string obtained if there was no leading quote, or the second string obtained if there was a leading quote. Then each of the remaining strings from the first part will be added to an array of strings interspersed with the strings from the second part added in place of the strings they were tokenized from. In this way you can get the result listed above. In code this would look like:
#include<string.h>
#include<stdlib.h>
char ** parser(char * input, char delim, char delim2){
char ** output;
char ** quotes;
char * line = input;
int flag = 0;
if(strlen(input) > 0 && input[0] == delim){
flag = 1;
line = input + 1;
}
int i = 0;
char * pch = strchr(line, delim);
while(pch != NULL){
i++;
pch = strchr(pch+1, delim);
}
quotes = (char **) malloc(sizeof(char *)*i+1);
char * token = strtok(input, delim);
int n = 0;
while(token != NULL){
quotes[n] = strdup(token);
token = strtok(NULL, delim);
n++;
}
if(delim2 != NULL){
int j = 0, k = 0, l = 0;
for(n = 0; n < i+1; n++){
if(flag & n % 2 == 1 || !flag & n % 2 == 0){
char ** new = parser(delim2, NULL);
l = sizeof(new)/sizeof(char *);
for(k = 0; k < l; k++){
output[j] = new[k];
j++;
}
for(k = l; k > -1; k--){
free(new[n]);
}
free(new);
} else {
output[j] = quotes[n];
j++;
}
}
for(n = i; n > -1; n--){
free(quotes[n]);
}
free(quotes);
} else {
return quotes;
}
return output;
}
int main(){
char * input;
char ** result = parser(input, '\"', ' ');
return 0;
}
(May not be perfect, I haven't tested it)