Im trying to take a user input string and parse is into an array called char *entire_line[100]; where each word is put at a different index of the array but if a part of the string is encapsulated by a quote, that should be put in a single index.
So if I have
char buffer[1024]={0,};
fgets(buffer, 1024, stdin);
example input: "word filename.txt "this is a string that shoudl take up one index in an output array";
tokenizer=strtok(buffer," ");//break up by spaces
do{
if(strchr(tokenizer,'"')){//check is a word starts with a "
is_string=YES;
entire_line[i]=tokenizer;// if so, put that word into current index
tokenizer=strtok(NULL,"\""); //should get rest of string until end "
strcat(entire_line[i],tokenizer); //append the two together, ill take care of the missing space once i figure out this issue
}
entire_line[i]=tokenizer;
i++;
}while((tokenizer=strtok(NULL," \n"))!=NULL);
This clearly isn't working and only gets close if the double quote encapsulated string is at the end of the input string
but i could have
input: word "this is text that will be user entered" filename.txt
Been trying to figure this out for a while, always get stuck somewhere.
thanks
The strtok function is a terrible way to tokenize in C, except for one (admittedly common) case: simple whitespace-separated words. (Even then it's still not great due to lack of re-entrance and recursion ability, which is why we invented strsep for BSD way back when.)
Your best bet in this case is to build your own simple state-machine:
char *p;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
for (p = buffer; *p != '\0'; p++) {
c = (unsigned char) *p; /* convert to unsigned char for is* functions */
switch (state) {
case DULL: /* not in a word, not in a double quoted string */
if (isspace(c)) {
/* still not in a word, so ignore this char */
continue;
}
/* not a space -- if it's a double quote we go to IN_STRING, else to IN_WORD */
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1; /* word starts at *next* char, not this one */
continue;
}
state = IN_WORD;
start_of_word = p; /* word starts here */
continue;
case IN_STRING:
/* we're in a double quoted string, so keep going until we hit a close " */
if (c == '"') {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_STRING or we handled the end above */
case IN_WORD:
/* we're in a word, so keep going until we get to a space */
if (isspace(c)) {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_WORD or we handled the end above */
}
}
Note that this does not account for the possibility of a double quote inside a word, e.g.:
"some text in quotes" plus four simple words p"lus something strange"
Work through the state machine above and you will see that "some text in quotes" turns into a single token (that ignores the double quotes), but p"lus is also a single token (that includes the quote), something is a single token, and strange" is a token. Whether you want this, or how you want to handle it, is up to you. For more complex but thorough lexical tokenization, you may want to use a code-building tool like flex.
Also, when the for loop exits, if state is not DULL, you need to handle the final word (I left this out of the code above) and decide what to do if state is IN_STRING (meaning there was no close-double-quote).
Torek's parts of parsing code are excellent but require little more work to use.
For my own purpose, I finished c function.
Here I share my work that is based on Torek's code.
#include <stdio.h>
#include <string.h>
#include <ctype.h>
size_t split(char *buffer, char *argv[], size_t argv_size)
{
char *p, *start_of_word;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
size_t argc = 0;
for (p = buffer; argc < argv_size && *p != '\0'; p++) {
c = (unsigned char) *p;
switch (state) {
case DULL:
if (isspace(c)) {
continue;
}
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1;
continue;
}
state = IN_WORD;
start_of_word = p;
continue;
case IN_STRING:
if (c == '"') {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
case IN_WORD:
if (isspace(c)) {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
}
}
if (state != DULL && argc < argv_size)
argv[argc++] = start_of_word;
return argc;
}
void test_split(const char *s)
{
char buf[1024];
size_t i, argc;
char *argv[20];
strcpy(buf, s);
argc = split(buf, argv, 20);
printf("input: '%s'\n", s);
for (i = 0; i < argc; i++)
printf("[%u] '%s'\n", i, argv[i]);
}
int main(int ac, char *av[])
{
test_split("\"some text in quotes\" plus four simple words p\"lus something strange\"");
return 0;
}
See program output:
input: '"some text in quotes" plus four simple words p"lus something strange"'
[0] 'some text in quotes'
[1] 'plus'
[2] 'four'
[3] 'simple'
[4] 'words'
[5] 'p"lus'
[6] 'something'
[7] 'strange"'
I wrote a qtok function some time ago that reads quoted words from a string. It's not a state machine and it doesn't make you an array but it's trivial to put the resulting tokens into one. It also handles escaped quotes and trailing and leading spaces:
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
// Strips backslashes from quotes
char *unescapeToken(char *token)
{
char *in = token;
char *out = token;
while (*in)
{
assert(in >= out);
if ((in[0] == '\\') && (in[1] == '"'))
{
*out = in[1];
out++;
in += 2;
}
else
{
*out = *in;
out++;
in++;
}
}
*out = 0;
return token;
}
// Returns the end of the token, without chaning it.
char *qtok(char *str, char **next)
{
char *current = str;
char *start = str;
int isQuoted = 0;
// Eat beginning whitespace.
while (*current && isspace(*current)) current++;
start = current;
if (*current == '"')
{
isQuoted = 1;
// Quoted token
current++; // Skip the beginning quote.
start = current;
for (;;)
{
// Go till we find a quote or the end of string.
while (*current && (*current != '"')) current++;
if (!*current)
{
// Reached the end of the string.
goto finalize;
}
if (*(current - 1) == '\\')
{
// Escaped quote keep going.
current++;
continue;
}
// Reached the ending quote.
goto finalize;
}
}
// Not quoted so run till we see a space.
while (*current && !isspace(*current)) current++;
finalize:
if (*current)
{
// Close token if not closed already.
*current = 0;
current++;
// Eat trailing whitespace.
while (*current && isspace(*current)) current++;
}
*next = current;
return isQuoted ? unescapeToken(start) : start;
}
int main()
{
char text[] = " \"some text in quotes\" plus four simple words p\"lus something strange\" \"Then some quoted \\\"words\\\", and backslashes: \\ \\ \" Escapes only work insi\\\"de q\\\"uoted strings\\\" ";
char *pText = text;
printf("Original: '%s'\n", text);
while (*pText)
{
printf("'%s'\n", qtok(pText, &pText));
}
}
Outputs:
Original: ' "some text in quotes" plus four simple words p"lus something strange" "Then some quoted \"words\", and backslashes: \ \ " Escapes only work insi\"de q\"uoted strings\" '
'some text in quotes'
'plus'
'four'
'simple'
'words'
'p"lus'
'something'
'strange"'
'Then some quoted "words", and backslashes: \ \ '
'Escapes'
'only'
'work'
'insi\"de'
'q\"uoted'
'strings\"'
I think the answer to your question is actually fairly simple, but I'm taking on an assumption where it seems the other responses have taken a different one. I'm assuming that you want any quoted block of text to be separated out on its own regardless of spacing with the rest of the text being separated by spaces.
So given the example:
"some text in quotes" plus four simple words p"lus something strange"
The output would be:
[0] some text in quotes
[1] plus
[2] four
[3] simple
[4] words
[5] p
[6] lus something strange
Given that this is the case, only a simple bit of code is required, and no complex machines. You would first check if there is a leading quote for the first character and if so tick a flag and remove the character. As well as removing any quotes at the end of the string. Then tokenize the string based on quotation marks. Then tokenize every other of the strings obtained previously by spaces. Tokenize starting with the first string obtained if there was no leading quote, or the second string obtained if there was a leading quote. Then each of the remaining strings from the first part will be added to an array of strings interspersed with the strings from the second part added in place of the strings they were tokenized from. In this way you can get the result listed above. In code this would look like:
#include<string.h>
#include<stdlib.h>
char ** parser(char * input, char delim, char delim2){
char ** output;
char ** quotes;
char * line = input;
int flag = 0;
if(strlen(input) > 0 && input[0] == delim){
flag = 1;
line = input + 1;
}
int i = 0;
char * pch = strchr(line, delim);
while(pch != NULL){
i++;
pch = strchr(pch+1, delim);
}
quotes = (char **) malloc(sizeof(char *)*i+1);
char * token = strtok(input, delim);
int n = 0;
while(token != NULL){
quotes[n] = strdup(token);
token = strtok(NULL, delim);
n++;
}
if(delim2 != NULL){
int j = 0, k = 0, l = 0;
for(n = 0; n < i+1; n++){
if(flag & n % 2 == 1 || !flag & n % 2 == 0){
char ** new = parser(delim2, NULL);
l = sizeof(new)/sizeof(char *);
for(k = 0; k < l; k++){
output[j] = new[k];
j++;
}
for(k = l; k > -1; k--){
free(new[n]);
}
free(new);
} else {
output[j] = quotes[n];
j++;
}
}
for(n = i; n > -1; n--){
free(quotes[n]);
}
free(quotes);
} else {
return quotes;
}
return output;
}
int main(){
char * input;
char ** result = parser(input, '\"', ' ');
return 0;
}
(May not be perfect, I haven't tested it)
Related
I have written a code to read a csv file in c. The file contains data of games and i am supposed to read it and sort it according to the score and print the top 10 rated games. The code is as follows:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#define tablesize 18626
typedef struct
{
char title[200];
char platform[20];
char Score[20];
char release_year[20];
} dict;
void printValues(dict *values)
{
for (int i = 0; i < 100; i++)
{
printf("title->%s,platform->%s,Score->%s,release->%s\n", values[i].title, values[i].platform, values[i].Score, values[i].release_year);
}
}
void sort(dict *values)
{
for (int i = 0; i < tablesize; i++)
{
for (int j = i + 1; j < tablesize; j++)
{
int a = *values[i].Score - '0';
int b = *values[j].Score - '0';
// printf("%d %d\n",values[i].Score,values[j].Score);
if (a < b)
{
dict temp = values[i];
values[i] = values[j];
values[j] = temp;
}
}
}
}
int main()
{
FILE *fp = fopen("t4_ign.csv", "r");
if (!fp)
{
printf("Error");
return 0;
}
char buff[1024];
int row = 0, column = 0;
int count = 0;
dict *values = NULL;
int i = 0;
while (fgets(buff, 1024, fp))
{
column = 0;
row++;
count++;
values = realloc(values, sizeof(dict) * count);
if (NULL == values)
{
perror("realloc");
break;
}
if (row == 1)
{
continue;
}
char *field = strtok(buff, ",");
while (field)
{
if (column == 0)
{
strcpy(values[i].title, field);
}
if (column == 1)
{
strcpy(values[i].platform, field);
}
if (column == 2)
{
strcpy(values[i].Score, field);
}
if (column == 3)
{
strcpy(values[i].release_year, field);
}
field = strtok(NULL, ",");
column++;
}
i++;
}
fclose(fp);
printf("File loaded!\n", fp);
sort(values);
printValues(values);
free(values);
return 0;
}
The problem i am facing is that the CSV file's Title field has commas in it and it thus differentiates the data separated by the commas as different columns which gives an error in loading the data in the struct.
Here are two example lines of the input file. Quotes are used when the title contains commas.
"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe",PlayStation 2,8,2005
The Chronicles of Narnia: Prince Caspian,Wireless,5,2008
Any suggestions? Thanks in advance.
Since quotes are used for the title field when it contains commas, I suggest you check to see if the " has been used. If so, use that delimiter for the first item.
char *field;
if(buff[0] == '"') {
field = strtok(buff, "\"");
}
else {
field = strtok(buff, ",");
}
The first one will leave a comma as the first character of the next field, but the next strtok will filter that off, since it does not allow "empty" fields.
The function strtok does not suit your needs, because it considers the quotation marks as characters like any other. Therefore, when strtok sees a comma, it won't care whether the comma is inside quotation marks or not.
Also, as someone else pointed out in the comments section, another problem with strtok is that it skips empty fields.
Therefore, I do not recommend using strtok for what you want to do.
In order to solve your problem, I recommend that you write your own function that does something very similar to strtok and strsep, but if the first non-whitespace character is a quotation mark, it considers the next quotation mark as the delimiter instead of the next comma. In the code below, I named this function my_strsep.
Here is an example:
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#define NUM_LINES 2
//this function is equivalent to the POSIX function "strsep", except
//that it always uses "," as a delimiter, unless the first
//non-whitespace character is a quotation mark, in which case it //skips the quotation mark and uses the next quotation mark as a
//delimiter, also consuming the next comma
char *my_strsep( char **restrict stringp )
{
char *p = *stringp;
char *start;
char delimiter = ',';
//do nothing if *stringp is
if ( *stringp == NULL )
return NULL;
//skip all whitespace characters
while ( isspace( (unsigned char)*p ) )
p++;
//remember start of field
start = p;
//determine whether this field uses quotation marks
if ( *p == '"' )
{
//set delimiter to quotation mark instead of comma
delimiter = '\"';
//skip the first quotation mark
p++;
}
//remember the start of the string
start = p;
while ( *p != delimiter )
{
if ( *p == '\0' )
{
if ( delimiter == '\"' )
{
fprintf( stderr,
"Warning: Encountered end of string before the "
"second quotation mark!\n"
);
}
//pass information back to calling function
*stringp = NULL;
return start;
}
p++;
}
//overwrite the delimiter with a null character
*p = '\0';
//go past the delimiter
p++;
//skip the comma too, if quotation marks are being used
if ( delimiter == '\"' )
{
//skip all whitespace characters
while ( isspace( (unsigned char)*p ) )
p++;
//skip the comma
if ( *p == ',' )
p++;
}
//pass information back to calling function
*stringp = p;
return start;
}
int main( void )
{
char lines[NUM_LINES][200] = {
"\"The Chronicles of Narnia: The Lion, The Witch and The Wardrobe\",PlayStation 2,8,2005",
"The Chronicles of Narnia: Prince Caspian,Wireless,5,2008"
};
for ( int i = 0; i < NUM_LINES; i++ )
{
char *p, *q;
printf( "Processing line #%d:\n", i + 1 );
p = lines[i];
while ( ( q = my_strsep( &p ) ) != NULL )
{
printf( "Found field: %s\n", q );
}
printf( "\n" );
}
}
This program has the following output:
Processing line #1:
Found field: The Chronicles of Narnia: The Lion, The Witch and The Wardrobe
Found field: PlayStation 2
Found field: 8
Found field: 2005
Processing line #2:
Found field: The Chronicles of Narnia: Prince Caspian
Found field: Wireless
Found field: 5
Found field: 2008
As you can see, the function my_strsep can handle fields both with and without quotation marks.
I have functin and input like this
"{ \'Carl Weber Maria von\', \'weber\', 2 }\n"
"{ \'Carl-Maria von Weber\', \'weber\', 4 }\n"
"{ \'Chuck Norris\', \'norrischuck\', 100 }";
It creates outputi like this:
Chuck Norris
,
norrischuck
,
100
}{
How can I make sure that no characters like {}, are stored in my field.Somehow I can't modify this function to achieve the desired result. Thank you in advance for the answers. Function:
int fillPoints (FILE *fp, TSTUDENTLIST *l)
{
char *src;
char a[2000];
src=a;
fscanf(fp,"%[^\"]s",src);
int count=1000;
char output[1000][1000];
int i = 0;
while (i < count) {
const char *start;
int len;
while (isspace((unsigned char)*src))
src++;
if (*src == '\0')
break;
if (*src == '\'') {
start = ++src;
len = strcspn(src, "\'");
src += len;
if (*src == '\'')
src++;
} else
if (*src == '\"') {
start = ++src;
len = strcspn(src, "\"");
src += len;
if (*src == '\"')
src++;
} else {
start = src;
len = strcspn(src, " \t\f\v\r\n");
src += len;
}
snprintf(output[i], sizeof(output[i]), "%.*s", len, start);
i++;
}
}
For simplicity I suggest you do it in multiple passes over each line, where each pass copies part of the line into temporary array.
For example an initial pass to remove a trailing semi-colon if one exists (for this you don't actually need to copy). Then a pass to copy all but the opening and closing double-quotes ". Then one pass for the braces. And one pass for the back-slashes (and the n in \n). And a last pass for the single quotes.
All that should leave you with something like "Chuck Norris, norrischuck, 100". And this can be fed to strtok to "tokenize" on the comma, and you simply call it twice to get the three separate strings "Chuck Norris", "norrischuck", and "100". The last you could pass to strtoul to convert to an integer.
You can of course combine all the passes into a single pass once you get the long multi-pass solution working.
When I say you make a "pass" over the input, I mean you iterate over the string, copying all but the unwanted characters to a new temporary array.
For example:
// Previous pass puts its output in pass_1_output
// Pass to remove double-quotes
char pass_2_ouput[1000] = { 0 }; // Zero-initialize, which is the string terminator
for (size_t in = 0, out = 0; pass_1_output[in] != '\0'; ++in)
{
if (pass_1_input[in] != '"')
{
// Not a double-quote, copy the input to the output
pass_2_output[out++] = pass_1_input[i];
}
}
// After the above loop, pass_2_output will contain the same contents as
// pass_1_output, *except* any double-quotes
OK... first question so please forgive me if it isn't quite understandable the first go.
I am attempting to parse a string input to stdin through a couple of different conditions.
Example input string: move this into "tokens that I need" \n
I would like to parse this into tokens as:
Token 1 = move
Token 2 = this
Token 3 = into
Token 4 = tokens that I need
Where the tokens are by whitespace (easy enough) until a quote is encountered, then everything inside of the open and close quotes is treated as a single token.
I've tried several different methods, but I unfortunately feel that I may be in over my head here so any help would be greatly appreciated.
My latest attempt:
fgets(input, BUFLEN, stdin); //gets the input
input[strlen(input)-1] = '\0';//removes the new line
printf("Input string = %s\n",input);//Just prints it out for me to see
char *token = strtok(input,delim);//Tokenizes the input, which unfortunately does not do what I need. delim is just my string of delimiters which currently only has a " " in it.
I tried to scan through the string one character at a time and then place those characters into arrays so that I could have them as I wanted, but that failed miserably.
The ultimate solution with customized version of my_strtok_r is here. This solution has advantage over solution with non re-entrant: strtok.
my_strtok_r is re-entrant: you can call them from multiple threads simultaneously, or in nested loops, et cetera.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char * my_strtok_r(char *s, const char *delim1, const char *delim2, char **save_ptr)
{
char *end;
size_t s1;
size_t s2;
int delim2found = 0;
if (s == NULL)
s = *save_ptr;
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
s1 = strspn (s, delim1);
s2 = strspn (s, delim2);
if(s2 > s1){
s += s2;
delim2found = 1;
}
else{
s += s1;
}
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
/* Find the end of the token. */
if(delim2found)
end = s + strcspn (s, delim2);
else
end = s + strcspn (s, delim1);
if (*end == '\0') {
*save_ptr = end;
return s;
}
/* Terminate the token and make *save_ptr point past it. */
*end = '\0';
*save_ptr = end + 1;
return s;
}
int main (void)
{
char str[] = " 123 abc \"SPLITTING WORKS\" yes! \"GREAT WE HAVE A SOLUTION\" ! ";
char *d1 = " ";
char *d2 = "\"";
char *token;
char *rest = str;
char array[20][80];
printf ("Splitting string \"%s\" into tokens:\n",str);
size_t nr_of_tokens = 0;
while ((token = my_strtok_r(rest, d1, d2, &rest)))
{
strcpy (array[nr_of_tokens], token);
nr_of_tokens++;
}
for(int i=0; i < nr_of_tokens; i++)
printf ("%s\n",array[i]);
return 0;
}
Test:
Splitting string " 123 abc "SPLITING WORKS" yes! "GREAT WE HAVE A SOLUTION" ! " into tokens:
123
abc
SPLITTING WORKS
yes!
GREAT WE HAVE A SOLUTION
!
This is another solution (fully tested) which you can use. You can mix any number of tokens delimited by white spaces and '\"'. It can be configured to your needs. Extensive explanations are given in the code itself.
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include <ctype.h>
char *get_str_segment(char *output_str, char *input_str, char extDel)
{
/*
Purpose :
To copy to output first segment.
To extract the segment two types of delimiters are used:
1. white space delimiter
2. 'extDel' -
do not put here white space or '\0'!
(typicaly '"' = quote!)
'extDel' allows us to put white spaces inside the segment.
Notice that 'extDel' cannot be embedded inside the segment!
It makes 'extDel' special character which will not be encountered
in the 'output_string'! First appearance of 'extDel' starts new
segment!
Notice that unbalanced 'extDel' will cause coping whole string to
destination from that point!
Return:
Pointer to the first character after the segment
or NULL !!!
we will not allow **empty** segments with unbalanced 'extDel'
if ('extDel' is unbalanced) it has to have at list one character!
It can be white space!
Notice!
"get_str_segment()" on strings filed with white spaces
and empty strings will return *** NULL *** to indicate that
no conclusive segment has been found!
Example:
input_str = " qwerty"123 45" "samuel" G7 "
output_str = ""
// Call:
char *ptr = get_str_segment(output_str,input_str,'"');
Result:
input_str = " qwerty"123 45" "samuel" G7 "
^
|
ptr----------------------.
output_str = "qwerty"
*/
char *s = input_str;
char *d = output_str;
char i = 0;
if(!s) return NULL; // rule #1 our code never brakes!
if(!d) return NULL;
// eliminate white spaces from front of the block
while(1)
{
if ( *s == '\0')
{
*d = '\0' ; // end the output string
return (NULL) ; // return NULL to indicate that no
// coping has been done.
//
//
// "get_str_segment()" on
// strings filed with white spaces
// and empty strings
// will return NULL to indicate that
// no conclusive segment has been found
//
}
if (isspace(*s)) ++s; // move pointer to next char
else break; // break the loop!
}
// we found first non white character!
if( *s != extDel)
{
// copy block up to end of string first white space or extDel
while( ((*s) != '\0') && (!isspace(*s)) && ((*s) != extDel) )
{
*d = *s; // copy segment characters
++s;
++d;
}
*d = '\0' ; // end the destination string
return (s); // return pointer to end of the string ||
// trailing white space ||
// 'extDel' char
}
else // It is 'extDel' character !
{
++s; // skip opening 'extDel'
while( ((*s) != '\0') && ((*s) != extDel) )
{
i=1; // we executed loop at list one time
*d = *s; // copy segment characters till '\0' or extDel
++s;
++d;
}
*d = '\0' ; // end the destination string
if( *s == extDel ) ++s; // skip *closing* 'extDel'
else
{
// unbalanced 'extDel'!
printf("WARNING:get_str_segment: unbalanced '%c' encountered!\n",extDel);
if (i==0) return NULL; // we will not allow
// **empty** unbalanced segments 'extDel'
// if ('extDel' is unbalanced) it has to have at list one character!
// It can be white space!
}
return (s); // return pointer to next char after 'extDel'
// ( It can be '\0')
// if it is '\0' next pass will return 'NULL'!
}
}
int parse_line_to_table(int firstDim, int secondDim, char *table, char * line, char separator)
{
// Purpose:
// Parse 'line' to 'table'
// Function returns: number of segments
// 'table' has to be passed from outside
char* p;
int i;
if(!table) return (-1);
// parse segments to 'table':
if(line)
{
p = line; // A necessary initialization!
for(i=0; i<firstDim; i++)
{
p = get_str_segment( table+i*secondDim , p , separator );
if(p==NULL) break;
}
}
else
return (-1);
// debug only
// for(int j=0; j<i; j++) { printf(" i=%d %s",j, table+j*secondDim ); }
// printf("\n");
return (i); // notice that i is post incremented
}
int main(void)
{
char table[20][80];
char *line = "move this into \"tokens that I need\"";
int ret = parse_line_to_table(20, 80, table, line, '\"');
for(int i = 0; i < ret; i++ )
printf("%s\n",table[i]);
return 0;
}
Output:
move
this
into
tokens that I need
I have a string of 80 chars (line from .txt file)
Somewhere at the end of the string I have numbers or strings or chars and "," (comma) between them. I need to delete these spaces around "," so I will be able to get them by strtok().
Any ideas ?
For example :
String : " name: today 12 ,r ,ab, 5 , seven"<br>
I need : " name: today 12,r,ab,5,seven"
You can apply this algorithm ::
Find the element , in this case a space.
Replace the element with an element of your choice, in this case an empty character.
This function might come handy for replacing any character to a string. You might add the char *replace function as a snippet and use it later for similar purposes.
char *replace(const char *the_input_string, char the_character,
const char *replacing_string)
{
int count = 0;
const char *t;
for(t=the_input_string; *t; t++)
count += (*t == the_character);
size_t rlen = strlen(replacing_string);
char *res = (char*)malloc(strlen(the_input_string) + (rlen-1)*count + 1);
char *ptr = res;
for(t=the_input_string; *t; t++)
{
if(*t == the_character)
{
memcpy(ptr, replacing_string, rlen);
ptr += rlen;
}
else
*ptr++ = *t;
}
*ptr = 0;
return res;
}
Driver Program ::
int main(int argc, char const *argv[])
{
const char *s = replace("name: today 12 ,r ,ab, 5 , seven", ' ', "");
printf("%s\n", s);
return 0;
}
Please refer to this link and the code might be verisimilar but use the above code as the solution mentioned there might throw some errors or warnings.
Because the resulting string will be shorter then the original string, you can do the replacement in place: When you find a comma, copy it and skip the following space. To treat the space before the comma, keep track of the first space after the last non-space character and skip that, too if necessary:
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
void remspc(char *str)
{
char *first = str; // dest. char after last non-space
char *q = str; // destination pointer
// skip leading white space
while (isspace((unsigned char) *str)) str++;
while (*str) {
if (*str == ',') {
q = first; // skip space before comma
*q++ = *str++;
first = q;
// skip space after comma
while (isspace((unsigned char) *str)) str++;
} else {
// remember last non-space
if (!isspace((unsigned char) *str)) first = q + 1;
*q++ = *str++;
}
}
*first = '\0';
}
int main(void)
{
char str[] = " name: today 12, r ,ab, , 5 , seven";
remspc(str);
puts(str);
return 0;
}
This solution will run commas that are separated by white space together, which may lead to problems with strtok, because it will consider stretches of commas as a single delimiter.
You may give this a try!
Replace with your code where necessary.
#include <stdio.h>
#include <stdlib.h>
int main()
{ int i;
char line[] = "name: today 12 ,r ,ab, 5 , seven";
int length = strlen(line);
char line2[length];
for(i = 0; i<length; i++) {
if(!isspace(line[i])) {
line2[i] = line[i];
}
}
for(i = 0; i<length; i++){
printf("%c", line2[i]);
}
return 0;
}
This question already has answers here:
How do I trim leading/trailing whitespace in a standard way?
(40 answers)
Closed 5 years ago.
Briefly:
I'm after the equivalent of .NET's String.Trim in C using the win32 and standard C api (compiling with MSVC2008 so I have access to all the C++ stuff if needed, but I am just trying to trim a char*).
Given that there is strchr, strtok, and all manner of other string functions, surely there should be a trim function, or one that can be repurposed...
Thanks
There is no standard library function to do this, but it's not too hard to roll your own. There is an existing question on SO about doing this that was answered with source code.
This made me want to write my own - I didn't like the ones that had been provided. Seems to me there should be 3 functions.
char *ltrim(char *s)
{
while(isspace(*s)) s++;
return s;
}
char *rtrim(char *s)
{
char* back = s + strlen(s);
while(isspace(*--back));
*(back+1) = '\0';
return s;
}
char *trim(char *s)
{
return rtrim(ltrim(s));
}
You can use the standard isspace() function in ctype.h to achieve this. Simply compare the beginning and end characters of your character array until both ends no longer have spaces.
"spaces" include:
' ' (0x20) space (SPC)
'\t' (0x09) horizontal tab (TAB)
'\n' (0x0a) newline (LF)
'\v' (0x0b) vertical tab (VT)
'\f' (0x0c) feed (FF)
'\r' (0x0d) carriage return (CR)
although there is no function which will do all of the work for you, you will have to roll your own solution to compare each side of the given character array repeatedly until no spaces remain.
Edit:
Since you have access to C++, Boost has a trim implementation waiting for you to make your life a lot easier.
Surprised to see such implementations. I usually do trim like this:
char *trim(char *s) {
char *ptr;
if (!s)
return NULL; // handle NULL string
if (!*s)
return s; // handle empty string
for (ptr = s + strlen(s) - 1; (ptr >= s) && isspace(*ptr); --ptr);
ptr[1] = '\0';
return s;
}
It is fast and reliable - serves me many years.
/* Function to remove white spaces on both sides of a string i.e trim */
void trim (char *s)
{
int i;
while (isspace (*s)) s++; // skip left side white spaces
for (i = strlen (s) - 1; (isspace (s[i])); i--) ; // skip right side white spaces
s[i + 1] = '\0';
printf ("%s\n", s);
}
#include "stdafx.h"
#include <string.h>
#include <ctype.h>
char* trim(char* input);
int _tmain(int argc, _TCHAR* argv[])
{
char sz1[]=" MQRFH ";
char sz2[]=" MQRFH";
char sz3[]=" MQR FH";
char sz4[]="MQRFH ";
char sz5[]="MQRFH";
char sz6[]="M";
char sz7[]="M ";
char sz8[]=" M";
char sz9[]="";
char sz10[]=" ";
printf("sz1:[%s] %d\n",trim(sz1), strlen(sz1));
printf("sz2:[%s] %d\n",trim(sz2), strlen(sz2));
printf("sz3:[%s] %d\n",trim(sz3), strlen(sz3));
printf("sz4:[%s] %d\n",trim(sz4), strlen(sz4));
printf("sz5:[%s] %d\n",trim(sz5), strlen(sz5));
printf("sz6:[%s] %d\n",trim(sz6), strlen(sz6));
printf("sz7:[%s] %d\n",trim(sz7), strlen(sz7));
printf("sz8:[%s] %d\n",trim(sz8), strlen(sz8));
printf("sz9:[%s] %d\n",trim(sz9), strlen(sz9));
printf("sz10:[%s] %d\n",trim(sz10), strlen(sz10));
return 0;
}
char *ltrim(char *s)
{
while(isspace(*s)) s++;
return s;
}
char *rtrim(char *s)
{
char* back;
int len = strlen(s);
if(len == 0)
return(s);
back = s + len;
while(isspace(*--back));
*(back+1) = '\0';
return s;
}
char *trim(char *s)
{
return rtrim(ltrim(s));
}
Output:
sz1:[MQRFH] 9
sz2:[MQRFH] 6
sz3:[MQR FH] 8
sz4:[MQRFH] 7
sz5:[MQRFH] 5
sz6:[M] 1
sz7:[M] 2
sz8:[M] 2
sz9:[] 0
sz10:[] 8
I like it when the return value always equals the argument. This way, if the string array has been allocated with malloc(), it can safely be free() again.
/* Remove leading whitespaces */
char *ltrim(char *const s)
{
size_t len;
char *cur;
if(s && *s) {
len = strlen(s);
cur = s;
while(*cur && isspace(*cur))
++cur, --len;
if(s != cur)
memmove(s, cur, len + 1);
}
return s;
}
/* Remove trailing whitespaces */
char *rtrim(char *const s)
{
size_t len;
char *cur;
if(s && *s) {
len = strlen(s);
cur = s + len - 1;
while(cur != s && isspace(*cur))
--cur, --len;
cur[isspace(*cur) ? 0 : 1] = '\0';
}
return s;
}
/* Remove leading and trailing whitespaces */
char *trim(char *const s)
{
rtrim(s); // order matters
ltrim(s);
return s;
}
void ltrim(char str[PATH_MAX])
{
int i = 0, j = 0;
char buf[PATH_MAX];
strcpy(buf, str);
for(;str[i] == ' ';i++);
for(;str[i] != '\0';i++,j++)
buf[j] = str[i];
buf[j] = '\0';
strcpy(str, buf);
}
static inline void ut_trim(char * str) {
char * start = str;
char * end = start + strlen(str);
while (--end >= start) { /* trim right */
if (!isspace(*end))
break;
}
*(++end) = '\0';
while (isspace(*start)) /* trim left */
start++;
if (start != str) /* there is a string */
memmove(str, start, end - start + 1);
}
How about this... It only requires one iteration over the string (doesn't use strlen, which iterates over the string). When the function returns you get a pointer to the start of the trimmed string which is null terminated. The string is trimmed of spaces from the left (until the first character is found). The string is also trimmed of all trailing spaces after the last nonspace character.
char* trim(char* input) {
char* start = input;
while (isSpace(*start)) { //trim left
start++;
}
char* ptr = start;
char* end = start;
while (*ptr++ != '\0') { //trim right
if (!isSpace(*ptr)) { //only move end pointer if char isn't a space
end = ptr;
}
}
*end = '\0'; //terminate the trimmed string with a null
return start;
}
bool isSpace(char c) {
switch (c) {
case ' ':
case '\n':
case '\t':
case '\f':
case '\r':
return true;
break;
default:
return false;
break;
}
}
/* iMode 0:ALL, 1:Left, 2:Right*/
char* Trim(char* szStr,const char ch, int iMode)
{
if (szStr == NULL)
return NULL;
char szTmp[1024*10] = { 0x00 };
strcpy(szTmp, szStr);
int iLen = strlen(szTmp);
char* pStart = szTmp;
char* pEnd = szTmp+iLen;
int i;
for(i = 0;i < iLen;i++){
if (szTmp[i] == ch && pStart == szTmp+i && iMode != 2)
++pStart;
if (szTmp[iLen-i-1] == ch && pEnd == szTmp+iLen-i && iMode != 1)
*(--pEnd) = '\0';
}
strcpy(szStr, pStart);
return szStr;
}
Here's my implementation, behaving like the built-in string functions in libc (that is, it expects a c-string, it modifies it and returns it to the caller).
It trims leading spaces & shifts the remaining chars to the left, as it parses the string from left to right. It then marks a new end of string and starts parsing it backwards, replacing trailing spaces with '\0's until it finds either a non-space char or the start of the string. I believe those are the minimum possible iterations for this particular task.
// ----------------------------------------------------------------------------
// trim leading & trailing spaces from string s (return modified string s)
// alg:
// - skip leading spaces, via cp1
// - shift remaining *cp1's to the left, via cp2
// - mark a new end of string
// - replace trailing spaces with '\0', via cp2
// - return the trimmed s
//
char *s_trim(char *s)
{
char *cp1; // for parsing the whole s
char *cp2; // for shifting & padding
// skip leading spaces, shift remaining chars
for (cp1=s; isspace(*cp1); cp1++ ) // skip leading spaces, via cp1
;
for (cp2=s; *cp1; cp1++, cp2++) // shift left remaining chars, via cp2
*cp2 = *cp1;
*cp2-- = 0; // mark new end of string for s
// replace trailing spaces with '\0'
while ( cp2 > s && isspace(*cp2) )
*cp2-- = 0; // pad with '\0's
return s;
}
Not the best way but it works
char* Trim(char* str)
{
int len = strlen(str);
char* buff = new char[len];
int i = 0;
memset(buff,0,len*sizeof(char));
do{
if(isspace(*str)) continue;
buff[i] = *str; ++i;
} while(*(++str) != '\0');
return buff;
}
void inPlaceStrTrim(char* str) {
int k = 0;
int i = 0;
for (i=0; str[i] != '\0';) {
if (isspace(str[i])) {
// we have got a space...
k = i;
for (int j=i; j<strlen(str)-1; j++) {
str[j] = str[j+1];
}
str[strlen(str)-1] = '\0';
i = k; // start the loop again where we ended..
} else {
i++;
}
}
}
Easiest thing to do is a simple loop. I'm going to assume that you want the trimmed string returned in place.
char *
strTrim(char * s){
int ix, jx;
int len ;
char * buf
len = strlen(s); /* possibly should use strnlen */
buf = (char *) malloc(strlen(s)+1);
for(ix=0, jx=0; ix < len; ix++){
if(!isspace(s[ix]))
buf[jx++] = s[ix];
buf[jx] = '\0';
strncpy(s, buf, jx); /* always looks as far as the null, but who cares? */
free(buf); /* no good leak goes unpunished */
return s; /* modifies s in place *and* returns it for swank */
}
This gets rid of embedded blanks too, if String.Trim doesn't then it needs a bit more logic.