program adds on a string seemingly out of nowhere

program adds on a string seemingly out of nowhere - arrays

I'm writing a code that reads in an input file from stdin, and outputs the exact same content (to stdout) except that replaces any words found in the "dictionary" in the following way, in the exact order:
if the word exactly as it is stored as a key in dictionary appears, then find the corresponding value pair and print that out instead.
if the word that's capitalized properly (eg., Thomas, with capital first letter and lowercase for everything else) is a valid key in the dictionary, print out the corresponding value pair instead
if the lowercased version is a valid key, print out its corresponding value
if there are no matches, just print things out as they are.
(All non-alphabetical characters are just printed out "normally".)
A problem I've been having though is that when I'm doing (2), somehow a character ('U') gets tagged on to the end of the "string" or copy2 array when I'm testing "IPSUM" (all cap).
For instance, see this output:
My outputs are in the lines with "<", and the ">" indicate what should've been. Based on the order I'm checking things, since IPSUM is not in the dictionary (see the end of this post for contents of dictionary), it goes to (2) where IPSUM should become Ipsum, and it should print out the corresponding value of Ipsum. But instead I get IpsumU, and so the dictionary doesn't recognize the word. But I'm not sure where the 'U' is coming from, since the input is exactly
IPSUM (all cap).
Could anyone help me figure out what might be wrong with my code?
//for reference:
typedef struct HashBucketEntry {
void *key;
void *data;
struct HashBucketEntry *next;
} HashBucketEntry;
typedef struct HashTable {
int size;
unsigned int (*hashFunction)(void *);
int (*equalFunction)(void*, void*);
HashBucketEntry **buckets;
} HashTable;
//We have a Hashtable *dictionary.
void processInput() {
//char c;
int c;
int i = 0;
//char * word = (char *) malloc(60 * sizeof(char));
char word[60];
while (c = getchar()) {
if (isalpha(c)) {
word[i] = c;
i++;
} else {
word[i] = '\0';
if (word[0] != '\0') {
//char * copy = (char *) malloc(60 * sizeof(char));
char copy[60];
strcpy(copy, word);
unsigned int location = (dictionary->hashFunction)(copy) % (dictionary->size);
char * word_in_dict;
if (dictionary->buckets[location] != NULL) {
word_in_dict = (char *) dictionary->buckets[location]->data;
} else {
word_in_dict = NULL;
}
char copy2[60];
copy2[0] = toupper(copy[0]);
for(int i = 1; copy[i]; i++){
copy2[i] = tolower(copy[i]);
}
unsigned int location2 = (dictionary->hashFunction)(copy2) % (dictionary->size);
char * word_in_dict2;
if (dictionary->buckets[location2] != NULL) { //somehow this is NULL when IPSUM, even though copy2 has correct string
word_in_dict2 = (char *) dictionary->buckets[location2]->data;
} else {
word_in_dict2 = NULL;
}
char copy3[60];
for(int i = 0; copy[i]; i++){
copy3[i] = tolower(copy[i]);
}
unsigned int location3 = (dictionary->hashFunction)(copy3) % (dictionary->size);
char * word_in_dict3;
if (dictionary->buckets[location3] != NULL) {
word_in_dict3 = (char *) dictionary->buckets[location3]->data;
} else {
word_in_dict3 = NULL;
}
if (word_in_dict != NULL) {
fprintf(stdout, "%s", word_in_dict);
} else if (word_in_dict2 != NULL) {
fprintf(stdout, "%s", word_in_dict2);
} else if (word_in_dict3 != NULL) {
fprintf(stdout, "%s", word_in_dict3);
} else {
//fprintf(stdout, "%s", copy);
printf("%s", copy);
}
putchar(c);
i = 0;
} else if (c != EOF) {
putchar(c);
} else {
break;
}
}
}
}
The dictionary contains only these entries:
ipsum i%##!
fubar fubar
IpSum XXXXX24
Ipsum YYYYY211
Any help would be really appreciated!
Update in response to the answer:
I changed the code for copy2 to this:
for(j = 1; j < strlen(copy); j++) {
if (j < sizeof(copy2)) {
copy2[j] = tolower(copy[j]);
}
}
(and did a similar thing to copy3). The second case works, but now the third case fails; things only seem to work if I change the second case but not the third case. Does anyone know why this is the case?

Your code to creaty modified copies of your input strings, e.g.
char copy2[60];
copy2[0] = toupper(copy[0]);
for(int i = 1; copy[i]; i++){
copy2[i] = tolower(copy[i]);
}
does not copy the terminating '\0'. As automatic variables are not implicitly initialized, the corresponding memory may contain any data (from prevous loop cycles or from unrelated code) which may appear as trailing characters. You must append a '\0' character after the last character of your string.
This error may result in out-of-bounds access to the array when you access it as a string if there is no '\0' within the array bounds. (undefined behavior)
Your code itself might result in out-of-bounds access if the input string is too long. You should add a check to prevent access to array elements at i >= sizeof(copy2).
I suggest something like this:
char copy2[60];
copy2[0] = toupper(copy[0]);
/* avoid reading past the end of an empty string */
if(copy[0]) {
for(int i = 1; copy[i] && (i < sizeof(copy)-1); i++){
copy2[i] = tolower(copy[i]);
}
/* variable i will already be incremented here */
copy2[i] = '\0';
}
Edit as a response to a question in a comment:
You cannot combine strcpy and tolower, but you can copy the string first and modify the characters in-place afterwards.
Example:
char copy2[60];
if(strlen(copy) y sizeof(copy2)) {
strcpy(copy2, copy);
copy2[0] = toupper(copy2[0]);
if(copy[0]) {
/* The length has been checked before, no need to check again here */
for(int i = 1; copy[i]; i++) {
copy2[i] = tolower(copy2[i]);
}
/* the string is already terminated */
}
} else {
/* string too long, handle error */
}
or with truncating instead of reporting an error
char copy2[60];
strncpy(copy2, copy, sizeof(copy)-1);
copy2[sizeof(copy2)-1] = '\0';
copy2[0] = toupper(copy2[0]);
if(copy[0]) {
/* A long string would have been truncated before, no need to check the length here */
for(int i = 1; copy[i]; i++) {
copy2[i] = tolower(copy2[i]);
}
/* the string is already terminated */
}

Related

How to get ASCII code for characters from a text file?

Update, Hello guys Thank you all for the help, my initial approach was wrong and I did not use ASCII codes at all.
Sorry for the late replay I had a half-day off today and made a new post for the complete code
there is no errors but the prgram is not working proberly ( this is an update of old post )
I wrote the program, and it is working with no errors But it is not giving me the results I wanted
My only problem is when I read a character how to check its ASCII and store it.
#include <stdio.h>
#include <string.h>
int main()
{
char dictionary[300];
char ch, temp1, temp2;
FILE *test;
test=fopen("HW2.txt","r");
for(int i=0;i<2000;i+=1)
{ ch=fgetc(test);
printf("%c",ch);
}
}

If we are talking about plain ASCII, values goes from 0 to 127, your table shoud look like:
int dictionary[128] = {0};
Regarding your question:
how to check its ASCII and store it
Consider a char being a tiny int, they are interchangeable and you don't need any conversion.
fgetc wants an int in order to handle EOF, and trying to read 2000 characters from a file containing less than 2000 bytes can have very bad consequences, to read the whole file:
int c;
while ((c = fgetc(test)) != EOF)
{
if ((c > 0) && (c < 128))
{
dictionary[c]++;
}
}
for (int i = 1; i < 128; i++)
{
if (dictionary[i] > 0)
{
printf("%c appeared %d times\n", i, dictionary[i]);
}
}
EDIT:
Rereading, I see that you want to store words, not chars, ok, then it's a bit more difficult but nothing terrible, do not limit yourself to 300 words, use dynamic memory:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
// A struct to hold the words and the
// number of times it appears
struct words
{
size_t count;
char *word;
};
int main(void)
{
FILE *file;
file = fopen("HW2.txt", "r");
// Always check the result of fopen
if (file == NULL)
{
perror("fopen");
exit(EXIT_FAILURE);
}
struct words *words = NULL;
size_t nwords = 0;
char *word = NULL;
size_t nchars = 1;
size_t i;
int c;
// while there is text to scan
while ((c = fgetc(file)) != EOF)
{
if (isspace(c))
{
if (word != NULL)
{
// Search the word in the table
for (i = 0; i < nwords; i++)
{
// Found, increment the counter
if (strcmp(word, words[i].word) == 0)
{
words[i].count++;
free(word);
break;
}
}
// Not found, add the word to the table
if (i == nwords)
{
struct words *temp;
temp = realloc(words, sizeof(*temp) * (nwords + 1));
if (temp == NULL)
{
perror("realloc");
exit(EXIT_FAILURE);
}
words = temp;
words[nwords].word = word;
words[nwords].count = 1;
nwords++;
}
// Prepare the next word
word = NULL;
nchars = 1;
}
}
else
{
char *temp;
temp = realloc(word, nchars + 1);
if (temp == NULL)
{
perror("realloc");
exit(EXIT_FAILURE);
}
word = temp;
word[nchars - 1] = (char)c;
word[nchars++] = '\0';
}
}
for (i = 0; i < nwords; i++)
{
printf("%s appeared %zu times\n", words[i].word, words[i].count);
free(words[i].word);
}
free(words);
fclose(file);
return 0;
}

In C, characters are, essentially, their ASCII code (or rather, their char or unsigned char value). So once you read a character, you have its ASCII code already.
However, fgetc() doesn't always return the character it read for you; it may fail, for which reason it returns an int, not an unsigned char, which will be -1 in case of failure.
So:
You need to define an int variable to take the result of fgetc().
If it's not EOF, you can cast the result back into a unsigned char. That's your character, and it's ASCII value, at the same time.
PS - I'm ignoring non-ASCII characters, non-Latin languages etc. (But C mostly ignores them in its basic standard library functions too.)

Implementing a C function that splits a string on a given character and returns an array of strings after the split (along with length of array)

I'm trying to implement a C function that takes a string and then breaks that string on a certain character and returns back an array of strings after the split along with the size of that array. I'm using a data structure for this since returning a 2D array (the array of strings after the split) and its length is not possible. My code is given below:
struct charArr {
char *arr[10000];
int size;
};
struct charArr *stringSplitter(char *str, char c) {
struct charArr *splitString = (struct charArr *)malloc(sizeof(struct charArr));
if (splitString == NULL) {
fprintf(stderr, "malloc failed\n");
exit(1);
}
splitString->size = 0;
int i = 0;
int j = 0;
while (str[i] != '\0') {
if (str[i] == c) {
splitString->arr[splitString->size][j] = '\0';
(splitString->size)++;
j = 0;
i++;
while (str[i] == c) { /* this loop is to ignore continuous occurrences of the character c */
i++;
}
} else {
splitString->arr[splitString->size][j] = str[i];
i++;
j++;
}
}
splitString->arr[splitString->size][j] = '\0';
return splitString;
}
int main(int argc, char *argv[]) {
// take input from command line
if (argc == 1) {
//buffer to store lines
size_t buffer_size = 128;
char *buffer = malloc(buffer_size * sizeof(char));
if (buffer == NULL) {
fprintf(stderr, "malloc failed\n");
exit(1);
}
// loop continuously till user exits by ctrl+c
while (1) {
printf("Enter Input> ");
getline(&buffer, &buffer_size, stdin);
char *str = strdup(buffer);
struct charArr *splitString = stringSplitter(str, '&');
for (int i = 0; i<splitString->size; i++) {
printf("%s ", splitString->arr[i]);
}
}
}
return 0;
}
On running the code on a simple input like (the input is continuously taken from the command line):
Enter Input> this & that
I expect the output to be:
this that
But, I'm getting the error:
Segmentation fault (core dumped)
If the input is as shown below (i.e; continuous occurrences of the splitting character):
Enter Input> this &&& that
then also the output must be:
this that
Edit: I'm trying to extend this to split a string on multiple delimiters as well (in one go), so instead of char c in the above function, if char *c is passed which is a string of delimiters (example c = " \t\n" to remove all white spaces from given string), then also it should work as expected and return an array of strings after the split and length of array.
For example, if input is (multiple spaces, tabs and newline):
Enter Input> this that
Then the array returned (which is a part of the returned structure) must be of size 2 and only contain the 2 strings - "this" and "that".

Here's a rewrite of your function with the corrections that you need for proper allocation of each found string using strdup():
You can find my modifications preceded with comments that start 'Previously':
struct charArr* stringSplitter(char *str, char c){
struct charArr* splitString = (struct charArr*)malloc(sizeof(struct charArr));
char buffer[ MAX_BUFF ] ;
if(splitString == NULL){
fprintf(stderr, "malloc failed\n");
exit(1);
}
splitString->size = 0;
int i=0;
int j=0;
while(str[i] != '\0'){
if(str[i] == c){
//Previously: splitString->arr[splitString->size][j] = '\0';
splitString->arr[splitString->size] = strndup( buffer , j );
(splitString->size)++;
j = 0;
i++;
while(str[i] == c){ /* this loop is to ignore continuous occurrences of the character c */
i++;
}
} else {
// Previously: splitString->arr[splitString->size][j] = str[i];
buffer[j] = str[i];
i++;
j++;
}
}
//Previously: splitString->arr[splitString->size][j] = '\0';
splitString->arr[splitString->size++] = strndup( buffer , j );
return splitString;
}

It's been a long time since I wrote any C so I thought this would be a challenge. Here's a rewrite of the stringSplitter function.
struct charArr* stringSplitter(char *str, char c){
struct charArr* splitString = (struct charArr*)malloc(sizeof(struct charArr));
if(splitString == NULL){
fprintf(stderr, "malloc failed\n");
exit(1);
}
splitString->size = 0;
char sep[2];
sep[0] = c;
sep[1] = (char) 0;
char* next;
while( (next = strtok( str, sep )) )
{
str = NULL;
splitString->arr[ splitString->size++ ] = next;
}
return splitString;
}
Above, I'm simply using strtok. Take a look at the manpage for strtok() to see it's nuances.

Parsing and data overwriting issues in C using custom strtok

I'm reading in a .csv file, which I then need to parse into tokens. I tried using strtok(), but that unfortunately cannot return null fields (which my data is fulll of). So I went with a home-made version of strtok that I found, strtok_single, which returns the correct values that I need.
The data is input into my array correctly; but there is something wrong because before the initilization loops finish, the data gets overwritten. I've tried print statements and analyzing the problem but I just can't figure out what's wrong. Any insight at all would be helpful.
Here is the homemade strtok function I'm using:
char* strtok_single(char* str, char const* delims) {
static char* src = NULL;
char* p, *ret = 0;
if (str != NULL)
src = str;
if (src == NULL)
return NULL;
if ((p = strpbrk(src, delims)) != NULL) {
*p = 0;
ret = src;
src = ++p;
}
return ret;
}
Here is my code:
int main() {
int numLines = 0;
int ch, i, j;
char tmp[1024];
char* field;
char line[1024];
FILE* fp = fopen("filename.csv", "r");
// count number of lines in file
while ((ch = fgetc(fp)) != EOF) {
if (ch == '\n')
numLines++;
}
fclose(fp);
// Allocate memory for each line in file
char*** activity = malloc(numLines * sizeof(char**));
for (i = 0; i < numLines; i++) {
activity[i] = malloc(42 * sizeof(char*));
for (j = 0; j < 42; j++) {
activity[i][j] = malloc(100 * sizeof(char));
}
}
// read activity file and initilize activity matrix
FILE* stream = fopen("filename.csv", "r");
i = 0;
while (fgets(line, 1024, stream)) {
j = 0;
int newlineLoc = strcspn(line, "\n");
line[newlineLoc] = ',';
strcpy(tmp, line);
field = strtok_single(tmp, ",");
while (field != NULL) {
for (j = 0; j < 42; j++) {
activity[i][j] = field;
field = strtok_single(NULL, ",");
// when I print activity[i][j] here, the values are correct
}
// when I print activity[i][j] here, the values are correct for the
// first iteration
// and then get overwritten by partial data from the next line
}
i++;
} // close while
fclose(stream);
// by the time I get to here my matrix is full of garbage
// some more code that prints the array and frees memory
} // close main

activity[i][j] = field;
When the loops finish, each activity[i][j] points to somewhere in tmp, which is overwritten in each loop. Instead, since you pre-allocate space in each activity[i][j], you should just copy the contents of the string to that:
strcpy(activity[i][j], field);
Being careful of buffer overflow (i.e. if field is more than 99 characters).
Also, the sizeof(char) is superfluous since it's always 1 by definition.

Your line "activity[i][j] = field;" is backwards - you want the pointer assigned to the malloc'd memory.

Does a string contains a word from a list

I have a string and an array of keywords. If the string contains one of the keywords on the list, I want to check if the keyword is the only element of this string. If it's not, I want to return an error. Last thing, the string will always end with \n.
My keyword array is the following:
const char * keywordsTable[] =
{
"INIT",
"BEGIN",
"END",
"ROUTINES",
"ENDROUTINES",
"ENDWHEN",
"WHEN",
"WHILE"
};
For instance if my string is "BEGIN\n", everything is fine. If my string is "BEGIN FOO\n" or "FOO BEGIN\n", I have to return an error. Finally if my string is "BEGINFOO\n", everything is fine. (error code is 1, else it's 0)
I've tried something (I don't know how to proceed):
int CheckKeyword(char * str)
{
int nKeywords = sizeof(keywordsTable) / sizeof(keywordsTable[0]);
char * strTok = NULL;
char * keywrdWithLF = malloc(20);
// I don't want to check for the last two keywords nor the first
for (int i = 1; i < nKeywords - 2; i++)
{
strcpy_s(keywrdWithLF, 20, keywordsTable[i]);
strcat_s(keywrdWithLF, 20, "\n");
strTok = strstr(str, keywrdWithLF);
// If my string contains a keyword
if (strTok != NULL)
{
// If the string contains other characters... and I'm stuck
if (strcmp(str, keywrdWithLF))
{
}
else
{
free(keywrdWithLF);
return 1;
}
}
}
free(keywrdWithLF);
return 0;
}
Thank you in advance (please don't complain bout my indent style, I have to use Whitesmith indent) !

int CheckKeyword(char * str)
{
int nKeywords = sizeof(keywordsTable) / sizeof(keywordsTable[0]);
char * strTok = NULL;
for (int i = 1; i < nKeywords - 2; i++)
{
if(NULL!=(strTok = strstr(str, keywordsTable[i])))
{
int len = strlen(keywordsTable[i]);
if(strTok == str)
{
if(str[len]==' ' || str[len]=='\t')
return 1;
}
else
{
if((strTok[-1]==' ' || strTok[-1]=='\t') && isspace(strTok[len]))//isspace in <ctype.h>
return 1;
}
}
}
return 0;
}

Perhaps another method?
int CheckKeyword(char * str)
{
int rCode=0;
int nKeywords = sizeof(keywordsTable) / sizeof(keywordsTable[0]);
char *keyword;
char *cp = keywordsTable;
I assume that since str is defined as "char * str" and not "const char * str", it is OK to modify the input string. Hence, why not just eliminate the '\n' problem from the equation?
/* Elininate the newline character from the end of the string. */
if((cp = strchr(str, '\n'))
*cp = \0;
// I don't want to check for the last two keywords nor the first.
nKeywords -= 3;
++keyword;
/* Loop through the keywords. */
while(nKeywords)
{
// "I want to check if the keyword is the only element of this string."
// "If it's not, I want to return an error."
if((cp=strstr(str, keyword))
{
/* Check for stuff prior to the keyword. */
if(cp != str)
rCode=1;
/* Check for stuff after the keyword. */
// Finally if my string is "BEGINFOO\n", everything is fine.
if(' ' == str[strlen[keyword])
rCode=1;
if(strcmp(cp, keyword))
rCode=1
break;
}
++keyword;
--nKeywords;
}
return(rCode);
}

Parse string into array based on spaces or "double quotes strings"

Im trying to take a user input string and parse is into an array called char *entire_line[100]; where each word is put at a different index of the array but if a part of the string is encapsulated by a quote, that should be put in a single index.
So if I have
char buffer[1024]={0,};
fgets(buffer, 1024, stdin);
example input: "word filename.txt "this is a string that shoudl take up one index in an output array";
tokenizer=strtok(buffer," ");//break up by spaces
do{
if(strchr(tokenizer,'"')){//check is a word starts with a "
is_string=YES;
entire_line[i]=tokenizer;// if so, put that word into current index
tokenizer=strtok(NULL,"\""); //should get rest of string until end "
strcat(entire_line[i],tokenizer); //append the two together, ill take care of the missing space once i figure out this issue
}
entire_line[i]=tokenizer;
i++;
}while((tokenizer=strtok(NULL," \n"))!=NULL);
This clearly isn't working and only gets close if the double quote encapsulated string is at the end of the input string
but i could have
input: word "this is text that will be user entered" filename.txt
Been trying to figure this out for a while, always get stuck somewhere.
thanks

The strtok function is a terrible way to tokenize in C, except for one (admittedly common) case: simple whitespace-separated words. (Even then it's still not great due to lack of re-entrance and recursion ability, which is why we invented strsep for BSD way back when.)
Your best bet in this case is to build your own simple state-machine:
char *p;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
for (p = buffer; *p != '\0'; p++) {
c = (unsigned char) *p; /* convert to unsigned char for is* functions */
switch (state) {
case DULL: /* not in a word, not in a double quoted string */
if (isspace(c)) {
/* still not in a word, so ignore this char */
continue;
}
/* not a space -- if it's a double quote we go to IN_STRING, else to IN_WORD */
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1; /* word starts at *next* char, not this one */
continue;
}
state = IN_WORD;
start_of_word = p; /* word starts here */
continue;
case IN_STRING:
/* we're in a double quoted string, so keep going until we hit a close " */
if (c == '"') {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_STRING or we handled the end above */
case IN_WORD:
/* we're in a word, so keep going until we get to a space */
if (isspace(c)) {
/* word goes from start_of_word to p-1 */
... do something with the word ...
state = DULL; /* back to "not in word, not in string" state */
}
continue; /* either still IN_WORD or we handled the end above */
}
}
Note that this does not account for the possibility of a double quote inside a word, e.g.:
"some text in quotes" plus four simple words p"lus something strange"
Work through the state machine above and you will see that "some text in quotes" turns into a single token (that ignores the double quotes), but p"lus is also a single token (that includes the quote), something is a single token, and strange" is a token. Whether you want this, or how you want to handle it, is up to you. For more complex but thorough lexical tokenization, you may want to use a code-building tool like flex.
Also, when the for loop exits, if state is not DULL, you need to handle the final word (I left this out of the code above) and decide what to do if state is IN_STRING (meaning there was no close-double-quote).

Torek's parts of parsing code are excellent but require little more work to use.
For my own purpose, I finished c function.
Here I share my work that is based on Torek's code.
#include <stdio.h>
#include <string.h>
#include <ctype.h>
size_t split(char *buffer, char *argv[], size_t argv_size)
{
char *p, *start_of_word;
int c;
enum states { DULL, IN_WORD, IN_STRING } state = DULL;
size_t argc = 0;
for (p = buffer; argc < argv_size && *p != '\0'; p++) {
c = (unsigned char) *p;
switch (state) {
case DULL:
if (isspace(c)) {
continue;
}
if (c == '"') {
state = IN_STRING;
start_of_word = p + 1;
continue;
}
state = IN_WORD;
start_of_word = p;
continue;
case IN_STRING:
if (c == '"') {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
case IN_WORD:
if (isspace(c)) {
*p = 0;
argv[argc++] = start_of_word;
state = DULL;
}
continue;
}
}
if (state != DULL && argc < argv_size)
argv[argc++] = start_of_word;
return argc;
}
void test_split(const char *s)
{
char buf[1024];
size_t i, argc;
char *argv[20];
strcpy(buf, s);
argc = split(buf, argv, 20);
printf("input: '%s'\n", s);
for (i = 0; i < argc; i++)
printf("[%u] '%s'\n", i, argv[i]);
}
int main(int ac, char *av[])
{
test_split("\"some text in quotes\" plus four simple words p\"lus something strange\"");
return 0;
}
See program output:
input: '"some text in quotes" plus four simple words p"lus something strange"'
[0] 'some text in quotes'
[1] 'plus'
[2] 'four'
[3] 'simple'
[4] 'words'
[5] 'p"lus'
[6] 'something'
[7] 'strange"'

I wrote a qtok function some time ago that reads quoted words from a string. It's not a state machine and it doesn't make you an array but it's trivial to put the resulting tokens into one. It also handles escaped quotes and trailing and leading spaces:
#include <stdio.h>
#include <ctype.h>
#include <assert.h>
// Strips backslashes from quotes
char *unescapeToken(char *token)
{
char *in = token;
char *out = token;
while (*in)
{
assert(in >= out);
if ((in[0] == '\\') && (in[1] == '"'))
{
*out = in[1];
out++;
in += 2;
}
else
{
*out = *in;
out++;
in++;
}
}
*out = 0;
return token;
}
// Returns the end of the token, without chaning it.
char *qtok(char *str, char **next)
{
char *current = str;
char *start = str;
int isQuoted = 0;
// Eat beginning whitespace.
while (*current && isspace(*current)) current++;
start = current;
if (*current == '"')
{
isQuoted = 1;
// Quoted token
current++; // Skip the beginning quote.
start = current;
for (;;)
{
// Go till we find a quote or the end of string.
while (*current && (*current != '"')) current++;
if (!*current)
{
// Reached the end of the string.
goto finalize;
}
if (*(current - 1) == '\\')
{
// Escaped quote keep going.
current++;
continue;
}
// Reached the ending quote.
goto finalize;
}
}
// Not quoted so run till we see a space.
while (*current && !isspace(*current)) current++;
finalize:
if (*current)
{
// Close token if not closed already.
*current = 0;
current++;
// Eat trailing whitespace.
while (*current && isspace(*current)) current++;
}
*next = current;
return isQuoted ? unescapeToken(start) : start;
}
int main()
{
char text[] = " \"some text in quotes\" plus four simple words p\"lus something strange\" \"Then some quoted \\\"words\\\", and backslashes: \\ \\ \" Escapes only work insi\\\"de q\\\"uoted strings\\\" ";
char *pText = text;
printf("Original: '%s'\n", text);
while (*pText)
{
printf("'%s'\n", qtok(pText, &pText));
}
}
Outputs:
Original: ' "some text in quotes" plus four simple words p"lus something strange" "Then some quoted \"words\", and backslashes: \ \ " Escapes only work insi\"de q\"uoted strings\" '
'some text in quotes'
'plus'
'four'
'simple'
'words'
'p"lus'
'something'
'strange"'
'Then some quoted "words", and backslashes: \ \ '
'Escapes'
'only'
'work'
'insi\"de'
'q\"uoted'
'strings\"'

I think the answer to your question is actually fairly simple, but I'm taking on an assumption where it seems the other responses have taken a different one. I'm assuming that you want any quoted block of text to be separated out on its own regardless of spacing with the rest of the text being separated by spaces.
So given the example:
"some text in quotes" plus four simple words p"lus something strange"
The output would be:
[0] some text in quotes
[1] plus
[2] four
[3] simple
[4] words
[5] p
[6] lus something strange
Given that this is the case, only a simple bit of code is required, and no complex machines. You would first check if there is a leading quote for the first character and if so tick a flag and remove the character. As well as removing any quotes at the end of the string. Then tokenize the string based on quotation marks. Then tokenize every other of the strings obtained previously by spaces. Tokenize starting with the first string obtained if there was no leading quote, or the second string obtained if there was a leading quote. Then each of the remaining strings from the first part will be added to an array of strings interspersed with the strings from the second part added in place of the strings they were tokenized from. In this way you can get the result listed above. In code this would look like:
#include<string.h>
#include<stdlib.h>
char ** parser(char * input, char delim, char delim2){
char ** output;
char ** quotes;
char * line = input;
int flag = 0;
if(strlen(input) > 0 && input[0] == delim){
flag = 1;
line = input + 1;
}
int i = 0;
char * pch = strchr(line, delim);
while(pch != NULL){
i++;
pch = strchr(pch+1, delim);
}
quotes = (char **) malloc(sizeof(char *)*i+1);
char * token = strtok(input, delim);
int n = 0;
while(token != NULL){
quotes[n] = strdup(token);
token = strtok(NULL, delim);
n++;
}
if(delim2 != NULL){
int j = 0, k = 0, l = 0;
for(n = 0; n < i+1; n++){
if(flag & n % 2 == 1 || !flag & n % 2 == 0){
char ** new = parser(delim2, NULL);
l = sizeof(new)/sizeof(char *);
for(k = 0; k < l; k++){
output[j] = new[k];
j++;
}
for(k = l; k > -1; k--){
free(new[n]);
}
free(new);
} else {
output[j] = quotes[n];
j++;
}
}
for(n = i; n > -1; n--){
free(quotes[n]);
}
free(quotes);
} else {
return quotes;
}
return output;
}
int main(){
char * input;
char ** result = parser(input, '\"', ' ');
return 0;
}
(May not be perfect, I haven't tested it)

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

program adds on a string seemingly out of nowhere - arrays

Related

How to get ASCII code for characters from a text file?

Implementing a C function that splits a string on a given character and returns an array of strings after the split (along with length of array)

Parsing and data overwriting issues in C using custom strtok

Does a string contains a word from a list

Parse string into array based on spaces or "double quotes strings"

Categories

Resources