Split a string into token in C - c

struct string {
size_t length;
size_t allocated;
char* data;
};
string* strCreate(char* str) {...}
string* strSubstring(string* str, size_t pos, size_t len) {...}
char* strGet(string* str) {
return str->data;
}
size_t findFrist(string* str, char what, size_t pos) {
for(size_t i = pos; i < str->length; ++i) {
if(str->data[i] == what)
return i;
}
return -1;
}
string** strTokenizer(string* str) {
string** res;
res = malloc(sizeof(char*)*90); //*90 (token num)
for(int i=0; i<90; i++) //i<90 (token num)
res[i] = malloc(sizeof(char)*100); //*100 (a token lenght)
size_t first = 0;
size_t i = 0;
while(first < str->length) {
int second = findFrist(str,' ',first);
if(second == - 1)
second = str->length;
string* token = strSubstring(str,first, second - first);
if(*strGet(token) != ' ')
res[i] = token;
first = second + 1;
++i;
}
return res;
}
int main() {
string* fe = strCreate("A string \ tof");
string** r = strTokenizer(fe);
for(int i = 0; i < 4; ++i) {
printf("%s",strGet(r[i]));
}
return 0;
}
I want to create a string tokenizer. When I want to print the string in the main function it not print the \. The other thing is that how to allocate string** res in proper way. When I allocate for the sizeof(char*) the only way that I see is loop through the string, but I was just wondering is this possible to allocate without going through the string 2 times in the tokenizer function.
I do not want to use strtok
typedef struct string string in the .h file

When compiling the code using GCC 10.2.0 with options:
gcc -O3 -g -std=c11 -Wall -Wextra -Werror …
I get the message:
bs83.c:85:44: error: unknown escape sequence: '\040' [-Werror]
85 | string *fe = strCreate("A string \ tof");
As I noted in the comments, the string "A string \ tof" is malformed — your compiler should be warning you about it. If you want a backslash in a string, you write (for example), "A string \\ tof". Backslash-space has no defined meaning; it is probably being interpreted as a single space. Maybe you should print the argument to strCreate() to validate this.
You also need typedef struct string string; near the top of the code for it to compile with a C compiler (and the headers).
Here is a fixed version of your code — because of the compilation options I use, the functions need to be either static or declared before being defined; I make them static since the only reason to make them non-static (IMO) is if they're accessed from another source file, and then they'd be declared in a header. That probably applies to your code — there is mention of a header.
Here's fixed code that works:
/* SO 6537-9584 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct string string;
struct string
{
size_t length;
size_t allocated;
char *data;
};
static
string *strCreate(char *str)
{
string *s = malloc(sizeof(string));
s->length = strlen(str);
s->allocated = s->length * 2;
s->data = (char *)malloc(s->allocated + 1);
memcpy(s->data, str, s->length);
s->data[s->length] = '\0';
return s;
}
static
string *strSubstring(string *str, size_t pos, size_t len)
{
string *s = malloc(sizeof(string));
s->length = len;
s->allocated = s->length * 2;
s->data = (char *)malloc(s->allocated + 1);
memcpy(s->data, &str->data[pos], s->length);
s->data[s->length] = '\0';
return s;
}
static
char *strGet(string *str)
{
return str->data;
}
static
size_t findFirst(string *str, char what, size_t pos)
{
for (size_t i = pos; i < str->length; ++i)
{
if (str->data[i] == what)
return i;
}
return -1;
}
static
string **strTokenizer(string *str)
{
string **res;
res = malloc(sizeof(char *) * 90); // *90 (token num)
for (int i = 0; i < 90; i++) // i<90 (token num)
res[i] = malloc(sizeof(char) * 100); // *100 (a token length)
size_t first = 0;
size_t i = 0;
while (first < str->length)
{
int second = findFirst(str, ' ', first);
if (second == -1)
second = str->length;
string *token = strSubstring(str, first, second - first);
if (*strGet(token) != ' ')
res[i] = token;
first = second + 1;
++i;
}
return res;
}
int main(void)
{
string *fe = strCreate("A string \\ tof");
string **r = strTokenizer(fe);
for (int i = 0; i < 4; ++i)
{
printf("[[%s]]\n", strGet(r[i]));
}
return 0;
}
The output is:
[[A]]
[[string]]
[[\]]
[[tof]]
Note the use of [[ and ]] around the printed strings (and the newlines). It makes it easier to identify trailing blanks and embedded carriage return characters and various other mishaps.

Related

Is there an easy way to remove specific chars from a char*?

char * deleteChars = "\"\'.“”‘’?:;-,—*($%)! \t\n\x0A\r"
I have this and i'm trying to remove any of these from a given char*. I'm not sure how I would go about comparing a char* to it.
For example if the char* is equal to "hello," how would I go about removing that comma with my deleteChars?
So far I have
void removeChar(char*p, char*delim){
char*holder = p;
while(*p){
if(!(*p==*delim++)){
*holder++=*p;
p++;
}
}
*holder = '\0';
A simple one-by-one approach:
You can use strchr to decide if the character is present in the deletion set. You then assign back into the buffer at the next unassigned position, only if not a filtered character.
It might be easier to understand this using two indices, instead of using pointer arithmetic.
#include <stdio.h>
#include <string.h>
void remove_characters(char *from, const char *set)
{
size_t i = 0, j = 0;
while (from[i]) {
if (!strchr(set, from[i]))
from[j++] = from[i];
i++;
}
from[j] = 0;
}
int main(void) {
const char *del = "\"\'.“”‘’?:;-,—*($%)! \t\n\x0A\r";
char buf[] = "hello, world!";
remove_characters(buf, del);
puts(buf);
}
stdout:
hello world
If you've several delimiters/characters to ignore, it's better to use a look-up table.
void remove_chars (char* str, const char* delims)
{
if (!str || !delims) return;
char* ans = str;
int dlt[256] = {0};
while (*delims)
dlt[(unsigned char)*delims++] = 1;
while (*str) {
if (dlt[(unsigned char)*str])
++str; // skip it
else //if (str != ans)
*ans++ = *str++;
}
*ans = '\0';
}
You could do a double loop, but depending on what you want to treat, it might not be ideal. And since you are FOR SURE shrinking the string you don't need to malloc (provided it was already malloced). I'd initialize a table like this.
#include <string.h>
...
char del[256];
memset(del, 0, 256 * sizeof(char));
for (int i = 0; deleteChars[i]; i++) del[deleteChars[i]] = 1;
Then in a function:
void delChars(char *del, char *string) {
int i, offset;
for (i = 0, offset = 0; string[i]; i++) {
string[i - offset] = string[i];
if (del[string[i]]) offset++;
}
string[i - offset] = 0;
}
This will not work on string literals (that you initialize with char* x = "") though because you'd end up writing in program memory, and probably segfault. I'm sure you can tweak it if that's your need. (Just do something like char *newString = malloc(strlen(string) + 1); newString[i - offset] = string[i])
Apply strchr(delim, p[i]) to each element in p[].
Let us take advantage that strchr(delim, 0) always returns a non-NULL pointer to eliminate the the null character test for every interrelation.
void removeChar(char *p, char *delim) {
size_t out = 0;
for (size_t in; /* empty */; in++) {
// p[in] in the delim set?
if (strchr(delim, p[in])) {
if (p[in] == '\0') {
break;
}
} else {
p[out++] = p[in];
}
}
p[out] = '\0';
}
Variation on #Oka good answer.
it is better way - return the string without needless characters
#include <string.h>
char * remove_chars(char * str, const char * delim) {
for ( char * p = strpbrk(str, delim); p; p = strpbrk(p, delim) )
memmove(p, p + 1, strlen(p));
return str;
}

What's wrong with my function? I have function that finds all common chars and concatenates them into one string

I have function that finds all common chars and concatenates into one string.
char* commonString(char* p1,char* p2)
{
char* res = "";
for (int k=0;k<strlen(p1);k++)
{
for (int h=0;h<strlen(p2);h++)
{
if (p1[k] == p2[h])
{
strcat(res,&p1[k]);
}
}
}
return res;
}
What's wrong with it? Can you review and help to fix it?
Example of I/O:
Example 00
Input: "padinton" && "paqefwtdjetyiytjneytjoeyjnejeyj"
Output:
Return Value: "padinto"
P.S. I also have function that removes all duplicated chars except the first ocurrence of it from strings.
This function works after removing them
The two main problems in your code are that you are not allocating space for the resulting string and you are using the strcat function inappropriately. Below is a brief implementation of what you are trying to achieve.
#include <stdlib.h>
#include <string.h>
char *commonString(char* p1,char* p2)
{
const size_t lenp1 = strlen(p1);
char *res = malloc(lenp1 + 1);
size_t j = 0;
for (size_t i = 0; i < lenp1; ++i)
if (strchr(p2, p1[i]))
res[j++] = p1[i];
res[j] = 0;
return res;
}
Important Note: The pointer returned by the malloc function must be checked against NULL before being dereferenced. It is omitted here for brevity.
There are so many issues in your code.
Not allocating memory,
Modifying string literals
returning local variables
etc etc.
Your function is also inefficient. You call strlen on every iteration, call strcat (which is very expensive) just to add 1 char.
This function does what you want with or without the duplicates.
#include <stdlib.h>
#include <stdio.h>
char *mystrnchr(const char *str, const char ch, size_t size)
{
char *result = NULL;
while(size--)
{
if(*str == ch)
{
result = (char *)str;
break;
}
str++;
}
return result;
}
char *mystrchr(const char *str, const char ch)
{
char *result = NULL;
while(*str)
{
if(*str == ch)
{
result = (char *)str;
break;
}
str++;
}
return result;
}
char* commonString(char *buff, const char* p1, const char* p2, int duplicates)
{
size_t size = 0;
char p1c;
while((p1c = *p1++))
{
if(!duplicates)
{
if(mystrnchr(buff, p1c, size))
{
continue;
}
}
if(mystrchr(p2, p1c))
{
buff[size++] = p1c;
}
}
buff[size] = 0;
return buff;
}
int main()
{
char result[23];
char *str1 = "paaaadiiiiinton";
char *str2 = "paqefwtdjetyiytjneytjoeyjnejeyj";
printf("%s\n", commonString(result, str1, str2, 0));
printf("%s\n", commonString(result, str1, str2, 1));
}
You can experiment with it yourself here: https://godbolt.org/z/qMnsfa
Here's a solution along those lines:
#include <stdio.h>
#include <string.h>
#define MAX_LENGTH 512
void removeDup(char *result, char *string)
{
for (int i = 0; i < strlen(string); i++)
{
char C[2] = { string[i], '\0' };
if (strstr(result, C) == NULL)
strcat(result, C);
}
}
char *commonString(char *p1, char *p2)
{
char r[MAX_LENGTH] = { };
for (int i = 0; i < strlen(p1); i++)
for (int j = 0; j < strlen(p2); j++)
if (p1[i] == p2[j])
strcat(r, &p1[i]);
static char res[MAX_LENGTH] = { };
removeDup(res, r);
return res;
}
int main()
{
printf("%s\n", commonString("padinton", "paqefwtdjetyiytjneytjoeyjnejeyj"));
return 0;
}
$ cc string.c -o string && ./string
padinto

Return argument doesn't work — gives me weird error

This is a simple program that should create a substring from a string, then it should return the substring as something that can be printed out.
It's actually an exercise and only the substring function can be changed. The problem is that I can't find a return type that doesn't spark all kinds of warnings and errors.
How should I change the return type?
static void panic(const char *serror)
{
printf("%s", serror);
exit(1);
}
static void *xmalloc(size_t size)
{
void *ptr;
if (size == 0)
panic("Size is 0!\n");
ptr = malloc(size);
if (!ptr)
panic("No mem left!\n");
return ptr;
}
static char *substring(const char *str, off_t pos, size_t len)
{
char out [len];
int index;
for(index = 0; index < (pos + len); index++)
{
if(index >= pos && index < (pos + len))
{
out[index - pos] = str[index];
}
}
return out;
}
int main(int argc, char **argv)
{
char *foo = "Nicht\n";
char *bar = substring(foo, 2, 3);
printf("%s", bar);
free(bar);
return 0;
}
You invoked two undefine behavior by
dereferencing a pointer bar that points at already vanished local variable.
passing a non-NULL pointer which doesn't point at buffer allocated via malloc(), calloc() or realloc().
Also note that
You have to terminate the string by adding null character.
Your loop is not efficient.
corrected code:
static char *substring(const char *str, off_t pos, size_t len)
{
char *out = xmalloc(len + 1);
int index;
for(index = pos; index < (pos + len); index++)
{
out[index - pos] = str[index];
}
out[len] = '\0';
return out;
}

C reference gone after for loop

i got a problem with my C code.
int split(char* source, char*** target, char* splitChar) {
int i;
int currentLength;
int splitCharPosition;
char* currentSubstring = source;
int splitCount = charcount(source, splitChar) + 1;
*target = (char**) malloc(splitCount * sizeof(char**));
for(i=0;i<splitCount;i++) {
splitCharPosition = indexOf(currentSubstring, splitChar);
substring(currentSubstring, target[i], 0, splitCharPosition);
currentLength = strlen(currentSubstring);
substring(currentSubstring, &currentSubstring, splitCharPosition + 1, curr entLength-splitCharPosition);
}
return splitCount;
}
The problem is that if I use the Debugger, the pointer to splitChar is set to 0x0 after the first run of the for loop.
Does anybody know why it is set to 0x0?
EDIT:
int indexOf(char* source, char* template) {
int i;
int j;
int index;
for (i = 0; source[i]; i++) {
index = i;
for (j = 0; template[j]; j++) {
if (source[i + j] != template[j]) {
index = -1;
break;
}
}
if (index != -1) {
return index;
}
}
return -1;
}
EDIT2:
int charcount(char* source, const char* countChar) {
int i;
int count = 0;
for(i=0;source[i];i++) {
if(source[i] == countChar[0]) {
count++;
}
}
return count;
}
EDIT3:
char* substring(char* source, char** target, int start, int length) {
*target = (char*) malloc(length + 1);
strncpy(*target, source + start, length);
target[length] = '\0';
return *target;
}
EDIT4:
I just noticed that if I add
char* sndfpgjps = splitChar;
to my split() code it does not delete the reference. Anyone know why?
This line:-
substring(currentSubstring, &currentSubstring, splitCharPosition + 1, curr entLength-splitCharPosition);
... will cause a memory leak, as well as being incredibly inefficient. The old substring is left dangling. and never freed.
It would be much better to write
currentSubString += splitCharPosition + 1;
I don't think that's the problem, but it's a problem.
Also, as you're using C library functions like strlen(), why aren't you using strtok or better yet, strtok_r?
I have some reservations about the code, but this works cleanly under valgrind (no leaks, no abuse). I've left the sub-functions largely unchanged except that constant strings are marked constant. The code in split() has been simplified. As I noted in a comment, I suggest writing the main split() function so that you have a local char **string_list; which you allocate and fill. Then, when you're about to return, you assign *target = string_list;. This will make it easier for you to understand what's going on. Triple indirection is nasty. You can justify it here (just), but minimize the time you spend working with triple pointers. The revision adopts that strategy.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
extern int split(const char *source, char ***target, const char *splitStr);
static int
indexOf(const char *source, const char *template)
{
int i;
int j;
int index;
for (i = 0; source[i]; i++)
{
index = i;
for (j = 0; template[j]; j++)
{
if (source[i + j] != template[j])
{
index = -1;
break;
}
}
if (index != -1)
return index;
}
return -1;
}
static int
charcount(const char *source, const char *countChar)
{
int count = 0;
for (int i = 0; source[i]; i++)
{
if (source[i] == countChar[0])
count++;
}
return count;
}
static char *
substring(const char *source, int start, int length)
{
char *target = (char *)malloc(length + 1);
if (target != 0)
{
memmove(target, source + start, length);
target[length] = '\0';
}
return target;
}
int
split(const char *source, char ***target, const char *splitStr)
{
int splitCount = charcount(source, splitStr) + 1;
char **result = (char **)malloc(splitCount * sizeof(*result));
if (result == 0)
return -1;
int splitLength = strlen(splitStr);
char **next = result;
const char *currentSubstring = source;
for (int i = 0; i < splitCount; i++)
{
int splitCharPosition = indexOf(currentSubstring, splitStr);
if (splitCharPosition < 0)
break;
*next++ = substring(currentSubstring, 0, splitCharPosition);
currentSubstring += splitCharPosition + splitLength;
}
*next++ = substring(currentSubstring, 0, strlen(currentSubstring));
*target = result;
return (next - result); /* Actual number of strings */
}
static void print_list(int nstrings, char **strings)
{
for (int i = 0; i < nstrings; i++)
{
if (strings[i] != 0)
printf("%d: <<%s>>\n", i, strings[i]);
}
}
static void free_list(int nstrings, char **strings)
{
for (int i = 0; i < nstrings; i++)
free(strings[i]);
free(strings);
}
int main(void)
{
const char source[] = "This is a string; it is really!";
char **strings;
int nstrings;
nstrings = split(source, &strings, " ");
printf("Splitting: <<%s>> on <<%s>>\n", source, " ");
print_list(nstrings, strings);
free_list(nstrings, strings);
nstrings = split(source, &strings, "is");
printf("Splitting: <<%s>> on <<%s>>\n", source, "is");
print_list(nstrings, strings);
free_list(nstrings, strings);
return 0;
}
Note that in the second example, charcount() returns 6 but there are only 4 strings. This caused a late adjustment to the source code. (You could realloc() the result so it is exactly the right size, but it probably isn't worth worrying about unless the discrepancy is really marked — say 'more than 10 entries'.) The error handling is not perfect; it doesn't access invalid memory after failure to allocate, but it doesn't stop trying to allocate, either. Nor does it report failures to allocate individual strings — it does for failure to allocate the array of pointers.
I'd probably avoid the triple pointer by creating a structure:
typedef struct StringList
{
size_t nstrings;
char **strings;
} StringList;
You can then pass a pointer to one of these into split(), and into the utility functions such as free_list() and print_list(). The free_list() function would then modify the structure so that both elements are zeroed after the data pointed at by the structure is freed.
I'd also be tempted to use a different implementation of indexOf():
int indexOf(const char *haystack, const char *needle)
{
const char *pos = strstr(haystack, needle);
if (pos != 0)
return (pos - haystack);
return -1;
}
I do not know what substring does, nor what signature it has, but in the line
substring(currentSubstring, target[i], 0, splitCharPosition);
target[i] is only defined for i==0. I believe you wanted to write
substring(currentSubstring, (*target)[i], 0, splitCharPosition);
See if your debugger also supports data breakpoints, i.e. break if some place in memory is modified. Then place one at the actual address of splitChar, and another at the address it points to. (Since you didn't specify whether the pointer is null or points to nil.) See where it breaks. It may be that it is a completely unrelated place; that would indicate a buffer overflow.
Also, you could make at least splitChar a pointer to const. You don't actually want to modify it, right? Better idea, make it a char, not a pointer, since its name suggests that there is only one character on which you split, not a string.
The first call to substring does not look correct:
substring(currentSubstring, target[i], 0, splitCharPosition);
I suspect it should be something like the following where it indexes the actual memory that was allocated:
substring(currentSubstring, &((*target)[i]), 0, splitCharPosition);
You first need to get the value that target points at (*target) and then index off of that and pass the address of that array location.

string array in c

How do you use a string array as a parameter in C? If I were to write a function with signature:
Guess i didnt explain myself very well... I'll post the code that i'm trying to get to work.
int format_parameters(char* str) {
char local_str[201] = "";
int i = 0;
int j = 0;
int flip = 0;
while(str[i]) {
if((str[i] == '"') && (flip == 0)) flip = 1;//Sentence allowed
else if((str[i] == '"') && (flip == 1)) flip = 0;//Sentence not allowed
if(flip == 1) append_char(local_str, str[i]);
//check if space
else if(flip == 0) {
int c = str[i];
if(!isspace(c)) append_char(local_str, str[i]);
else {
if((strlen(local_str) > 0) && (j < 4)) {
//local-str copied to param[j] here
//printf("j = %d %s\n",j,local_str);
local_str[0] = '\0';
j++;
}
}
}
i++;
}
//Add \0 to param
return flip;
}//end format_parameters
void append_char(char* str, char c) {
int len = strlen(str);
str[len] = c;
str[len+1] = '\0';
}//end append_char
int main() {
char str[200];
//str filled with stuff...
int x = format_parameters(str);
}
There should be a second (and third?) parameter in format_parameterssignature, a char* param[5] which should be readable from main.
Does this work?
#include <string.h>
#include <assert.h>
#include <stdio.h>
int format_parameters(char *str, char *param[], size_t nparam)
{
char **next = param;
char **end = param + nparam;
char *data = str;
assert(str != 0 && param != 0 && nparam != 0);
while (next < end && *data != '\0')
{
*next++ = data;
data = strchr(data, ' '); // Choose your own splitting criterion
if (data == 0)
break;
*data++ = '\0';
}
return(next - param);
}
int main(void)
{
char str[] = "a b c d";
char *param[5];
int nvals = format_parameters(str, param, 5);
int i;
for (i = 0; i < nvals; i++)
printf("Param %d: <<%s>>\n", i+1, param[i]);
return 0;
}
The return value is the number of parameters found. If you pass an empty string, that would be 0. Beware leading, trailing and repeated blanks; the code works - but maybe not as you want it to.
This is entirely about memory allocation.
If you allocate static memory for param before the function is called, the memory will exist in that scope.
Otherwise take a look at dynamic allocation, it will exist until you tell it to go away.
You have to create the char* param[] array outside the function and just pass it as a parameter:
int paramCount = countParameters(str); // you have to create this function
char* param[] = malloc(paramCount * sizeof(char*));
format_parameters(str, param);
and inside the function:
int format_parameters(char* str, char* param[])
{
int currentParamIndex = 0;
..........
//TODO: check if currentParamIndex < paramCount
char* currentParam = str + currentParamStart; // currentParamStart is the start index of the parameter in the str containing all parameters
param[currentParamIndex] = currentParam;
currentParamIndex++;
.............
}
And in order to write safe code you have to pass also the paramCount as a parameter to format_parameters so the function will not access an element out of the bounds of the array.
Or maybe you should just use getopt?
As Jonatahan pointed out, you need more parameters:
int format_parameters(char* strInput, char* paramOutput[], size_t cbMaxParams );
// return value is actual number of parameter strings in paramOutput
paramOutput is an array of pointers. So the caller has to provide an array of pointers and the called function has to allocate memory for the strings and set the pointers in the array:
// main:
#define SIZE 20
char * params[SIZE];
int result = format_parameters( szInput, params, SIZE );
// after use go through params and free all pointers
// function:
int format_parameters(char* strInput, char* paramOutput[], size_t cbMaxParams )
{
// ...
for( size_t i=0; (i<cbMaxParams) && (!noMoreParams); i++ )
{
// ...
paramOutput[i] = (char *)malloc( xxxx );
// ...
}
// ...
}

Resources