Free, invalid pointer - c

I have a program, that splits strings based on the delimiter. I have also, 2 other functions, one that prints the returned array and another that frees the array.
My program prints the array and returns an error when the free array method is called. Below is the full code.
#include "stringsplit.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
unsigned long int getNofTokens(const char *string) {
char *stringCopy;
unsigned long int stringLength;
unsigned long int count = 0;
stringLength = (unsigned)strlen(string);
stringCopy = malloc((stringLength + 1) * sizeof(char));
strcpy(stringCopy, string);
if (strtok(stringCopy, " \t") != NULL) {
count++;
while (strtok(NULL, " \t") != NULL)
count++;
}
free(stringCopy);
return count;
}
char **split_string(const char *str, const char *split) {
unsigned long int count = getNofTokens(str);
char **result;
result = malloc(sizeof(char *) * count + 1);
char *tmp = malloc(sizeof(char) * strlen(str));
strcpy(tmp, str);
char *token = strtok(tmp, split);
int idx = 0;
while (token != NULL) {
result[idx++] = token;
token = strtok(NULL, split);
}
return result;
}
void print_split_string(char **split_string) {
for (int i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (int i = 0; split_string[i] != NULL; i++) {
char *currentPointer = split_string[i];
free(currentPointer);
}
free(split_string);
}
Also, do I need to explicitly add \0 at the end of the array or does strtok add it automatically?

There are some problems in your code:
[Major] the function getNofTokens() does not take the separator string as an argument, it counts the number of words separated by blanks, potentially returning an inconsistent count to its caller.
[Major] the size allocated in result = malloc(sizeof(char *) * count + 1); is incorrect: it should be:
result = malloc(sizeof(char *) * (count + 1));
Storing the trailing NULL pointer will write beyond the end of the allocated space.
[Major] storing the said NULL terminator at the end of the array is indeed necessary, as the block of memory returned by malloc() is uninitialized.
[Major] the copy of the string allocated and parsed by split_string cannot be safely freed because the pointer tmp is not saved anywhere. The pointer to the first token will be different from tmp in 2 cases: if the string contains only delimiters (no token found) or if the string starts with a delimiter (the initial delimiters will be skipped). In order to simplify the code and make it reliable, each token could be duplicated and tmp should be freed. In fact your free_split_string() function relies on this behavior. With the current implementation, the behavior is undefined.
[Minor] you use unsigned long and int inconsistently for strings lengths and array index variables. For consistency, you should use size_t for both.
[Remark] you should allocate string copies with strdup(). If this POSIX standard function is not available on your system, write a simple implementation.
[Major] you never test for memory allocation failure. This is OK for testing purposes and throw away code, but such potential failures should always be accounted for in production code.
[Remark] strtok() is a tricky function to use: it modifies the source string and keeps a hidden static state that makes it non-reentrant. You should avoid using this function although in this particular case it performs correctly, but if the caller of split_string or getNofTokens relied on this hidden state being preserved, it would get unexpected behavior.
Here is a modified version:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
size_t getNofTokens(const char *string, const char *split) {
char *tmp = strdup(string);
size_t count = 0;
if (strtok(tmp, split) != NULL) {
count++;
while (strtok(NULL, split) != NULL)
count++;
}
free(tmp);
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
char *tmp = strdup(str);
char *token = strtok(tmp, split);
size_t idx = 0;
while (token != NULL && idx < count) {
result[idx++] = strdup(token);
token = strtok(NULL, split);
}
result[idx] = NULL;
free(tmp);
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}
Here is an alternative without strtok() and without intermediary allocations:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
size_t getNofTokens(const char *str, const char *split) {
size_t count = 0;
size_t pos = 0, len;
for (pos = 0;; pos += len) {
pos += strspn(str + pos, split); // skip delimiters
len = strcspn(str + pos, split); // parse token
if (len == '\0')
break;
count++;
}
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
size_t pos, len, idx;
for (pos = 0, idx = 0; idx < count; pos += len, idx++) {
pos += strspn(str + pos, split); // skip delimiters
len = strcspn(str + pos, split); // parse token
if (len == '\0')
break;
result[idx] = strndup(str + pos, len);
}
result[idx] = NULL;
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}
EDIT After re-reading the specification in your comment, there seems to be some potential confusion as to the semantics of the split argument:
if split is a set of delimiters, the above code does the job. And the examples will be split as expected.
if split is an actual string to match explicitly, the above code only works by coincidence on the examples given in the comment.
To implement the latter semantics, you should use strstr() to search for the split substring in both getNofTokens and split_string.
Here is an example:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
size_t getNofTokens(const char *str, const char *split) {
const char *p;
size_t count = 1;
size_t len = strlen(split);
if (len == 0)
return strlen(str);
for (p = str; (p = strstr(p, split)) != NULL; p += len)
count++;
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
size_t len = strlen(split);
size_t idx;
const char *p = str;
for (idx = 0; idx < count; idx++) {
const char *q = strstr(p, split);
if (q == NULL) {
q = p + strlen(p);
} else
if (q == p && *q != '\0') {
q++;
}
result[idx] = strndup(p, q - p);
p = q + len;
}
result[idx] = NULL;
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}

When debugging, take note of values that you got from malloc, strdup, etc. Let's call these values "the active set". It's just a name, so that we can refer to them. You get a pointer from those functions, you mentally add it to the active set. When you call free, you can only pass values from the active set, and after free returns, you mentally remove them from the set. Any other use of free is invalid and a bug.
You can easily find this out by putting breakpoints after all memory allocations, so that you can write down the pointer values, and then breakpoints on all frees, so that you can see if one of those pointer values got passed to free - since, again, to do otherwise is to misuse free.
This can be done also using "printf" debugging. Like this:
char *buf = malloc(...); // or strdup, or ...
fprintf(stderr, "+++ Alloc %8p\n", buf);
And then whenever you have free, do it again:
fprintf(stderr, "--- Free %8p\n", ptr);
free(ptr);
In the output of the program, you must be able to match every +++ with ---. If you see any --- with a value that wasn't earlier listed with a +++, there's your problem: that's the buggy invocation of free :)
I suggest using fprintf(stderr, ... instead of printf(..., since the former is typically unbuffered, so if your program crashes, you won't miss any output. printf is buffered on some architectures (and not buffered on others - so much for consistency).

Related

How do I split the string that fgets() returns into words? [duplicate]

How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().
EDIT:
Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
char** str_split(char* a_str, const char a_delim)
{
char** result = 0;
size_t count = 0;
char* tmp = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;
/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}
/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);
/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;
result = malloc(sizeof(char*) * count);
if (result)
{
size_t idx = 0;
char* token = strtok(a_str, delim);
while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}
return result;
}
int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;
printf("months=[%s]\n\n", months);
tokens = str_split(months, ',');
if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}
return 0;
}
Output:
$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]
month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
I think strsep is still the best tool for this:
while ((token = strsep(&str, ","))) my_fn(token);
That is literally one line that splits a string.
The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.
For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:
// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;
tofree = str = strdup(my_str_literal); // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);
If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.
In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:
char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}
Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string
int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;
p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}
*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);
p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}
return count;
}
How to use it:
int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;
c = split(s, ' ', &arr);
printf("found %d tokens.\n", c);
for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);
return 0;
}
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;
if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;
/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}
c++;
} while ( *c != '\0' );
ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;
c = str;
retLen = 1;
ret[0] = str;
do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}
c++;
} while ( *c != '\0' );
}
if ( numSplits != NULL )
{
*numSplits = retLen;
}
return ret;
}
int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char* strCpy;
char** split;
int num;
int i;
strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );
split = str_split( strCpy, ',', &num );
if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );
for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}
free( split );
free( strCpy );
return 0;
}
There is probably a neater way to do it, but you get the idea.
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
/**
* splits str on delim and dynamically allocates an array of pointers.
*
* On error -1 is returned, check errno
* On success size of array is returned, which may be 0 on an empty string
* or 1 if no delim was found.
*
* You could rewrite this to return the char ** array instead and upon NULL
* know it's an allocation problem but I did the triple array here. Note that
* upon the hitting two delim's in a row "foo,,bar" the array would be:
* { "foo", NULL, "bar" }
*
* You need to define the semantics of a trailing delim Like "foo," is that a
* 2 count array or an array of one? I choose the two count with the second entry
* set to NULL since it's valueless.
* Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;
p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}
// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;
p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p; // Copy start of string
p = strchr(p, 0 ); // Look for next null
p++; // Start of next string
}
*array = res;
*length = count;
return 0;
}
char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";
int main() {
char **res;
int k=0;
int count =0;
int rc;
rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}
printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}
free(res );
return 0;
}
I think the following solution is ideal:
Doesn't destroy the source string
Re-entrant - i.e., you can safely call it from anywhere in one or more threads
Portable
Handles multiple separators correctly
Fast and efficient
Explanation of the code:
Define a structure token to store the address and lengths of the tokens
Allocate enough memory for these in the worst case, which is when
str is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty strings
Scan str recording the address and length of every token
Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
Allocate, copy, and add the tokens using the start and length
information - use memcpy as it's faster than strcpy and we know
the lengths
Free the token address and length array
Return the array of tokens
typedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc checking omitted for brevity.
In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
My version:
int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;
while (*t == delimeter) t++;
char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;
(*args) = malloc(sizeof(char*) * cnt);
for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;
int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;
while (*t == delimeter) t++;
}
return cnt;
}
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);
This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.
Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0
To compare against a list of delimiters, define delim as a char* and replace the line:
if (str[i]==delim) {
with the two following lines:
char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {
Enjoy
#include <stdlib.h>
#include <string.h>
char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;
// there is at least one string returned
*count=1;
_result= *result;
// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}
// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {
// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
// if (str[i]==delim) {
//
// with the two following lines:
// char *c=delim; while(*c && *c!=str[i]) c++;
// if (*c) {
//
if (str[i]==delim) {
// replace delimiter with zero
str[i]=0;
// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}
// increment count for each separator found
++(*count);
// if max is specified, dont go further
if (max && *count==max) {
break;
}
}
}
// when result array is specified, we are done here
if (_result) {
return _result;
}
// else allocate memory for result
// and fill the result array
*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;
// add first string to result
_result[0]=str;
// if theres more strings
for (i=1; i<*count; ++i) {
// find next string
while(*str) ++str;
++str;
// add next string to result
_result[i]=str;
}
return _result;
}
Usage example:
#include <stdio.h>
int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;
split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);
if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}
printf("\n");
if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}
return 0;
}
Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.
Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
Below is an example usage...
Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));
Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)
The library can be downloaded from Github
https://github.com/fnoyanisi/zString
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Try use this.
char** strsplit(char* str, const char* delim){
char** res = NULL;
char* part;
int i = 0;
char* aux = strdup(str);
part = strdup(strtok(aux, delim));
while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);
part = strdup(strtok(NULL, delim));
i++;
}
res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;
return res;
}
Explode & implode - initial string remains intact, dynamic memory allocation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef struct
{
uintptr_t ptr;
int size;
} token_t;
int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;
for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}
if(c1 == 0)
{
return -1;
}
*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;
i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}
char* implode(token_t *tokens, int size, const char *delimiter)
{
int i, len = 0;
char *str;
for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}
str = (char*)calloc(len, sizeof(char));
len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}
str[len - 1] = '\0';
return str;
}
Usage:
int main(int argc, char **argv)
{
int i, c;
char *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t *tokens;
char *imp;
printf("%s\n", exp);
if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);
for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}
free((void*)tokens);
free((void*)imp);
return 0;
}
If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.
For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.
bstrlib has many other functions to operate on bstrings
Easy as pie...
#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}
}
Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)
See also the Implementation Notes, further below, to decide which function suits your needs better.
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h> // C99
// tokenize destructively
char **str_toksarray_alloc(
char **strp, /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks, /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls /* false ignores empty tokens, true includes them */
);
// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str, /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);
Usage Notes
Their prototypes are almost identical, except for the source-string (strp and str, respectively).
strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.
To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.
Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.
Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).
The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).
When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.
Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).
On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)
Simplified examples using both functions follow.
Prep:
const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;
str_toksarray_alloc():
// destructive (use copy of src)
char *scopy = strdup( src );
if (!scopy) { ... }; // handle strdup failure
printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/
str_toksarray_alloc2():
// non-destructive
keepnulls = false; // reject empty tokens
printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks ); // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks ); // non-dangling artoks
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/
Implementation Notes
Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).
A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.
A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.
On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.
Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.
Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.
I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.
Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).
Oh boy, I think that was a bit more than just 2 cents (lol).
Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).
Source Code
str_toksarray_alloc():
// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}
char *strpSaved = *strp; // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks); // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16); // # of tokens to alloc-ahead
// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}
// Parse *strp tokens into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL; // NULL sentinel
*strp = strpSaved; // restore initial *strp pointer
if (ntoks) *ntoks = i; // pass to caller # of parsed tokens
return toksarr;
failed:
if (ntoks) *ntoks = 0;
return NULL;
}
str_toksarray_alloc2():
// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) { // str_tokscount() failed
goto fail_free_str;
}
// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}
// Parse str tokens and duplicate them into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL; // NULL sentinel
free( _str ); // release the local copy of the source-string
if (ntoks) *ntoks = i; // pass to caller the # of parsed tokens
return toksarr;
// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );
fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}
str_tokscount() - helper function, used by str_toksarr_alloc2():
// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}
const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1; // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}
return keepnulls ? ntoks : (ntoks - nnulls);
}
toksarray_free2() - use it on the array returned by str_toksarr_alloc2():
// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
// e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) { // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}
return NULL;
}
Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().
Algorithm:
If the input string is not empty, go to step 2 else return null.
Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
Allocate memory and copy string from start to end in that memory.
Return token.
Advantage:
Thread safe.
Handles multiple delimiters.
Portable.
Doesn't modify the input string, like strtok() and strsep() does.
Implementation:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/
static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}
char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}
return tmp;
}
/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/
char * str_split (char ** p_str, const char * sep) {
char * token = NULL;
if (*p_str && **p_str) {
char * p_end;
// skip separator
*p_str += strspn(*p_str, sep);
p_end = *p_str;
// find separator
p_end = strpbrk (p_end, sep);
// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}
token = alloc_str (*p_str, p_end);
*p_str = p_end;
}
return token;
}
/*==================================================*/
/*==================================================*/
/*
* Just a helper function
*/
void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");
if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");
if (delim) printf ("Delimiter : \"%s\"\n", delim);
char * ptr = in_str;
char * token = NULL;
printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}
/*
* Driver function
*/
int main (void) {
/* test cases */
char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);
strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);
strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);
strcpy (string, " one more -- string !");
delim = "- !";
token_helper (string, delim);
strcpy (string, "");
delim = " ";
token_helper (string, delim);
token_helper (NULL, "");
strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);
strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);
strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);
return 0;
}
Output:
# ./a.out
Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!
Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!
Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string
Input string : " one more -- string !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string
Input string : ""
Delimiter : " "
Tokens:
Input string : NULL
Delimiter : ""
Tokens:
Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi
Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
-> of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .
Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.
char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}
My code (tested):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));
/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}
int main()
{
int i;
int c = 0;
char **arr = NULL;
int count =0;
char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);
for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);
return(0);
}
Result:
Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:
Thread safe. (strtok is not thread safe)
Does not employ malloc or any of it's derivatives (to avoid memory management issues)
Checks array bounds on the individual fields (to avoid segment faults on unknown data)
Works with multi-byte field separators (utf-8)
ignores extra fields in the input
provides soft error routine for invalid field lengths
The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.
#include <stdio.h>
#include <string.h>
struct splitFieldType {
char *field;
int maxLength;
};
typedef struct splitFieldType splitField;
int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual)) {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;
for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}
void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}
int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";
struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;
splitField inputFields[12] = {
{monthFields.field1, sizeof(monthFields.field1)},
{monthFields.field2, sizeof(monthFields.field2)},
{monthFields.field3, sizeof(monthFields.field3)},
{monthFields.field4, sizeof(monthFields.field4)},
{monthFields.field5, sizeof(monthFields.field5)},
{monthFields.field6, sizeof(monthFields.field6)},
{monthFields.field7, sizeof(monthFields.field7)},
{monthFields.field8, sizeof(monthFields.field8)},
{monthFields.field9, sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};
int expected=sizeof(inputFields)/sizeof(splitField);
printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);
int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);
if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}
for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}
printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}
Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.
$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC
Direct structure access, field 10: OCT
Enjoy!
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.
The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.
As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".
The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}
Outputs
This
is
Luke
Skywalker
here!
Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).
I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.
Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093
Example:
#include "./strutils.c"
struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";
str_split_begin(&info);
char * substr;
for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}
str_split_end(&info);
Output:
$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'
Full source of strutils.c
#ifndef STRUTILS_C
#define STRUTILS_C 1
#ifndef str
#define str char *
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string
* Provided by caller of str_split_begin function
*/
str delimiter;
/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;
/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;
/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *
/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;
int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;
int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;
//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;
//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);
int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;
//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {
//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;
//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;
//move to next split index
splitIndex ++;
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}
//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}
void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}
void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);
//free string lengths array pointer
free(info->splitStringsLengths);
info->splitStringsCount = 0;
}
}
void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";
struct str_split_info info;
info.source = source;
info.delimiter = delimiter;
str_split_begin (&info);
//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}
str_split_end(&info);
}
#endif
I tried to make a very simple one. I am also showing example in the main().
#include <stdio.h>
#include <string.h>
void split(char* inputArr, char** outputArr, char* delim) {
char *temp;
temp = strtok(inputArr, delim);
for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}
int main(int argc, char **argv){
/* check for proper arguments */
if(argc != 2){
printf("One Argument Expected\n");
} else {
printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
/* read txt file and split into array like java split() */
int bufferLen = 100;
char buffer[bufferLen];
char *splitArr[100];
while(fgets(buffer, bufferLen, myScriptFile) != NULL){
split(buffer, splitArr, " ");
printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}
Assume a .txt file has
Hello this is test
Hello2 this is test2
running it with a .txt file as a parameter would give
Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test
Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

How split word in .csv by comma? [duplicate]

How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().
EDIT:
Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
char** str_split(char* a_str, const char a_delim)
{
char** result = 0;
size_t count = 0;
char* tmp = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;
/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}
/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);
/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;
result = malloc(sizeof(char*) * count);
if (result)
{
size_t idx = 0;
char* token = strtok(a_str, delim);
while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}
return result;
}
int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;
printf("months=[%s]\n\n", months);
tokens = str_split(months, ',');
if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}
return 0;
}
Output:
$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]
month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
I think strsep is still the best tool for this:
while ((token = strsep(&str, ","))) my_fn(token);
That is literally one line that splits a string.
The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.
For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:
// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;
tofree = str = strdup(my_str_literal); // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);
If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.
In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:
char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}
Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string
int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;
p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}
*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);
p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}
return count;
}
How to use it:
int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;
c = split(s, ' ', &arr);
printf("found %d tokens.\n", c);
for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);
return 0;
}
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;
if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;
/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}
c++;
} while ( *c != '\0' );
ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;
c = str;
retLen = 1;
ret[0] = str;
do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}
c++;
} while ( *c != '\0' );
}
if ( numSplits != NULL )
{
*numSplits = retLen;
}
return ret;
}
int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char* strCpy;
char** split;
int num;
int i;
strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );
split = str_split( strCpy, ',', &num );
if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );
for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}
free( split );
free( strCpy );
return 0;
}
There is probably a neater way to do it, but you get the idea.
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
/**
* splits str on delim and dynamically allocates an array of pointers.
*
* On error -1 is returned, check errno
* On success size of array is returned, which may be 0 on an empty string
* or 1 if no delim was found.
*
* You could rewrite this to return the char ** array instead and upon NULL
* know it's an allocation problem but I did the triple array here. Note that
* upon the hitting two delim's in a row "foo,,bar" the array would be:
* { "foo", NULL, "bar" }
*
* You need to define the semantics of a trailing delim Like "foo," is that a
* 2 count array or an array of one? I choose the two count with the second entry
* set to NULL since it's valueless.
* Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;
p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}
// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;
p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p; // Copy start of string
p = strchr(p, 0 ); // Look for next null
p++; // Start of next string
}
*array = res;
*length = count;
return 0;
}
char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";
int main() {
char **res;
int k=0;
int count =0;
int rc;
rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}
printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}
free(res );
return 0;
}
I think the following solution is ideal:
Doesn't destroy the source string
Re-entrant - i.e., you can safely call it from anywhere in one or more threads
Portable
Handles multiple separators correctly
Fast and efficient
Explanation of the code:
Define a structure token to store the address and lengths of the tokens
Allocate enough memory for these in the worst case, which is when
str is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty strings
Scan str recording the address and length of every token
Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
Allocate, copy, and add the tokens using the start and length
information - use memcpy as it's faster than strcpy and we know
the lengths
Free the token address and length array
Return the array of tokens
typedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc checking omitted for brevity.
In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
My version:
int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;
while (*t == delimeter) t++;
char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;
(*args) = malloc(sizeof(char*) * cnt);
for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;
int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;
while (*t == delimeter) t++;
}
return cnt;
}
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);
This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.
Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0
To compare against a list of delimiters, define delim as a char* and replace the line:
if (str[i]==delim) {
with the two following lines:
char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {
Enjoy
#include <stdlib.h>
#include <string.h>
char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;
// there is at least one string returned
*count=1;
_result= *result;
// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}
// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {
// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
// if (str[i]==delim) {
//
// with the two following lines:
// char *c=delim; while(*c && *c!=str[i]) c++;
// if (*c) {
//
if (str[i]==delim) {
// replace delimiter with zero
str[i]=0;
// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}
// increment count for each separator found
++(*count);
// if max is specified, dont go further
if (max && *count==max) {
break;
}
}
}
// when result array is specified, we are done here
if (_result) {
return _result;
}
// else allocate memory for result
// and fill the result array
*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;
// add first string to result
_result[0]=str;
// if theres more strings
for (i=1; i<*count; ++i) {
// find next string
while(*str) ++str;
++str;
// add next string to result
_result[i]=str;
}
return _result;
}
Usage example:
#include <stdio.h>
int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;
split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);
if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}
printf("\n");
if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}
return 0;
}
Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.
Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
Below is an example usage...
Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));
Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)
The library can be downloaded from Github
https://github.com/fnoyanisi/zString
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Try use this.
char** strsplit(char* str, const char* delim){
char** res = NULL;
char* part;
int i = 0;
char* aux = strdup(str);
part = strdup(strtok(aux, delim));
while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);
part = strdup(strtok(NULL, delim));
i++;
}
res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;
return res;
}
Explode & implode - initial string remains intact, dynamic memory allocation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef struct
{
uintptr_t ptr;
int size;
} token_t;
int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;
for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}
if(c1 == 0)
{
return -1;
}
*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;
i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}
char* implode(token_t *tokens, int size, const char *delimiter)
{
int i, len = 0;
char *str;
for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}
str = (char*)calloc(len, sizeof(char));
len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}
str[len - 1] = '\0';
return str;
}
Usage:
int main(int argc, char **argv)
{
int i, c;
char *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t *tokens;
char *imp;
printf("%s\n", exp);
if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);
for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}
free((void*)tokens);
free((void*)imp);
return 0;
}
If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.
For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.
bstrlib has many other functions to operate on bstrings
Easy as pie...
#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}
}
Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)
See also the Implementation Notes, further below, to decide which function suits your needs better.
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h> // C99
// tokenize destructively
char **str_toksarray_alloc(
char **strp, /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks, /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls /* false ignores empty tokens, true includes them */
);
// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str, /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);
Usage Notes
Their prototypes are almost identical, except for the source-string (strp and str, respectively).
strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.
To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.
Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.
Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).
The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).
When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.
Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).
On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)
Simplified examples using both functions follow.
Prep:
const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;
str_toksarray_alloc():
// destructive (use copy of src)
char *scopy = strdup( src );
if (!scopy) { ... }; // handle strdup failure
printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/
str_toksarray_alloc2():
// non-destructive
keepnulls = false; // reject empty tokens
printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks ); // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks ); // non-dangling artoks
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/
Implementation Notes
Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).
A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.
A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.
On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.
Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.
Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.
I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.
Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).
Oh boy, I think that was a bit more than just 2 cents (lol).
Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).
Source Code
str_toksarray_alloc():
// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}
char *strpSaved = *strp; // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks); // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16); // # of tokens to alloc-ahead
// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}
// Parse *strp tokens into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL; // NULL sentinel
*strp = strpSaved; // restore initial *strp pointer
if (ntoks) *ntoks = i; // pass to caller # of parsed tokens
return toksarr;
failed:
if (ntoks) *ntoks = 0;
return NULL;
}
str_toksarray_alloc2():
// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) { // str_tokscount() failed
goto fail_free_str;
}
// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}
// Parse str tokens and duplicate them into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL; // NULL sentinel
free( _str ); // release the local copy of the source-string
if (ntoks) *ntoks = i; // pass to caller the # of parsed tokens
return toksarr;
// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );
fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}
str_tokscount() - helper function, used by str_toksarr_alloc2():
// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}
const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1; // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}
return keepnulls ? ntoks : (ntoks - nnulls);
}
toksarray_free2() - use it on the array returned by str_toksarr_alloc2():
// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
// e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) { // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}
return NULL;
}
Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().
Algorithm:
If the input string is not empty, go to step 2 else return null.
Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
Allocate memory and copy string from start to end in that memory.
Return token.
Advantage:
Thread safe.
Handles multiple delimiters.
Portable.
Doesn't modify the input string, like strtok() and strsep() does.
Implementation:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/
static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}
char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}
return tmp;
}
/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/
char * str_split (char ** p_str, const char * sep) {
char * token = NULL;
if (*p_str && **p_str) {
char * p_end;
// skip separator
*p_str += strspn(*p_str, sep);
p_end = *p_str;
// find separator
p_end = strpbrk (p_end, sep);
// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}
token = alloc_str (*p_str, p_end);
*p_str = p_end;
}
return token;
}
/*==================================================*/
/*==================================================*/
/*
* Just a helper function
*/
void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");
if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");
if (delim) printf ("Delimiter : \"%s\"\n", delim);
char * ptr = in_str;
char * token = NULL;
printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}
/*
* Driver function
*/
int main (void) {
/* test cases */
char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);
strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);
strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);
strcpy (string, " one more -- string !");
delim = "- !";
token_helper (string, delim);
strcpy (string, "");
delim = " ";
token_helper (string, delim);
token_helper (NULL, "");
strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);
strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);
strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);
return 0;
}
Output:
# ./a.out
Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!
Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!
Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string
Input string : " one more -- string !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string
Input string : ""
Delimiter : " "
Tokens:
Input string : NULL
Delimiter : ""
Tokens:
Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi
Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
-> of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .
Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.
char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}
My code (tested):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));
/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}
int main()
{
int i;
int c = 0;
char **arr = NULL;
int count =0;
char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);
for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);
return(0);
}
Result:
Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:
Thread safe. (strtok is not thread safe)
Does not employ malloc or any of it's derivatives (to avoid memory management issues)
Checks array bounds on the individual fields (to avoid segment faults on unknown data)
Works with multi-byte field separators (utf-8)
ignores extra fields in the input
provides soft error routine for invalid field lengths
The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.
#include <stdio.h>
#include <string.h>
struct splitFieldType {
char *field;
int maxLength;
};
typedef struct splitFieldType splitField;
int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual)) {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;
for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}
void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}
int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";
struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;
splitField inputFields[12] = {
{monthFields.field1, sizeof(monthFields.field1)},
{monthFields.field2, sizeof(monthFields.field2)},
{monthFields.field3, sizeof(monthFields.field3)},
{monthFields.field4, sizeof(monthFields.field4)},
{monthFields.field5, sizeof(monthFields.field5)},
{monthFields.field6, sizeof(monthFields.field6)},
{monthFields.field7, sizeof(monthFields.field7)},
{monthFields.field8, sizeof(monthFields.field8)},
{monthFields.field9, sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};
int expected=sizeof(inputFields)/sizeof(splitField);
printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);
int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);
if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}
for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}
printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}
Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.
$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC
Direct structure access, field 10: OCT
Enjoy!
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.
The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.
As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".
The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}
Outputs
This
is
Luke
Skywalker
here!
Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).
I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.
Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093
Example:
#include "./strutils.c"
struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";
str_split_begin(&info);
char * substr;
for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}
str_split_end(&info);
Output:
$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'
Full source of strutils.c
#ifndef STRUTILS_C
#define STRUTILS_C 1
#ifndef str
#define str char *
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string
* Provided by caller of str_split_begin function
*/
str delimiter;
/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;
/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;
/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *
/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;
int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;
int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;
//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;
//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);
int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;
//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {
//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;
//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;
//move to next split index
splitIndex ++;
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}
//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}
void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}
void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);
//free string lengths array pointer
free(info->splitStringsLengths);
info->splitStringsCount = 0;
}
}
void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";
struct str_split_info info;
info.source = source;
info.delimiter = delimiter;
str_split_begin (&info);
//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}
str_split_end(&info);
}
#endif
I tried to make a very simple one. I am also showing example in the main().
#include <stdio.h>
#include <string.h>
void split(char* inputArr, char** outputArr, char* delim) {
char *temp;
temp = strtok(inputArr, delim);
for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}
int main(int argc, char **argv){
/* check for proper arguments */
if(argc != 2){
printf("One Argument Expected\n");
} else {
printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
/* read txt file and split into array like java split() */
int bufferLen = 100;
char buffer[bufferLen];
char *splitArr[100];
while(fgets(buffer, bufferLen, myScriptFile) != NULL){
split(buffer, splitArr, " ");
printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}
Assume a .txt file has
Hello this is test
Hello2 this is test2
running it with a .txt file as a parameter would give
Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test
Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

Reading a string, getting token, returning char* to next token [duplicate]

How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().
EDIT:
Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
char** str_split(char* a_str, const char a_delim)
{
char** result = 0;
size_t count = 0;
char* tmp = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;
/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}
/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);
/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;
result = malloc(sizeof(char*) * count);
if (result)
{
size_t idx = 0;
char* token = strtok(a_str, delim);
while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}
return result;
}
int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;
printf("months=[%s]\n\n", months);
tokens = str_split(months, ',');
if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}
return 0;
}
Output:
$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]
month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
I think strsep is still the best tool for this:
while ((token = strsep(&str, ","))) my_fn(token);
That is literally one line that splits a string.
The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.
For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:
// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;
tofree = str = strdup(my_str_literal); // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);
If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.
In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:
char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}
Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string
int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;
p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}
*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);
p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}
return count;
}
How to use it:
int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;
c = split(s, ' ', &arr);
printf("found %d tokens.\n", c);
for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);
return 0;
}
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;
if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;
/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}
c++;
} while ( *c != '\0' );
ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;
c = str;
retLen = 1;
ret[0] = str;
do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}
c++;
} while ( *c != '\0' );
}
if ( numSplits != NULL )
{
*numSplits = retLen;
}
return ret;
}
int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char* strCpy;
char** split;
int num;
int i;
strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );
split = str_split( strCpy, ',', &num );
if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );
for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}
free( split );
free( strCpy );
return 0;
}
There is probably a neater way to do it, but you get the idea.
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
/**
* splits str on delim and dynamically allocates an array of pointers.
*
* On error -1 is returned, check errno
* On success size of array is returned, which may be 0 on an empty string
* or 1 if no delim was found.
*
* You could rewrite this to return the char ** array instead and upon NULL
* know it's an allocation problem but I did the triple array here. Note that
* upon the hitting two delim's in a row "foo,,bar" the array would be:
* { "foo", NULL, "bar" }
*
* You need to define the semantics of a trailing delim Like "foo," is that a
* 2 count array or an array of one? I choose the two count with the second entry
* set to NULL since it's valueless.
* Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;
p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}
// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;
p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p; // Copy start of string
p = strchr(p, 0 ); // Look for next null
p++; // Start of next string
}
*array = res;
*length = count;
return 0;
}
char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";
int main() {
char **res;
int k=0;
int count =0;
int rc;
rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}
printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}
free(res );
return 0;
}
I think the following solution is ideal:
Doesn't destroy the source string
Re-entrant - i.e., you can safely call it from anywhere in one or more threads
Portable
Handles multiple separators correctly
Fast and efficient
Explanation of the code:
Define a structure token to store the address and lengths of the tokens
Allocate enough memory for these in the worst case, which is when
str is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty strings
Scan str recording the address and length of every token
Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
Allocate, copy, and add the tokens using the start and length
information - use memcpy as it's faster than strcpy and we know
the lengths
Free the token address and length array
Return the array of tokens
typedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc checking omitted for brevity.
In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
My version:
int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;
while (*t == delimeter) t++;
char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;
(*args) = malloc(sizeof(char*) * cnt);
for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;
int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;
while (*t == delimeter) t++;
}
return cnt;
}
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);
This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.
Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0
To compare against a list of delimiters, define delim as a char* and replace the line:
if (str[i]==delim) {
with the two following lines:
char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {
Enjoy
#include <stdlib.h>
#include <string.h>
char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;
// there is at least one string returned
*count=1;
_result= *result;
// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}
// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {
// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
// if (str[i]==delim) {
//
// with the two following lines:
// char *c=delim; while(*c && *c!=str[i]) c++;
// if (*c) {
//
if (str[i]==delim) {
// replace delimiter with zero
str[i]=0;
// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}
// increment count for each separator found
++(*count);
// if max is specified, dont go further
if (max && *count==max) {
break;
}
}
}
// when result array is specified, we are done here
if (_result) {
return _result;
}
// else allocate memory for result
// and fill the result array
*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;
// add first string to result
_result[0]=str;
// if theres more strings
for (i=1; i<*count; ++i) {
// find next string
while(*str) ++str;
++str;
// add next string to result
_result[i]=str;
}
return _result;
}
Usage example:
#include <stdio.h>
int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;
split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);
if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}
printf("\n");
if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}
return 0;
}
Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.
Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
Below is an example usage...
Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));
Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)
The library can be downloaded from Github
https://github.com/fnoyanisi/zString
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Try use this.
char** strsplit(char* str, const char* delim){
char** res = NULL;
char* part;
int i = 0;
char* aux = strdup(str);
part = strdup(strtok(aux, delim));
while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);
part = strdup(strtok(NULL, delim));
i++;
}
res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;
return res;
}
Explode & implode - initial string remains intact, dynamic memory allocation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef struct
{
uintptr_t ptr;
int size;
} token_t;
int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;
for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}
if(c1 == 0)
{
return -1;
}
*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;
i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}
char* implode(token_t *tokens, int size, const char *delimiter)
{
int i, len = 0;
char *str;
for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}
str = (char*)calloc(len, sizeof(char));
len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}
str[len - 1] = '\0';
return str;
}
Usage:
int main(int argc, char **argv)
{
int i, c;
char *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t *tokens;
char *imp;
printf("%s\n", exp);
if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);
for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}
free((void*)tokens);
free((void*)imp);
return 0;
}
If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.
For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.
bstrlib has many other functions to operate on bstrings
Easy as pie...
#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}
}
Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)
See also the Implementation Notes, further below, to decide which function suits your needs better.
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h> // C99
// tokenize destructively
char **str_toksarray_alloc(
char **strp, /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks, /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls /* false ignores empty tokens, true includes them */
);
// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str, /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);
Usage Notes
Their prototypes are almost identical, except for the source-string (strp and str, respectively).
strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.
To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.
Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.
Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).
The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).
When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.
Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).
On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)
Simplified examples using both functions follow.
Prep:
const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;
str_toksarray_alloc():
// destructive (use copy of src)
char *scopy = strdup( src );
if (!scopy) { ... }; // handle strdup failure
printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/
str_toksarray_alloc2():
// non-destructive
keepnulls = false; // reject empty tokens
printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks ); // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks ); // non-dangling artoks
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/
Implementation Notes
Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).
A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.
A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.
On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.
Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.
Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.
I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.
Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).
Oh boy, I think that was a bit more than just 2 cents (lol).
Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).
Source Code
str_toksarray_alloc():
// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}
char *strpSaved = *strp; // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks); // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16); // # of tokens to alloc-ahead
// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}
// Parse *strp tokens into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL; // NULL sentinel
*strp = strpSaved; // restore initial *strp pointer
if (ntoks) *ntoks = i; // pass to caller # of parsed tokens
return toksarr;
failed:
if (ntoks) *ntoks = 0;
return NULL;
}
str_toksarray_alloc2():
// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) { // str_tokscount() failed
goto fail_free_str;
}
// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}
// Parse str tokens and duplicate them into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL; // NULL sentinel
free( _str ); // release the local copy of the source-string
if (ntoks) *ntoks = i; // pass to caller the # of parsed tokens
return toksarr;
// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );
fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}
str_tokscount() - helper function, used by str_toksarr_alloc2():
// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}
const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1; // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}
return keepnulls ? ntoks : (ntoks - nnulls);
}
toksarray_free2() - use it on the array returned by str_toksarr_alloc2():
// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
// e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) { // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}
return NULL;
}
Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().
Algorithm:
If the input string is not empty, go to step 2 else return null.
Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
Allocate memory and copy string from start to end in that memory.
Return token.
Advantage:
Thread safe.
Handles multiple delimiters.
Portable.
Doesn't modify the input string, like strtok() and strsep() does.
Implementation:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/
static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}
char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}
return tmp;
}
/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/
char * str_split (char ** p_str, const char * sep) {
char * token = NULL;
if (*p_str && **p_str) {
char * p_end;
// skip separator
*p_str += strspn(*p_str, sep);
p_end = *p_str;
// find separator
p_end = strpbrk (p_end, sep);
// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}
token = alloc_str (*p_str, p_end);
*p_str = p_end;
}
return token;
}
/*==================================================*/
/*==================================================*/
/*
* Just a helper function
*/
void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");
if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");
if (delim) printf ("Delimiter : \"%s\"\n", delim);
char * ptr = in_str;
char * token = NULL;
printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}
/*
* Driver function
*/
int main (void) {
/* test cases */
char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);
strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);
strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);
strcpy (string, " one more -- string !");
delim = "- !";
token_helper (string, delim);
strcpy (string, "");
delim = " ";
token_helper (string, delim);
token_helper (NULL, "");
strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);
strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);
strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);
return 0;
}
Output:
# ./a.out
Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!
Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!
Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string
Input string : " one more -- string !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string
Input string : ""
Delimiter : " "
Tokens:
Input string : NULL
Delimiter : ""
Tokens:
Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi
Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
-> of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .
Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.
char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}
My code (tested):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));
/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}
int main()
{
int i;
int c = 0;
char **arr = NULL;
int count =0;
char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);
for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);
return(0);
}
Result:
Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:
Thread safe. (strtok is not thread safe)
Does not employ malloc or any of it's derivatives (to avoid memory management issues)
Checks array bounds on the individual fields (to avoid segment faults on unknown data)
Works with multi-byte field separators (utf-8)
ignores extra fields in the input
provides soft error routine for invalid field lengths
The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.
#include <stdio.h>
#include <string.h>
struct splitFieldType {
char *field;
int maxLength;
};
typedef struct splitFieldType splitField;
int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual)) {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;
for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}
void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}
int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";
struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;
splitField inputFields[12] = {
{monthFields.field1, sizeof(monthFields.field1)},
{monthFields.field2, sizeof(monthFields.field2)},
{monthFields.field3, sizeof(monthFields.field3)},
{monthFields.field4, sizeof(monthFields.field4)},
{monthFields.field5, sizeof(monthFields.field5)},
{monthFields.field6, sizeof(monthFields.field6)},
{monthFields.field7, sizeof(monthFields.field7)},
{monthFields.field8, sizeof(monthFields.field8)},
{monthFields.field9, sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};
int expected=sizeof(inputFields)/sizeof(splitField);
printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);
int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);
if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}
for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}
printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}
Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.
$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC
Direct structure access, field 10: OCT
Enjoy!
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.
The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.
As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".
The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}
Outputs
This
is
Luke
Skywalker
here!
Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).
I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.
Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093
Example:
#include "./strutils.c"
struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";
str_split_begin(&info);
char * substr;
for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}
str_split_end(&info);
Output:
$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'
Full source of strutils.c
#ifndef STRUTILS_C
#define STRUTILS_C 1
#ifndef str
#define str char *
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string
* Provided by caller of str_split_begin function
*/
str delimiter;
/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;
/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;
/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *
/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;
int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;
int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;
//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;
//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);
int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;
//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {
//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;
//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;
//move to next split index
splitIndex ++;
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}
//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}
void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}
void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);
//free string lengths array pointer
free(info->splitStringsLengths);
info->splitStringsCount = 0;
}
}
void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";
struct str_split_info info;
info.source = source;
info.delimiter = delimiter;
str_split_begin (&info);
//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}
str_split_end(&info);
}
#endif
I tried to make a very simple one. I am also showing example in the main().
#include <stdio.h>
#include <string.h>
void split(char* inputArr, char** outputArr, char* delim) {
char *temp;
temp = strtok(inputArr, delim);
for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}
int main(int argc, char **argv){
/* check for proper arguments */
if(argc != 2){
printf("One Argument Expected\n");
} else {
printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
/* read txt file and split into array like java split() */
int bufferLen = 100;
char buffer[bufferLen];
char *splitArr[100];
while(fgets(buffer, bufferLen, myScriptFile) != NULL){
split(buffer, splitArr, " ");
printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}
Assume a .txt file has
Hello this is test
Hello2 this is test2
running it with a .txt file as a parameter would give
Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test
Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

split 2D char array into two parts in C [duplicate]

How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().
EDIT:
Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
char** str_split(char* a_str, const char a_delim)
{
char** result = 0;
size_t count = 0;
char* tmp = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;
/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}
/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);
/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;
result = malloc(sizeof(char*) * count);
if (result)
{
size_t idx = 0;
char* token = strtok(a_str, delim);
while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}
return result;
}
int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;
printf("months=[%s]\n\n", months);
tokens = str_split(months, ',');
if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}
return 0;
}
Output:
$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]
month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
I think strsep is still the best tool for this:
while ((token = strsep(&str, ","))) my_fn(token);
That is literally one line that splits a string.
The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.
For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:
// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;
tofree = str = strdup(my_str_literal); // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);
If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.
In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:
char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}
Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string
int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;
p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}
*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);
p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}
return count;
}
How to use it:
int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;
c = split(s, ' ', &arr);
printf("found %d tokens.\n", c);
for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);
return 0;
}
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;
if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;
/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}
c++;
} while ( *c != '\0' );
ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;
c = str;
retLen = 1;
ret[0] = str;
do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}
c++;
} while ( *c != '\0' );
}
if ( numSplits != NULL )
{
*numSplits = retLen;
}
return ret;
}
int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char* strCpy;
char** split;
int num;
int i;
strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );
split = str_split( strCpy, ',', &num );
if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );
for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}
free( split );
free( strCpy );
return 0;
}
There is probably a neater way to do it, but you get the idea.
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
/**
* splits str on delim and dynamically allocates an array of pointers.
*
* On error -1 is returned, check errno
* On success size of array is returned, which may be 0 on an empty string
* or 1 if no delim was found.
*
* You could rewrite this to return the char ** array instead and upon NULL
* know it's an allocation problem but I did the triple array here. Note that
* upon the hitting two delim's in a row "foo,,bar" the array would be:
* { "foo", NULL, "bar" }
*
* You need to define the semantics of a trailing delim Like "foo," is that a
* 2 count array or an array of one? I choose the two count with the second entry
* set to NULL since it's valueless.
* Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;
p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}
// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;
p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p; // Copy start of string
p = strchr(p, 0 ); // Look for next null
p++; // Start of next string
}
*array = res;
*length = count;
return 0;
}
char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";
int main() {
char **res;
int k=0;
int count =0;
int rc;
rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}
printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}
free(res );
return 0;
}
I think the following solution is ideal:
Doesn't destroy the source string
Re-entrant - i.e., you can safely call it from anywhere in one or more threads
Portable
Handles multiple separators correctly
Fast and efficient
Explanation of the code:
Define a structure token to store the address and lengths of the tokens
Allocate enough memory for these in the worst case, which is when
str is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty strings
Scan str recording the address and length of every token
Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
Allocate, copy, and add the tokens using the start and length
information - use memcpy as it's faster than strcpy and we know
the lengths
Free the token address and length array
Return the array of tokens
typedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc checking omitted for brevity.
In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
My version:
int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;
while (*t == delimeter) t++;
char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;
(*args) = malloc(sizeof(char*) * cnt);
for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;
int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;
while (*t == delimeter) t++;
}
return cnt;
}
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);
This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.
Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0
To compare against a list of delimiters, define delim as a char* and replace the line:
if (str[i]==delim) {
with the two following lines:
char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {
Enjoy
#include <stdlib.h>
#include <string.h>
char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;
// there is at least one string returned
*count=1;
_result= *result;
// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}
// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {
// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
// if (str[i]==delim) {
//
// with the two following lines:
// char *c=delim; while(*c && *c!=str[i]) c++;
// if (*c) {
//
if (str[i]==delim) {
// replace delimiter with zero
str[i]=0;
// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}
// increment count for each separator found
++(*count);
// if max is specified, dont go further
if (max && *count==max) {
break;
}
}
}
// when result array is specified, we are done here
if (_result) {
return _result;
}
// else allocate memory for result
// and fill the result array
*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;
// add first string to result
_result[0]=str;
// if theres more strings
for (i=1; i<*count; ++i) {
// find next string
while(*str) ++str;
++str;
// add next string to result
_result[i]=str;
}
return _result;
}
Usage example:
#include <stdio.h>
int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;
split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);
if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}
printf("\n");
if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}
return 0;
}
Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.
Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
Below is an example usage...
Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));
Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)
The library can be downloaded from Github
https://github.com/fnoyanisi/zString
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Try use this.
char** strsplit(char* str, const char* delim){
char** res = NULL;
char* part;
int i = 0;
char* aux = strdup(str);
part = strdup(strtok(aux, delim));
while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);
part = strdup(strtok(NULL, delim));
i++;
}
res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;
return res;
}
Explode & implode - initial string remains intact, dynamic memory allocation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef struct
{
uintptr_t ptr;
int size;
} token_t;
int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;
for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}
if(c1 == 0)
{
return -1;
}
*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;
i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}
char* implode(token_t *tokens, int size, const char *delimiter)
{
int i, len = 0;
char *str;
for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}
str = (char*)calloc(len, sizeof(char));
len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}
str[len - 1] = '\0';
return str;
}
Usage:
int main(int argc, char **argv)
{
int i, c;
char *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t *tokens;
char *imp;
printf("%s\n", exp);
if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);
for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}
free((void*)tokens);
free((void*)imp);
return 0;
}
If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.
For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.
bstrlib has many other functions to operate on bstrings
Easy as pie...
#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}
}
Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)
See also the Implementation Notes, further below, to decide which function suits your needs better.
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h> // C99
// tokenize destructively
char **str_toksarray_alloc(
char **strp, /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks, /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls /* false ignores empty tokens, true includes them */
);
// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str, /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);
Usage Notes
Their prototypes are almost identical, except for the source-string (strp and str, respectively).
strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.
To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.
Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.
Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).
The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).
When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.
Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).
On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)
Simplified examples using both functions follow.
Prep:
const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;
str_toksarray_alloc():
// destructive (use copy of src)
char *scopy = strdup( src );
if (!scopy) { ... }; // handle strdup failure
printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/
str_toksarray_alloc2():
// non-destructive
keepnulls = false; // reject empty tokens
printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks ); // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks ); // non-dangling artoks
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/
Implementation Notes
Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).
A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.
A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.
On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.
Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.
Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.
I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.
Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).
Oh boy, I think that was a bit more than just 2 cents (lol).
Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).
Source Code
str_toksarray_alloc():
// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}
char *strpSaved = *strp; // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks); // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16); // # of tokens to alloc-ahead
// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}
// Parse *strp tokens into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL; // NULL sentinel
*strp = strpSaved; // restore initial *strp pointer
if (ntoks) *ntoks = i; // pass to caller # of parsed tokens
return toksarr;
failed:
if (ntoks) *ntoks = 0;
return NULL;
}
str_toksarray_alloc2():
// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) { // str_tokscount() failed
goto fail_free_str;
}
// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}
// Parse str tokens and duplicate them into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL; // NULL sentinel
free( _str ); // release the local copy of the source-string
if (ntoks) *ntoks = i; // pass to caller the # of parsed tokens
return toksarr;
// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );
fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}
str_tokscount() - helper function, used by str_toksarr_alloc2():
// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}
const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1; // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}
return keepnulls ? ntoks : (ntoks - nnulls);
}
toksarray_free2() - use it on the array returned by str_toksarr_alloc2():
// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
// e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) { // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}
return NULL;
}
Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().
Algorithm:
If the input string is not empty, go to step 2 else return null.
Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
Allocate memory and copy string from start to end in that memory.
Return token.
Advantage:
Thread safe.
Handles multiple delimiters.
Portable.
Doesn't modify the input string, like strtok() and strsep() does.
Implementation:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/
static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}
char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}
return tmp;
}
/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/
char * str_split (char ** p_str, const char * sep) {
char * token = NULL;
if (*p_str && **p_str) {
char * p_end;
// skip separator
*p_str += strspn(*p_str, sep);
p_end = *p_str;
// find separator
p_end = strpbrk (p_end, sep);
// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}
token = alloc_str (*p_str, p_end);
*p_str = p_end;
}
return token;
}
/*==================================================*/
/*==================================================*/
/*
* Just a helper function
*/
void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");
if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");
if (delim) printf ("Delimiter : \"%s\"\n", delim);
char * ptr = in_str;
char * token = NULL;
printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}
/*
* Driver function
*/
int main (void) {
/* test cases */
char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);
strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);
strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);
strcpy (string, " one more -- string !");
delim = "- !";
token_helper (string, delim);
strcpy (string, "");
delim = " ";
token_helper (string, delim);
token_helper (NULL, "");
strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);
strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);
strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);
return 0;
}
Output:
# ./a.out
Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!
Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!
Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string
Input string : " one more -- string !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string
Input string : ""
Delimiter : " "
Tokens:
Input string : NULL
Delimiter : ""
Tokens:
Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi
Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
-> of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .
Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.
char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}
My code (tested):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));
/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}
int main()
{
int i;
int c = 0;
char **arr = NULL;
int count =0;
char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);
for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);
return(0);
}
Result:
Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:
Thread safe. (strtok is not thread safe)
Does not employ malloc or any of it's derivatives (to avoid memory management issues)
Checks array bounds on the individual fields (to avoid segment faults on unknown data)
Works with multi-byte field separators (utf-8)
ignores extra fields in the input
provides soft error routine for invalid field lengths
The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.
#include <stdio.h>
#include <string.h>
struct splitFieldType {
char *field;
int maxLength;
};
typedef struct splitFieldType splitField;
int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual)) {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;
for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}
void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}
int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";
struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;
splitField inputFields[12] = {
{monthFields.field1, sizeof(monthFields.field1)},
{monthFields.field2, sizeof(monthFields.field2)},
{monthFields.field3, sizeof(monthFields.field3)},
{monthFields.field4, sizeof(monthFields.field4)},
{monthFields.field5, sizeof(monthFields.field5)},
{monthFields.field6, sizeof(monthFields.field6)},
{monthFields.field7, sizeof(monthFields.field7)},
{monthFields.field8, sizeof(monthFields.field8)},
{monthFields.field9, sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};
int expected=sizeof(inputFields)/sizeof(splitField);
printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);
int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);
if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}
for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}
printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}
Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.
$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC
Direct structure access, field 10: OCT
Enjoy!
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.
The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.
As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".
The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}
Outputs
This
is
Luke
Skywalker
here!
Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).
I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.
Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093
Example:
#include "./strutils.c"
struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";
str_split_begin(&info);
char * substr;
for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}
str_split_end(&info);
Output:
$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'
Full source of strutils.c
#ifndef STRUTILS_C
#define STRUTILS_C 1
#ifndef str
#define str char *
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string
* Provided by caller of str_split_begin function
*/
str delimiter;
/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;
/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;
/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *
/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;
int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;
int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;
//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;
//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);
int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;
//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {
//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;
//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;
//move to next split index
splitIndex ++;
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}
//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}
void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}
void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);
//free string lengths array pointer
free(info->splitStringsLengths);
info->splitStringsCount = 0;
}
}
void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";
struct str_split_info info;
info.source = source;
info.delimiter = delimiter;
str_split_begin (&info);
//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}
str_split_end(&info);
}
#endif
I tried to make a very simple one. I am also showing example in the main().
#include <stdio.h>
#include <string.h>
void split(char* inputArr, char** outputArr, char* delim) {
char *temp;
temp = strtok(inputArr, delim);
for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}
int main(int argc, char **argv){
/* check for proper arguments */
if(argc != 2){
printf("One Argument Expected\n");
} else {
printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
/* read txt file and split into array like java split() */
int bufferLen = 100;
char buffer[bufferLen];
char *splitArr[100];
while(fgets(buffer, bufferLen, myScriptFile) != NULL){
split(buffer, splitArr, " ");
printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}
Assume a .txt file has
Hello this is test
Hello2 this is test2
running it with a .txt file as a parameter would give
Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test
Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

Easiest way to parse colon delimited paths [duplicate]

How do I write a function to split and return an array for a string with delimiters in the C programming language?
char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
str_split(str,',');
You can use the strtok() function to split a string (and specify the delimiter to use). Note that strtok() will modify the string passed into it. If the original string is required elsewhere make a copy of it and pass the copy to strtok().
EDIT:
Example (note it does not handle consecutive delimiters, "JAN,,,FEB,MAR" for example):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
char** str_split(char* a_str, const char a_delim)
{
char** result = 0;
size_t count = 0;
char* tmp = a_str;
char* last_comma = 0;
char delim[2];
delim[0] = a_delim;
delim[1] = 0;
/* Count how many elements will be extracted. */
while (*tmp)
{
if (a_delim == *tmp)
{
count++;
last_comma = tmp;
}
tmp++;
}
/* Add space for trailing token. */
count += last_comma < (a_str + strlen(a_str) - 1);
/* Add space for terminating null string so caller
knows where the list of returned strings ends. */
count++;
result = malloc(sizeof(char*) * count);
if (result)
{
size_t idx = 0;
char* token = strtok(a_str, delim);
while (token)
{
assert(idx < count);
*(result + idx++) = strdup(token);
token = strtok(0, delim);
}
assert(idx == count - 1);
*(result + idx) = 0;
}
return result;
}
int main()
{
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char** tokens;
printf("months=[%s]\n\n", months);
tokens = str_split(months, ',');
if (tokens)
{
int i;
for (i = 0; *(tokens + i); i++)
{
printf("month=[%s]\n", *(tokens + i));
free(*(tokens + i));
}
printf("\n");
free(tokens);
}
return 0;
}
Output:
$ ./main.exe
months=[JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC]
month=[JAN]
month=[FEB]
month=[MAR]
month=[APR]
month=[MAY]
month=[JUN]
month=[JUL]
month=[AUG]
month=[SEP]
month=[OCT]
month=[NOV]
month=[DEC]
I think strsep is still the best tool for this:
while ((token = strsep(&str, ","))) my_fn(token);
That is literally one line that splits a string.
The extra parentheses are a stylistic element to indicate that we're intentionally testing the result of an assignment, not an equality operator ==.
For that pattern to work, token and str both have type char *. If you started with a string literal, then you'd want to make a copy of it first:
// More general pattern:
const char *my_str_literal = "JAN,FEB,MAR";
char *token, *str, *tofree;
tofree = str = strdup(my_str_literal); // We own str's memory now.
while ((token = strsep(&str, ","))) my_fn(token);
free(tofree);
If two delimiters appear together in str, you'll get a token value that's the empty string. The value of str is modified in that each delimiter encountered is overwritten with a zero byte - another good reason to copy the string being parsed first.
In a comment, someone suggested that strtok is better than strsep because strtok is more portable. Ubuntu and Mac OS X have strsep; it's safe to guess that other unixy systems do as well. Windows lacks strsep, but it has strbrk which enables this short and sweet strsep replacement:
char *strsep(char **stringp, const char *delim) {
if (*stringp == NULL) { return NULL; }
char *token_start = *stringp;
*stringp = strpbrk(token_start, delim);
if (*stringp) {
**stringp = '\0';
(*stringp)++;
}
return token_start;
}
Here is a good explanation of strsep vs strtok. The pros and cons may be judged subjectively; however, I think it's a telling sign that strsep was designed as a replacement for strtok.
String tokenizer this code should put you in the right direction.
int main(void) {
char st[] ="Where there is will, there is a way.";
char *ch;
ch = strtok(st, " ");
while (ch != NULL) {
printf("%s\n", ch);
ch = strtok(NULL, " ,");
}
getch();
return 0;
}
Method below will do all the job (memory allocation, counting the length) for you. More information and description can be found here - Implementation of Java String.split() method to split C string
int split (const char *str, char c, char ***arr)
{
int count = 1;
int token_len = 1;
int i = 0;
char *p;
char *t;
p = str;
while (*p != '\0')
{
if (*p == c)
count++;
p++;
}
*arr = (char**) malloc(sizeof(char*) * count);
if (*arr == NULL)
exit(1);
p = str;
while (*p != '\0')
{
if (*p == c)
{
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
token_len = 0;
i++;
}
p++;
token_len++;
}
(*arr)[i] = (char*) malloc( sizeof(char) * token_len );
if ((*arr)[i] == NULL)
exit(1);
i = 0;
p = str;
t = ((*arr)[i]);
while (*p != '\0')
{
if (*p != c && *p != '\0')
{
*t = *p;
t++;
}
else
{
*t = '\0';
i++;
t = ((*arr)[i]);
}
p++;
}
return count;
}
How to use it:
int main (int argc, char ** argv)
{
int i;
char *s = "Hello, this is a test module for the string splitting.";
int c = 0;
char **arr = NULL;
c = split(s, ' ', &arr);
printf("found %d tokens.\n", c);
for (i = 0; i < c; i++)
printf("string #%d: %s\n", i, arr[i]);
return 0;
}
Here is my two cents:
int split (const char *txt, char delim, char ***tokens)
{
int *tklen, *t, count = 1;
char **arr, *p = (char *) txt;
while (*p != '\0') if (*p++ == delim) count += 1;
t = tklen = calloc (count, sizeof (int));
for (p = (char *) txt; *p != '\0'; p++) *p == delim ? *t++ : (*t)++;
*tokens = arr = malloc (count * sizeof (char *));
t = tklen;
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
while (*txt != '\0')
{
if (*txt == delim)
{
p = *arr++ = calloc (*(t++) + 1, sizeof (char *));
txt++;
}
else *p++ = *txt++;
}
free (tklen);
return count;
}
Usage:
char **tokens;
int count, i;
const char *str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
count = split (str, ',', &tokens);
for (i = 0; i < count; i++) printf ("%s\n", tokens[i]);
/* freeing tokens */
for (i = 0; i < count; i++) free (tokens[i]);
free (tokens);
In the above example, there would be a way to return an array of null terminated strings (like you want) in place in the string. It would not make it possible to pass a literal string though, as it would have to be modified by the function:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
char** str_split( char* str, char delim, int* numSplits )
{
char** ret;
int retLen;
char* c;
if ( ( str == NULL ) ||
( delim == '\0' ) )
{
/* Either of those will cause problems */
ret = NULL;
retLen = -1;
}
else
{
retLen = 0;
c = str;
/* Pre-calculate number of elements */
do
{
if ( *c == delim )
{
retLen++;
}
c++;
} while ( *c != '\0' );
ret = malloc( ( retLen + 1 ) * sizeof( *ret ) );
ret[retLen] = NULL;
c = str;
retLen = 1;
ret[0] = str;
do
{
if ( *c == delim )
{
ret[retLen++] = &c[1];
*c = '\0';
}
c++;
} while ( *c != '\0' );
}
if ( numSplits != NULL )
{
*numSplits = retLen;
}
return ret;
}
int main( int argc, char* argv[] )
{
const char* str = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char* strCpy;
char** split;
int num;
int i;
strCpy = malloc( strlen( str ) * sizeof( *strCpy ) );
strcpy( strCpy, str );
split = str_split( strCpy, ',', &num );
if ( split == NULL )
{
puts( "str_split returned NULL" );
}
else
{
printf( "%i Results: \n", num );
for ( i = 0; i < num; i++ )
{
puts( split[i] );
}
}
free( split );
free( strCpy );
return 0;
}
There is probably a neater way to do it, but you get the idea.
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
/**
* splits str on delim and dynamically allocates an array of pointers.
*
* On error -1 is returned, check errno
* On success size of array is returned, which may be 0 on an empty string
* or 1 if no delim was found.
*
* You could rewrite this to return the char ** array instead and upon NULL
* know it's an allocation problem but I did the triple array here. Note that
* upon the hitting two delim's in a row "foo,,bar" the array would be:
* { "foo", NULL, "bar" }
*
* You need to define the semantics of a trailing delim Like "foo," is that a
* 2 count array or an array of one? I choose the two count with the second entry
* set to NULL since it's valueless.
* Modifies str so make a copy if this is a problem
*/
int split( char * str, char delim, char ***array, int *length ) {
char *p;
char **res;
int count=0;
int k=0;
p = str;
// Count occurance of delim in string
while( (p=strchr(p,delim)) != NULL ) {
*p = 0; // Null terminate the deliminator.
p++; // Skip past our new null
count++;
}
// allocate dynamic array
res = calloc( 1, count * sizeof(char *));
if( !res ) return -1;
p = str;
for( k=0; k<count; k++ ){
if( *p ) res[k] = p; // Copy start of string
p = strchr(p, 0 ); // Look for next null
p++; // Start of next string
}
*array = res;
*length = count;
return 0;
}
char str[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,";
int main() {
char **res;
int k=0;
int count =0;
int rc;
rc = split( str, ',', &res, &count );
if( rc ) {
printf("Error: %s errno: %d \n", strerror(errno), errno);
}
printf("count: %d\n", count );
for( k=0; k<count; k++ ) {
printf("str: %s\n", res[k]);
}
free(res );
return 0;
}
I think the following solution is ideal:
Doesn't destroy the source string
Re-entrant - i.e., you can safely call it from anywhere in one or more threads
Portable
Handles multiple separators correctly
Fast and efficient
Explanation of the code:
Define a structure token to store the address and lengths of the tokens
Allocate enough memory for these in the worst case, which is when
str is made up entirely of separators so there are strlen(str) + 1
tokens, all of them empty strings
Scan str recording the address and length of every token
Use this to allocate the output array of the correct size, including an extra space for a NULL sentinel value
Allocate, copy, and add the tokens using the start and length
information - use memcpy as it's faster than strcpy and we know
the lengths
Free the token address and length array
Return the array of tokens
typedef struct {
const char *start;
size_t len;
} token;
char **split(const char *str, char sep)
{
char **array;
unsigned int start = 0, stop, toks = 0, t;
token *tokens = malloc((strlen(str) + 1) * sizeof(token));
for (stop = 0; str[stop]; stop++) {
if (str[stop] == sep) {
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
start = stop + 1;
}
}
/* Mop up the last token */
tokens[toks].start = str + start;
tokens[toks].len = stop - start;
toks++;
array = malloc((toks + 1) * sizeof(char*));
for (t = 0; t < toks; t++) {
/* Calloc makes it nul-terminated */
char *token = calloc(tokens[t].len + 1, 1);
memcpy(token, tokens[t].start, tokens[t].len);
array[t] = token;
}
/* Add a sentinel */
array[t] = NULL;
free(tokens);
return array;
}
Note malloc checking omitted for brevity.
In general, I wouldn't return an array of char * pointers from a split function like this as it places a lot of responsibility on the caller to free them correctly. An interface I prefer is to allow the caller to pass a callback function and call this for every token, as I have described here: Split a String in C.
My version:
int split(char* str, const char delimeter, char*** args) {
int cnt = 1;
char* t = str;
while (*t == delimeter) t++;
char* t2 = t;
while (*(t2++))
if (*t2 == delimeter && *(t2 + 1) != delimeter && *(t2 + 1) != 0) cnt++;
(*args) = malloc(sizeof(char*) * cnt);
for(int i = 0; i < cnt; i++) {
char* ts = t;
while (*t != delimeter && *t != 0) t++;
int len = (t - ts + 1);
(*args)[i] = malloc(sizeof(char) * len);
memcpy((*args)[i], ts, sizeof(char) * (len - 1));
(*args)[i][len - 1] = 0;
while (*t == delimeter) t++;
}
return cnt;
}
This function takes a char* string and splits it by the deliminator. There can be multiple deliminators in a row. Note that the function modifies the orignal string. You must make a copy of the original string first if you need the original to stay unaltered. This function doesn't use any cstring function calls so it might be a little faster than others. If you don't care about memory allocation, you can allocate sub_strings at the top of the function with size strlen(src_str)/2 and (like the c++ "version" mentioned) skip the bottom half of the function. If you do this, the function is reduced to O(N), but the memory optimized way shown below is O(2N).
The function:
char** str_split(char *src_str, const char deliminator, size_t &num_sub_str){
//replace deliminator's with zeros and count how many
//sub strings with length >= 1 exist
num_sub_str = 0;
char *src_str_tmp = src_str;
bool found_delim = true;
while(*src_str_tmp){
if(*src_str_tmp == deliminator){
*src_str_tmp = 0;
found_delim = true;
}
else if(found_delim){ //found first character of a new string
num_sub_str++;
found_delim = false;
//sub_str_vec.push_back(src_str_tmp); //for c++
}
src_str_tmp++;
}
printf("Start - found %d sub strings\n", num_sub_str);
if(num_sub_str <= 0){
printf("str_split() - no substrings were found\n");
return(0);
}
//if you want to use a c++ vector and push onto it, the rest of this function
//can be omitted (obviously modifying input parameters to take a vector, etc)
char **sub_strings = (char **)malloc( (sizeof(char*) * num_sub_str) + 1);
const char *src_str_terminator = src_str_tmp;
src_str_tmp = src_str;
bool found_null = true;
size_t idx = 0;
while(src_str_tmp < src_str_terminator){
if(!*src_str_tmp) //found a NULL
found_null = true;
else if(found_null){
sub_strings[idx++] = src_str_tmp;
//printf("sub_string_%d: [%s]\n", idx-1, sub_strings[idx-1]);
found_null = false;
}
src_str_tmp++;
}
sub_strings[num_sub_str] = NULL;
return(sub_strings);
}
How to use it:
char months[] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char *str = strdup(months);
size_t num_sub_str;
char **sub_strings = str_split(str, ',', num_sub_str);
char *endptr;
if(sub_strings){
for(int i = 0; sub_strings[i]; i++)
printf("[%s]\n", sub_strings[i]);
}
free(sub_strings);
free(str);
This optimized method create (or update an existing) array of pointers in *result and returns the number of elements in *count.
Use "max" to indicate the maximum number of strings you expect (when you specify an existing array or any other reaseon), else set it to 0
To compare against a list of delimiters, define delim as a char* and replace the line:
if (str[i]==delim) {
with the two following lines:
char *c=delim; while(*c && *c!=str[i]) c++;
if (*c) {
Enjoy
#include <stdlib.h>
#include <string.h>
char **split(char *str, size_t len, char delim, char ***result, unsigned long *count, unsigned long max) {
size_t i;
char **_result;
// there is at least one string returned
*count=1;
_result= *result;
// when the result array is specified, fill it during the first pass
if (_result) {
_result[0]=str;
}
// scan the string for delimiter, up to specified length
for (i=0; i<len; ++i) {
// to compare against a list of delimiters,
// define delim as a string and replace
// the next line:
// if (str[i]==delim) {
//
// with the two following lines:
// char *c=delim; while(*c && *c!=str[i]) c++;
// if (*c) {
//
if (str[i]==delim) {
// replace delimiter with zero
str[i]=0;
// when result array is specified, fill it during the first pass
if (_result) {
_result[*count]=str+i+1;
}
// increment count for each separator found
++(*count);
// if max is specified, dont go further
if (max && *count==max) {
break;
}
}
}
// when result array is specified, we are done here
if (_result) {
return _result;
}
// else allocate memory for result
// and fill the result array
*result=malloc((*count)*sizeof(char*));
if (!*result) {
return NULL;
}
_result=*result;
// add first string to result
_result[0]=str;
// if theres more strings
for (i=1; i<*count; ++i) {
// find next string
while(*str) ++str;
++str;
// add next string to result
_result[i]=str;
}
return _result;
}
Usage example:
#include <stdio.h>
int main(int argc, char **argv) {
char *str="JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
char **result=malloc(6*sizeof(char*));
char **result2=0;
unsigned long count;
unsigned long count2;
unsigned long i;
split(strdup(str),strlen(str),',',&result,&count,6);
split(strdup(str),strlen(str),',',&result2,&count2,0);
if (result)
for (i=0; i<count; ++i) {
printf("%s\n",result[i]);
}
printf("\n");
if (result2)
for (i=0; i<count2; ++i) {
printf("%s\n", result2[i]);
}
return 0;
}
Below is my strtok() implementation from zString library.
zstring_strtok() differs from standard library's strtok() in the way it treats consecutive delimiters.
Just have a look at the code below,sure that you will get an idea about how it works (I tried to use as many comments as I could)
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
Below is an example usage...
Example Usage
char str[] = "A,B,,,C";
printf("1 %s\n",zstring_strtok(s,","));
printf("2 %s\n",zstring_strtok(NULL,","));
printf("3 %s\n",zstring_strtok(NULL,","));
printf("4 %s\n",zstring_strtok(NULL,","));
printf("5 %s\n",zstring_strtok(NULL,","));
printf("6 %s\n",zstring_strtok(NULL,","));
Example Output
1 A
2 B
3 ,
4 ,
5 C
6 (null)
The library can be downloaded from Github
https://github.com/fnoyanisi/zString
This is a string splitting function that can handle multi-character delimiters. Note that if the delimiter is longer than the string that is being split, then buffer and stringLengths will be set to (void *) 0, and numStrings will be set to 0.
This algorithm has been tested, and works. (Disclaimer: It has not been tested for non-ASCII strings, and it assumes that the caller gave valid parameters)
void splitString(const char *original, const char *delimiter, char ** * buffer, int * numStrings, int * * stringLengths){
const int lo = strlen(original);
const int ld = strlen(delimiter);
if(ld > lo){
*buffer = (void *)0;
*numStrings = 0;
*stringLengths = (void *)0;
return;
}
*numStrings = 1;
for(int i = 0;i < (lo - ld);i++){
if(strncmp(&original[i], delimiter, ld) == 0) {
i += (ld - 1);
(*numStrings)++;
}
}
*stringLengths = (int *) malloc(sizeof(int) * *numStrings);
int currentStringLength = 0;
int currentStringNumber = 0;
int delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(i < (lo - ld)){
if(strncmp(&original[i], delimiter, ld) == 0){
(*stringLengths)[currentStringNumber] = currentStringLength;
currentStringNumber++;
currentStringLength = 0;
delimiterTokenDecrementCounter = ld - 1;
} else {
currentStringLength++;
}
} else {
currentStringLength++;
}
if(i == (lo - 1)){
(*stringLengths)[currentStringNumber] = currentStringLength;
}
}
*buffer = (char **) malloc(sizeof(char *) * (*numStrings));
for(int i = 0;i < *numStrings;i++){
(*buffer)[i] = (char *) malloc(sizeof(char) * ((*stringLengths)[i] + 1));
}
currentStringNumber = 0;
currentStringLength = 0;
delimiterTokenDecrementCounter = 0;
for(int i = 0;i < lo;i++){
if(delimiterTokenDecrementCounter > 0){
delimiterTokenDecrementCounter--;
} else if(currentStringLength >= (*stringLengths)[currentStringNumber]){
(*buffer)[currentStringNumber][currentStringLength] = 0;
delimiterTokenDecrementCounter = ld - 1;
currentStringLength = 0;
currentStringNumber++;
} else {
(*buffer)[currentStringNumber][currentStringLength] = (char)original[i];
currentStringLength++;
}
}
buffer[currentStringNumber][currentStringLength] = 0;
}
Sample code:
int main(){
const char *string = "STRING-1 DELIM string-2 DELIM sTrInG-3";
char **buffer;
int numStrings;
int * stringLengths;
splitString(string, " DELIM ", &buffer, &numStrings, &stringLengths);
for(int i = 0;i < numStrings;i++){
printf("String: %s\n", buffer[i]);
}
}
Libraries:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
Try use this.
char** strsplit(char* str, const char* delim){
char** res = NULL;
char* part;
int i = 0;
char* aux = strdup(str);
part = strdup(strtok(aux, delim));
while(part){
res = (char**)realloc(res, (i + 1) * sizeof(char*));
*(res + i) = strdup(part);
part = strdup(strtok(NULL, delim));
i++;
}
res = (char**)realloc(res, i * sizeof(char*));
*(res + i) = NULL;
return res;
}
Explode & implode - initial string remains intact, dynamic memory allocation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
typedef struct
{
uintptr_t ptr;
int size;
} token_t;
int explode(char *str, int slen, const char *delimiter, token_t **tokens)
{
int i = 0, c1 = 0, c2 = 0;
for(i = 0; i <= slen; i++)
{
if(str[i] == *delimiter)
{
c1++;
}
}
if(c1 == 0)
{
return -1;
}
*tokens = (token_t*)calloc((c1 + 1), sizeof(token_t));
((*tokens)[c2]).ptr = (uintptr_t)str;
i = 0;
while(i <= slen)
{
if((str[i] == *delimiter) || (i == slen))
{
((*tokens)[c2]).size = (int)((uintptr_t)&(str[i]) - (uintptr_t)(((*tokens)[c2]).ptr));
if(i < slen)
{
c2++;
((*tokens)[c2]).ptr = (uintptr_t)&(str[i + 1]);
}
}
i++;
}
return (c1 + 1);
}
char* implode(token_t *tokens, int size, const char *delimiter)
{
int i, len = 0;
char *str;
for(i = 0; i < len; i++)
{
len += tokens[i].size + 1;
}
str = (char*)calloc(len, sizeof(char));
len = 0;
for(i = 0; i < size; i++)
{
memcpy((void*)&str[len], (void*)tokens[i].ptr, tokens[i].size);
len += tokens[i].size;
str[(len++)] = *delimiter;
}
str[len - 1] = '\0';
return str;
}
Usage:
int main(int argc, char **argv)
{
int i, c;
char *exp = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
token_t *tokens;
char *imp;
printf("%s\n", exp);
if((c = explode(exp, strlen(exp), ",", &tokens)) > 0)
{
imp = implode(tokens, c, ",");
printf("%s\n", imp);
for(i = 0; i < c; i++)
{
printf("%.*s, %d\n", tokens[i].size, (char*)tokens[i].ptr, tokens[i].size);
}
}
free((void*)tokens);
free((void*)imp);
return 0;
}
If you are willing to use an external library, I can't recommend bstrlib enough. It takes a little extra setup, but is easier to use in the long run.
For example, split the string below, one first creates a bstring with the bfromcstr() call. (A bstring is a wrapper around a char buffer).
Next, split the string on commas, saving the result in a struct bstrList, which has fields qty and an array entry, which is an array of bstrings.
bstrlib has many other functions to operate on bstrings
Easy as pie...
#include "bstrlib.h"
#include <stdio.h>
int main() {
int i;
char *tmp = "Hello,World,sak";
bstring bstr = bfromcstr(tmp);
struct bstrList *blist = bsplit(bstr, ',');
printf("num %d\n", blist->qty);
for(i=0;i<blist->qty;i++) {
printf("%d: %s\n", i, bstr2cstr(blist->entry[i], '_'));
}
}
Late to the party I know, but here's 2 more functions to play with and probably further adjust to your needs (source code at the bottom of the post)
See also the Implementation Notes, further below, to decide which function suits your needs better.
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdbool.h> // C99
// tokenize destructively
char **str_toksarray_alloc(
char **strp, /* InOut: pointer to the source non-constant c-string */
const char *delim, /* c-string containing the delimiting chars */
size_t *ntoks, /* InOut: # of tokens to parse/parsed (NULL or *ntoks==0 for all tokens) */
bool keepnulls /* false ignores empty tokens, true includes them */
);
// tokenize non-destructively
char **str_toksarray_alloc2(
const char *str, /* the source c-string */
const char *delim,
size_t *ntoks,
bool keepnulls
);
Usage Notes
Their prototypes are almost identical, except for the source-string (strp and str, respectively).
strp (pointer to string) is the address of an already allocated, non-constant c-string, to be tokenized in-place. str is a c-string which is not altered (it can even be a string-literal). By c-string I mean a nul-terminated buffer of chars. The rest of the arguments are the same for both functions.
To parse all available tokens, mute ntoks (meaning set it to 0 before passing it to any of the functions or pass it as a NULL pointer). Else the functions parse up to *ntoks tokens, or until there are no more tokens (whichever comes first). In any case, when ntoks is non-NULL it gets updated with the count of successfully parsed tokens.
Note also that a non-muted ntoks determines how many pointers will be allocated. Thus if the source string contains say 10 tokens and we set ntoks to 1000, we'll end up with 990 needlessly allocated pointers. On the other hand, if the source-string contains say 1000 tokens but we only need the first 10, setting ntoks to 10 sounds like a much wiser choice.
Both functions allocate and return an array of char-pointers, but str_toksarray_alloc() makes them point to the tokens in the modified source-string itself, while str_toksarray_alloc2() makes them point to dynamically allocated copies of the tokens (that 2 at the end of its name indicates the 2-levels of allocation).
The returned array is appended with a NULL sentinel pointer, which is not taken into account in the passed-back value of ntoks (put otherwise, when non-NULL, ntoks passes-back to the caller the length of the returned array, not its 1st level size).
When keepnulls is set to true, the resulting tokens are similar to what we'd expect from the strsep() function. Mostly meaning that consecutive delimiters in the source-string produce empty tokens (nulls), and if delim is an empty c-string or none of its contained delimiter-chars were found in the source string, the result is just 1 token: the source string. Contrary to strsep(), empty tokens can be ignored by setting keepnulls to false.
Failed calls of the functions can be identified by checking their return value against NULL, or by checking the passed-back value of ntoks against 0 (provided ntoks was non-NULL). I suggest always checking against failure before attempting to access the returned array, because the functions include sanity checks which can postpone otherwise immediate crashes (for example, passing a NULL pointer as the source string).
On success, the caller should free the array when they're done with it.
For str_toksarray_alloc(), a simple free() is enough. For str_toksarray_alloc2() a loop is involved, due to the 2nd level of allocation. The NULL sentinel (or the passed-back value of a non-NULL ntoks) makes this trivial, but I'm also providing a toksarray_free2() function below, for all the lazy bees out there :)
Simplified examples using both functions follow.
Prep:
const char *src = ";b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to";
const char *delim = ";,";
bool keepnulls = true;
size_t ntoks = 0;
str_toksarray_alloc():
// destructive (use copy of src)
char *scopy = strdup( src );
if (!scopy) { ... }; // handle strdup failure
printf( "%s\n", src );
char **arrtoks = str_toksarray_alloc( &scopy, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
free( scopy );
free( arrtoks );
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
11 tokens read
0:
1: b
2: test
3: Tèst
4:
5:
6: cd
7: ελληνικά
8: nørmälize
9:
10: string to
*/
str_toksarray_alloc2():
// non-destructive
keepnulls = false; // reject empty tokens
printf( "%s\n", src );
arrtoks = str_toksarray_alloc2( src, delim, &ntoks, keepnulls );
printf( "%lu tokens read\n", ntoks );
if ( arrtoks ) {
for (int i=0; arrtoks[i]; i++) {
printf( "%d: %s\n", i, arrtoks[i] );
}
}
toksarray_free2( arrtoks ); // dangling arrtoks
// or: arrtoks = toksarray_free2( arrtoks ); // non-dangling artoks
/* OUTPUT
;b,test,Tèst,;;cd;ελληνικά,nørmälize,;string to
7 tokens read
0: b
1: test
2: Tèst
3: cd
4: ελληνικά
5: nørmälize
6: string to
*/
Implementation Notes
Both functions use strsep() for the tokenization which makes them thread-safe, but it's not a standard function. If not provided, you can always use an open-source implementation (like GNU's or Apple's for example). Same goes for the function strdup() which is used in str_toksarray_alloc2() (its implementation is trivial but again here's GNU's and Apple's for example).
A side-effect of using strsep() in str_toksarray_alloc() is that the starting pointer of the source-string keeps moving to the next token in every step of the parsing loop. This means that the caller won't be able to free the parsed string, unless they had saved the starting address to an extra pointer. We save them the hassle, by doing that locally in the function, using the strpSaved pointer. str_toksarray_alloc2() is not affected by this, because it doesn't touch the source-string.
A main difference between the 2 functions is that str_toksarray_alloc() does not allocate memory for the found tokens. It rather allocates space just for the array pointers and sets them pointing directly into the source-string. This works because strsep() nul-terminates the found tokens in-place. This dependency can complicate your supporting code, but with big strings it can also make a big difference in performance. If preserving the source-string is not important, it can make a big difference in memory footprint too.
On the other hand, str_toksarray_alloc2() allocates and returns a self sustained array of dynamically allocated copies of the tokens, without further dependencies. It does so firstly by creating the array from a local duplicate of the source-string, and secondly by duplicating the actual tokens contents into the array. This is a lot slower and leaves a much bigger memory footprint compared to str_toksarray_alloc(), but it has no further dependencies, and sets no special requirements for the nature of the source-string. This makes it easier to write simpler (hence better maintainable) supporting code.
Another difference between the 2 functions is the 1st level of allocation (the array pointers) when ntoks is muted. They both parse all available tokens, but they take quite different approaches. str_toksarray_alloc() uses alloc-ahead with an initial size of 16 (char-pointers), doubling it on demand in the parsing loop. str_toksarray_alloc2() makes a 1st pass counting all available tokens, then it allocates that many char-pointers just once. That 1st pass is done with a helper function str_toksfound() which uses the standard functions strpbrk() and strchr(). I'm providing the source-code of that function too, further below.
Which approach is better is really up to you to decide, depending on the needs of your project. Feel free to adjust the code of each function to either approach and take it from there.
I'd say that on average and for really big strings alloc-ahead is much faster, especially when the initial size and grow factor are fine tuned on a per-case basis (making them function parameters for example). Saving that extra pass with all those strchr()'s and strpbrk()'s can make a difference there. However, with relatively small strings which is pretty much the norm, allocing-ahead just a bunch of char-pointers is just an overkill. It doesn't hurt but it does clutter the code for no good reason in this case. Anyway, feel free to choose whichever suits you best.
Same goes for these 2 functions. I'd say in most cases str_toksarray_alloc2() is much simpler to cope with, since memory and performance are rarely an issue with small to medium strings. If you have to deal with huge strings, then consider using str_toksarray_alloc() (though in those cases you should roll a specialized string parsing function, close to the needs of your project and the specs of your input).
Oh boy, I think that was a bit more than just 2 cents (lol).
Anyway, here is the code of the 2 functions and the helper ones (I've removed most of their description comments, since I've covered pretty much everything already).
Source Code
str_toksarray_alloc():
// ----------------------------------------
// Tokenize destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of char-pointers
// each pointing to each token found in the source-string, or NULL on error.
//
char **str_toksarray_alloc(char **strp, const char *delim, size_t *ntoks, bool keepnulls)
{
// sanity checks
if ( !strp || !*strp || !**strp || !delim ) {
goto failed;
}
char *strpSaved = *strp; // save initial *strp pointer
bool ntoksOk = (ntoks && *ntoks); // false when ntoks is muted
size_t _ntoks = (ntoksOk ? *ntoks : 16); // # of tokens to alloc-ahead
// alloc array of char-pointers (+1 for NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto failed;
}
// Parse *strp tokens into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( (tok = strsep(strp, delim)) ) {
// if requested, ignore empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// non-muted ntoks reached? we are done
if ( ntoksOk && i == _ntoks ) {
*ntoks = i;
break;
}
// muted ntoks & ran out of space? double toksarr and keep parsing
if ( !ntoksOk && i == _ntoks ) {
_ntoks *= 2;
char **tmparr = realloc( toksarr, (_ntoks+1) * sizeof(*tmparr) );
if ( !tmparr ) {
*strp = strpSaved;
free( toksarr );
goto failed;
}
toksarr = tmparr;
}
toksarr[i++] = tok; // get token address
}
toksarr[i] = NULL; // NULL sentinel
*strp = strpSaved; // restore initial *strp pointer
if (ntoks) *ntoks = i; // pass to caller # of parsed tokens
return toksarr;
failed:
if (ntoks) *ntoks = 0;
return NULL;
}
str_toksarray_alloc2():
// ----------------------------------------
// Tokenize non-destructively a nul-terminated source-string.
// Return a dynamically allocated, NULL terminated array of dynamically
// allocated and nul-terminated string copies of each token found in the
// source-string. Return NULL on error.
// The 2 at the end of the name means 2-levels of allocation.
//
char **str_toksarray_alloc2( const char *str, const char *delim, size_t *ntoks, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// make a copy of str to work with
char *_str = strdup( str );
if ( !_str ) {
if (ntoks) *ntoks = 0;
return NULL;
}
// if ntoks is muted we'll allocate str_tokscount() tokens, else *ntoks
size_t _ntoks = (ntoks && *ntoks) ? *ntoks : str_tokscount(_str, delim, keepnulls);
if ( _ntoks == 0 ) { // str_tokscount() failed
goto fail_free_str;
}
// alloc the array of strings (+1 for an extra NULL sentinel)
char **toksarr = malloc( (_ntoks+1) * sizeof(*toksarr) );
if ( !toksarr ) {
goto fail_free_str;
}
// Parse str tokens and duplicate them into the array
size_t i = 0; // # of actually parsed tokens
char *tok;
while ( i < _ntoks && (tok = strsep(&_str, delim)) ) {
// if requested, skip empty tokens
if ( *tok == '\0' && !keepnulls ) {
continue;
}
// duplicate current token into the array
char *tmptok = strdup( tok );
if ( !tmptok ) {
goto fail_free_arr;
}
toksarr[i++] = tmptok;
}
toksarr[i] = NULL; // NULL sentinel
free( _str ); // release the local copy of the source-string
if (ntoks) *ntoks = i; // pass to caller the # of parsed tokens
return toksarr;
// cleanup before failing
fail_free_arr:
for (size_t idx=0; idx < i; idx++) {
free( toksarr[idx] );
}
free( toksarr );
fail_free_str:
free( _str );
if (ntoks) *ntoks = 0;
return NULL;
}
str_tokscount() - helper function, used by str_toksarr_alloc2():
// ----------------------------------------
// Return the count of tokens present in a nul-terminated source-string (str),
// based on the delimiting chars contained in a 2nd nul-terminated string (delim).
// If the boolean argument is false, empty tokens are excluded.
//
// To stay consistent with the behavior of strsep(), the function returns 1 if
// delim is an empty string or none of its delimiters is found in str (in those
// cases the source-string is considered a single token).
// 0 is returned when str or delim are passed as NULL pointers, or when str is
// passed as an empty string.
//
size_t str_tokscount( const char *str, const char *delim, bool keepnulls )
{
// sanity checks
if ( !str || !*str || !delim ) {
return 0;
}
const char *tok = str;
size_t nnulls = strchr(delim, *str) ? 1 : 0;
size_t ntoks = 1; // even when no delims in str, str counts as 1 token
for (; (str = strpbrk(tok, delim)); ntoks++ ) {
tok = ++str;
if ( strchr(delim, *str) ) {
nnulls++;
}
}
return keepnulls ? ntoks : (ntoks - nnulls);
}
toksarray_free2() - use it on the array returned by str_toksarr_alloc2():
// ----------------------------------------
// Free a dynamically allocated, NULL terminated, array of char-pointers
// with each such pointer pointing to its own dynamically allocated data.
// Return NULL, so the caller has the choice of assigning it back to the
// dangling pointer. The 2 at the end of the name means 2-levels of deallocation.
//
// NULL terminated array means ending with a NULL sentinel.
// e.g.: toksarr[0] = tok1, ..., toksarr[len] = NULL
//
char **toksarray_free2( char **toksarr )
{
if ( toksarr ) {
char **toks = toksarr;
while ( *toks ) { // walk until NULL sentinel
free( *toks++ );
}
free( toksarr );
}
return NULL;
}
Both strtok() and strsep() modify the input string. We can write a function to split the string based on delimiters using strspn() and strpbrk().
Algorithm:
If the input string is not empty, go to step 2 else return null.
Skip separator, if any at the start of string, and record start position of word (using strspn() for this), call it start.
Find next separator position (or end of string if no more separator exists) from the current start found in previous step (using strpbrk() for this), call it end.
Allocate memory and copy string from start to end in that memory.
Return token.
Advantage:
Thread safe.
Handles multiple delimiters.
Portable.
Doesn't modify the input string, like strtok() and strsep() does.
Implementation:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/*
* alloc_str function allocates memory and copy substring
* to allocated memory.
*/
static char * alloc_str (const char * start, const char * end) {
if (!start || !end || (start >= end)) {
return NULL;
}
char * tmp = malloc (end - start + 1);
if (tmp) {
memcpy (tmp, start, end - start);
tmp[end - start] = '\0';
} else {
fprintf (stderr, "Failed to allocate memory\n");
exit (EXIT_FAILURE);
}
return tmp;
}
/*
* str_split function returns the next token which is sequences of contiguous
* characters separated by any of the characters that are part of delimiters.
*
* Parameters:
* p_str : Address of pointer to the string that you want to split.
* sep : A set of characters that delimit the pieces in the string.
*
* Behaviour is undefined if sep is not a pointer to a null-terminated string.
*
* Return :
* Returns the pointer to dynamically allocated memory where the token is copied.
* If p_str is NULL or empty string, NULL is returned.
*/
char * str_split (char ** p_str, const char * sep) {
char * token = NULL;
if (*p_str && **p_str) {
char * p_end;
// skip separator
*p_str += strspn(*p_str, sep);
p_end = *p_str;
// find separator
p_end = strpbrk (p_end, sep);
// strpbrk() returns null pointer if no such character
// exists in the input string which is part of sep argument.
if (!p_end) {
p_end = *p_str + strlen (*p_str);
}
token = alloc_str (*p_str, p_end);
*p_str = p_end;
}
return token;
}
/*==================================================*/
/*==================================================*/
/*
* Just a helper function
*/
void token_helper (char * in_str, const char * delim) {
printf ("\nInput string : ");
if (in_str) printf ("\"%s\"\n", in_str);
else printf ("NULL\n");
if (delim) printf ("Delimiter : \"%s\"\n", delim);
char * ptr = in_str;
char * token = NULL;
printf ("Tokens:\n");
while ((token = str_split(&ptr, delim)) != NULL) {
printf ("-> %s\n", token);
/* You can assign this token to a pointer of an array of pointers
* and return that array of pointers from this function.
* Since, this is for demonstration purpose, I am
* freeing the allocated memory now.
*/
free (token);
}
}
/*
* Driver function
*/
int main (void) {
/* test cases */
char string[100] = "hello world!";
const char * delim = " ";
token_helper (string, delim);
strcpy (string, " hello world,friend of mine!");
delim = " ,";
token_helper (string, delim);
strcpy (string, "Another string");
delim = "-!";
token_helper (string, delim);
strcpy (string, " one more -- string !");
delim = "- !";
token_helper (string, delim);
strcpy (string, "");
delim = " ";
token_helper (string, delim);
token_helper (NULL, "");
strcpy (string, "hi");
delim = " -$";
token_helper (string, delim);
strcpy (string, "Give papa a cup of proper coffee in a copper coffee cup.");
delim = "cp";
token_helper (string, delim);
strcpy (string, "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC");
delim = ",";
token_helper (string, delim);
return 0;
}
Output:
# ./a.out
Input string : "hello world!"
Delimiter : " "
Tokens:
-> hello
-> world!
Input string : " hello world,friend of mine!"
Delimiter : " ,"
Tokens:
-> hello
-> world
-> friend
-> of
-> mine!
Input string : "Another string"
Delimiter : "-!"
Tokens:
-> Another string
Input string : " one more -- string !"
Delimiter : "- !"
Tokens:
-> one
-> more
-> string
Input string : ""
Delimiter : " "
Tokens:
Input string : NULL
Delimiter : ""
Tokens:
Input string : "hi"
Delimiter : " -$"
Tokens:
-> hi
Input string : "Give papa a cup of proper coffee in a copper coffee cup."
Delimiter : "cp"
Tokens:
-> Give
-> a
-> a a
-> u
-> of
-> ro
-> er
-> offee in a
-> o
-> er
-> offee
-> u
-> .
Input string : "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC"
Delimiter : ","
Tokens:
-> JAN
-> FEB
-> MAR
-> APR
-> MAY
-> JUN
-> JUL
-> AUG
-> SEP
-> OCT
-> NOV
-> DEC
My approach is to scan the string and let the pointers point to every character after the deliminators(and the first character), at the same time assign the appearances of deliminator in string to '\0'.
First make a copy of original string(since it's constant), then get the number of splits by scan it pass it to pointer parameter len. After that, point the first result pointer to the copy string pointer, then scan the copy string: once encounter a deliminator, assign it to '\0' thus the previous result string is terminated, and point the next result string pointer to the next character pointer.
char** split(char* a_str, const char a_delim, int* len){
char* s = (char*)malloc(sizeof(char) * strlen(a_str));
strcpy(s, a_str);
char* tmp = a_str;
int count = 0;
while (*tmp != '\0'){
if (*tmp == a_delim) count += 1;
tmp += 1;
}
*len = count;
char** results = (char**)malloc(count * sizeof(char*));
results[0] = s;
int i = 1;
while (*s!='\0'){
if (*s == a_delim){
*s = '\0';
s += 1;
results[i++] = s;
}
else s += 1;
}
return results;
}
My code (tested):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int dtmsplit(char *str, const char *delim, char ***array, int *length ) {
int i=0;
char *token;
char **res = (char **) malloc(0 * sizeof(char *));
/* get the first token */
token = strtok(str, delim);
while( token != NULL )
{
res = (char **) realloc(res, (i + 1) * sizeof(char *));
res[i] = token;
i++;
token = strtok(NULL, delim);
}
*array = res;
*length = i;
return 1;
}
int main()
{
int i;
int c = 0;
char **arr = NULL;
int count =0;
char str[80] = "JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC";
c = dtmsplit(str, ",", &arr, &count);
printf("Found %d tokens.\n", count);
for (i = 0; i < count; i++)
printf("string #%d: %s\n", i, arr[i]);
return(0);
}
Result:
Found 12 tokens.
string #0: JAN
string #1: FEB
string #2: MAR
string #3: APR
string #4: MAY
string #5: JUN
string #6: JUL
string #7: AUG
string #8: SEP
string #9: OCT
string #10: NOV
string #11: DEC
Two issues surrounding this question are memory management and thread safety. As you can see from the numerous posts,
this isn't an easy task to accomplish seamlessly in C. I desired a solution that is:
Thread safe. (strtok is not thread safe)
Does not employ malloc or any of it's derivatives (to avoid memory management issues)
Checks array bounds on the individual fields (to avoid segment faults on unknown data)
Works with multi-byte field separators (utf-8)
ignores extra fields in the input
provides soft error routine for invalid field lengths
The solution I came up meets all of these criteria. It's probably a little more work to setup
than some other solutions posted here, but I think that in practice, the extra work is worth
it in order to avoid the common pitfalls of other solutions.
#include <stdio.h>
#include <string.h>
struct splitFieldType {
char *field;
int maxLength;
};
typedef struct splitFieldType splitField;
int strsplit(splitField *fields, int expected, const char *input, const char *fieldSeparator, void (*softError)(int fieldNumber,int expected,int actual)) {
int i;
int fieldSeparatorLen=strlen(fieldSeparator);
const char *tNext, *tLast=input;
for (i=0; i<expected && (tNext=strstr(tLast, fieldSeparator))!=NULL; ++i) {
int len=tNext-tLast;
if (len>=fields[i].maxLength) {
softError(i,fields[i].maxLength-1,len);
len=fields[i].maxLength-1;
}
fields[i].field[len]=0;
strncpy(fields[i].field,tLast,len);
tLast=tNext+fieldSeparatorLen;
}
if (i<expected) {
if (strlen(tLast)>fields[i].maxLength) {
softError(i,fields[i].maxLength,strlen(tLast));
} else {
strcpy(fields[i].field,tLast);
}
return i+1;
} else {
return i;
}
}
void monthSplitSoftError(int fieldNumber, int expected, int actual) {
fprintf(stderr,"monthSplit: input field #%d is %d bytes, expected %d bytes\n",fieldNumber+1,actual,expected);
}
int main() {
const char *fieldSeparator=",";
const char *input="JAN,FEB,MAR,APRI,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR";
struct monthFieldsType {
char field1[4];
char field2[4];
char field3[4];
char field4[4];
char field5[4];
char field6[4];
char field7[4];
char field8[4];
char field9[4];
char field10[4];
char field11[4];
char field12[4];
} monthFields;
splitField inputFields[12] = {
{monthFields.field1, sizeof(monthFields.field1)},
{monthFields.field2, sizeof(monthFields.field2)},
{monthFields.field3, sizeof(monthFields.field3)},
{monthFields.field4, sizeof(monthFields.field4)},
{monthFields.field5, sizeof(monthFields.field5)},
{monthFields.field6, sizeof(monthFields.field6)},
{monthFields.field7, sizeof(monthFields.field7)},
{monthFields.field8, sizeof(monthFields.field8)},
{monthFields.field9, sizeof(monthFields.field9)},
{monthFields.field10, sizeof(monthFields.field10)},
{monthFields.field11, sizeof(monthFields.field11)},
{monthFields.field12, sizeof(monthFields.field12)}
};
int expected=sizeof(inputFields)/sizeof(splitField);
printf("input data: %s\n", input);
printf("expecting %d fields\n",expected);
int ct=strsplit(inputFields, expected, input, fieldSeparator, monthSplitSoftError);
if (ct!=expected) {
printf("string split %d fields, expected %d\n", ct,expected);
}
for (int i=0;i<expected;++i) {
printf("field %d: %s\n",i+1,inputFields[i].field);
}
printf("\n");
printf("Direct structure access, field 10: %s", monthFields.field10);
}
Below is an example compile and output. Note that in my example, I purposefully spelled out "APRIL" so that you can see how the soft error works.
$ gcc strsplitExample.c && ./a.out
input data: JAN,FEB,MAR,APRIL,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,FOO,BAR
expecting 12 fields
monthSplit: input field #4 is 5 bytes, expected 3 bytes
field 1: JAN
field 2: FEB
field 3: MAR
field 4: APR
field 5: MAY
field 6: JUN
field 7: JUL
field 8: AUG
field 9: SEP
field 10: OCT
field 11: NOV
field 12: DEC
Direct structure access, field 10: OCT
Enjoy!
Here is another implementation that will operate safely to tokenize a string-literal matching the prototype requested in the question returning an allocated pointer-to-pointer to char (e.g. char **). The delimiter string can contain multiple characters, and the input string can contain any number of tokens. All allocations and reallocations are handled by malloc or realloc without POSIX strdup.
The initial number of pointers allocated is controlled by the NPTRS constant and the only limitation is that it be greater than zero. The char ** returned contains a sentinel NULL after the last token similar to *argv[] and in the form usable by execv, execvp and execve.
As with strtok() multiple sequential delimiters are treated as a single delimiter, so "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC" will be parsed as if only a single ',' separates "MAY,JUN".
The function below is commented in-line and a short main() was added splitting the months. The initial number of pointers allocated was set at 2 to force three reallocation during tokenizing the input string:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define NPTRS 2 /* initial number of pointers to allocate (must be > 0) */
/* split src into tokens with sentinel NULL after last token.
* return allocated pointer-to-pointer with sentinel NULL on success,
* or NULL on failure to allocate initial block of pointers. The number
* of allocated pointers are doubled each time reallocation required.
*/
char **strsplit (const char *src, const char *delim)
{
int i = 0, in = 0, nptrs = NPTRS; /* index, in/out flag, ptr count */
char **dest = NULL; /* ptr-to-ptr to allocate/fill */
const char *p = src, *ep = p; /* pointer and end-pointer */
/* allocate/validate nptrs pointers for dest */
if (!(dest = malloc (nptrs * sizeof *dest))) {
perror ("malloc-dest");
return NULL;
}
*dest = NULL; /* set first pointer as sentinel NULL */
for (;;) { /* loop continually until end of src reached */
if (!*ep || strchr (delim, *ep)) { /* if at nul-char or delimiter char */
size_t len = ep - p; /* get length of token */
if (in && len) { /* in-word and chars in token */
if (i == nptrs - 1) { /* used pointer == allocated - 1? */
/* realloc dest to temporary pointer/validate */
void *tmp = realloc (dest, 2 * nptrs * sizeof *dest);
if (!tmp) {
perror ("realloc-dest");
break; /* don't exit, original dest still valid */
}
dest = tmp; /* assign reallocated block to dest */
nptrs *= 2; /* increment allocated pointer count */
}
/* allocate/validate storage for token */
if (!(dest[i] = malloc (len + 1))) {
perror ("malloc-dest[i]");
break;
}
memcpy (dest[i], p, len); /* copy len chars to storage */
dest[i++][len] = 0; /* nul-terminate, advance index */
dest[i] = NULL; /* set next pointer NULL */
}
if (!*ep) /* if at end, break */
break;
in = 0; /* set in-word flag 0 (false) */
}
else { /* normal word char */
if (!in) /* if not in-word */
p = ep; /* update start to end-pointer */
in = 1; /* set in-word flag 1 (true) */
}
ep++; /* advance to next character */
}
return dest;
}
int main (void) {
char *str = "JAN,FEB,MAR,APR,MAY,,,JUN,JUL,AUG,SEP,OCT,NOV,DEC",
**tokens; /* pointer to pointer to char */
if ((tokens = strsplit (str, ","))) { /* split string into tokens */
for (char **p = tokens; *p; p++) { /* loop over filled pointers */
puts (*p);
free (*p); /* don't forget to free allocated strings */
}
free (tokens); /* and pointers */
}
}
Example Use/Output
$ ./bin/splitinput
JAN
FEB
MAR
APR
MAY
JUN
JUL
AUG
SEP
OCT
NOV
DEC
Let me know if you have any further questions.
#include <cstring>
#include <cstdio>
int main()
{
char buf[] = "This is Luke Skywalker here!";
for( char* tok = strtok( buf, " ");
tok != nullptr;
tok = strtok( nullptr, " ")) {
puts( tok);
}
}
Outputs
This
is
Luke
Skywalker
here!
Came across this looking for a simple solution.
I am fascinated by all of the options but dissatisfied for my own use case/taste (which may be terrible).
I have created a somewhat unique solution that aims to clearly behave for its user, not re-allocate any memory, and be human readable + with comments.
Uploaded to gist.github here: https://gist.github.com/RepComm/1e89f7611733ce0e75c8476d5ef66093
Example:
#include "./strutils.c"
struct str_split_info info;
info.source = " SPLIT ME hello SPLIT ME world SPLIT ME whats SPLIT ME going SPLIT ME on SPLIT ME today";
info.delimiter = " SPLIT ME ";
str_split_begin(&info);
char * substr;
for (int i=0; i<info.splitStringsCount; i++) {
substr = info.splitStrings[i];
printf("substring: '%s'\n", substr);
}
str_split_end(&info);
Output:
$ ./test
substring: ''
substring: 'hello'
substring: 'world'
substring: 'whats'
substring: 'going'
substring: 'on'
substring: 'today'
Full source of strutils.c
#ifndef STRUTILS_C
#define STRUTILS_C 1
#ifndef str
#define str char *
#endif
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
struct str_split_info {
/* The string to be split
* Provided by caller of str_split_begin function
*/
str source;
/* The string that cuts the source string, all occurances of
* this string will be removed from the source string
* Provided by caller of str_split_begin function
*/
str delimiter;
/* Array of strings split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
str * splitStrings;
/* Array of string lengths split by delimiter
* Provided and allocated by str_split_begin function
* Must be garbage collected by str_split_end function
*/
int * splitStringsLengths;
/* Number of strings split by delimiter contained in splitStrings
* Provided by str_split_begin function
*/
int splitStringsCount;
};
#define str_split_infop struct str_split_info *
/* Split a string by a delimiting string
*
* The caller is responsible only for calling str_split_end
* when finished with the results in 'info'
*/
void str_split_begin (str_split_infop info) {
info->splitStringsCount = 0;
int sourceLength = strlen(info->source);
int sourceOffset = 0;
char sourceChar;
int delimiterLength = strlen(info->delimiter);
int delimiterOffset = 0;
char delimiterChar;
//first pass, simply count occurances so we can allocate only once
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
if (delimiterOffset >= delimiterLength) {
delimiterOffset = 0;
//increment count
info->splitStringsCount ++;
}
} else {
delimiterOffset = 0;
}
}
info->splitStringsCount++;
//allocate arrays since we know the count
//this one is an array of strings, which are each char arrays
info->splitStrings = (str *) malloc(sizeof (str *) * info->splitStringsCount);
//this one is an array of ints
info->splitStringsLengths = (int*) malloc(sizeof(int) *info->splitStringsCount);
int stringBegin = 0;
int stringEnd = 0;
int splitIndex = 0;
int splitLength = 0;
//second pass, fill the arrays
for (sourceOffset = 0; sourceOffset<sourceLength; sourceOffset++) {
sourceChar = info->source[sourceOffset];
delimiterChar = info->delimiter[delimiterOffset];
if (sourceChar == delimiterChar) {
delimiterOffset++;
//if we've reached the end of the delimiter
if (delimiterOffset >= delimiterLength) {
//don't worry about delimiter trailing null, strlen doesn't count those
stringEnd = sourceOffset - delimiterLength;
//char count of substring we want to split
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
//explicitly set the last char of this split to a NULL just for fun
info->splitStrings[splitIndex][splitLength] = 0x00;
//conveniently put the substring split size for the
//user of str_split_begin :)
info->splitStringsLengths[splitIndex] = splitLength;
//move to next split index
splitIndex ++;
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
//next substring split should occur after the current delimiter
stringBegin = sourceOffset+1;
}
} else {
//reset delimiter offset so we look for new occurances of it
delimiterOffset = 0;
}
}
//handle edge case of last substring after last delimiter
if (stringEnd != stringBegin) {
stringEnd = sourceLength-1;
splitLength = stringEnd - stringBegin + 1;
//allocate for our substring split
info->splitStrings[splitIndex] = (str) malloc(
//+1 for trailing null for c-string
sizeof(char) * splitLength + 1
);
//copy substring from source into splitStrings array
memcpy(
info->splitStrings[splitIndex],
info->source + stringBegin,
splitLength
);
}
}
int str_split_count (str_split_infop info) {
return info->splitStringsCount;
}
void str_split_get (str_split_infop info, str * out) {
for (int i=0; i < info->splitStringsCount; i++) {
strcpy(out[i], info->splitStrings[i]);
}
}
void str_split_end (str_split_infop info) {
if (info->splitStringsCount > 0 && info->splitStrings != NULL) {
//free each string allocated
for (int i=0; i < info->splitStringsCount; i++) {
free(info->splitStrings[i]);
}
//free string array pointer
free (info->splitStrings);
//free string lengths array pointer
free(info->splitStringsLengths);
info->splitStringsCount = 0;
}
}
void str_split_test () {
char * source = "hello world this is a test";
str delimiter = " ";
struct str_split_info info;
info.source = source;
info.delimiter = delimiter;
str_split_begin (&info);
//iterate thru split substrings
//NOTE: removed/memory cleanup after str_split_end
for (int i=0; i<info.splitStringsCount; i++) {
// info.splitStrings[i];
}
str_split_end(&info);
}
#endif
I tried to make a very simple one. I am also showing example in the main().
#include <stdio.h>
#include <string.h>
void split(char* inputArr, char** outputArr, char* delim) {
char *temp;
temp = strtok(inputArr, delim);
for(int i = 0; temp != NULL; i++) {
outputArr[i] = temp;
temp = strtok(NULL, delim);
}
}
int main(int argc, char **argv){
/* check for proper arguments */
if(argc != 2){
printf("One Argument Expected\n");
} else {
printf("\n");
/*---------main code starts here----------*/
FILE * myScriptFile;
myScriptFile = fopen(argv[1], "r");
/* read txt file and split into array like java split() */
int bufferLen = 100;
char buffer[bufferLen];
char *splitArr[100];
while(fgets(buffer, bufferLen, myScriptFile) != NULL){
split(buffer, splitArr, " ");
printf("Index 0 String: %s\n", splitArr[0]);
printf("Index 1 String: %s\n", splitArr[1]);
printf("Index 2 String: %s\n", splitArr[2]);
printf("Index 3 String: %s\n", splitArr[3]);
}
fclose(myScriptFile);
}
printf("\nProgram-Script Ended\n");
return 0;
}
Assume a .txt file has
Hello this is test
Hello2 this is test2
running it with a .txt file as a parameter would give
Index 0 String: Hello
Index 1 String: this
Index 2 String: is
Index 3 String: test
Index 0 String: Hello2
Index 1 String: this
Index 2 String: is
Index 3 String: test2

Resources