string tokenizer function error segmented core - c

I'm trying to create a tokenizer for a dynamic string input but I'm getting the error "segmented fault(core dump)' when I try to run it.Here's the code for the function and an execution example
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main()
{int num_words = 0;
char **words = NULL;
char **tokenize_string(char *str,int num_words)
{
num_words = 0;
// reinitialize -> 0 for each read
char **words = NULL; // words array of strings that will be returned
char *word = strtok(str, " ");
while (word != NULL)
{
words = (char **)realloc(words, sizeof(char *) * (num_words + 1)); // dynamically reallocate the array of strings for each iteration : size of pointer to character (string) * number of words tokenized
words[num_words] = (char *)malloc(sizeof(char) * (strlen(word) + 1));
// allocate size of tokenized string
strcpy(words[num_words], word);
num_words++;
word = strtok(NULL, " ");
}
// Append a null character at the end of the array
words = (char **)realloc(words, sizeof(char *) * (num_words + 1));
words[num_words] = NULL;
return words;
}
words = tokenize_string("ls wc tr qx",num_words);
printf("%d",num_words);
}

Related

Function to split a string and return every word in the string as an array of strings

I am trying to create a function that will accept a string, and return an array of words in the string. Here is my attempt:
#include "main.h"
/**
* str_split - Splits a string
* #str: The string that will be splited
*
* Return: On success, it returns the new array
* of strings. On failure, it returns NULL
*/
char **str_split(char *str)
{
char *piece, **str_arr = NULL, *str_cpy;
int number_of_words = 0, i;
if (str == NULL)
{
return (NULL);
}
str_cpy = str;
piece = strtok(str_cpy, " ");
while (piece != NULL)
{
if ((*piece) == '\n')
{
piece = strtok(NULL, " ");
continue;
}
number_of_words++;
piece = strtok(NULL, " ");
}
str_arr = (char **)malloc(sizeof(char *) * number_of_words);
piece = strtok(str, " ");
for (i = 0; piece != NULL; i++)
{
if ((*piece) == '\n')
{
piece = strtok(NULL, " ");
continue;
}
str_arr[i] = (char *)malloc(sizeof(char) * (strlen(piece) + 1));
strcpy(str_arr[i], piece);
piece = strtok(NULL, " ");
}
return (str_arr);
}
Once I compile my file, I should be getting:
Hello
World
But I am getting:
Hello
Why is this happening? I have tried to dynamically allocate memory for the new string array, by going through the copy of the original string and keeping track of the number of words. Is this happening because the space allocated for the array of strings is not enough?
The code seems fine overall, with just some issues:
You tried to copy str, as strtok modifies it while parsing.
This is the right approach. However, the following line is wrong:
str_cpy = str;
This is not a copy of strings, it is only copying the address of the string. You can use strdup function here.
Also, you need to return the number of words counted otherwise the caller will not know how many were parsed.
Finally, be careful when you define the string to be passed to this function. If you call it with:
char **arr = str_split ("Hello World", &nwords);
Or even with:
char *str = "Hello World";
char **arr = str_split (str, &nwords);
The program will crash as str here is read-only (see this).
Taking care of these, the program should work with:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
/**
* str_split - Splits a string
* #str: The string that will be splited
*
* Return: On success, it returns the new array
* of strings. On failure, it returns NULL
*/
char **str_split(char *str, int *number_of_words)
{
char *piece, **str_arr = NULL, *str_cpy = NULL;
int i = 0;
if (str == NULL)
{
return (NULL);
}
str_cpy = strdup (str);
piece = strtok(str_cpy, " ");
while (piece != NULL)
{
if ((*piece) == '\n')
{
piece = strtok(NULL, " ");
continue;
}
(*number_of_words)++;
piece = strtok(NULL, " ");
}
str_arr = (char **)malloc(sizeof(char *) * (*number_of_words));
piece = strtok(str, " ");
for (i = 0; piece != NULL; i++)
{
if ((*piece) == '\n')
{
piece = strtok(NULL, " ");
continue;
}
str_arr[i] = (char *)malloc(sizeof(char) * (strlen(piece) + 1));
strcpy(str_arr[i], piece);
piece = strtok(NULL, " ");
}
if (str_cpy)
free (str_cpy);
return (str_arr);
}
int main ()
{
int nwords = 0;
char str[] = "Hello World";
char **arr = str_split (str, &nwords);
for (int i = 0; i < nwords; i++) {
printf ("word %d: %s\n", i, arr[i]);
}
// Needs to free allocated memory...
}
Testing:
$ gcc main.c && ./a.out
word 0: Hello
word 1: World

Extracting the first two words in a sentence in C without pointers

I am getting used to writing eBPF code as of now and want to avoid using pointers in my BPF text due to how difficult it is to get a correct output out of it. Using strtok() seems to be out of the question due to all of the example codes requiring pointers. I also want to expand it to CSV files in the future since this is a means of practice for me. I was able to find another user's code here but it gives me an error with the BCC terminal due to the one pointer.
char str[256];
bpf_probe_read_user(&str, sizeof(str), (void *)PT_REGS_RC(ctx));
char token[] = strtok(str, ",");
char input[] ="first second third forth";
char delimiter[] = " ";
char firstWord, *secondWord, *remainder, *context;
int inputLength = strlen(input);
char *inputCopy = (char*) calloc(inputLength + 1, sizeof(char));
strncpy(inputCopy, input, inputLength);
str = strtok_r (inputCopy, delimiter, &context);
secondWord = strtok_r (NULL, delimiter, &context);
remainder = context;
getchar();
free(inputCopy);
Pointers are powerful, and you wont be able to avoid them for very long. The time you invest in learning them is definitively worth it.
Here is an example:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/**
Extracts the word with the index "n" in the string "str".
Words are delimited by a blank space or the end of the string.
}*/
char *getWord(char *str, int n)
{
int words = 0;
int length = 0;
int beginIndex = 0;
int endIndex = 0;
char currentchar;
while ((currentchar = str[endIndex++]) != '\0')
{
if (currentchar == ' ')
{
if (n == words)
break;
if (length > 0)
words++;
length = 0;
beginIndex = endIndex;
continue;
}
length++;
}
if (n == words)
{
char *result = malloc(sizeof(char) * length + 1);
if (result == NULL)
{
printf("Error while allocating memory!\n");
exit(1);
}
memcpy(result, str + beginIndex, length);
result[length] = '\0';
return result;
}else
return NULL;
}
You can easily use the function:
int main(int argc, char *argv[])
{
char string[] = "Pointers are cool!";
char *word = getWord(string, 2);
printf("The third word is: '%s'\n", word);
free(word); //Don't forget to de-allocate the memory!
return 0;
}

pointer being realloc'd was not allocated?

Maybe it's a stupid question, but I get stuck here for a while.
Let's say freq_tostring() converts a word frequency freq into string, and freq_intostream() appends that string to the end of a stream.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
typedef struct {
char *word; // null-terminated
int freq;
} freq;
/**
* Constructor
*/
void new_freq(freq *fq, const char *word, const int freq) {
fq->word = (char *)malloc((strlen(word) + 1) * sizeof(char)); // +1 for null-terminator
strcpy(fq->word, word);
fq->freq = freq;
}
/**
* Free memory
*/
void dispose_freq(void *fq) {
freq *p = (freq *)fq;
free(p->word);
p->word = NULL;
}
/**
* snprintf() will terminate the string with a null character, unless buf_size is zero.
*/
char *freq_tostring(freq *fq) {
size_t wordlen = strlen(fq->word);
char *buffer = (char *)malloc(wordlen + 16); // maximum integer has 10 digits
snprintf(buffer, wordlen + 16, "[%s, %d]\n", fq->word, fq->freq);
return buffer;
}
/**
* Append the string of freq to the end of stream.
*/
void freq_intostream(void *elem, void *stream) {
freq *fq = (freq *)elem;
char *str = *(char **)stream;
size_t strsize = strlen(str);
// printf("Stream = \"%s\", length = %lu\n", str, strsize);
char *word = freq_tostring(fq);
size_t wordsize = strlen(word);
// printf("Element = \"%s\"%lu\n", word, wordsize);
char *temp = (char *)realloc(str, strsize + wordsize + 1);
strcpy(temp + strsize, word);
temp[strsize + wordsize] = '\0';
// printf("After strcpy(): \"%s\"\n", temp);
str = temp;
free(word);
}
int main(void) {
freq apple, banana, kiwi;
new_freq(&apple, "apple", 3);
new_freq(&banana, "banana", 2);
new_freq(&kiwi, "kiwi", 5);
char *buffer = (char *)malloc(1);
buffer[0] = '\0';
freq_intostream(&apple, &buffer);
freq_intostream(&banana, &buffer);
freq_intostream(&kiwi, &buffer);
assert(strlen(buffer) == 33);
assert(strcmp(buffer, "[apple, 3]\n[banana, 2]\n[kiwi, 5]\n") == 0);
dispose_freq(&apple);
dispose_freq(&banana);
dispose_freq(&kiwi);
free(buffer);
}
The weird thing is, when I run 10 times, it gives me about 9 pointer being realloc'd was not allocated, but maybe in 1~2 cases, everything is ok.
If I comment out the printf(), it shows that before appending the third element kiwi, the stream is empty, and that could be why realloc is failed. But I'm sure that I pass a pointer of char * stream to the freq_intostream() function, which is a char ** for sure. I can't find out what's the problem, anyone can help?
You've done the equivalent of i = j; i = 3; when you wanted j = 3;. Obviously, these don't do the same thing. Have a close look at the marked line in this funciton:
/**
* Append the string of freq to the end of stream.
*/
void freq_intostream(void *elem, void *stream) {
freq *fq = (freq *)elem;
char *str = *(char **)stream;
size_t strsize = strlen(str);
// printf("Stream = \"%s\", length = %lu\n", str, strsize);
char *word = freq_tostring(fq);
size_t wordsize = strlen(word);
// printf("Element = \"%s\"%lu\n", word, wordsize);
char *temp = (char *)realloc(str, strsize + wordsize + 1);
strcpy(temp + strsize, word);
temp[strsize + wordsize] = '\0';
// printf("After strcpy(): \"%s\"\n", temp);
str = temp; // OOPS!!
free(word);
}
You change the value of str, but str is a local to this function and its value is thrown away as soon as the function ends.
You wanted: *(char**)stream = temp; to change the value the caller passed you a pointer to.
This code would be much simpler if you get rid of all the casts. If elem were of type char **, you could just do *elem = temp; and the code would be much easier to understand.

Free, invalid pointer

I have a program, that splits strings based on the delimiter. I have also, 2 other functions, one that prints the returned array and another that frees the array.
My program prints the array and returns an error when the free array method is called. Below is the full code.
#include "stringsplit.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
unsigned long int getNofTokens(const char *string) {
char *stringCopy;
unsigned long int stringLength;
unsigned long int count = 0;
stringLength = (unsigned)strlen(string);
stringCopy = malloc((stringLength + 1) * sizeof(char));
strcpy(stringCopy, string);
if (strtok(stringCopy, " \t") != NULL) {
count++;
while (strtok(NULL, " \t") != NULL)
count++;
}
free(stringCopy);
return count;
}
char **split_string(const char *str, const char *split) {
unsigned long int count = getNofTokens(str);
char **result;
result = malloc(sizeof(char *) * count + 1);
char *tmp = malloc(sizeof(char) * strlen(str));
strcpy(tmp, str);
char *token = strtok(tmp, split);
int idx = 0;
while (token != NULL) {
result[idx++] = token;
token = strtok(NULL, split);
}
return result;
}
void print_split_string(char **split_string) {
for (int i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (int i = 0; split_string[i] != NULL; i++) {
char *currentPointer = split_string[i];
free(currentPointer);
}
free(split_string);
}
Also, do I need to explicitly add \0 at the end of the array or does strtok add it automatically?
There are some problems in your code:
[Major] the function getNofTokens() does not take the separator string as an argument, it counts the number of words separated by blanks, potentially returning an inconsistent count to its caller.
[Major] the size allocated in result = malloc(sizeof(char *) * count + 1); is incorrect: it should be:
result = malloc(sizeof(char *) * (count + 1));
Storing the trailing NULL pointer will write beyond the end of the allocated space.
[Major] storing the said NULL terminator at the end of the array is indeed necessary, as the block of memory returned by malloc() is uninitialized.
[Major] the copy of the string allocated and parsed by split_string cannot be safely freed because the pointer tmp is not saved anywhere. The pointer to the first token will be different from tmp in 2 cases: if the string contains only delimiters (no token found) or if the string starts with a delimiter (the initial delimiters will be skipped). In order to simplify the code and make it reliable, each token could be duplicated and tmp should be freed. In fact your free_split_string() function relies on this behavior. With the current implementation, the behavior is undefined.
[Minor] you use unsigned long and int inconsistently for strings lengths and array index variables. For consistency, you should use size_t for both.
[Remark] you should allocate string copies with strdup(). If this POSIX standard function is not available on your system, write a simple implementation.
[Major] you never test for memory allocation failure. This is OK for testing purposes and throw away code, but such potential failures should always be accounted for in production code.
[Remark] strtok() is a tricky function to use: it modifies the source string and keeps a hidden static state that makes it non-reentrant. You should avoid using this function although in this particular case it performs correctly, but if the caller of split_string or getNofTokens relied on this hidden state being preserved, it would get unexpected behavior.
Here is a modified version:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
size_t getNofTokens(const char *string, const char *split) {
char *tmp = strdup(string);
size_t count = 0;
if (strtok(tmp, split) != NULL) {
count++;
while (strtok(NULL, split) != NULL)
count++;
}
free(tmp);
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
char *tmp = strdup(str);
char *token = strtok(tmp, split);
size_t idx = 0;
while (token != NULL && idx < count) {
result[idx++] = strdup(token);
token = strtok(NULL, split);
}
result[idx] = NULL;
free(tmp);
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}
Here is an alternative without strtok() and without intermediary allocations:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
size_t getNofTokens(const char *str, const char *split) {
size_t count = 0;
size_t pos = 0, len;
for (pos = 0;; pos += len) {
pos += strspn(str + pos, split); // skip delimiters
len = strcspn(str + pos, split); // parse token
if (len == '\0')
break;
count++;
}
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
size_t pos, len, idx;
for (pos = 0, idx = 0; idx < count; pos += len, idx++) {
pos += strspn(str + pos, split); // skip delimiters
len = strcspn(str + pos, split); // parse token
if (len == '\0')
break;
result[idx] = strndup(str + pos, len);
}
result[idx] = NULL;
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}
EDIT After re-reading the specification in your comment, there seems to be some potential confusion as to the semantics of the split argument:
if split is a set of delimiters, the above code does the job. And the examples will be split as expected.
if split is an actual string to match explicitly, the above code only works by coincidence on the examples given in the comment.
To implement the latter semantics, you should use strstr() to search for the split substring in both getNofTokens and split_string.
Here is an example:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "stringsplit.h"
/* Split string by another string, return split parts + NULL in array.
*
* Parameters:
* str: the string to split
* split: the string to split str with
*
* Returns:
* A dynamically reserved array of dynamically reserved string parts.
*
* For example called with "Test string split" and " ",
* returns ["Test", "string", "split", NULL].
* Or called with "Another - test" and " - ",
* returns ["Another", "test", NULL].
*/
size_t getNofTokens(const char *str, const char *split) {
const char *p;
size_t count = 1;
size_t len = strlen(split);
if (len == 0)
return strlen(str);
for (p = str; (p = strstr(p, split)) != NULL; p += len)
count++;
return count;
}
char **split_string(const char *str, const char *split) {
size_t count = getNofTokens(str, split);
char **result = malloc(sizeof(*result) * (count + 1));
size_t len = strlen(split);
size_t idx;
const char *p = str;
for (idx = 0; idx < count; idx++) {
const char *q = strstr(p, split);
if (q == NULL) {
q = p + strlen(p);
} else
if (q == p && *q != '\0') {
q++;
}
result[idx] = strndup(p, q - p);
p = q + len;
}
result[idx] = NULL;
return result;
}
void print_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
printf("%s\n", split_string[i]);
}
}
void free_split_string(char **split_string) {
for (size_t i = 0; split_string[i] != NULL; i++) {
free(split_string[i]);
}
free(split_string);
}
When debugging, take note of values that you got from malloc, strdup, etc. Let's call these values "the active set". It's just a name, so that we can refer to them. You get a pointer from those functions, you mentally add it to the active set. When you call free, you can only pass values from the active set, and after free returns, you mentally remove them from the set. Any other use of free is invalid and a bug.
You can easily find this out by putting breakpoints after all memory allocations, so that you can write down the pointer values, and then breakpoints on all frees, so that you can see if one of those pointer values got passed to free - since, again, to do otherwise is to misuse free.
This can be done also using "printf" debugging. Like this:
char *buf = malloc(...); // or strdup, or ...
fprintf(stderr, "+++ Alloc %8p\n", buf);
And then whenever you have free, do it again:
fprintf(stderr, "--- Free %8p\n", ptr);
free(ptr);
In the output of the program, you must be able to match every +++ with ---. If you see any --- with a value that wasn't earlier listed with a +++, there's your problem: that's the buggy invocation of free :)
I suggest using fprintf(stderr, ... instead of printf(..., since the former is typically unbuffered, so if your program crashes, you won't miss any output. printf is buffered on some architectures (and not buffered on others - so much for consistency).

C build string char by char with known MAX length

I'm trying to add characters to a string one by one. I have something like this:
void doline(char *line, char *buffer, char** tokens){
}
and i am calling it like:
char *line = malloc(1025 * sizeof(char *));
fgets(line, 1024, stdin);
int linelength = strlen(line);
if (line[linelength - 1] == '\n'){
line[linelength - 1] = '\0';
}
char ** tokens = (char **) malloc(strlen(line) * sizeof(char *));
char *emptybuffer = malloc(strlen(line) * sizeof(char *));
parseline(line, emptybuffer, tokens);
So doline will go through line and tokenize it based on various conditions and place fragments of it into tokens. I am building the temp string in the variable buffer To do this, I need to go through line character by character.
I am currently doing:
buffer[strlen(buffer)] = line[i];
And then at the end of the loop:
*buffer++ = '\0';
But this is the result:
printf("Working on line: '%s' %d\n", line, strlen(line));
Outputs: Working on line: 'test' 4
But by the end of the function the buffer is:
*buffer++ = '\0';
printf("Buffer at the very end: '%s' %d\n", buffer, strlen(buffer));
Outputs: Buffer at the very end: 'test' 7
So the output is showing that the string is getting messed up. What's the best way to build this string character by character? Are my string manipulations correct?
Any help would be much appreciated!
Thanks!
There were some basic problems so I re-written the program.
#include <stdio.h>
#include <stdlib.h>
#define str_len 180
void tokenize(char *str, char **tokens)
{
int length = 0, index = 0;
int i = 0;
int str_i;
int tok_i;
while(str[length]) {
if (str[length] == ' ') {
/* this charecter is a space, so skip it! */
length++;
index++;
tokens[i] = malloc(sizeof(char) * index);
tok_i = 0;
for (str_i=length-index ; str_i<length; str_i++) {
tokens[i][tok_i] = str[str_i];
tok_i++;
}
tokens[i][tok_i] = '\0';
i++;
index = 0;
}
length++;
index++;
}
/* copy the last word in the string */
tokens[i] = malloc(sizeof(char) * index);
tok_i = 0;
for (str_i=length-index ; str_i<length; str_i++) {
tokens[i][tok_i] = str[str_i];
tok_i++;
}
tokens[i][tok_i] = '\0';
tokens[i++] = NULL;
return;
}
int main()
{
char *str = malloc(str_len * sizeof(char));
char **tokens = malloc(100 * sizeof(char *));
int i = 0;
if (str == NULL || tokens == NULL)
return 1;
gets(str);
printf("input string: %s\n", str);
tokenize(str, tokens);
while(tokens[i] != NULL) {
printf("%d - %s \n", i, tokens[i]);
i++;
}
while(tokens[i])
free(tokens[i]);
free(tokens);
free(str);
return 0;
}
It is compiled and executed as follows:
$ gcc -ggdb -Wall prog.c
$ ./a.out
this is a test string... hello world!!
input string: this is a test string... hello world!!
0 - this
1 - is
2 - a
3 - test
4 - string...
5 - hello
6 - world!!
$
There were few basic assumptions:
the length of the incoming string is assumed to a constant. This can be done dynamically - please check this - How to read a line from the console in C?.
The length of the tokens array is also assumed to be a constant. This can also be changed. I will leave that to you to find out how!
Hope this helps!

Resources