strtok and memory leaks - c

I wrote a simple url parser using strtok(). here's the code
#include <stdio.h>
#include <stdlib.h>
typedef struct {
char *protocol;
char *host;
int port;
char *path;
} aUrl;
void parse_url(char *url, aUrl *ret) {
printf("Parsing %s\n", url);
char *tmp = (char *)_strdup(url);
//char *protocol, *host, *port, *path;
int len = 0;
// protocol agora eh por exemplo http: ou https:
ret->protocol = (char *) strtok(tmp, "/");
len = strlen(ret->protocol) + 2;
ret->host = (char *) strtok(NULL, "/");
len += strlen(ret->host);
//printf("char at %d => %c", len, url[len]);
ret->path = (char *)_strdup(&url[len]);
ret->path = (char *) strtok(ret->path, "#");
ret->protocol = (char *) strtok(ret->protocol, ":");
// host agora é por exemplo address.com:8080
//tmp = (char *)_strdup(host);
//strtok(tmp, ":");
ret->host = (char *) strtok(ret->host, ":");
tmp = (char *) strtok(NULL, ":");
if(tmp == NULL) {
if(strcmp(ret->protocol, "http") == 0) {
ret->port = 80;
} else if(strcmp(ret->protocol, "https") == 0) {
ret->port = 443;
}
} else {
ret->port = atoi(tmp);
}
//host = (char *) strtok(NULL, "/");
}
/*
*
*/
int main(int argc, char** argv) {
printf("hello moto\n");
aUrl myUrl;
parse_url("http://teste.com/Teste/asdf#coisa", &myUrl);
printf("protocol is %s\nhost is %s\nport is %d\npath is %s\n", myUrl.protocol, myUrl.host, myUrl.port, myUrl.path);
return (EXIT_SUCCESS);
}
As you can see, I use strtok() a lot so I can "slice" the url. I don't need to support urls different than http or https so the way it's done solves all of my problems.
My concern is (this is running on an embedded device) - Am I wasting memory ?
When I write something like
ret->protocol = (char *) strtok(tmp, "/");
And then later call
ret->protocol = (char *) strtok(ret->protocol, ":");
Does me first pointer ret->protocol held remain in memory ? I thought that maybe I should set the first call to a tmp pointer, call strtok pointing ret->protocol to the right portion of the string (the second call) and then free(tmp).
What should be the best way to use strtok ?

To answer your question directly, strtok only returns a pointer to a location inside the string you give it as input-- it doesn't allocate new memory for you, so shouldn't need to call free on any of the pointers it gives you back in return.
For what it's worth, you could also look into "strchr" and "strstr", which are nondestructive ways of searching for single characters or sequences within strings.
Also note that your memory allocation is problematic here-- you're using strdup() to allocate a new string inside your parse function, and then you're assigning fragments of that memory block to fields of "ret". Your caller will thus be responsible for free'ing the strdup'd string, but since you're only passing that string back implicitly inside ret, the caller needs to know magically what pointer to pass to free. (Probably ret->protocol, but maybe not, depending on how the input looks.)

strtok modifies the string in place, replacing the specified characters with NULL. Since strings in C are NULL-terminated, it now appears that your original pointer is pointing to a shorter string, even though the original string is still there and still occupies the same amount of memory (but with characters replaced with NULL). The end of the string, I think, contains a double-NULL.
The short answer is this: Keep a pointer to the beginning of your string buffer, and have another pointer that is your "current" pointer into the string as you parse it. When you use strtok or iterate over the string in other ways you update the "current" pointer but leave the beginning pointer alone. When you're finished, free() the beginning pointer. No memory leaked.

Do you know you can continue parsing the string using NULL as first parameter of strtok?
First call:
char* token = strtok(string, delimiters);
Then:
token = strtok(NULL, other_delimiters);
This allow you to simplify your code:
int parse_url(char *url, aUrl *ret)
{
//get protocol
char* token = strtok(url, "/");
if( token == NULL )
return -1;
strcpy(ret->protocol, token);
strcat(ret->protocol, "//");
// skip next '/'
token = strtok(NULL, "/");
if( token == NULL )
return -1;
//get host
token = strtok(NULL, "/");
if( token == NULL )
return -1;
strcpy(ret->host, token);
// get path
token = strtok(NULL, "#");
if( token == NULL )
return -1;
strcpy(ret->path, token);
// ...
return 0;
}
You can see I had a return value to know if parsing was successfully done.

Thanks for sharing your code! I ran it inside valgrind and fixed two memory leaks generated by strdup functions.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
char *protocol;
char *host;
int port;
char *path;
} URL;
void parse_url(char *url, URL *ret) {
char *tmp = (char *) strdup(url);
int len = 0;
ret->protocol = (char *) strtok(tmp, "/");
len = strlen(ret->protocol) + 2;
ret->host = (char *) strtok(NULL, "/");
len += strlen(ret->host);
ret->path = (char *) strdup(&url[len]);
ret->path = (char *) strtok(ret->path, "#");
ret->protocol = (char *) strtok(ret->protocol, ":");
ret->host = (char *) strtok(ret->host, ":");
tmp = (char *) strtok(NULL, ":");
if (tmp == NULL) {
if (strcmp(ret->protocol, "http") == 0) {
ret->port = 80;
} else if (strcmp(ret->protocol, "https") == 0) {
ret->port = 443;
}
} else {
ret->port = atoi(tmp);
}
}
void free_url(URL *url) {
free(url->path);
free(url->protocol);
}
int main(int argc, char** argv) {
URL url;
parse_url("http://example.com:3000/Teste/asdf#coisa", &url);
printf("protocol: %s\nhost: %s\nport: %d\npath: %s\n", url.protocol, url.host, url.port, url.path);
free_url(&url);
return (EXIT_SUCCESS);
}

Related

Conditional jump or move depends on uninitialised value(s) in C

I have this function that return a parsed_url structure that looks like this
typedef struct url_parser_url {
char *protocol;
char *host;
int port;
char *path;
char *query_string;
int host_exists;
} url_parser_url_t;
url_parser_url_t *parsed_url;
parsed_url = (url_parser_url_t *) malloc(sizeof(url_parser_url_t));
parse_url(address, true, parsed_url);
printf("parsed_url->path = %s\n", parsed_url->path);
The parse_url function looks like
int parse_url(char *url, bool verify_host, url_parser_url_t *parsed_url) {
char *local_url = (char *) malloc(sizeof(char) * (strlen(url) + 1));
char *token;
char *token_host;
char *host_port;
char *token_ptr;
char *host_token_ptr;
char *path = NULL;
strcpy(local_url, url);
token = strtok_r(local_url, ":", &token_ptr);
parsed_url->protocol = (char *) malloc(sizeof(char) * strlen(token) + 1);
strcpy(parsed_url->protocol, token);
token = strtok_r(NULL, "/", &token_ptr);
if (token) {
host_port = (char *) malloc(sizeof(char) * (strlen(token) + 1));
strcpy(host_port, token);
} else {
host_port = (char *) malloc(sizeof(char) * 1);
strcpy(host_port, "");
}
token_host = strtok_r(host_port, ":", &host_token_ptr);
if (token_host) {
parsed_url->host = (char *) malloc(
sizeof(char) * strlen(token_host) + 1);
strcpy(parsed_url->host, token_host);
if (verify_host) {
struct hostent *host;
host = gethostbyname(parsed_url->host);
if (host != NULL) {
parsed_url->host_exists = 1;
} else {
parsed_url->host_exists = 0;
}
} else {
parsed_url->host_exists = -1;
}
} else {
parsed_url->host_exists = -1;
parsed_url->host = NULL;
}
token_host = strtok_r(NULL, ":", &host_token_ptr);
if (token_host)
parsed_url->port = atoi(token_host);
else
parsed_url->port = 0;
token_host = strtok_r(NULL, ":", &host_token_ptr);
assert(token_host == NULL);
token = strtok_r(NULL, "?", &token_ptr);
parsed_url->path = NULL;
if (token) {
path = (char *) realloc(path, sizeof(char) * (strlen(token) + 2));
strcpy(path, "/");
strcat(path, token);
parsed_url->path = (char *) malloc(sizeof(char) * strlen(path) + 1);
strncpy(parsed_url->path, path, strlen(path));
free(path);
} else {
parsed_url->path = (char *) malloc(sizeof(char) * 2);
strcpy(parsed_url->path, "/");
}
token = strtok_r(NULL, "?", &token_ptr);
if (token) {
parsed_url->query_string = (char *) malloc(
sizeof(char) * (strlen(token) + 1));
strncpy(parsed_url->query_string, token, strlen(token));
} else {
parsed_url->query_string = NULL;
}
token = strtok_r(NULL, "?", &token_ptr);
assert(token == NULL);
free(local_url);
free(host_port);
return 0;
}
The problem is when I call the function parse_url and then I use the parsed_url->path member it throws me this segmentation fault
==16647== Conditional jump or move depends on uninitialised value(s)
Can anyone to explain me what is happening and why ? Thank you
And there it is. Although it is supposed to be a safer alternative to strcpy(), the strncpy() has a nasty wart. The standard says:
The strncpy function copies not more than n characters (characters that follow a null character are not copied) from the array pointed to by s2 to the array pointed to by s1.
(C2011 7.24.2.4/2), and note 308 clarifies that
Thus, if there is no null character in the first n characters of the array pointed to by s2, the result will not be null-terminated.
The code that is ultimately responsible for your particular valgrind complaint is this:
strncpy(parsed_url->path, path, strlen(path));
Since by definition there cannot be a null character within the first strlen(path) characters of path, that strncpy() reliably fails to ensure that the copy is null-terminated. You have at least one other instance of the same problem in your code.
Since you seem to take sufficient care to ensure that enough space is available, one solution would be to switch from strncpy() to strcpy(). That would also be more efficient, because you would avoid duplicate calls to strlen().
As I noted in comments, however, if you're willing to rely on POSIX's strdup(), then that's cleaner than strlen() + malloc() + str[n]cpy(), and has the same semantics (you take responsibility for freeing the memory allocated for the copy). You wouldn't have even had the opportunity to make these errors if you had made your copies that way.

Segfault while accessing memory malloc'd in function

I'm trying to write a function that takes in a path (char *) and splits it into an array of strings based around the '/' delimiter. Simplified code below :
int split_path(char * path, char ** out) {
out = NULL;
char * token = strtok(path, "/");
int count = 0;
while(token) {
out = realloc(out, sizeof(char*) * (++count));
out[count-1] = malloc(sizeof(char) * strlen(token)+1);
strcpy(out[count-1], token);
fprintf(stderr, "%s\n", out[count-1]);
token = strtok(NULL, "/");
}
out = realloc(out, sizeof(char*) * (count+1));
out[count] = NULL;
return count;
}
int main(int argc, char * argv[]) {
char path[] = "/home/pirates/are/cool/yeah";
char ** out;
int count = split_path(path, out);
fprintf(stdout, "count: %d\n", count);
fprintf(stderr, "1st: %s\n", out[0]); // segfaults here
return 0;
}
All of the print statements in the split_path function print perfectly, the output looks like this :
count: 1, string: home
count: 2, string: pirates
count: 3, string: are
count: 4, string: cool
count: 5, string: yeah
count: 5
1st: ./a.out
[1] 5676 segmentation fault (core dumped) ./a.out
But for some reason when I get back to the main function the double-char-array is no longer valid. I thought that it might be because it was pointing to memory declared in that split_path function but I'm doing strcpy to get the strings into it so it shouldn't be pointing back to memory that is local to that function. Any help is greatly appreciated.
You are mismanaged the out parameter. The out variable in main() is never assigned a valid memory address, thus the segfault. The out parameter in split_path() never updates the out variable in main(). You need to pass the address of the variable to split_path() so it can update the variable, and access the memory that the variable points to.
Also note that strtok() modifies the string it is parsing, so you should make a copy and then parse the copy so the original does not get destroyed. Otherwise, consider using strchr() instead of strtok().
Try something more like this instead:
int split_path(char * path, char *** out) {
*out = NULL;
char * tmp = strdup(path);
if (!tmp) { ... }
char * token = strtok(tmp, "/"');
int count = 0;
char ** newout;
while (token) {
newout = realloc(*out, sizeof(char**) * (++count));
if (!newout) { ... }
*out = newout;
(*out)[count-1] = malloc(sizeof(char) * (strlen(token)+1));
if (!(*out)[count-1]) { ... }
strcpy((*out)[count-1], token);
fprintf(stderr, "%s\n", token);
token = strtok(NULL, "/");
}
newout = realloc(*out, sizeof(char**) * (count+1));
if (!newout) { ... }
*out = newout;
(*out)[count] = NULL;
free (tmp);
return count;
}
int main(int argc, char * argv[]) {
char path[] = "/home/pirates/are/cool/yeah";
char ** out;
int count = split_path(path, &out);
fprintf(stdout, "count: %d\n", count);
fprintf(stderr, "1st: %s\n", out[0]); // segfaults here
free (out);
return 0;
}
And don't forget error handling. I've left it out of this example for brevity, but you should not leave it out of your real code.

Programming a Shell in C, Disappearing char** When Passed to Function

I've only found a few threads like this, and none with information that I am able to make any sense of. I'm programming a shell in C and I feel like it should be easy but my C programming is not so fresh. I'm having issues with passing a double pointer and the contents disappearing
I feel I am on the right track, and it sounds like it has something to do with initialization, but I've tried a few things, setting pointers to NULL just to be sure. Thanks.
void runProgram (char **cLine);
char **parse(char *str);
/*
*
*/
int main(int argc, char** argv)
{
char *cin = NULL;
ssize_t buffer = 0;
char **tempArgs = NULL;
printf(">");
while(1)
{
getline(&cin, &buffer, stdin);
tempArgs = parse(cin); //malloc, parse, and return
printf("passing %s", tempArgs[0]); //works just fine here, can see the string
runProgram(tempArgs); //enter this function and array is lost
}
return (EXIT_SUCCESS);
}
char** parse( char* str )
{
char *token = NULL;
char tokens[256];
char** args = malloc( 256 );
int i = 0;
strcpy( tokens, str );
args[i] = strtok( tokens, " " );
while( args[i] )
{
i++;
args[i] = strtok(NULL, " ");
}
args[i] = NULL;
return args;
}
Visible in main up until this function call
void runProgram (char **cLine)
{
//function that calls fork and execvp
}
The simplest fix is not to use tokens at all in the parse() function:
int main(void)
{
char *buffer = NULL;
size_t buflen = 0;
char **tempArgs = NULL;
printf("> ");
while (getline(&buffer, &buflen, stdin) != -1)
{
tempArgs = parse(buffer);
printf("passing %s", tempArgs[0]);
runProgram(tempArgs);
printf("> ");
free(tempArgs); // Free the space allocated by parse()
}
free(buffer); // Free the space allocated by getline()
return (EXIT_SUCCESS);
}
char **parse(char *str)
{
char **args = malloc(256);
if (args == 0)
…handle error appropriately…
int i = 0;
args[i] = strtok(str, " ");
// Bounds checking omitted
while (args[i])
args[++i] = strtok(NULL, " ");
return args;
}
Note that when the loop terminates, the array is already null terminated, so the extra assignment wasn't necessary (but it is better to be safe than sorry).

Strange behavior of String tokenizer in C

I have written the following program to resolve a path to several directory names
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
char *
tokenizer(char *path, char **name){
char s[300];
char *buffer;
memcpy(s, path, strlen(path)+1);
printf("%s\n",s); // PROBLEM
int i=0;
while(s[i] == '/'){
i++;
}
if (i == strlen(path)){
return NULL;
}
*name = strtok_r(s, "/", &buffer);
return buffer;
}
int main(void){
char str[300];
char *token, *p;
scanf("%s",str);
p = tokenizer(str, &token);
if (p != NULL)
printf("%s\n",token);
else
printf("Nothing left\n");
while((p=tokenizer(p, &token)) != NULL){
printf("%s\n",token);
}
}
Output of the above program
Input: a/b/c
Output: a/b/c
a/b/c
a
b/c
b
c
c
If I comment the line labelled PROBLEM
Input: a/b/c
Output: Some garbage value
Can somebody explain me the reason for this strange behavior?
Note:
I have realised that s is a stack allocated variable and it ceases to exist in function main() but why does the program works when I use printf() ?
In addition to what geekasaur says:
strtok_r's 3rd parameter is used incorrectly, in two ways:
1. It should be initialized to NULL before the first call.
2. It shouldn't be used in any way (you return it to the caller). It should only be passed to another strtok_r call.
You are returning a pointer into a stack-allocated string (buffer points into s); s's memory ceases to be meaningful after tokenize returns.
You cannot do this
char s[300];
char *buffer;
...
*name = strtok_r(s, "/", &buffer);
return buffer;
Here buffer is a pointer to a s[300] position. s[300] is a function local variable allocated on the stack when the function is called and destroyed when the function returns.
So you are not returning a valid pointer, you cannot use that pointer out of the function.
Along with the observations that you're returning a pointer to a local variable, I think it's worth noting that your tokenizer is almost 100% pointless.
Most of what your tokenizer does is skip across any leading / characters before calling strtok_r -- but you're passing '/' as the delimiter character to strtok_r, which will automatically skip across any leading delimiter characters on it own.
Rather simpler code suffices to print out the components of a path without the delimiters:
char path[] = "a/b/c";
char *pos = NULL;
char *component = strtok_r(path, "/", &pos);
while (NULL != component) {
printf("%s\n", component);
component = strtok_r(NULL, "/", &pos);
}
Try this:
char*
token(char * path, char ** name){
static char * obuffer = NULL;
char * buffer = NULL, * p, * q;
if(path == NULL) {
buffer = realloc(buffer, strlen(obuffer) + 1);
p = obuffer;
} else {
buffer = malloc(257);
p = path;
}
if(!buffer) return NULL;
q = buffer;
if(!p || !*p) return NULL;
while(*p != '\0') {
if(*p == '/') {
p++; /* remove the / from string. */
break;
}
*q ++ = *p++;
}
*q ++ = '\0';
obuffer = p;
*name = buffer;
return buffer;
}
int main(void)
{
char * s = "foo/baa/hehehe/";
char * name = NULL;
char * t = token(s, &name);
while(t) {
printf("%s\n", name);
t = token(NULL, &name);
}
return 0;
}
the output:
foo
baa
hehehe
But you are basically "reinventing the wheel" of strtok() function..

getting the end of an web address in c?

say I pass an argument www.bbc.co.uk/news/world-us-canada-11893886
I need to separate www.bbc.co.uk from /news/world-us-canada-11893886 for a HTTP GET
I have tried using strtok and strcat but I come across weird splits at runtime.
I can get www.bbc.co.uk just fine using strtok( host, "/");
I have tried using a combination of strtok and strcat to try and get all the rest of the string from the first "/" but i get an output like this...
request: da-11893886
tempString: news/world!
host: www.bbc.co.uk
Path: news/world!da-11893886
If you look at this output, the strangest part is that it always cuts out the middle section.
In this case, the "-us-cana"
the section of the code is attached below
// testing purposes
printf("argv[1]: %s\n", argv[1] );
host = malloc(sizeof(argv[1]));
strcpy(host, argv[1]);
host = strtok(host, "/");
// get the request
request = malloc(sizeof(argv[1]) + sizeof(char)*6);
char *tok, *tempString;
tempString = malloc(sizeof(argv[1]));
tok = strtok( NULL, "\0");
while( tok ) {
strcpy(tempString, tok);
printf("request: %s\n", request);
request = strcat(tempString, request);
tok = strtok(NULL, "\0");
}
printf("host: %s\n", host);
printf("Path: %s\n", request);
Thanks for looking over this.
Any direction or even a link to a site where I can figure out how to do this would be much appreciated.
Here's some code that does more than you want. Note that this modifies the original string - you may want to make copies instead:
void split_request(char *request, char **protocol, char **addr, char **path)
{
char *ptr = strstr(request, "://");
if(NULL == ptr)
{
*protocol = NULL;
*addr = request;
}
else
{
*protocol = request;
*addr = ptr + 3;
*ptr = '\0';
}
ptr = strchr(*addr, '/');
if(NULL == ptr)
{
*path = NULL;
}
else
{
*path = ptr + 1;
*ptr = '\0';
}
}
Please excuse any typos/obvious errors. I'm typing this in a hurry as I have work to do :P
It should get you started though.
I have modified your code to work the way you are expecting
main(int argc, char *argv[])
{
char *request,*host,*req;
char *tok, *tempString;
printf("argv[1]: %s\n", argv[1] );
host = malloc(strlen(argv[1]));
strcpy(host, argv[1]);
host = strtok(host, "/");
tempString = malloc(strlen(argv[1]));
tok = strtok( NULL, "\0");
printf("sizeof(tok) %d\n",strlen(tok));
strncpy(tempString, tok,strlen(tok));
while( tok ) {
tok = strtok(NULL, "\0");
if (tok != NULL) {
strncat(tempString, tok,strlen(tok));
}else {
break;
}
}
request = tempString;
printf("host: %s\n", host);
printf("Path: %s\n", request);
}
~
Output
./tmp www.bbc.co.uk/news/world-us-canada-11893886/tmp.htmlargv[1]: www.bbc.co.uk/news/world-us-canada-11893886/tmp.html
sizeof(tok) 38
host: www.bbc.co.uk
Path: news/world-us-canada-11893886/tmp.html
bash-2.03$
~
Use strrchr() to find the last occurrence of '/' from the rear. You will then have a pointer to the start of 'the end of the web address' if you add one to that returned pointer.
Update
Assuming your URL does not start with http://, this aught to work
#include <stdio.h>
#include <string.h>
int main(void)
{
char url[] = "www.bbc.co.uk/news/world-us-canada-11893886";
int cnt;
char host[100];
char path[100];
char request[100];
strcpy(request, strrchr(url, '/'));
strcpy(host, url);
host[cnt = strcspn(url, "/")] = '\0';
strcpy(path, &url[cnt]);
printf("host: %s\npath: %s\nrequest: %s\n", host, path, request);
return 0;
}
Output
$ ./a.out
host: www.bbc.co.uk
path: /news/world-us-canada-11893886
request: /world-us-canada-11893886
strrchr() returns the LAST instance of the character. He wants the FIRST instance after any http:// string.
The answer is simple:
char *address_start = strchr(in_string+8, '/');
If it's non NULl then there you are at the first / of the path.
Why +8? Because "https://" is 8 characters long and even if there is no "http://" at the beginning, no IP or web address is less than 8 characters. Even "a.b.c.d" is 7 characters long and I don't believe an IPv4 dotted numerical notation has any legal public address with all single digits. I might be wrong though. Might be worth validating the string to check it's long enough first.
Anyway, you can always pre-validate the string to see if it begins with "http" or not to determine the offset to start searching at.

Resources