getting the end of an web address in c? - c

say I pass an argument www.bbc.co.uk/news/world-us-canada-11893886
I need to separate www.bbc.co.uk from /news/world-us-canada-11893886 for a HTTP GET
I have tried using strtok and strcat but I come across weird splits at runtime.
I can get www.bbc.co.uk just fine using strtok( host, "/");
I have tried using a combination of strtok and strcat to try and get all the rest of the string from the first "/" but i get an output like this...
request: da-11893886
tempString: news/world!
host: www.bbc.co.uk
Path: news/world!da-11893886
If you look at this output, the strangest part is that it always cuts out the middle section.
In this case, the "-us-cana"
the section of the code is attached below
// testing purposes
printf("argv[1]: %s\n", argv[1] );
host = malloc(sizeof(argv[1]));
strcpy(host, argv[1]);
host = strtok(host, "/");
// get the request
request = malloc(sizeof(argv[1]) + sizeof(char)*6);
char *tok, *tempString;
tempString = malloc(sizeof(argv[1]));
tok = strtok( NULL, "\0");
while( tok ) {
strcpy(tempString, tok);
printf("request: %s\n", request);
request = strcat(tempString, request);
tok = strtok(NULL, "\0");
}
printf("host: %s\n", host);
printf("Path: %s\n", request);
Thanks for looking over this.
Any direction or even a link to a site where I can figure out how to do this would be much appreciated.

Here's some code that does more than you want. Note that this modifies the original string - you may want to make copies instead:
void split_request(char *request, char **protocol, char **addr, char **path)
{
char *ptr = strstr(request, "://");
if(NULL == ptr)
{
*protocol = NULL;
*addr = request;
}
else
{
*protocol = request;
*addr = ptr + 3;
*ptr = '\0';
}
ptr = strchr(*addr, '/');
if(NULL == ptr)
{
*path = NULL;
}
else
{
*path = ptr + 1;
*ptr = '\0';
}
}
Please excuse any typos/obvious errors. I'm typing this in a hurry as I have work to do :P
It should get you started though.

I have modified your code to work the way you are expecting
main(int argc, char *argv[])
{
char *request,*host,*req;
char *tok, *tempString;
printf("argv[1]: %s\n", argv[1] );
host = malloc(strlen(argv[1]));
strcpy(host, argv[1]);
host = strtok(host, "/");
tempString = malloc(strlen(argv[1]));
tok = strtok( NULL, "\0");
printf("sizeof(tok) %d\n",strlen(tok));
strncpy(tempString, tok,strlen(tok));
while( tok ) {
tok = strtok(NULL, "\0");
if (tok != NULL) {
strncat(tempString, tok,strlen(tok));
}else {
break;
}
}
request = tempString;
printf("host: %s\n", host);
printf("Path: %s\n", request);
}
~
Output
./tmp www.bbc.co.uk/news/world-us-canada-11893886/tmp.htmlargv[1]: www.bbc.co.uk/news/world-us-canada-11893886/tmp.html
sizeof(tok) 38
host: www.bbc.co.uk
Path: news/world-us-canada-11893886/tmp.html
bash-2.03$
~

Use strrchr() to find the last occurrence of '/' from the rear. You will then have a pointer to the start of 'the end of the web address' if you add one to that returned pointer.
Update
Assuming your URL does not start with http://, this aught to work
#include <stdio.h>
#include <string.h>
int main(void)
{
char url[] = "www.bbc.co.uk/news/world-us-canada-11893886";
int cnt;
char host[100];
char path[100];
char request[100];
strcpy(request, strrchr(url, '/'));
strcpy(host, url);
host[cnt = strcspn(url, "/")] = '\0';
strcpy(path, &url[cnt]);
printf("host: %s\npath: %s\nrequest: %s\n", host, path, request);
return 0;
}
Output
$ ./a.out
host: www.bbc.co.uk
path: /news/world-us-canada-11893886
request: /world-us-canada-11893886

strrchr() returns the LAST instance of the character. He wants the FIRST instance after any http:// string.
The answer is simple:
char *address_start = strchr(in_string+8, '/');
If it's non NULl then there you are at the first / of the path.
Why +8? Because "https://" is 8 characters long and even if there is no "http://" at the beginning, no IP or web address is less than 8 characters. Even "a.b.c.d" is 7 characters long and I don't believe an IPv4 dotted numerical notation has any legal public address with all single digits. I might be wrong though. Might be worth validating the string to check it's long enough first.
Anyway, you can always pre-validate the string to see if it begins with "http" or not to determine the offset to start searching at.

Related

Segfault while accessing memory malloc'd in function

I'm trying to write a function that takes in a path (char *) and splits it into an array of strings based around the '/' delimiter. Simplified code below :
int split_path(char * path, char ** out) {
out = NULL;
char * token = strtok(path, "/");
int count = 0;
while(token) {
out = realloc(out, sizeof(char*) * (++count));
out[count-1] = malloc(sizeof(char) * strlen(token)+1);
strcpy(out[count-1], token);
fprintf(stderr, "%s\n", out[count-1]);
token = strtok(NULL, "/");
}
out = realloc(out, sizeof(char*) * (count+1));
out[count] = NULL;
return count;
}
int main(int argc, char * argv[]) {
char path[] = "/home/pirates/are/cool/yeah";
char ** out;
int count = split_path(path, out);
fprintf(stdout, "count: %d\n", count);
fprintf(stderr, "1st: %s\n", out[0]); // segfaults here
return 0;
}
All of the print statements in the split_path function print perfectly, the output looks like this :
count: 1, string: home
count: 2, string: pirates
count: 3, string: are
count: 4, string: cool
count: 5, string: yeah
count: 5
1st: ./a.out
[1] 5676 segmentation fault (core dumped) ./a.out
But for some reason when I get back to the main function the double-char-array is no longer valid. I thought that it might be because it was pointing to memory declared in that split_path function but I'm doing strcpy to get the strings into it so it shouldn't be pointing back to memory that is local to that function. Any help is greatly appreciated.
You are mismanaged the out parameter. The out variable in main() is never assigned a valid memory address, thus the segfault. The out parameter in split_path() never updates the out variable in main(). You need to pass the address of the variable to split_path() so it can update the variable, and access the memory that the variable points to.
Also note that strtok() modifies the string it is parsing, so you should make a copy and then parse the copy so the original does not get destroyed. Otherwise, consider using strchr() instead of strtok().
Try something more like this instead:
int split_path(char * path, char *** out) {
*out = NULL;
char * tmp = strdup(path);
if (!tmp) { ... }
char * token = strtok(tmp, "/"');
int count = 0;
char ** newout;
while (token) {
newout = realloc(*out, sizeof(char**) * (++count));
if (!newout) { ... }
*out = newout;
(*out)[count-1] = malloc(sizeof(char) * (strlen(token)+1));
if (!(*out)[count-1]) { ... }
strcpy((*out)[count-1], token);
fprintf(stderr, "%s\n", token);
token = strtok(NULL, "/");
}
newout = realloc(*out, sizeof(char**) * (count+1));
if (!newout) { ... }
*out = newout;
(*out)[count] = NULL;
free (tmp);
return count;
}
int main(int argc, char * argv[]) {
char path[] = "/home/pirates/are/cool/yeah";
char ** out;
int count = split_path(path, &out);
fprintf(stdout, "count: %d\n", count);
fprintf(stderr, "1st: %s\n", out[0]); // segfaults here
free (out);
return 0;
}
And don't forget error handling. I've left it out of this example for brevity, but you should not leave it out of your real code.

Parse url path of GET request

I'm new to C and I've been working on this task for about 7 hours now - please don't say I didn't try.
I want to parse the path of a self-written webserver in C. Let's say I call
http://localhost:8080/hello/this/is/a/test.html
then the browser gets
GET /hello/this/is/a/test.html HTTP/1.1
I want to parse /hello/this/is/a/test.html, so the complete string between "GET " (note the white space after GET) and the first white space after /../../..html.
What I tried so far:
int main() {
...
char * getPathOfGetRequest(char *);
char *pathname = getPathOfGetRequest(buf);
printf("%s\n\n%s", buf, pathname);
...
}
char * getPathOfGetRequest(char *buf) {
char *startingGet = "GET ";
char buf_cpy[BUFLEN];
memcpy(buf_cpy, buf, sizeof(buf));
char *urlpath = malloc(1000);
char *path = malloc(1000);
urlpath = strstr(buf_cpy, startingGet);
char delimiter[] = " ";
path = strtok(urlpath, delimiter);
path = strtok(NULL, delimiter);
return path;
}
The pathname always only has 4 correct chars and may or may not be filled with other unrelated chars, like /hell32984cn)/$"§$. I guess it has something to do with strlen(startingGet), but I can't see the relationship between it. Where is my mistake?
Question code with commentary:
char * getPathOfGetRequest(char *buf) {
char *startingGet = "GET ";
char buf_cpy[BUFLEN];
memcpy(buf_cpy, buf, sizeof(buf));
The above memcpy will likely only copy 4 bytes from buf to buf_cpy.
This is due to buf being a pointer to a char.
sizeof(buf) is the size of a pointer (likely: 4).
Perhaps, instead of using 'sizeof()', it would have been better to use 'strlen()'.
char *urlpath = malloc(1000);
char *path = malloc(1000);
urlpath = strstr(buf_cpy, startingGet);
Perhaps the questioner is not clear on why urlpath was allocated 1000 bytes of memory. In any case, the above assignment will cause that 1000 bytes to be leaked, and defeats the purpose of the 'urlpath=malloc(1000)'.
The actual effect of the above statements is urlpath = buf_cpy;, as strstr() will return the position of the beginning of 'GET ' in the buf_copy.
char delimiter[] = " ";
path = strtok(urlpath, delimiter);
Likewise, the above assignment will cause the 1000 bytes allocated to path to be leaked, and defeats the purpose of the 'path=malloc(1000)' above.
path = strtok(NULL, delimiter);
return path;
}
An alternitive coding:
char *getPathOfGetRequest(const char *buf)
{
const char *start = buf;
const char *end;
char *path=NULL;
size_t pathLen;
/* Verify that there is a 'GET ' at the beginning of the string. */
if(strncmp("GET ", start, 4))
{
fprintf(stderr, "Parse error: 'GET ' is missing.\n");
goto CLEANUP;
}
/* Set the start pointer at the first character beyond the 'GET '. */
start += 4;
/* From the start position, set the end pointer to the first white-space character found in the string. */
end=start;
while(*end && !isspace(*end))
++end;
/* Calculate the path length, and allocate sufficient memory for the path plus string termination. */
pathLen = (end - start);
path = malloc(pathLen + 1);
if(NULL == path)
{
fprintf(stderr, "malloc() failed. \n");
goto CLEANUP;
}
/* Copy the path string to the path storage. */
memcpy(path, start, pathLen);
/* Terminate the string. */
path[pathLen] = '\0';
CLEANUP:
/* Return the allocated storage, or NULL in the event of an error, to the caller. */
return(path);
}
And, finally, if 'strtok()' must be used:
char *getPathOfGetRequest(char *buf)
{
char *path = NULL;
if(strtok(buf, " "))
{
path = strtok(NULL, " ");
if(path)
path=strdup(path);
}
return(path);
}

string pointers in C indexing

If I have:
char *tokenPtr = "testingpointerindex"
and I want to access everything after the 4th character, how would I go about that? I tried :
char *tokenPtr = "testingpointerindex";
char *host = tokenPtr + 4;
printf("%s\n",host);
return host;
It's just an outake but I hope it gives enough info, I get a bus error.
Thanks
EDIT:
The full code
char * getHost(char *buf){
char *tokenPtr;
tokenPtr = strtok(buf, "\r\n" );
printf("got token\n");
while ( tokenPtr != NULL ) {
if(strncmp(tokenPtr,"Host",4) == 0){
break;
}
else{
tokenPtr = strtok( NULL, "\r\n" );
}
}
char *host = tokenPtr + 7;
printf("%s\n",host);
return host;
}
int main(int argc, char *argv[])
{
char *msg = "GET /index.html HTTP/1.1\r\n Host: www.google.com\r\n\r\n";
getHost(msg);
}
The above code works fine.
However, there's one thing to mention: string literals (e.g. "testingpointerindex") are non-modifiable in C. Therefore you should use const char *, not char *.
Change:
char *tokenPtr = "testingpointerindex";
to
static char tokenPtr[] = "testingpointerindex";
In your example, tokenPtr is a string literal and string literal are non-modifiable.
The static specifier is required in the second example if you plan to return a pointer to an element of the array as automatic variables are discarded at the end of a function.
Note: I've updated this answer to conform to the new code posted by the OP's update
In the following code, there are a couple problems:
while ( tokenPtr != NULL ) {
if(strncmp(tokenPtr,"Host",4) == 0){
break;
}
else{
tokenPtr = strtok( NULL, "\r\n" );
}
}
char *host = tokenPtr + 7;
The first problem is that there is a possibility that you could exit the while-loop because tokenPtr is NULL ... you don't guard for that possibility. Secondly, you assume that tokenPtr is pointing to a string of at least length 8 after it's been returned from strtok, but that's not necessarily true either (It should be true in your example code, but in working code it might not be true).
Finally, strtok modifies the string it processes, and you're passing it a pointer to a string-literal, which is stored in a read-only memory segment, and should not be modified. You should call strdup to create your string, knowing that you'll have to call free on the returned pointer at some point. So for instance:
int main()
{
char *msg = strdup("GET /index.html HTTP/1.1\r\n Host: www.google.com\r\n\r\n");
getHost(msg);
free(msg);
return 0;
}
Fix:
char * getHost(char *buf){
char *tokenPtr;
tokenPtr = strtok(buf, "\r\n" );
printf("got token\n");
while ( tokenPtr != NULL ) {
if(strncmp(tokenPtr,"Host",4) == 0) break;
else {tokenPtr = strtok( NULL, "\r\n"); break;}
}
char *host;
host = &tokenPtr[7];
printf("%s\n", host);
}
int main(int argc, char *argv[])
{
char msg[100] = "GET /index.html HTTP/1.1\r\n Host: www.google.com\r\n\r\n";
getHost(msg);
}
Find out what was wrong ;)

Want to free my pointer token after strtok

I have extracted the "meaning" part of my code (and also replace some line to simplify it).
I have 2 dynamic pointers, one for the current line (extracted from a file) and a second for the current token.
Following this question, Free/delete strtok_r pointer before processing complete string?
I wrote this :
int main(void) {
int n = 455;
char *tok2, *freetok2;
char *line, *freeline;
line = freeline = malloc(n*sizeof(*line));
tok2 = freetok2 = malloc(n*sizeof(*tok2));
/* content of the file) */
const char* file_reading = "coucou/gniagnia/puet/";
/* reading from the file */
strcpy(line, file_reading);
strtok(line, "/");
/* get the second token of the line */
tok2 = strtok(NULL, "/");
fprintf(stdout, "%s \n", tok2); // print gniagnia
fprintf(stdout, "%s \n", line); // print coucou
/* free error */
//free(tok2);
/* worked, but maybe don't free "everything ?" */
//free(line);
free(freetok2);
free(freeline);
return 0;
}
But at the end, I'm not sure of what is correct or not, and I find this solution not so elegant (because of using 2 "save variables".
Is that correct ? Is there some ways to improve it ?
Thanks
Edit: changed my code for this, (and it will handle all the lines of the file)
include <unistd.h>
include <stdlib.h>
int main(void) {
char *tok2;
char *line;
/* content of the file) */
const char* file_reading = "coucou/gniagnia/puet/";
const char* file_reading2 = "blabla/dadada/";
/* reading from the file */
line = strdup(file_reading);
strtok(line, "/");
/* get the second token of the line */
tok2 = strtok(NULL, "/");
printf("%s \n", tok2);
printf("%s \n", line);
/* reading from the file */
line = strdup(file_reading2);
strtok(line, "/");
/* get the second token of the line */
tok2 = strtok(NULL, "/");
printf("%s \n", tok2);
printf("%s \n", line);
free(line);
return 0;
}
You're not actually using the memory pointed by freetok2, you don't need to malloc anything, thus you don't need the freetok2 variable.
Saying free(line) or free(freeline) is the same in your code so you don't need the freeline at all.
Another problem is this: malloc(n*sizeof(*line));. You might as well be saying: malloc(n); because sizeof(char) is always 1. But best of all would be:
line = malloc(strlen(file_reading) + 1);
strcpy(line, file_reading);
The code should be modified as follows:
int main(void) {
int n = 455;
char *tok2;
char *line;
line = malloc(n*sizeof(*line));
/* content of the file) */
const char* file_reading = "coucou/gniagnia/puet/";
/* reading from the file */
strcpy(line, file_reading);
strtok(line, "/");
/* get the second token of the line */
tok2 = strtok(NULL, "/");
fprintf(stdout, "%s \n", tok2); // print gniagnia
fprintf(stdout, "%s \n", line); // print coucou
free(line);
return 0;
}

strtok and memory leaks

I wrote a simple url parser using strtok(). here's the code
#include <stdio.h>
#include <stdlib.h>
typedef struct {
char *protocol;
char *host;
int port;
char *path;
} aUrl;
void parse_url(char *url, aUrl *ret) {
printf("Parsing %s\n", url);
char *tmp = (char *)_strdup(url);
//char *protocol, *host, *port, *path;
int len = 0;
// protocol agora eh por exemplo http: ou https:
ret->protocol = (char *) strtok(tmp, "/");
len = strlen(ret->protocol) + 2;
ret->host = (char *) strtok(NULL, "/");
len += strlen(ret->host);
//printf("char at %d => %c", len, url[len]);
ret->path = (char *)_strdup(&url[len]);
ret->path = (char *) strtok(ret->path, "#");
ret->protocol = (char *) strtok(ret->protocol, ":");
// host agora é por exemplo address.com:8080
//tmp = (char *)_strdup(host);
//strtok(tmp, ":");
ret->host = (char *) strtok(ret->host, ":");
tmp = (char *) strtok(NULL, ":");
if(tmp == NULL) {
if(strcmp(ret->protocol, "http") == 0) {
ret->port = 80;
} else if(strcmp(ret->protocol, "https") == 0) {
ret->port = 443;
}
} else {
ret->port = atoi(tmp);
}
//host = (char *) strtok(NULL, "/");
}
/*
*
*/
int main(int argc, char** argv) {
printf("hello moto\n");
aUrl myUrl;
parse_url("http://teste.com/Teste/asdf#coisa", &myUrl);
printf("protocol is %s\nhost is %s\nport is %d\npath is %s\n", myUrl.protocol, myUrl.host, myUrl.port, myUrl.path);
return (EXIT_SUCCESS);
}
As you can see, I use strtok() a lot so I can "slice" the url. I don't need to support urls different than http or https so the way it's done solves all of my problems.
My concern is (this is running on an embedded device) - Am I wasting memory ?
When I write something like
ret->protocol = (char *) strtok(tmp, "/");
And then later call
ret->protocol = (char *) strtok(ret->protocol, ":");
Does me first pointer ret->protocol held remain in memory ? I thought that maybe I should set the first call to a tmp pointer, call strtok pointing ret->protocol to the right portion of the string (the second call) and then free(tmp).
What should be the best way to use strtok ?
To answer your question directly, strtok only returns a pointer to a location inside the string you give it as input-- it doesn't allocate new memory for you, so shouldn't need to call free on any of the pointers it gives you back in return.
For what it's worth, you could also look into "strchr" and "strstr", which are nondestructive ways of searching for single characters or sequences within strings.
Also note that your memory allocation is problematic here-- you're using strdup() to allocate a new string inside your parse function, and then you're assigning fragments of that memory block to fields of "ret". Your caller will thus be responsible for free'ing the strdup'd string, but since you're only passing that string back implicitly inside ret, the caller needs to know magically what pointer to pass to free. (Probably ret->protocol, but maybe not, depending on how the input looks.)
strtok modifies the string in place, replacing the specified characters with NULL. Since strings in C are NULL-terminated, it now appears that your original pointer is pointing to a shorter string, even though the original string is still there and still occupies the same amount of memory (but with characters replaced with NULL). The end of the string, I think, contains a double-NULL.
The short answer is this: Keep a pointer to the beginning of your string buffer, and have another pointer that is your "current" pointer into the string as you parse it. When you use strtok or iterate over the string in other ways you update the "current" pointer but leave the beginning pointer alone. When you're finished, free() the beginning pointer. No memory leaked.
Do you know you can continue parsing the string using NULL as first parameter of strtok?
First call:
char* token = strtok(string, delimiters);
Then:
token = strtok(NULL, other_delimiters);
This allow you to simplify your code:
int parse_url(char *url, aUrl *ret)
{
//get protocol
char* token = strtok(url, "/");
if( token == NULL )
return -1;
strcpy(ret->protocol, token);
strcat(ret->protocol, "//");
// skip next '/'
token = strtok(NULL, "/");
if( token == NULL )
return -1;
//get host
token = strtok(NULL, "/");
if( token == NULL )
return -1;
strcpy(ret->host, token);
// get path
token = strtok(NULL, "#");
if( token == NULL )
return -1;
strcpy(ret->path, token);
// ...
return 0;
}
You can see I had a return value to know if parsing was successfully done.
Thanks for sharing your code! I ran it inside valgrind and fixed two memory leaks generated by strdup functions.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
char *protocol;
char *host;
int port;
char *path;
} URL;
void parse_url(char *url, URL *ret) {
char *tmp = (char *) strdup(url);
int len = 0;
ret->protocol = (char *) strtok(tmp, "/");
len = strlen(ret->protocol) + 2;
ret->host = (char *) strtok(NULL, "/");
len += strlen(ret->host);
ret->path = (char *) strdup(&url[len]);
ret->path = (char *) strtok(ret->path, "#");
ret->protocol = (char *) strtok(ret->protocol, ":");
ret->host = (char *) strtok(ret->host, ":");
tmp = (char *) strtok(NULL, ":");
if (tmp == NULL) {
if (strcmp(ret->protocol, "http") == 0) {
ret->port = 80;
} else if (strcmp(ret->protocol, "https") == 0) {
ret->port = 443;
}
} else {
ret->port = atoi(tmp);
}
}
void free_url(URL *url) {
free(url->path);
free(url->protocol);
}
int main(int argc, char** argv) {
URL url;
parse_url("http://example.com:3000/Teste/asdf#coisa", &url);
printf("protocol: %s\nhost: %s\nport: %d\npath: %s\n", url.protocol, url.host, url.port, url.path);
free_url(&url);
return (EXIT_SUCCESS);
}

Resources