split html by tags - c

I want to split a html page into pieces by a tag delimiter: like <img or <div>.
I tried the following code but it doesn't work:
char source[MAXBUFLEN + 1];
FILE *fp = fopen("source.html", "r");
if (fp != NULL)
{
size_t newLen = fread(source, sizeof(char), MAXBUFLEN, fp);
if (newLen == 0) {
fputs("Error reading file", stderr);
} else {
source[++newLen] = '\0'; /* Just to be safe. */
}
}
fclose(fp);
//not working
char* strArray[10];
int i = 0;
char *token = strtok(source, "<img");
while(token != NULL)
{
strcpy(strArray[i++], token);
token = strtok(NULL, "<img");
}
printf("%s\n", strArray[3]);
What am I doing wrong? Is there any other method I can use except strtok?

As Daren has already posted, strtok() doesn't do what you want. You can use
char *ptr = strstr( source, "<img" );
instead to find the first tag, and then
ptr = strstr(ptr+4, "<img" ); // search starts direcly behind the previous "<img"
// maybe you can find a better offset
for the next occurances.
Besides, your line
strcpy(strArray[i++], token);
would crash because you have no memory allocated to the pointer.

char *strtokByWord_r(char *str, const char *word, char **store){
char *p, *ret;
if(str != NULL){
*store = str;
}
if(*store == NULL) return NULL;
p = strstr(ret=*store, word);
if(p){
*p='\0';
*store = p + strlen(word);
} else {
*store = NULL;
}
return ret;
}
char *strtokByWord(char *str, const char *word){
static char *store = NULL;
return strtokByWord_r(str, word, &store);
}
replace
char *token = strtok(source, "<img");
...
token = strtok(NULL, "<img");
to
char *token = strtokByWord(source, "<img");
...
token = strtokByWord(NULL, "<img");

The second argument to strtok is a list of delimiter characters. Each of these will be used to split the string into tokens. I don't think it does what you think it does...
If you want to go and parse an html file into tokens, you could look into lex...
What is your desired output? Do you have a test case for your input?
Your code should produce the following:
input:
<html><img src="test.png"/></html>
output:
""
"ht"
"l>"
" src=\"test.pn"
"\"/>"
"/ht"
"l>"
I somehow don't think that is what you want...

Related

Trying to split text into lines, and then tabs. Get only first line

I have a char array (buf) that exists out of multiple lines and each line is split up by multiple tabs. I want to separate this. I use the following code for this:
char copy[4096];
char* split_request = strtok(buf, "\r\n");
strcpy(copy, split_request);
while(split_request != NULL) {
if (strchr(copy, '\t') != NULL) {
printf("We have a tab");
//If I uncomment this line I get an assertion error
//char* temp = strtok(copy, '\t');
}
printf(split_request);
split_request = strtok(NULL, "\r\n");
if (split_request != NULL) {
strcpy(copy, split_request);
}
printf("\n");
}
If I uncomment that one line of code, only the first line is processed. In addition, it is printed 5 times, and each time one tabbed column disappears. It feels like despite the strcpy, the original string is still affected...
I was experimenting with an alternative approach to your problem. I have used strtok_r for separating lines and each line is processed using strtok for tabs. The code is given below.
void lineParser(char *singleLine){
const char tab[] = "\t";
char *token = NULL;
if(strchr(singleLine, '\t') != NULL){
token = strtok(singleLine, tab);
while(token != NULL){
printf("%s\n", token);
token = strtok(NULL, tab);
}
}
}
int main()
{
char buf[] = "Stack\tOverFlow\r\nStack\tExchange\r\n";
char *rest = buf;;
char* token = NULL;
const char tab[] = "\t";
const char newline[] = "\r\n";
while ((token = strtok_r(rest, " ", &rest))) {
lineParser(token);
token = strtok_r(rest, newline, &rest);
}
}

strtok c multiple chars as one delimiter

Is it possible to use multiple chars as one delimiter?
I would like a string as separator for another string.
char * input = "inputvalue1SEPARATORSTRINGinputvalue2SEPARATORSTRINGinputvalue2";
char * output = malloc(sizeof(char*));
char * delim = "SEPARATORSTRING";
char * example()
{
char * ptr = strtok(input, delim);
while (ptr != NULL)
{
output = strcat(output, ptrvar);
output = strcat(output, "\n");
ptr = strtok(NULL, delim);
}
return output;
}
Return value printed with printf:
inputvalue1
inputvalue2
inputvalue3
No, according to the manual page for strtok():
The delim argument specifies a set of bytes that delimit the tokens in the parsed string.
If you want to use a multi-byte string as delimiter, there is no built-in function that behaves like strtok(). You would have to use strstr() instead to find occurrences of the delimiter string in the input, and advance manually.
Here's an example from this answer:
char *multi_tok(char *input, char *delimiter) {
static char *string;
if (input != NULL)
string = input;
if (string == NULL)
return string;
char *end = strstr(string, delimiter);
if (end == NULL) {
char *temp = string;
string = NULL;
return temp;
}
char *temp = string;
*end = '\0';
string = end + strlen(delimiter);
return temp;
}

Parsing input in C [duplicate]

I have been trying to tokenize a string using SPACE as delimiter but it doesn't work. Does any one have suggestion on why it doesn't work?
Edit: tokenizing using:
strtok(string, " ");
The code is like the following
pch = strtok (str," ");
while (pch != NULL)
{
printf ("%s\n",pch);
pch = strtok (NULL, " ");
}
Do it like this:
char s[256];
strcpy(s, "one two three");
char* token = strtok(s, " ");
while (token) {
printf("token: %s\n", token);
token = strtok(NULL, " ");
}
Note: strtok modifies the string its tokenising, so it cannot be a const char*.
Here's an example of strtok usage, keep in mind that strtok is destructive of its input string (and therefore can't ever be used on a string constant
char *p = strtok(str, " ");
while(p != NULL) {
printf("%s\n", p);
p = strtok(NULL, " ");
}
Basically the thing to note is that passing a NULL as the first parameter to strtok tells it to get the next token from the string it was previously tokenizing.
strtok can be very dangerous. It is not thread safe. Its intended use is to be called over and over in a loop, passing in the output from the previous call. The strtok function has an internal variable that stores the state of the strtok call. This state is not unique to each thread - it is global. If any other code uses strtok in another thread, you get problems. Not the kind of problems you want to track down either!
I'd recommend looking for a regex implementation, or using sscanf to pull apart the string.
Try this:
char strprint[256];
char text[256];
strcpy(text, "My string to test");
while ( sscanf( text, "%s %s", strprint, text) > 0 ) {
printf("token: %s\n", strprint);
}
Note: The 'text' string is destroyed as it's separated. This may not be the preferred behaviour =)
You can simplify the code by introducing an extra variable.
#include <string.h>
#include <stdio.h>
int main()
{
char str[100], *s = str, *t = NULL;
strcpy(str, "a space delimited string");
while ((t = strtok(s, " ")) != NULL) {
s = NULL;
printf(":%s:\n", t);
}
return 0;
}
I've made some string functions in order to split values, by using less pointers as I could because this code is intended to run on PIC18F processors. Those processors does not handle really good with pointers when you have few free RAM available:
#include <stdio.h>
#include <string.h>
char POSTREQ[255] = "pwd=123456&apply=Apply&d1=88&d2=100&pwr=1&mpx=Internal&stmo=Stereo&proc=Processor&cmp=Compressor&ip1=192&ip2=168&ip3=10&ip4=131&gw1=192&gw2=168&gw3=10&gw4=192&pt=80&lic=&A=A";
int findchar(char *string, int Start, char C) {
while((string[Start] != 0)) { Start++; if(string[Start] == C) return Start; }
return -1;
}
int findcharn(char *string, int Times, char C) {
int i = 0, pos = 0, fnd = 0;
while(i < Times) {
fnd = findchar(string, pos, C);
if(fnd < 0) return -1;
if(fnd > 0) pos = fnd;
i++;
}
return fnd;
}
void mid(char *in, char *out, int start, int end) {
int i = 0;
int size = end - start;
for(i = 0; i < size; i++){
out[i] = in[start + i + 1];
}
out[size] = 0;
}
void getvalue(char *out, int index) {
mid(POSTREQ, out, findcharn(POSTREQ, index, '='), (findcharn(POSTREQ, index, '&') - 1));
}
void main() {
char n_pwd[7];
char n_d1[7];
getvalue(n_d1, 1);
printf("Value: %s\n", n_d1);
}
When reading the strtok documentation, I see you need to pass in a NULL pointer after the first "initializing" call. Maybe you didn't do that. Just a guess of course.
Here is another strtok() implementation, which has the ability to recognize consecutive delimiters (standard library's strtok() does not have this)
The function is a part of BSD licensed string library, called zString. You are more than welcome to contribute :)
https://github.com/fnoyanisi/zString
char *zstring_strtok(char *str, const char *delim) {
static char *static_str=0; /* var to store last address */
int index=0, strlength=0; /* integers for indexes */
int found = 0; /* check if delim is found */
/* delimiter cannot be NULL
* if no more char left, return NULL as well
*/
if (delim==0 || (str == 0 && static_str == 0))
return 0;
if (str == 0)
str = static_str;
/* get length of string */
while(str[strlength])
strlength++;
/* find the first occurance of delim */
for (index=0;index<strlength;index++)
if (str[index]==delim[0]) {
found=1;
break;
}
/* if delim is not contained in str, return str */
if (!found) {
static_str = 0;
return str;
}
/* check for consecutive delimiters
*if first char is delim, return delim
*/
if (str[0]==delim[0]) {
static_str = (str + 1);
return (char *)delim;
}
/* terminate the string
* this assignmetn requires char[], so str has to
* be char[] rather than *char
*/
str[index] = '\0';
/* save the rest of the string */
if ((str + index + 1)!=0)
static_str = (str + index + 1);
else
static_str = 0;
return str;
}
As mentioned in previous posts, since strtok(), or the one I implmented above, relies on a static *char variable to preserve the location of last delimiter between consecutive calls, extra care should be taken while dealing with multi-threaded aplications.
int not_in_delimiter(char c, char *delim){
while(*delim != '\0'){
if(c == *delim) return 0;
delim++;
}
return 1;
}
char *token_separater(char *source, char *delimiter, char **last){
char *begin, *next_token;
char *sbegin;
/*Get the start of the token */
if(source)
begin = source;
else
begin = *last;
sbegin = begin;
/*Scan through the string till we find character in delimiter. */
while(*begin != '\0' && not_in_delimiter(*begin, delimiter)){
begin++;
}
/* Check if we have reached at of the string */
if(*begin == '\0') {
/* We dont need to come further, hence return NULL*/
*last = NULL;
return sbegin;
}
/* Scan the string till we find a character which is not in delimiter */
next_token = begin;
while(next_token != '\0' && !not_in_delimiter(*next_token, delimiter)) {
next_token++;
}
/* If we have not reached at the end of the string */
if(*next_token != '\0'){
*last = next_token--;
*next_token = '\0';
return sbegin;
}
}
void main(){
char string[10] = "abcb_dccc";
char delim[10] = "_";
char *token = NULL;
char *last = "" ;
token = token_separater(string, delim, &last);
printf("%s\n", token);
while(last){
token = token_separater(NULL, delim, &last);
printf("%s\n", token);
}
}
You can read detail analysis at blog mentioned in my profile :)

Weird seg faults on consecutive calls to the same array

I tried really hard to search for a solution to this but I can't think of good enough keywords.
Currently I'm having troubles grasping the concept behind makeargv and it's usage with triple pointers (I have no idea what ***foo means, it doesn't seem to be as easy of a concept as **foo or *foo). So I made my own:
const char **makeargv(char *string, int *numargs) {
string = string + strspn(string, delims);
char *copy = malloc(strlen(string) + 1);
int i;
strcpy(copy, string);
int numtokens;
if (strtok(copy, delims) != NULL) {
for (numtokens = 1; strtok(NULL, delims) != NULL; numtokens++) {}
}
strcpy(copy, string);
const char *results[numtokens+1];
results[0] = strtok(copy, delims);
for (i = 1; i < numtokens; i++) {
results[i] = strtok(NULL, delims);
}
results[numtokens+1] = NULL;
*numargs = numtokens;
return results;
}
Here's the part at where it breaks:
void parse_file(char* filename) {
char* line = malloc(160*sizeof(char));
FILE* fp = file_open(filename);
int i = 0;
int numargs = 0;
int *pointer = &numargs;
while((line = file_getline(line, fp)) != NULL) {
if (strlen(line) == 1){
continue;
}
const char **args = makeargv(line, pointer);
printf("%s\n", args[0]);
printf("%s\n", args[1]);
/* This prints out args[0], but then args[1] causes a seg fault. Even if I replace
the args[1] with another args[0] it still causes a seg fault */
}
fclose(fp);
free(line);
}
I have a working array of strings. However when I try to print out the strings in the array, I can only print 1 of my choice and then it seg faults for any subsequent calls. lets pretend my array of strings is argv[3] = {"Yes", "no", "maybe"}, if i call argv[0], it will let me call "Yes", but any other calls (even if i call argv[0] again) do not work and cause a segfault. I can call any of the elements in the array, but once i call one the rest cease to work causing segfaults.
Help please? D: This is in C.
const char *results[numtokens+1];
This array "results" is a local variable, it is only available inside of "makeargv".
You'd better use malloc:
results = malloc(numtokens+1)
And I believe there is memory leak in your code.
You will not be able to free the memory for "char *copy"
char *copy = malloc(strlen(string) + 1);
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char **makeargv(char *string, int *numargs) {
static const char *delims = " \t\n";
string = string + strspn(string, delims);
char *copy = malloc(strlen(string) + 1), *p = copy;
strcpy(copy, string);
int numtokens;
for (numtokens = 0; strtok(p, delims); ++numtokens, p = NULL);
char **results = malloc(sizeof(char*)*(numtokens+1));
strcpy(copy, string);
int i;
p = copy;
for (i = 0; i < numtokens; ++i, p = NULL)
results[i] = strtok(p, delims);
results[i] = NULL;
*numargs = numtokens;
return results;
}
FILE *file_open(char *filename){
FILE *fp = fopen(filename, "r");
if(!fp){
perror("file_open");
exit(1);
}
return fp;
}
void parse_file(char* filename) {
char* line = malloc(160*sizeof(char));
FILE* fp = file_open(filename);
int i = 0, numargs = 0;
while(fgets(line, 160, fp)){
if (*line == '\n')
continue;
char **args = makeargv(line, &numargs);
for(i = 0;i<numargs;++i)
printf("%s\n", args[i]);
printf("\n");
if(args[0])
free(args[0]);
free(args);
}
fclose(fp);
free(line);
}
int main(int argc, char *argv[]){
parse_file(argv[1]);
return 0;
}

How can i split a string into tokens by '&'

I´m tying to split a string into tokens by ';'. But I have a problem that some tokens are empty/null, for example; 123;123132;;;232;232323;;;;1;
So i can´t use strtok becasuse merges adjacents delimiters. I see that you post this solution:
include <string.h>
char *data = "this&&that&other";
char *next;
char *curr = data;
while ((next = strchr(curr, '&')) != NULL) {
/* process curr to next-1 */
curr = next + 1;
}
/* process the remaining string (the last token) */
But I don´t understand because when I do next-1 to get the firts value i only get the firts word of the value not all the whole value.
Can you help me?, do you have any idea how to split this?
I´m programmig in C ansi. I see in another post that exists a strsep function thats seems exactly what i need, but in C ansi library this functions is not included.
Thanks and sorry for my english :)
I think this is want you want :-
#include <stddef.h>
#include <string.h>
#include <stdio.h>
char* mystrsep(char** input, const char* delim)
{
char* result = *input;
char* p;
p = (result != NULL) ? strpbrk(result, delim) : NULL;
if (p == NULL)
*input = NULL;
else
{
*p = '\0';
*input = p + 1;
}
return result;
}
int main()
{
char str[] = "123;123132;;;232;232323;;;;1;";
const char delimiters[] = ";";
char* ptr;
char* token;
ptr = str;
token = mystrsep(&ptr, delimiters);
while(token)
{
printf("%s\n",token);
token = mystrsep(&ptr, delimiters);
}
return 0;
}
#include <stdio.h>
#include <string.h>
char *strtok_r_noskip(char *str, const char *delims, char **store){
char *p, *wk;
if(str != NULL){
*store = str;
}
if(*store == NULL) return NULL;
//*store += strspn(*store, delims);//skip delimiter
if(**store == '\0') return NULL;
p=strpbrk(wk=*store, delims);
if(p != NULL){
*p='\0';
*store = p + 1;
} else {
*store = NULL;
}
return wk;
}
int main(void){
char data1[] = "this&&that&other";
char *store, *token = strtok_r_noskip(data1, "&", &store);
for(; token ; token = strtok_r_noskip(NULL, "&", &store)) {
printf("\"%s\"\n", token);
}
/* output
"this"
""
"that"
"other"
*/
char data2[] = "123;123132;;;232;232323;;;;1;";
token = strtok_r_noskip(data2, ";", &store);
for(; token ; token = strtok_r_noskip(NULL, ";", &store)) {
printf("\"%s\"\n", token);
}
/* output
"123"
"123132"
""
""
"232"
"232323"
""
""
""
"1"
*/
return 0;
}

Resources