I can give my program input files such as the following:
1 0 0 0 2 -1
16 70 -169 -580 75
1 0 4 0 -5
0 -9 3 5 -3
5 -4 3 -2 0
1.0 -3.4 5.4531 -4.2077 1.5092 -0.2030
Each of these lines represents a polynomial. For instance, the first line represents x^6 + 2x^2 - 1
I am trying to read in this file but am not sure how to deal with the tabs and new lines. I am also not sure how to handle the fact that the amount of coefficients and polynomials can change.
Right now I have:
polynomial** readPolyFile(FILE *polyFile){
polynomial *p = NULL;
int size = 1; /* Size to malloc */
polynomial **polynomials = NULL;
polyList = malloc(sizeof(polynomial*) * size); /* Initialize */
if(polyList == NULL){
fprintf(stderr, "%s %n: Could not allocate memory\n", __FILE__, __LINE__);
exit(-99);
}
/* Read all of the data from the file */
do {
}
}
My initial thought is to increment the size each time I need to, but I'm not sure if that's the best way to go about this.
My definition of polynomial is as follows:
typedef struct
{
unsigned int nterms; /* number of terms */
double complex *polyCoef; /* coefficients */
} polynomial;
I would like to return a list of polynomial structs. Any suggestions on how to go about this?
Any suggestions on how to go about this?
Read inputs into a linked list and then form the array when done.
Some quick code, lacking needed error checking, to get you started.
Set up an empty linked-list
typedef struct LL {
polynomial *poly;
struct LL *next;
} LL;
LL head = { NULL, NULL };
LL *p = &head;
Inside the loop, read a line in the a buffer, parse it to a polynomial and append to the LL.
#define N 1000
count = 0;
char buf[N];
/* Read all of the data from the file */
while (fgets(buf, sizeof buf, polyFile)) {
p->next = malloc(sizeof *(p->next));
p = p->next;
// TBD code for OP.
// Hint: degree of polynomial < sizeof buf/2
p->poly = polyList_from_line(buffer);
p->next = NULL;
count++;
}
Allocate for the array
polyList = malloc(sizeof *polyList * count);
p = head.next;
for (i=0; i< count; i++) {
assert(p);
polylist[i] = p->poly;
next = p->next;
free(p);
p = next;
}
return polylist;
Sub problem: More pseudo code to read tab separated data
polynomial *polyList_from_line(char *buffer) {
double complex coef[some factor of N];
count = 0;
char *token = strtok(buffer, "\t");
while (token) {
// parse the token for a complex number
if (sscanf(buffer, tbd_format, tbd_variables) != expected_result)
break;
coef[count++] = ...
token = strtok(tbd_code);
}
// Malloc polynomial using count
// Populate polynomial from coef[]
// return allocation
}
Related
I am trying to add new node to my linked list, but it's gives memory error
my struct and global vars:
typedef struct word word;
struct word
{
char str[256];
word *next;
};
word *head = NULL;
word *cur = NULL;
the function :
int addWord(char * str)
{
word *w = calloc(1, sizeof(w));
if(w == NULL)
{
return 0;
}
strcpy(w->str, str);
if(cur == NULL)
{
cur = w;
head = w;
}
else
{
puts("4");
cur->next = w;
puts("5");
cur = w;
puts("6");
}
return 1;
}
and the result is :
...
4
5
6
4
==73913== Invalid write of size 8
==73913== at 0x109425: addWord (in /home/mz37/programming/godaphy/bin/godaphy.out)
==73913== by 0x109696: parseLine (in /home/mz37/programming/godaphy/bin/godaphy.out)
==73913== by 0x109351: main (in /home/mz37/programming/godaphy/bin/godaphy.out)
==73913== Address 0x4a6a880 is 96 bytes inside an unallocated block of size 4,188,096 in arena "client"
==73913==
5
6
i am still searching for the error and i haven't found it yet
word *w = calloc(1, sizeof(w));
The w variable is of type word pointer hence is likely to be four or eight bytes at most. It may be larger if we end up with 128-bit machines at some point, but it'll be quite some time before it gets to 2000+ bits :-)
You probably wanted to do:
word *w = calloc(1, sizeof(*w));
// note this ___^
The type of *w is the actual type word, and that will be the correct size for what you're trying to do.
And, as an aside, you may want to think about the wisdom of blindly copying whatever string you're given, into a block of memory that can only hold 256 characters. A safer alternative would be:
strncpy(w->str, str, sizeof(w->str) - 1);
// Would normally also do something like:
// w->str[sizeof(w->str) - 1] = '\0';
// but calloc() makes that superfluous.
The resultant function (including compactifying) would be along the following lines:
int addWord(char *str) {
word *w;
// Option to fail if string too big, rather than truncate.
//if (strlen(str) >= sizeof(w->str)
// return 0;
// Allocate and check.
if ((w = calloc(1, sizeof(*w))) == NULL)
return 0;
// Copy in string, truncate if too big.
strncpy(w->str, str, sizeof(w->str) - 1);
// Make new list if currently empty, otherwise add it, then flag success.
if(cur == NULL) {
cur = head = w;
} else {
cur->next = w;
cur = w;
}
return 1;
}
I'm trying to code a function in C! Not C++. I couldn't figure out how to actually go about it so this is just an outline of what I would like to do. The purpose of this function is to save the top 10 high scores to a txt file. It needs to have the winners name and amount of wins next to their name. Each time a round is finished, the winner will get their name printed into the file with the total amount of times they've won. If a returning user wins, their name should only appear once. Below is an example of how the txt file should look when displayed and what I'm trying to go for code-wise.
EX.
1 Joe 10
2 Jen 8
3 Bob 7
4 Caleb 6
5 Lance 5
6 Siobhan 3
7 Laurel 2
8 Jack 2
9 Gabriel 1
10 Timmy 1
Outline of function
You need a data structure to represent each entry. The simplest that comes to mind is something like this
struct record_entry{
char name[BUFLEN];
int score;
};
Then you need another data structure to store these entry_record objects. Ideally, something like a C++ map or Java HasMap (i.e. a hash table) to store all the entries by using the name field as your keys. This, however, requires you to implement your own hash table and if you do not want to do it, you can use C arrays.
Use standard library's qsort function to sort the entries in the array. A simple compar function for struct entry_record could be
int cmp_score(const void* e1, const void* e2){
int s1, s2;
s1 = ((struct record_entry*)e1)->score;
s2 = ((struct record_entry*)e2)->score;
if (s1 < s2)
return -1;
else if (s1 == s2)
return 0;
else
return 1;
}
Then, you need a function to update your data structure (hash table or array) based on the new entry. You should implement something similar to insert_or_assign() method of std::map (C++17) or put() method of Java's HashMap class. Below is a simple function that achieves this
size_t update_records(struct record_entry **ep, size_t n, struct record_entry ne){
size_t i;
int exist=0;
size_t nsize = n;
struct record_entry *tmp = *ep;
/* check if the entry is already there */
for (i=0; i<nsize; i++){
if (strcmp(tmp[i].name, ne.name) == 0){
exist = 1;
break;
}
}
if (exist){
/* update the record */
tmp[i].score = ne.score;
} else {
/* add a new entry */
nsize++;
tmp = realloc(*ep, nsize * sizeof(struct record_entry));
strncpy(tmp[nsize-1].name, ne.name, BUFLEN);
tmp[nsize-1].score=ne.score;
}
/* sort the array */
qsort(tmp, nsize, sizeof(struct record_entry), cmp_score);
*ep = tmp;
return nsize;
}
update_records returns the size of the new array; n+1 if the entry you pass as an argument is not in the array (i.e. a new record), or n if it already there. It updates the existing record in the latter case of course.
The code snippet below does something similar to what you want to achieve, simply get the data from your text file and add error handling where appropriate. The logic is the same though
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BUFLEN 32
#define N 4
struct record_entry{
char name[BUFLEN];
int score;
};
int cmp_score(const void* e1, const void* e2){
int s1, s2;
s1 = ((struct record_entry*)e1)->score;
s2 = ((struct record_entry*)e2)->score;
if (s1 < s2)
return -1;
else if (s1 == s2)
return 0;
else
return 1;
}
size_t update_records(struct record_entry **ep, size_t n, struct record_entry ne){
size_t i;
int exist=0;
size_t nsize = n;
struct record_entry *tmp = *ep;
/* check if the entry is already there */
for (i=0; i<nsize; i++){
if (strcmp(tmp[i].name, ne.name) == 0){
exist = 1;
break;
}
}
if (exist){
/* update the record */
tmp[i].score = ne.score;
} else {
/* add a new entry */
nsize++;
tmp = realloc(*ep, nsize * sizeof(struct record_entry));
strncpy(tmp[nsize-1].name, ne.name, BUFLEN);
tmp[nsize-1].score=ne.score;
}
/* sort the array */
qsort(tmp, nsize, sizeof(struct record_entry), cmp_score);
*ep = tmp;
return nsize;
}
int main(){
struct record_entry *entries, *p, new_entry;
int i;
size_t ns;
entries = malloc(N * sizeof(struct record_entry));
strncpy(entries[0].name, "Test0", BUFLEN);
entries[0].score=0;
strncpy(entries[1].name, "Test1", BUFLEN);
entries[1].score=1;
strncpy(entries[2].name, "Test2", BUFLEN);
entries[2].score=2;
strncpy(entries[3].name, "Test3", BUFLEN);
entries[3].score=3;
/* sort the array */
qsort(entries, N, sizeof(struct record_entry), cmp_score);
printf("before\n");
printf("===============\n");
for (i=N-1; i>=0; i--){
printf("%d %s\t%d\n",i ,entries[i].name, entries[i].score);
}
printf("\nname : ");
scanf("%s", new_entry.name);
printf("score : ");
scanf("%d",&new_entry.score);
getc(stdin);
ns = update_records(&entries, N, new_entry);
printf("\nafter return\n");
printf("===============\n");
for (i=ns-1; i>=0; i--){
printf("%d %s\t%d\n",i ,entries[i].name, entries[i].score);
}
free(entries);
return 0;
}
When I run this code with a new entry
before
===============
3 Test3 3
2 Test2 2
1 Test1 1
0 Test0 0
name : test
score : 7
after return
===============
4 test 7
3 Test3 3
2 Test2 2
1 Test1 1
0 Test0 0
and when I update an existing entry
before
===============
3 Test3 3
2 Test2 2
1 Test1 1
0 Test0 0
name : Test2
score : 9
after return
===============
3 Test2 9
2 Test3 3
1 Test1 1
0 Test0 0
So I have files formatted as follows:
2
4 8 4 10 6
9 6 74
The first line is actually the number of rows that the file will have after it. I want to read the files line by line (note there are different number of tokens in each line but all have the format: 1 token and then an unspecified number of pairs of tokens) and do two things for each line:
1) Know how many tokens are in this line.
2) Assign each token to a variable. Using structures similar to:
typedef struct {
unsigned start; //start node of a graph
unsigned end; // end node of a graph
double weight; //weight of the edge going from start to end
} edge ;
typedef struct {
unsigned id; // id of the node
unsigned ne; // number of edges adjacent to node
edge *edges; // array of edge to store adjacent edges of this node
} node;
Some code:
FILE *fin;
unsigned nn;
node *nodes;
fin = fopen ("input.txt", "r");
fscanf(fin,"%u\n", &nn);
nodes = malloc(nn*sizeof(node));
for(i=0; i < nn; i++) { //loop through all the rows
/*grab the row and split in parts, let's say they are part[0], part[1]... */
/*and there are N tokens in the row*/
nodes[i].id=part[0];
nodes[i].ne=(N-1)/2; //number of pairs excluding first element
nodes[i].edges=malloc( (N-1)/2)*sizeof(edge) );
for(j=0; j< (N-1)/2; j++){
nodes[i].edges[j].start=part[0];
nodes[i].edges[j].end=part[2*j+1];
nodes[i].edges[j].weight=part[2*j+2];
}
}
I need to figure out how to do the part comented inside the first for loop to get the number of tokens and each one of them as a simgle token to asign. Any ideas?
EDIT: to make things clear, each line will have first one integer, and then a variable number of pairs. I want to store data as follows:
if the file reads
2
4 8 4 10 6 //(2 pairs)
9 6 74 //(1 pair)
then
nn=2;
node[0].id=4;
node[0].ne=2; //(2 pairs)
node[0].(*edges) //should be a vector of dimension ne=2 containing elements of type edge
node[0].edges[0].start=4; //same as node[0].id
node[0].edges[0].end=8;
node[0].edges[0].weight=4;
node[0].edges[1].start=4; //same as node[0].id
node[0].edges[1].end=10;
node[0].edges[1].weight=6;
node[1].id=9;
node[1].ne=1; //(1 pair)
node[1].(*edges) //should be a vector of dimension ne=1 containing elements of type edge
node[1].edges[0].start=9; //same as node[1].id
node[1].edges[0].end=6;
node[1].edges[0].weight=74;
This code produces the results you described, It initializes your nested struct member edge, and uses strtok. With strtok(), I included the \n as part of the delimiter in addition to a space " \n" to prevent the newline from giving us trouble (see other comments on that below)
Note: you have to free memory where I have indicated, but before you do, preserve the intermediate results (in the structs) or it will be lost.
#include <ansi_c.h>
typedef struct {
unsigned start;
unsigned end;
double weight;
} edge ;
typedef struct {
unsigned id;
unsigned ne;
edge *edges;
} node;
int GetNumPairs(char *buf);
int main(void)
{
FILE *fp;
char *tok;
char lineBuf[260];
int i=0, j=0;
int nn; //number of nodes
char countPairsBuf[260];
fp = fopen("C:\\dev\\play\\numbers.txt", "r");
//get first line of file for nn:
fgets (lineBuf, sizeof(lineBuf), fp);
nn = atoi(lineBuf);
//create array of node with [nn] elements
node n[nn], *pN;
pN = &n[0];
//read rest of lines, (2 through end)
i = -1;
while(fgets (lineBuf, sizeof(lineBuf), fp))
{
i++;
//get number of items in a line
strcpy(countPairsBuf, lineBuf);
pN[i].ne = GetNumPairs(countPairsBuf); //number of edges (pairs)
if(pN[i].ne > 0)
{ //allocate *edges struct element
pN[i].edges = malloc((pN[i].ne)*sizeof(edge));
//get first item in new line as "line token" and "start"
tok = strtok(lineBuf, " \n");
while(tok)
{
pN[i].id = atoi(tok);
//now get rest of pairs
for(j=0;j<pN[i].ne;j++)
{
pN[i].edges[j].start = pN[i].id;
tok = strtok(NULL, " \n");
pN[i].edges[j].end = atoi(tok);
tok = strtok(NULL, " \n");
pN[i].edges[j].weight = atoi(tok);
}
tok = strtok(NULL, " \n"); //should be NULL if file formatted right
}
}
else //pN[i].ne = -1
{
//error, file line did not contain odd number of elements
}
}
//you have to free memory here
//but I will leave that to you
fclose(fp);
}
//GetNumPairs
int GetNumPairs(char *buf)
{
int len = strlen(buf);
int numWords=0, i, cnt=0;
for(i=0;i<len;i++)
{
if ( isalpha ( buf[i] ) ) cnt++;
else if ( ( ispunct ( buf[i] ) ) || ( isspace ( buf[i] ) ) )
{
numWords++;
cnt = 0;
}
}//if odd number of "words", return number of pairs, else error
return (((numWords-1)%2) == 0) ? ((numWords-1)/2) : (-1);
}
I am trying to write a function to clean up the hash table that is generated by this code
/*
* Markov chain random text generator.
*/
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "eprintf.h"
enum {
NPREF = 2, /* number of prefix words */
NHASH = 4093, /* size of state hash table array */
MAXGEN = 10000 /* maximum words generated */
};
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* lookup(char *prefix[], int create);
void build(char *prefix[], FILE*);
void generate(int nwords);
void add(char *prefix[], char *word);
State* statetab[NHASH]; /* hash table of states */
char NONWORD[] = "\n"; /* cannot appear as real word */
/* markov main: markov-chain random text generation */
int main(void)
{
int i, nwords = MAXGEN;
char *prefix[NPREF]; /* current input prefix */
int c;
long seed;
setProgName("markov");
seed = time(NULL);
srand(seed);
for (i = 0; i < NPREF; i++) /* set up initial prefix */
prefix[i] = NONWORD;
build(prefix, stdin);
add(prefix, NONWORD);
generate(nwords);
return 0;
}
const int MULTIPLIER = 31; /* for hash() */
/* hash: compute hash value for array of NPREF strings */
unsigned int hash(char* s[NPREF])
{
unsigned int h;
unsigned char *p;
int i;
h = 0;
for (i = 0; i < NPREF; i++)
for (p = (unsigned char *) s[i]; *p != '\0'; p++)
h = MULTIPLIER * h + *p;
return h % NHASH;
}
/* lookup: search for prefix; create if requested. */
/* returns pointer if present or created; NULL if not. */
/* creation doesn't strdup so strings mustn't change later. */
State* lookup(char *prefix[NPREF], int create)
{
int i, h;
State *sp;
h = hash(prefix);
for (sp = statetab[h]; sp != NULL; sp = sp->next) {
for (i = 0; i < NPREF; i++)
if (strcmp(prefix[i], sp->pref[i]) != 0)
break;
if (i == NPREF) /* found it */
return sp;
}
if (create) {
sp = (State *) emalloc(sizeof(State));
for (i = 0; i < NPREF; i++)
sp->pref[i] = prefix[i];
sp->suf = NULL;
sp->next = statetab[h];
statetab[h] = sp;
}
return sp;
}
/* addsuffix: add to state. suffix must not change later */
void addsuffix(State *sp, char *suffix)
{
Suffix *suf;
suf = (Suffix *) emalloc(sizeof(Suffix));
suf->word = suffix;
suf->next = sp->suf;
sp->suf = suf;
}
/* add: add word to suffix list, update prefix */
void add(char *prefix[NPREF], char *suffix)
{
State *sp;
sp = lookup(prefix, 1); /* create if not found */
addsuffix(sp, suffix);
/* move the words down the prefix */
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = suffix;
}
/* build: read input, build prefix table */
void build(char *prefix[NPREF], FILE *f)
{
char buf[100], fmt[10];
/* create a format string; %s could overflow buf */
sprintf(fmt, "%%%ds", sizeof(buf)-1);
while (fscanf(f, fmt, buf) != EOF)
add(prefix, estrdup(buf));
}
/* generate: produce output, one word per line */
void generate(int nwords)
{
State *sp;
Suffix *suf;
char *prefix[NPREF], *w;
int i, nmatch;
for (i = 0; i < NPREF; i++) /* reset initial prefix */
prefix[i] = NONWORD;
for (i = 0; i < nwords; i++) {
sp = lookup(prefix, 0);
if (sp == NULL)
eprintf("internal error: lookup failed");
nmatch = 0;
for (suf = sp->suf; suf != NULL; suf = suf->next)
if (rand() % ++nmatch == 0) /* prob = 1/nmatch */
w = suf->word;
if (nmatch == 0)
eprintf("internal error: no suffix %d %s", i, prefix[0]);
if (strcmp(w, NONWORD) == 0)
break;
printf("%s\n", w);
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = w;
}
}
Here is what I have so far for my clean function
/*Clean Function*/
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, temp3;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
while(sp->suf != NULL)
{
temp2= sp->suf;
temp3= *temp2->next;
free(temp2);
sp->suf= &temp3;
}
}
}
}
I think im on the right track, I'm going through each index in the hash table, then going from state to state and freeing the suffixes. I'm not sure what to do about the prefixes, because I have to free them before I can free each state. Any help would be greatly appreciated.
In your code, you are copying into a temp3 node, which lives in automatic memory ("on the stack") pointing sp->suf to this memory will (on the next iteration of the loop) cause free to be called with the address of this object (which has not been obtained by malloc, and thus cannot be freed by free() )
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, **pp;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
for (pp = &sp->suf; *pp; *pp = temp2)
{
temp2 = (*pp)->next;
free(*pp);
}
}
}
}
The example code is derived from the Markov program in The Practice of Programming by Kernighan and Pike, a most excellent book.
Given that you are trying to clean up the statetab, the main clean-up function doesn't need any argument. You do have to be careful not to free the states directly in statetab, but you do need to release auxilliary states chained off statetab[i].next.
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* statetab[NHASH]; /* hash table of states */
static void free_state(State *state);
static void free_suffix(Suffix *suffix);
static void cleanup(void)
{
for (int i = 0; i < NHASH; i++)
free_state(statetab[i]);
}
static void free_state(State *state)
{
if (state != 0)
{
for (int i = 0; i < NPREF; i++)
free(state->pref[i]);
free_suffix(state->suf);
if (state->next != 0)
{
free_state(state->next);
free(state->next);
}
}
}
static void free_suffix(Suffix *suffix)
{
if (suffix != 0)
{
free(suffix->word);
free_suffix(suffix->next);
free(suffix);
}
}
Do you see how I've designed the free_xxxx() code based on the design of the xxxx structure?
Caveat Lector: uncompiled code, much less tested code.
I dug up the code from the TPOP site, and tried to apply it. I made some fixes to the freeing code above (syntax error fixed, the null checks in free_state() and free_suffix()), but the code as a whole was not designed to allow the data to be freed.
There are a couple of problems. First, a few of the prefixes are not allocated (NONWORD). It might be possible to avoid releasing those by testing whether a prefix is NONWORD, but that's nasty. It might be possible to allocate those prefixes too (replace NONWORD by estrdup(NONWORD)). I think there's another place, somewhere, that a non-allocated pointer is being stashed in a prefix in the state table; I'm getting crashes in malloc() complaining of 'freeing non-allocated memory' (which is distinct from 'double freeing allocated memory', I believe), but I've not managed to resolve that.
However, that then changes to another problem; the prefixes are reused. That is, almost every prefix in the system is used as the the second word of one prefix, then as the first word of the next prefix. Thus, you can't readily free the prefixes.
If you were to design this so that the memory could be released, then you'd probably design it so that there was a system of 'atoms' (immutable strings) such that each word was allocated once and reused as often as necessary (see C Interfaces and Implementations: Techniques for Creating Reusable Code by D Hanson for the source of the term). The code freeing the state table would then concentrate only on the non-word data. There'd be code to release the complete set of atoms as well.
I ran the Markov program under valgrind without the cleanup; there are no memory access problems and no leaked data; it is all still accessible at program exit. I was using a data file of about 15,000 words (and about 2900 distinct words), and the statistics were:
==9610== HEAP SUMMARY:
==9610== in use at exit: 695,269 bytes in 39,567 blocks
==9610== total heap usage: 39,567 allocs, 0 frees, 695,269 bytes allocated
==9610==
==9610== LEAK SUMMARY:
==9610== definitely lost: 0 bytes in 0 blocks
==9610== indirectly lost: 0 bytes in 0 blocks
==9610== possibly lost: 0 bytes in 0 blocks
==9610== still reachable: 695,269 bytes in 39,567 blocks
So, you set yourself an interesting exercise. However, I think it is not achievable without reworking some of the memory allocation mechanism so that the data can be freed cleanly.
(On BSD, and hence on Mac OS X too, there are a pair of functions in <stdlib.h> called setprogname() and getprogname(). On BSD, setprogname() is called automatically before the main() gets going (with argv[0], I believe). The declaration in eprintf.h conflicts with the declaration in <stdlib.h>, which may be why the code in the question uses setProgName() instead of the original setprogname(). I chose to fix setprogname() in eprintf.h so that it took a const char * argument and therefore matched the declaration in <stdlib.h>.)
TPOP was previously at
http://plan9.bell-labs.com/cm/cs/tpop and
http://cm.bell-labs.com/cm/cs/tpop but both are now (2015-08-10) broken.
See also Wikipedia on TPOP.
I am trying to tokenize a string. I have a table of available tokens ordered in the form of a trie. Each token knows it has children. A simple tokens table will look like,
pattern value has_children
-------- ------ --------
s s-val 1
stack stack-val 0
over over-val 1
overflow overflow-val 0
In this table, stack is a child of s and overflow is a child of over. In practice, this table will have 5000+ records ordered in this way.
Now, given a string stackover, it should output stack-valover-val. Algorithm is greedy and it will try to find the longest match always.
To do this, I will start reading each character from the input, look for match, if a match found and the token has children, look for match again by including next character. Do this until we find the longest match. If no match found, try to match by including the next character until we reach the end of string or a successful match.
If we reached end of the string without a match, output ? symbol and remove the first character from the input. Repeat the whole process with remaining characters.
This algorithm works, but the backtracking and iterating on all possible combinations of the input makes it slow and complex.
I am wondering is there a better way of solving this? Any help would be appreciated.
Instead of backtracking you could keep in memory all possible results, until one result singles out at certain point in input stream. Example
Tokens: S STACK STACKOVERFLOW STAG OVER OVERFLOW
String: SSTACKOVERFUN
1 - Found S on place 0, have tokens that begin with S, try them all, only S is valid, so resolve S
2 - S on 1, have such tokens, try them, possible valid are S and STACK. Don't resolve, just keep them in mind.
3 - T on 2, have no such tokens, so S could be resolved now, but we also have longer token (STACK) so S is no good. Ditch S, and STACK is only left, but it has children. Try string for children. There are no possible children so resolve STACK
4 - O on 6, have such tokens, try them, have only OVER, so resolve OVER
5 - F on 10, no such tokens, and nothing to resolve from before so this is non-tokenizable
6 and 7 - same as step 5
Final result: S STACK OVER fun
Could you use the Aho-Corasick algorithm? It creates an automaton to search a keyword tree (trie).
I'm thinking that you want to take all of your keywords and sort them reverse alphabetically, so your list would become (plus a few extras)
0 stack 1
1 s 0
2 overflow 3
3 over 5
4 ovum 5
5 o 0
6 exchange 7
7 ex 0
The third column of this list are pointers to the parent token which is always lower on the list. Then you can take your target string and binary search where it fits on this list. If it lands above a token which matches then you clip off that portion and repeat the process for the remainder. If it doesn't match you use the parent pointer to find the next longest potential matching token.
If you want to get really fancy you can also chunk up the strings into 64bit words and compare 8 characters at once in the binary search.
I suggest you try Ragel, It can generate efficient scanners that can do longest match/backtracking. See chapter 6.3 in the Ragel user guide for more information.
I've created a tiny test which I think matches your specification, this is only the state machine description, without the code to feed input:
%%{
machine test;
main := |*
's' => { puts("s-val");};
'stack' => { puts("stack-val");};
'over' => { puts("over-val");};
'overflow' => { puts("overflow-val");};
# Anything else matches to any, outputs a '?' and continues
any => {putc('?');};
*|;
}%%
The following token_tree code is based on the prefix_tree class from ZeroMQ
The prefix_tree class only returns "true" when one of the tree's prefixes matches the start of the input text. It will not even tell you which prefix or how long that prefix was.
This token_tree will look for the longest token that matches the start of the input text. The search
function token_tree_longest_token() only needs to return the length of the longest token matched
against the start of the input text.
The basic algorithm is similar to the one described in the question, but it's implmentation might be faster.
Also there are some ways to improve memory usage, which could have it faster.
#include <stdint.h>
#include <stdlib.h>
/* #define TEST_TOKEN_TREE */
/*
* TODO: possible improvements, use multiple types of nodes: string/branch/leaf.
* The string node would replace a chain of normal token_nodes and save memory.
* This would require spliting a node to add branch points.
* Use these structs:
* struct token_node {
* uint32_t ref_count;
* uint8_t node_type; -- node is token_node_str/token_node_branch/token_node_leaf
* };
* struct token_node_str {
* token_node base;
* uint8_t reserved;
* uint16_t len; -- string length
* token_node *child; -- string nodes can only have one child.
* uint8_t str[0]; -- embedded string (not null-terminated)
* };
* struct token_node_branch {
* token_node base;
* uint8_t min; -- smallest char in child list.
* uint16_t count; -- child count.
* token_node *children[0];
* };
* struct token_node_leaf { -- leaf nodes have no children.
* token_node base;
* };
* This will save memory, but will make code much more complex.
*/
typedef struct token_tree token_tree;
typedef struct token_node token_node;
struct token_tree {
token_node *root; /**< root node of token tree. */
};
struct token_node {
uint32_t ref_count; /**< how many token references end at this node. */
uint8_t min; /**< smallest 'char' in children's list. */
uint8_t reserved; /**< padding. */
uint16_t count; /**< number of children. (max count = 256, so count must be 16bits) */
token_node *children[0]; /**< list of children nodes. index by (c - min) */
};
#define NODE_SIZE(count) (sizeof(token_node) + (sizeof(token_node *) * count))
static token_node *token_node_new(uint16_t count) {
token_node *node = calloc(1, NODE_SIZE(count));
node->count = count;
return node;
}
static void token_node_build_chain(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
do {
/* the last node in the chain will have no children. */
node = token_node_new((len == 0) ? 0 : 1);
*pnode = node; /* add node to slot in parent's children list. */
if(len == 0) break;
/* new node will have one child. */
node->min = *token;
node->count = 1;
/* slot where next node will be saved. */
pnode = &(node->children[0]);
/* consume char. */
token++;
len--;
} while(1);
/* mark last node as end of a valid token. */
node->ref_count++;
}
static void token_node_free(token_node *node) {
uint32_t i;
uint32_t count = node->count;
/* free children nodes. */
for(i=0; i < count; i++) {
if(node->children[i]) token_node_free(node->children[i]);
}
free(node);
}
static void token_node_grow(token_node **pnode, uint8_t c) {
token_node *node = *pnode;
token_node **children;
uint8_t old_min = node->min;
uint16_t old_count = node->count;
uint32_t i;
uint8_t min;
uint16_t count;
if(c < old_min) {
min = c;
count = old_count + (old_min - min);
} else {
if(old_count == 0) {
/* the list was empty, so this is the first char. */
old_min = c;
}
min = old_min;
c -= old_min;
if(c < old_count) {
/* don't need to grow. */
return;
}
count = c + 1;
}
node = realloc(node, NODE_SIZE(count));
*pnode = node;
children = node->children;
/* if the 'min' value changed, then we need to move all the old slots up. */
if(old_min != min) {
uint32_t diff = old_min - min;
for(i=count-1; i >= diff; i--) {
children[i] = children[i - diff];
}
/* null new slots at start of children list. */
for(i=0; i < diff; i++) {
children[i] = NULL;
}
} else {
/* null new slots at end of children list. */
for(i=old_count; i < count; i++) {
children[i] = NULL;
}
}
node->min = min;
node->count = count;
}
static token_node **token_node_find_last_node(token_node **pnode, const uint8_t **ptoken, size_t *plen) {
const uint8_t *token = *ptoken;
size_t len = *plen;
uint32_t c;
token_node *node = *pnode;
while(node && len) {
/* next char. */
c = (*token);
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/*
* NOTE: we don't consume this char and "*pnode" will not be null.
* When adding tokens, this node will be grown to hold more children.
*/
break;
}
/* consume char. */
token++;
len--;
/* get pointer to next node's slot. */
pnode = &(node->children[c]);
node = *pnode;
}
*ptoken = token;
*plen = len;
/* return pointer to last node's slot. */
return pnode;
}
static void token_node_add(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
/* find last node in chain for this token. */
pnode = token_node_find_last_node(pnode, &token, &len);
/* if full token was consumed then we found the last node for this token. */
if(!len) {
node = *pnode;
node->ref_count++;
return;
}
/* check if the children list of the last node needs to be grown. */
node = *pnode;
if(node) {
uint32_t c = *token;
/* consume char. */
token++;
len--;
/* grow node to make room for new char. */
token_node_grow(pnode, c);
node = *pnode; /* token_node_grow() may change the node's pointer. */
/* get slot for new child. */
pnode = &(node->children[c - node->min]);
}
/* build node chain for un-consumed part of token. */
token_node_build_chain(pnode, token, len);
}
static size_t token_node_longest_token(token_node *node, const uint8_t *text, size_t len) {
size_t last_token_len = 0;
size_t off = 0;
uint32_t c;
/* loop until we get a NULL node or run out of text. */
do {
if(node->ref_count > 0) {
/* found a token, keep track of it's length. */
last_token_len = off;
}
/* end of input text. */
if(off >= len) break;
/* next char. */
c = text[off];
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/* End of search, no more child nodes. */
break;
}
/* consume char. */
off++;
/* get pointer to next node's slot. */
node = node->children[c];
} while(node);
/* return length of largest token found. */
return last_token_len;
}
extern token_tree *token_tree_new() {
token_tree *tree = malloc(sizeof(token_tree));
tree->root = token_node_new(0);
return tree;
}
extern void token_tree_free(token_tree *tree) {
token_node_free(tree->root);
free(tree);
}
extern void token_tree_add(token_tree *tree, const char *token, size_t len) {
token_node_add(&(tree->root), token, len);
}
extern size_t token_tree_longest_token(token_tree *tree, const char *text, size_t len) {
return token_node_longest_token(tree->root, text, len);
}
#ifdef TEST_TOKEN_TREE
#include <stdio.h>
#include <string.h>
static const char *test_tokens[] = {
"s",
"stack",
"stackoverflow",
"over",
"overflow",
NULL,
};
static const char *test_input[] = {
"aastackoverasdfasdf",
"stack7777",
"777stack777",
"overstackflow",
NULL,
};
static void add_tokens(token_tree *tree, const char **tokens) {
int i;
for(i = 0; tokens[i] != NULL; i++) {
token_tree_add(tree, tokens[i], strlen(tokens[i]));
}
}
static void print_tokens(token_tree *tree, const char *text) {
size_t len = strlen(text);
size_t token_len;
printf("input: \"%s\"\n", text);
printf("tokens: [");
while(len) {
token_len = token_tree_longest_token(tree, text, len);
if(token_len > 0) {
printf("<%.*s>", (int)token_len, text);
} else {
printf("?");
token_len = 1;
}
text += token_len;
len -= token_len;
}
printf("]\n");
}
static void run_test(token_tree *tree, const char **texts) {
int i;
for(i = 0; texts[i] != NULL; i++) {
print_tokens(tree, texts[i]);
}
}
int main(int argc, char *argv[]) {
token_tree *tree = token_tree_new();
add_tokens(tree, test_tokens);
run_test(tree, test_input);
run_test(tree, test_tokens);
token_tree_free(tree);
}
#endif