Storing integers in c-implemented hash tables? - c

Currently I have a hash table implementation in C that uses strings as the keys and values. If I wanted to store integers instead of strings as the values, what would be the best way to do this? I'm thinking of storing the integer in a string and converting it to an integer when I need it but it seems inefficient for arithmetic. Something like
insert("money", "13");
int i = atoi(get("key1"));
int sum = i + 10;
insert("money", itoa(sum));
Is there a better way to do this?
EDIT: hash table implementation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct tableentry /* hashtab entry */
{
struct tableentry *next;
char *key;
char *val;
} tableentry_t;
typedef struct hashtable
{
size_t size;
struct tableentry **tab;
} hashtable_t;
/* creates hashtable */
/* NOTE: dynamically allocated, remember to ht_free() */
hashtable_t *ht_create(size_t size)
{
hashtable_t *ht = NULL;
if ((ht = malloc(sizeof(hashtable_t))) == NULL)
return NULL;
/* allocate ht's table */
if ((ht->tab = malloc(sizeof(tableentry_t) * size)) == NULL)
return NULL;
/* null-initialize table */
int i;
for (i = 0; i < size; i++)
ht->tab[i] = NULL;
ht->size = size;
return ht;
}
/* creates hash for a hashtab */
static unsigned hash(hashtable_t *ht, char *s)
{
unsigned hashval;
for (hashval = 0; *s != '\0'; s++)
hashval = *s + 31 * hashval;
return hashval;
}
/* loops through linked list freeing */
static void te_free(tableentry_t *te)
{
tableentry_t *next;
while (te != NULL)
{
next = te->next;
free(te->key);
free(te->val);
free(te);
te = next;
}
}
/* creates a key-val pair */
static tableentry_t *new(char *k, char *v)
{
tableentry_t *te = NULL;
if ((te = calloc(1, sizeof(*te))) == NULL
|| (te->key = strdup(k)) == NULL
|| (te->val = strdup(v)) == NULL)
{
te_free(te);
return NULL;
}
te->next = NULL;
return te;
}
static tableentry_t *lookup(hashtable_t *ht, char *k)
{
tableentry_t *te;
/* step through linked list */
for (te = ht->tab[hash(ht, k) % ht->size]; te != NULL; te = te->next)
if (strcmp(te->key, k) == 0)
return te; /* found */
return NULL; /* not found */
}
/* inserts the key-val pair */
hashtable_t *ht_insert(hashtable_t *ht, char *k, char *v)
{
tableentry_t *te;
/* unique entry */
if ((te = lookup(ht, k)) == NULL)
{
te = new(k, v);
unsigned hashval = hash(ht, k) % ht->size;
/* insert at beginning of linked list */
te->next = ht->tab[hashval];
ht->tab[hashval] = te;
}
/* replace val of previous entry */
else
{
free(te->val);
if ((te->val = strdup(v)) == NULL)
return NULL;
}
return ht;
}
/* retrieve value from key */
char *ht_get(hashtable_t *ht, char *k)
{
tableentry_t *te;
if ((te = lookup(ht, k)) == NULL)
return NULL;
return te->val;
}
/* frees hashtable created from ht_create() */
void ht_free(hashtable_t *ht)
{
int i;
for (i = 0; i < ht->size; i++)
if (ht->tab[i] != NULL)
te_free(ht->tab[i]);
free(ht);
}
/* resizes hashtable, returns new hashtable and frees old*/
hashtable_t *ht_resize(hashtable_t *oht, size_t size)
{
hashtable_t *nht; /* new hashtable */
nht = ht_create(size);
/* rehash */
int i;
tableentry_t *te;
/* loop through hashtable */
for (i = 0; i < oht->size; i++)
/* loop through linked list */
for (te = oht->tab[i]; te != NULL; te = te->next)
if (ht_insert(nht, te->key, te->val) == NULL)
return NULL;
ht_free(oht);
return nht;
}

The access and manipulation functions associated with your hash table implementation assume that values have the form of null-terminated strings, and that their significance is carried entirely by their contents (not, for example, by the values of the pointers themselves). Among other things, this is evident from the fact that the new() and ht_insert() functions make copies of the provided values via strdup(). Therefore, if you intend to use those functions (not just the underlying data structures) then your only alternative for storing integers is to encode the integers into strings in some way, and store the strings. This is what you already came up with.
Note, by the way, that this presents a bit of an issue if you want to be able to store both strings and integers in the same hash table. The table entries do not provide any way to record data type metadata, so to avoid collisions between string and number representations, you would need to encode data types into the values you store -- not only for the integers, but for the strings, too. For example, you might encode values into strings whose first character communicates the data type. Thus, perhaps "S12345" represents the string "12345", whereas "I12345" represents the integer 12345. But you don't need such tricks if you assume all the values are of uniform type, on a table-by-table basis.
You would have more options if you were open to writing at least a partial set of alternative hash table functions for storing integers in the existing data structures. For example, you might use the fact that pointers and integers can be converted back and forth (with implementation-defined results). But I interpret you to have rejected such approaches, as using alternative functions is effectively the same thing as modifying the implementation.

Related

Hashmap implementation problem in C with void pointer as value

Hi I am attempting to implement a really simple hashmap in regular C with a string as key and a void pointer as value as I wish to use the map for multiple data types.
So far I have this
struct node{
void * value;
char * key;
};
unsigned long strhash(char *string)
{
unsigned long hash = 5381;
int c;
while ((c = *string++))
{
hash = ((hash << 5) + hash) + c;
}
return hash;
}
map_t *map_create(int maxSize){
map_t *map = malloc(sizeof(map_t));
map->curSize = 0;
map->maxSize = maxSize;
map->nodes = calloc(map->maxSize, sizeof(node_t *));
return map;
}
node_t *node_create(char *key, void *value){
node_t *node = malloc(sizeof(node_t));
node->key = key;
node->value = value;
return node;
}
void map_insert(map_t *map, char *key, void *value){
node_t *node = node_create(key, value);
int idx = strhash(key) % map->maxSize;
if(map->nodes[idx] == NULL){
map->nodes[idx] = node;
}else{
while(map->nodes[idx] != NULL){
idx++%map->maxSize;
}
map->nodes[idx] = node;
}
return;
}
void map_print(map_t *map){
for(int i = 0; i < map->maxSize; i++){
if(map->nodes[i] != NULL){
printf("index: %d\t value: %d\n",i, *(int*)map->nodes[i]->value);
}
}
return;
}
void map_destroy(map_t *map){
for(int i = 0; i < map->maxSize; i++){
if(map->nodes[i] != NULL){
free(map->nodes[i]);
}
}
free(map->nodes);
free(map);
return;
}
int main(){
map_t *map = map_create(32);
for(int i = 0; i < 30; i++){
map_insert(map, (char*)&i, &i);
}
map_print(map);
map_destroy(map);
return 0;
}
The problem is the output is not as I'd expect when the map gets printed all that is retrieved is the value "30" on all indexes which is the last number inserted into the map. If I change the value to type int the map works as expected, so is there must be something crucial I am missing in regards to pointers.
I am not the greatest at C so any light which could be shed on this would be most appreciated.
The problem is that you're using the same pointer every time you call map_insert(). It just stores the pointer, it doesn't copy the data. Each time through the loop you change the contents of that memory, so all the hash map elements point to that same value.
There are two ways you can fix it. One way is to always make a dynamically-allocated copy of the data before calling map_insert():
for (int i = 0; i < 30; i++) {
int *i_copy = malloc(sizeof *i_copy);
*i_copy = i;
map_insert(map, (char *)i_copy, (char *)i_copy);
}
The other option is to add the size of the value to the map_insert() and node_create() arguments. Then node_create call malloc() and memcpy() to copy the value to dynamic memory.
BTW, there's another problem. The key is supposed to be a null-terminated string (strhash() depends on this), but you're using &i, which is a pointer to an integer. Casting a pointer to an integer to char* doesn't return a string, it just returns a pointer to the same location with a different data type. I haven't fixed this above.
OP stores a reference to the same value, so of course all lookups yield the same value (which is not even a string, but whatever the storage representation of the value of the variable i happens to be).
I prefer chaining the hash map entries, and keeping a copy of the hash in the entry:
struct entry {
struct entry *next;
size_t hash;
void *data;
size_t data_size;
int data_type;
unsigned char name[];
};
typedef struct {
size_t size;
size_t used; /* Number of entries, total */
struct entry **slot; /* Array of entry pointers */
size_t (*hash)(const unsigned char *, size_t);
} hashmap;
int hashmap_new(hashmap *hmap, const size_t size,
size_t (*hash)(const unsigned char *, size_t))
{
if (!hmap)
return -1; /* No hashmap specified */
hmap->size = 0;
hmap->used = 0;
hmap->slot = NULL;
hmap->hash = NULL;
if (size < 1)
return -1; /* Invalid size */
if (!hash)
return -1; /* No hash function specified. */
hmap->slot = calloc(size, sizeof hmap->slot[0]);
if (!hmap->slot)
return -1; /* Not enough memory */
hmap->size = size;
hmap->hash = hash;
return 0;
}
void hashmap_free(hashmap *hmap)
{
if (hmap) {
size_t i = hmap->size;
while (i-->0) {
struct entry *next = hmap->slot[i];
struct entry *curr;
while (next) {
curr = next;
next = next->next;
free(curr->data);
/* Poison the entry, to help detect use-after-free bugs. */
curr->next = NULL;
curr->data = NULL;
curr->hash = 0;
curr->data_size = 0;
curr->data_type = 0;
curr->name[0] = '\0';
free(curr);
}
}
}
free(hmap->slot);
hmap->size = 0;
hmap->used = 0;
hmap->slot = NULL;
hmap->hash = NULL;
}
To insert a key-value pair, the function either uses the data specified as-is, in which case it's the caller's responsibility to ensure each key has their own unique data not overwritten later; or we copy the user data. In the above hashmap_free() function, you'll see free(curr->data);; it assumes we allocated memory dynamically, and copied the user data there. So:
int hashmap_add(hashmap *hmap, const unsigned char *name,
const void *data, const size_t data_size,
const int data_type)
{
const size_t namelen = (name) ? strlen(name) : 0;
struct entry *curr;
size_t i;
if (!hmap)
return -1; /* No hashmap specified. */
if (name_len < 1)
return -1; /* NULL or empty name. */
/* Allocate memory for the hashmap entry,
including enough room for the name, and end of string '\0'. */
curr = malloc(sizeof (struct entry) + namelen + 1;
if (!curr)
return -1; /* Out of memory. */
/* Copy data, if any. */
if (data_size > 0) {
curr->data = malloc(data_size);
if (!curr->data) {
free(curr);
return -1; /* Out of memory. */
}
memcpy(curr->data, data, data_size);
} else {
curr->data = NULL;
curr->data_size = 0;
}
curr->data_type = data_type;
/* Calculate the hash of the name. */
curr->hash = hmap->hash(name, namelen);
/* Copy name, including the trailing '\0'. */
memcpy(curr->name, name, namelen + 1);
/* Slot to prepend to. */
i = curr->hash % hmap->size;
curr->next = hmap->slot[i];
hmap->slot[i] = curr;
/* An additional node added. */
hmap->used++;
return 0;
}
The meaning of data_type is completely up to the user of the code.
Lookup can be made based on the hash and the data type:
/* Returns 0 if found. */
int hashmap_find(hashmap *hmap, const unsigned char *name,
const int data_type,
void **dataptr_to, size_t *size_to)
{
struct entry *curr;
size_t hash;
if (size_to)
*size_to = 0;
if (dataptr_to)
*dataptr_to = NULL;
if (!hmap)
return -1; /* No hashmap specified. */
if (!name || !*name)
return -1; /* NULL or empty name. */
hash = hmap->hash(name, strlen(name));
curr = hmap->slot[hash % hmap->size];
for (curr = hmap->slot[hash % hmap->size]; curr != NULL; curr = curr->next) {
if (curr->data_type == data_type && curr->hash == hash &&
!strcmp(curr->name, name)) {
/* Data type an name matches. Save size if requested. */
if (size_to)
*size_to = curr->data_size;
if (dataptr_to)
*dataptr_to = curr->data;
return 0; /* Found. */
}
}
return -1; /* Not found. */
}
The above lookup returns 0 if found, and nonzero if error or not found. (This way, even zero-size NULL data can be stored in the hash map.)
If the number of data types supported is small, say 32, then using an unsigned int with each bit (1U<<0 == 1, 1U<<1 == 2, 1U<<2 == 4, and so on) reserved for a specific type, you can do the lookup using a mask, allowing only the specified types. Similarly, the data_type can be a mask, describing which types the value can be interpreted as (almost always will have just one bit set).
This scheme also allows one to dynamically resize the hashmap, by allocating a new slot array of pointers, and moving each old entry to the new one. The keys don't need to be rehashed, because the original hash is stored in each entry. For lookup efficiency, the chains (hanging off each slot) should be as short as possible. A common "rule of thumb" is that hashmap->size should be between hashmap->used and 2 * hashmap->used.
When you call map_insert(map, (char*)&i, &i); the value inserted into hasmap is the pointer to i variable, i.e. its address in memory, and not the value of i.
So when you change i value inside the for loop there is the side-effect to all entries into the hashmap, and at the end of the loop you only see the last value assigned.

Returning an array of structs from a recursive huffman tree C

i have a task in class to the return an array of struck Symbol from huffman tree.
the function getSL get a huffman tree(only) and return struck of Symbol.
each spot in the array contain a char from the "leaf" of the tree and the
length of his code(how many cross section till the leaf).
my main problem was to find how i advance the cnt of the arry that it will not overright the arry.
thank you.
typedef struct HNode {
char chr;
struct HNode *left, *right;
} HNode;
typedef struct {
char chr;
int counter;
}Symbol;
this is what i did till now.
Symbol * getSL(HNode *root) {
if (root->left == NULL && root->right == NULL) {
Symbol* b = (Symbol*)malloc(100);
b->counter=0;
b->chr = root->chr;
return b;
}
Symbol* a = (Symbol*)malloc(100);
if (root->left != NULL) {
a= getSL(root->left);
a->counter++;
}
if (root->right != NULL) {
a= getSL(root->right);
a->counter++;
}
return a;
}
Apart from the malloc problem (see the comments already), you have a fundamental problem: You allocate a new struct, but then replace it with the one returned from the recursive call. So you lose the one created before (actually, memory leaking!).
Easiest variant would now be converting your Symbol to linked list nodes; then you simply could do:
Symbol* lastLeafFound; // probaly a function parameter!
if(!(root->left || root->right))
{
// leaf found:
Symbol* a = (Symbol*)malloc(sizeof(Symbol));
a->chr = root->chr;
a->counter = /* ... */;
a->next = NULL;
lastLeafFound->next = a;
// you might return a now as last leaf found, using it in the next recursive call
}
Sure, above code is incomplete, but should give you the idea...
If you cannot modify your struct, then you need to create an array and pass it on to every new recursive call (prefer not to use global variables instead):
void doGetSL
(
HNode* root,
Symbol** symbols, // your array to be used
unsigned int* count, // number of symbols contained so far
unsigned int* capacity // maximum possible symbols
)
Passing all data as pointers allows the function to modify them as needed and they are still available from outside...
Symbol* getSL(HNode* root)
{
if(!root)
return NULL;
unsigned int count = 0;
unsigned int capacity = 128;
// allocate a whole array:
Symbol* array = malloc(capacity*sizeof(Symbol));
if(array) // malloc could fail...
{
doGetSL(root, &array, &count, &capacity);
// as you cannot return the number of leaves together with
// the array itself, you will need a sentinel:
array[count].chr = 0;
// obvious enough, I'd say, alternatively you could
// set counter to 0 or -1 (or set both chr and counter)
}
return array;
}
doGetSL will now use above set up "infrastructure":
{
if(!(root->left || root->right))
{
if(*count == *capacity)
{
// no memory left -> we need a larger array!
// store in separate variables:
unsigned int c = *capacity * 2;
Symbol* s = realloc(symbols, c * sizeof(Symbol));
// now we can check, if reallocation was successful
// (on failure, s will be NULL!!!):
if(s)
{
// OK, we can use them...
*symbols = s; // <- need a pointer for (pointer to pointer)!
*capacity = c;
}
else
{
// re-allocation failed!
// -> need appropriate error handling!
}
}
(*symbols)[count].chr = root->chr;
(*symbols)[count].counter = /*...*/;
++*count;
}
else
{
if(root->left)
{
doGetSL(root->left, symbols, count, capacity);
}
if(root->right)
{
doGetSL(root->right, symbols, count, capacity);
}
}
}
One thing yet omitted: setting the counter. That would be quite easy: add another parameter to doGetSL indicating the current depth, which you increment right when entering doGetSL, you can then just assign this value when needed.
You can further improve above variant (especially readability), if you introduce a new struct:
struct SLData
{
Symbol* symbols, // your array to be used
unsigned int count, // number of symbols contained so far
unsigned int capacity // maximum possible symbols
};
and pass this one instead of the three pointers:
doGetSL(HNode*, struct SLData*, unsigned int depth);
struct SLData data =
{
.count = 0;
.capacity = 128;
.array = malloc(capacity*sizeof(Symbol));
};
if(data.array)
doGetSL(root, &data, 0); // again passed as pointer!

Insert function of Hashtable in C

So, I have the functions. How can I insert numbers in the Hashtable? A for that goes until the size of the table? I don't know what goes inside the for, if it is exists.
#include <stdio.h>
//Structure
typedef struct Element {
int key;
int value;
} Element;
typedef struct HashTable {
Element *table[11];
} HashTable;
//Create an empty Hash
HashTable* createHashTable() {
HashTable *Raking = malloc(sizeof(HashTable));
int i;
for (i = 0; i < 11; i++) {
Raking->table[i] = NULL;
}
return Raking;
}
//Insert element
void insertElement(HashTable *Raking, int key, int value) {
int h = hashFunction(key);
while(Raking->table[h] != NULL) {
if(Raking->table[h]->key == key) {
Raking->table[h]->value = value;
break;
}
h = (h + 1) % 11;
}
if(Raking->table[h] == NULL) {
Element *newElement = (Element*) malloc(sizeof(Element));
newElement->key = key;
newElement->value = value;
Raking->table[h] = newElement;
}
}
int main() {
HashTable * Ranking = createHashTable();
/** ??? **/
}
Could someone explain to me how to write my main function with these structures? In this case I'm fixing the number of elements in this table, right? (table [11]) What could I do for the user to determine the size of the hash table? is it possible? Or should I set the size?
I've added comments and changes to your code that I feel will be of use to you. I've also adapted it so that size is not hardcoded. Finally I free all the malloc-ed statements.
This compiles without errors and I've tested it for memory leaks and other errors using valgrind and found no complaints.
Let me know if something is not clear and the comments fail to explain it. I've tried to stick to your code as much as possible but I've not had a chance to test the functionality properly.
#include <stdio.h>
#include <stdlib.h>
//Structure
typedef struct Element {
int key;
int value;
} Element; /* you had a syntax error here */
typedef struct HashTable {
int size; /* we will need the size for the traversal */
Element *table; /* leave it as a pointer */
} HashTable; /* a syntax error here too */
HashTable* createHashTable(int size) {
HashTable *Ranking = malloc(sizeof(HashTable));
/* set the pointer to point to a dynamic array of size 'size' */
/* this way you don't have to hardcode the size */
Ranking->table = malloc(sizeof(Element) * size);
Ranking->size = size;
/* initialisation is a bit different because we don't have pointers here */
/* only table is a pointer, not its elements */
int i;
for (i = 0; i < size; i++) {
Ranking->table[i].key = 0;
Ranking->table[i].value = 0;
}
return Ranking;
}
/* I implemented a fake hashFunction just to test the code */
/* all it does is make sure the key does not exceed the size of the table */
int hashFunction(int key, int size)
{
return (key % size);
}
//Insert element
void insertElement(HashTable *Ranking, int key, int value) {
int h = hashFunction(key, Ranking->size);
int i = 0;
/* if hash is full and key doesn't exist your previous loop would have gone on forever, I've added a check */
/* also notice that I check if table[h] has empty key, not if it's null as this is not a pointer */
while(Ranking->table[h].key != 0 && (i < Ranking->size)) {
if(Ranking->table[h].key == key) {
Ranking->table[h].value = value;
return; /* break is intended to quit the loop, but actually we want to exit the function altogether */
}
h = (h + 1) % Ranking->size; /* changed 11 to the size specified */
i++; /* advance the loop index */
}
/* okay found a free slot, store it there */
if(Ranking->table[h].key == 0) {
/* we now do direct assignment, no need for pointers */
Ranking->table[h].key = key;
Ranking->table[h].value = value;
}
}
int main() {
int size = 0;
scanf(" %d", &size);
HashTable *Ranking = createHashTable(size);
insertElement(Ranking, 113, 10); /* this is just a test, 113 will be hashed to be less than size */
/* we free everything we have malloc'ed */
free(Ranking->table);
free(Ranking);
return 0;
}

Markov Clean Function Trouble

I am trying to write a function to clean up the hash table that is generated by this code
/*
* Markov chain random text generator.
*/
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "eprintf.h"
enum {
NPREF = 2, /* number of prefix words */
NHASH = 4093, /* size of state hash table array */
MAXGEN = 10000 /* maximum words generated */
};
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* lookup(char *prefix[], int create);
void build(char *prefix[], FILE*);
void generate(int nwords);
void add(char *prefix[], char *word);
State* statetab[NHASH]; /* hash table of states */
char NONWORD[] = "\n"; /* cannot appear as real word */
/* markov main: markov-chain random text generation */
int main(void)
{
int i, nwords = MAXGEN;
char *prefix[NPREF]; /* current input prefix */
int c;
long seed;
setProgName("markov");
seed = time(NULL);
srand(seed);
for (i = 0; i < NPREF; i++) /* set up initial prefix */
prefix[i] = NONWORD;
build(prefix, stdin);
add(prefix, NONWORD);
generate(nwords);
return 0;
}
const int MULTIPLIER = 31; /* for hash() */
/* hash: compute hash value for array of NPREF strings */
unsigned int hash(char* s[NPREF])
{
unsigned int h;
unsigned char *p;
int i;
h = 0;
for (i = 0; i < NPREF; i++)
for (p = (unsigned char *) s[i]; *p != '\0'; p++)
h = MULTIPLIER * h + *p;
return h % NHASH;
}
/* lookup: search for prefix; create if requested. */
/* returns pointer if present or created; NULL if not. */
/* creation doesn't strdup so strings mustn't change later. */
State* lookup(char *prefix[NPREF], int create)
{
int i, h;
State *sp;
h = hash(prefix);
for (sp = statetab[h]; sp != NULL; sp = sp->next) {
for (i = 0; i < NPREF; i++)
if (strcmp(prefix[i], sp->pref[i]) != 0)
break;
if (i == NPREF) /* found it */
return sp;
}
if (create) {
sp = (State *) emalloc(sizeof(State));
for (i = 0; i < NPREF; i++)
sp->pref[i] = prefix[i];
sp->suf = NULL;
sp->next = statetab[h];
statetab[h] = sp;
}
return sp;
}
/* addsuffix: add to state. suffix must not change later */
void addsuffix(State *sp, char *suffix)
{
Suffix *suf;
suf = (Suffix *) emalloc(sizeof(Suffix));
suf->word = suffix;
suf->next = sp->suf;
sp->suf = suf;
}
/* add: add word to suffix list, update prefix */
void add(char *prefix[NPREF], char *suffix)
{
State *sp;
sp = lookup(prefix, 1); /* create if not found */
addsuffix(sp, suffix);
/* move the words down the prefix */
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = suffix;
}
/* build: read input, build prefix table */
void build(char *prefix[NPREF], FILE *f)
{
char buf[100], fmt[10];
/* create a format string; %s could overflow buf */
sprintf(fmt, "%%%ds", sizeof(buf)-1);
while (fscanf(f, fmt, buf) != EOF)
add(prefix, estrdup(buf));
}
/* generate: produce output, one word per line */
void generate(int nwords)
{
State *sp;
Suffix *suf;
char *prefix[NPREF], *w;
int i, nmatch;
for (i = 0; i < NPREF; i++) /* reset initial prefix */
prefix[i] = NONWORD;
for (i = 0; i < nwords; i++) {
sp = lookup(prefix, 0);
if (sp == NULL)
eprintf("internal error: lookup failed");
nmatch = 0;
for (suf = sp->suf; suf != NULL; suf = suf->next)
if (rand() % ++nmatch == 0) /* prob = 1/nmatch */
w = suf->word;
if (nmatch == 0)
eprintf("internal error: no suffix %d %s", i, prefix[0]);
if (strcmp(w, NONWORD) == 0)
break;
printf("%s\n", w);
memmove(prefix, prefix+1, (NPREF-1)*sizeof(prefix[0]));
prefix[NPREF-1] = w;
}
}
Here is what I have so far for my clean function
/*Clean Function*/
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, temp3;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
while(sp->suf != NULL)
{
temp2= sp->suf;
temp3= *temp2->next;
free(temp2);
sp->suf= &temp3;
}
}
}
}
I think im on the right track, I'm going through each index in the hash table, then going from state to state and freeing the suffixes. I'm not sure what to do about the prefixes, because I have to free them before I can free each state. Any help would be greatly appreciated.
In your code, you are copying into a temp3 node, which lives in automatic memory ("on the stack") pointing sp->suf to this memory will (on the next iteration of the loop) cause free to be called with the address of this object (which has not been obtained by malloc, and thus cannot be freed by free() )
void clean_up(State *sp)
{
State *temp;
Suffix *temp2, **pp;
for(int h = 0; h < NHASH; h++)
{
for (sp = statetab[h]; sp != NULL; sp = sp->next)
{
for (pp = &sp->suf; *pp; *pp = temp2)
{
temp2 = (*pp)->next;
free(*pp);
}
}
}
}
The example code is derived from the Markov program in The Practice of Programming by Kernighan and Pike, a most excellent book.
Given that you are trying to clean up the statetab, the main clean-up function doesn't need any argument. You do have to be careful not to free the states directly in statetab, but you do need to release auxilliary states chained off statetab[i].next.
typedef struct State State;
typedef struct Suffix Suffix;
struct State { /* prefix + suffix list */
char* pref[NPREF]; /* prefix words */
Suffix* suf; /* list of suffixes */
State* next; /* next in hash table */
};
struct Suffix { /* list of suffixes */
char* word; /* suffix */
Suffix* next; /* next in list of suffixes */
};
State* statetab[NHASH]; /* hash table of states */
static void free_state(State *state);
static void free_suffix(Suffix *suffix);
static void cleanup(void)
{
for (int i = 0; i < NHASH; i++)
free_state(statetab[i]);
}
static void free_state(State *state)
{
if (state != 0)
{
for (int i = 0; i < NPREF; i++)
free(state->pref[i]);
free_suffix(state->suf);
if (state->next != 0)
{
free_state(state->next);
free(state->next);
}
}
}
static void free_suffix(Suffix *suffix)
{
if (suffix != 0)
{
free(suffix->word);
free_suffix(suffix->next);
free(suffix);
}
}
Do you see how I've designed the free_xxxx() code based on the design of the xxxx structure?
Caveat Lector: uncompiled code, much less tested code.
I dug up the code from the TPOP site, and tried to apply it. I made some fixes to the freeing code above (syntax error fixed, the null checks in free_state() and free_suffix()), but the code as a whole was not designed to allow the data to be freed.
There are a couple of problems. First, a few of the prefixes are not allocated (NONWORD). It might be possible to avoid releasing those by testing whether a prefix is NONWORD, but that's nasty. It might be possible to allocate those prefixes too (replace NONWORD by estrdup(NONWORD)). I think there's another place, somewhere, that a non-allocated pointer is being stashed in a prefix in the state table; I'm getting crashes in malloc() complaining of 'freeing non-allocated memory' (which is distinct from 'double freeing allocated memory', I believe), but I've not managed to resolve that.
However, that then changes to another problem; the prefixes are reused. That is, almost every prefix in the system is used as the the second word of one prefix, then as the first word of the next prefix. Thus, you can't readily free the prefixes.
If you were to design this so that the memory could be released, then you'd probably design it so that there was a system of 'atoms' (immutable strings) such that each word was allocated once and reused as often as necessary (see C Interfaces and Implementations: Techniques for Creating Reusable Code by D Hanson for the source of the term). The code freeing the state table would then concentrate only on the non-word data. There'd be code to release the complete set of atoms as well.
I ran the Markov program under valgrind without the cleanup; there are no memory access problems and no leaked data; it is all still accessible at program exit. I was using a data file of about 15,000 words (and about 2900 distinct words), and the statistics were:
==9610== HEAP SUMMARY:
==9610== in use at exit: 695,269 bytes in 39,567 blocks
==9610== total heap usage: 39,567 allocs, 0 frees, 695,269 bytes allocated
==9610==
==9610== LEAK SUMMARY:
==9610== definitely lost: 0 bytes in 0 blocks
==9610== indirectly lost: 0 bytes in 0 blocks
==9610== possibly lost: 0 bytes in 0 blocks
==9610== still reachable: 695,269 bytes in 39,567 blocks
So, you set yourself an interesting exercise. However, I think it is not achievable without reworking some of the memory allocation mechanism so that the data can be freed cleanly.
(On BSD, and hence on Mac OS X too, there are a pair of functions in <stdlib.h> called setprogname() and getprogname(). On BSD, setprogname() is called automatically before the main() gets going (with argv[0], I believe). The declaration in eprintf.h conflicts with the declaration in <stdlib.h>, which may be why the code in the question uses setProgName() instead of the original setprogname(). I chose to fix setprogname() in eprintf.h so that it took a const char * argument and therefore matched the declaration in <stdlib.h>.)
TPOP was previously at
http://plan9.bell-labs.com/cm/cs/tpop and
http://cm.bell-labs.com/cm/cs/tpop but both are now (2015-08-10) broken.
See also Wikipedia on TPOP.

Efficient way to tokenize a string - C

I am trying to tokenize a string. I have a table of available tokens ordered in the form of a trie. Each token knows it has children. A simple tokens table will look like,
pattern value has_children
-------- ------ --------
s s-val 1
stack stack-val 0
over over-val 1
overflow overflow-val 0
In this table, stack is a child of s and overflow is a child of over. In practice, this table will have 5000+ records ordered in this way.
Now, given a string stackover, it should output stack-valover-val. Algorithm is greedy and it will try to find the longest match always.
To do this, I will start reading each character from the input, look for match, if a match found and the token has children, look for match again by including next character. Do this until we find the longest match. If no match found, try to match by including the next character until we reach the end of string or a successful match.
If we reached end of the string without a match, output ? symbol and remove the first character from the input. Repeat the whole process with remaining characters.
This algorithm works, but the backtracking and iterating on all possible combinations of the input makes it slow and complex.
I am wondering is there a better way of solving this? Any help would be appreciated.
Instead of backtracking you could keep in memory all possible results, until one result singles out at certain point in input stream. Example
Tokens: S STACK STACKOVERFLOW STAG OVER OVERFLOW
String: SSTACKOVERFUN
1 - Found S on place 0, have tokens that begin with S, try them all, only S is valid, so resolve S
2 - S on 1, have such tokens, try them, possible valid are S and STACK. Don't resolve, just keep them in mind.
3 - T on 2, have no such tokens, so S could be resolved now, but we also have longer token (STACK) so S is no good. Ditch S, and STACK is only left, but it has children. Try string for children. There are no possible children so resolve STACK
4 - O on 6, have such tokens, try them, have only OVER, so resolve OVER
5 - F on 10, no such tokens, and nothing to resolve from before so this is non-tokenizable
6 and 7 - same as step 5
Final result: S STACK OVER fun
Could you use the Aho-Corasick algorithm? It creates an automaton to search a keyword tree (trie).
I'm thinking that you want to take all of your keywords and sort them reverse alphabetically, so your list would become (plus a few extras)
0 stack 1
1 s 0
2 overflow 3
3 over 5
4 ovum 5
5 o 0
6 exchange 7
7 ex 0
The third column of this list are pointers to the parent token which is always lower on the list. Then you can take your target string and binary search where it fits on this list. If it lands above a token which matches then you clip off that portion and repeat the process for the remainder. If it doesn't match you use the parent pointer to find the next longest potential matching token.
If you want to get really fancy you can also chunk up the strings into 64bit words and compare 8 characters at once in the binary search.
I suggest you try Ragel, It can generate efficient scanners that can do longest match/backtracking. See chapter 6.3 in the Ragel user guide for more information.
I've created a tiny test which I think matches your specification, this is only the state machine description, without the code to feed input:
%%{
machine test;
main := |*
's' => { puts("s-val");};
'stack' => { puts("stack-val");};
'over' => { puts("over-val");};
'overflow' => { puts("overflow-val");};
# Anything else matches to any, outputs a '?' and continues
any => {putc('?');};
*|;
}%%
The following token_tree code is based on the prefix_tree class from ZeroMQ
The prefix_tree class only returns "true" when one of the tree's prefixes matches the start of the input text. It will not even tell you which prefix or how long that prefix was.
This token_tree will look for the longest token that matches the start of the input text. The search
function token_tree_longest_token() only needs to return the length of the longest token matched
against the start of the input text.
The basic algorithm is similar to the one described in the question, but it's implmentation might be faster.
Also there are some ways to improve memory usage, which could have it faster.
#include <stdint.h>
#include <stdlib.h>
/* #define TEST_TOKEN_TREE */
/*
* TODO: possible improvements, use multiple types of nodes: string/branch/leaf.
* The string node would replace a chain of normal token_nodes and save memory.
* This would require spliting a node to add branch points.
* Use these structs:
* struct token_node {
* uint32_t ref_count;
* uint8_t node_type; -- node is token_node_str/token_node_branch/token_node_leaf
* };
* struct token_node_str {
* token_node base;
* uint8_t reserved;
* uint16_t len; -- string length
* token_node *child; -- string nodes can only have one child.
* uint8_t str[0]; -- embedded string (not null-terminated)
* };
* struct token_node_branch {
* token_node base;
* uint8_t min; -- smallest char in child list.
* uint16_t count; -- child count.
* token_node *children[0];
* };
* struct token_node_leaf { -- leaf nodes have no children.
* token_node base;
* };
* This will save memory, but will make code much more complex.
*/
typedef struct token_tree token_tree;
typedef struct token_node token_node;
struct token_tree {
token_node *root; /**< root node of token tree. */
};
struct token_node {
uint32_t ref_count; /**< how many token references end at this node. */
uint8_t min; /**< smallest 'char' in children's list. */
uint8_t reserved; /**< padding. */
uint16_t count; /**< number of children. (max count = 256, so count must be 16bits) */
token_node *children[0]; /**< list of children nodes. index by (c - min) */
};
#define NODE_SIZE(count) (sizeof(token_node) + (sizeof(token_node *) * count))
static token_node *token_node_new(uint16_t count) {
token_node *node = calloc(1, NODE_SIZE(count));
node->count = count;
return node;
}
static void token_node_build_chain(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
do {
/* the last node in the chain will have no children. */
node = token_node_new((len == 0) ? 0 : 1);
*pnode = node; /* add node to slot in parent's children list. */
if(len == 0) break;
/* new node will have one child. */
node->min = *token;
node->count = 1;
/* slot where next node will be saved. */
pnode = &(node->children[0]);
/* consume char. */
token++;
len--;
} while(1);
/* mark last node as end of a valid token. */
node->ref_count++;
}
static void token_node_free(token_node *node) {
uint32_t i;
uint32_t count = node->count;
/* free children nodes. */
for(i=0; i < count; i++) {
if(node->children[i]) token_node_free(node->children[i]);
}
free(node);
}
static void token_node_grow(token_node **pnode, uint8_t c) {
token_node *node = *pnode;
token_node **children;
uint8_t old_min = node->min;
uint16_t old_count = node->count;
uint32_t i;
uint8_t min;
uint16_t count;
if(c < old_min) {
min = c;
count = old_count + (old_min - min);
} else {
if(old_count == 0) {
/* the list was empty, so this is the first char. */
old_min = c;
}
min = old_min;
c -= old_min;
if(c < old_count) {
/* don't need to grow. */
return;
}
count = c + 1;
}
node = realloc(node, NODE_SIZE(count));
*pnode = node;
children = node->children;
/* if the 'min' value changed, then we need to move all the old slots up. */
if(old_min != min) {
uint32_t diff = old_min - min;
for(i=count-1; i >= diff; i--) {
children[i] = children[i - diff];
}
/* null new slots at start of children list. */
for(i=0; i < diff; i++) {
children[i] = NULL;
}
} else {
/* null new slots at end of children list. */
for(i=old_count; i < count; i++) {
children[i] = NULL;
}
}
node->min = min;
node->count = count;
}
static token_node **token_node_find_last_node(token_node **pnode, const uint8_t **ptoken, size_t *plen) {
const uint8_t *token = *ptoken;
size_t len = *plen;
uint32_t c;
token_node *node = *pnode;
while(node && len) {
/* next char. */
c = (*token);
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/*
* NOTE: we don't consume this char and "*pnode" will not be null.
* When adding tokens, this node will be grown to hold more children.
*/
break;
}
/* consume char. */
token++;
len--;
/* get pointer to next node's slot. */
pnode = &(node->children[c]);
node = *pnode;
}
*ptoken = token;
*plen = len;
/* return pointer to last node's slot. */
return pnode;
}
static void token_node_add(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
/* find last node in chain for this token. */
pnode = token_node_find_last_node(pnode, &token, &len);
/* if full token was consumed then we found the last node for this token. */
if(!len) {
node = *pnode;
node->ref_count++;
return;
}
/* check if the children list of the last node needs to be grown. */
node = *pnode;
if(node) {
uint32_t c = *token;
/* consume char. */
token++;
len--;
/* grow node to make room for new char. */
token_node_grow(pnode, c);
node = *pnode; /* token_node_grow() may change the node's pointer. */
/* get slot for new child. */
pnode = &(node->children[c - node->min]);
}
/* build node chain for un-consumed part of token. */
token_node_build_chain(pnode, token, len);
}
static size_t token_node_longest_token(token_node *node, const uint8_t *text, size_t len) {
size_t last_token_len = 0;
size_t off = 0;
uint32_t c;
/* loop until we get a NULL node or run out of text. */
do {
if(node->ref_count > 0) {
/* found a token, keep track of it's length. */
last_token_len = off;
}
/* end of input text. */
if(off >= len) break;
/* next char. */
c = text[off];
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/* End of search, no more child nodes. */
break;
}
/* consume char. */
off++;
/* get pointer to next node's slot. */
node = node->children[c];
} while(node);
/* return length of largest token found. */
return last_token_len;
}
extern token_tree *token_tree_new() {
token_tree *tree = malloc(sizeof(token_tree));
tree->root = token_node_new(0);
return tree;
}
extern void token_tree_free(token_tree *tree) {
token_node_free(tree->root);
free(tree);
}
extern void token_tree_add(token_tree *tree, const char *token, size_t len) {
token_node_add(&(tree->root), token, len);
}
extern size_t token_tree_longest_token(token_tree *tree, const char *text, size_t len) {
return token_node_longest_token(tree->root, text, len);
}
#ifdef TEST_TOKEN_TREE
#include <stdio.h>
#include <string.h>
static const char *test_tokens[] = {
"s",
"stack",
"stackoverflow",
"over",
"overflow",
NULL,
};
static const char *test_input[] = {
"aastackoverasdfasdf",
"stack7777",
"777stack777",
"overstackflow",
NULL,
};
static void add_tokens(token_tree *tree, const char **tokens) {
int i;
for(i = 0; tokens[i] != NULL; i++) {
token_tree_add(tree, tokens[i], strlen(tokens[i]));
}
}
static void print_tokens(token_tree *tree, const char *text) {
size_t len = strlen(text);
size_t token_len;
printf("input: \"%s\"\n", text);
printf("tokens: [");
while(len) {
token_len = token_tree_longest_token(tree, text, len);
if(token_len > 0) {
printf("<%.*s>", (int)token_len, text);
} else {
printf("?");
token_len = 1;
}
text += token_len;
len -= token_len;
}
printf("]\n");
}
static void run_test(token_tree *tree, const char **texts) {
int i;
for(i = 0; texts[i] != NULL; i++) {
print_tokens(tree, texts[i]);
}
}
int main(int argc, char *argv[]) {
token_tree *tree = token_tree_new();
add_tokens(tree, test_tokens);
run_test(tree, test_input);
run_test(tree, test_tokens);
token_tree_free(tree);
}
#endif

Resources