Related
I've just begun learning the C language and I ran into an issue with one of my programs.
I am getting an error: "Illegal instruction 4" when executing: ./dictionary large.txt
Large.txt is a file with 143091 alphabetically sorted words, with each word starting on a new line. I am trying to load all of them into a hash table and return true if all the words are loaded successfully.
This code works for me if the code in bool load() is within int main and load() is non-existent. However, once I place it inside the load() function and call it from main, I get an error.
I would appreciate help on this, as there are not many threads on Illegal instruction.
This is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdbool.h>
// Maximum length for a word
// (e.g., pneumonoultramicroscopicsilicovolcanoconiosis)
#define LENGTH 45
// Number of letters in the english alphabet
#define ALPHABET_LENGTH 26
// Default dictionary
#define DICTIONARY "large.txt"
// Represents a node in a hash table
typedef struct node
{
char word[LENGTH + 1];
struct node *next;
} node;
// Number of buckets in hash table
const unsigned int N = ALPHABET_LENGTH;
// Hash table
node *table[N];
// Load function
bool load(char *dictionary);
// Hash function
int hash(char *word);
int main(int argc, char *argv[])
{
// Check for correct number of args
if (argc != 2 && argc != 3)
{
printf("Usage: ./speller [DICTIONARY] text\n");
exit(1);
}
// Determine which dictionary to use
char *dictionary = (argc == 3) ? argv[1] : DICTIONARY;
bool loaded = load(dictionary);
// TODO: free hashtable from memory
return 0;
}
bool load(char *dictionary)
{
// Open dictionary for reading
FILE *file = fopen(dictionary, "r");
if (file == NULL)
{
printf("Error 2: could not open %s. Please call customer service.\n", dictionary);
exit(2);
}
// Initialize array to NULL
for (int i = 0; i < N; i++)
table[i] = NULL;
// Declare and initialize variables
unsigned int char_count = 0;
unsigned int word_count = 0;
char char_buffer;
char word_buffer[LENGTH + 1];
int hash_code = 0;
int previous_hash_code = 0;
// Declare pointers
struct node *first_item;
struct node *current_item;
struct node *new_item;
// Is true the first time the while loop is ran to be able to distinguish between hash_code and previous_hash_code after one loop
bool first_loop = true;
// Count the number of words in dictionary
while (fread(&char_buffer, sizeof(char), 1, file))
{
// Builds the word_buffer by scanning characters
if (char_buffer != '\n')
{
word_buffer[char_count] = char_buffer;
char_count++;
}
else
{
// Increases word count each time char_buffer == '\n'
word_count += 1;
// Calls the hash function and stores its value in hash_code
hash_code = hash(&word_buffer[0]);
// Creates and initializes first node in a given table index
if (hash_code != previous_hash_code || first_loop == true)
{
first_item = table[hash_code] = (struct node *)malloc(sizeof(node));
if (first_item == NULL)
{
printf("Error 3: memory not allocated. Please call customer service.\n");
return false;
}
current_item = first_item;
strcpy(current_item->word, word_buffer);
current_item->next = NULL;
}
else
{
new_item = current_item->next = (struct node *)malloc(sizeof(node));
if (new_item == NULL)
{
printf("Error 4: memory not allocated. Please call customer service.\n");
return false;
}
current_item = new_item;
strcpy(current_item->word, word_buffer);
current_item->next = NULL;
}
// Fills word buffer elements with '\0'
for (int i = 0; i < char_count; i++)
{
word_buffer[i] = '\0';
}
// Signals the first loop has finished.
first_loop = false;
// Clears character buffer to keep track of next word
char_count = 0;
// Keeps track if a new table index should be initialized
previous_hash_code = hash_code;
}
}
return true;
}
// Hash in order of: 'a' is 0 and 'z' is 25
int hash(char *word_buffer)
{
int hash = word_buffer[0] - 97;
return hash;
}
Thank you in advance!
Chris
You should use node *table[ALPHABET_LENGTH]; for the table declaration instead of node *table[N];
There is a difference between constant macros and const variables, a macro can be used in a constant expression, such as a global array bound as per your use case, whereas a const variable cannot.
As you can see here, the compiler you say you are using, gcc, with no compiler flags, issues an error message:
error: variably modified 'table' at file scope
You can read more about these differences and use cases in "static const" vs "#define" vs "enum" it has more subjects, like static and enum, but is a nice read to grasp the differences between these concepts.
with following code I can store one string only.
Main problem is how to store several. If i want to enter another string after the first one it wont do it.
I didnt write it in code but when I type("KRAJ") it should get out of while loop.
typedef struct{
char Objekat[20+1];
char Mjesto[20+1];
char velicina [20];
int cijena;
char kn[3];
char stanje[20];
}Apartmani;
int main()
{
Apartmani *apartmani=(Apartmani*)malloc(sizeof(Apartmani)*50);
while(scanf("%[^,\n],%[^,],%[^,],%d%[^,],%[^\n]", &apartmani[i].Objekat,&apartmani[i].Mjesto,&apartmani[i].velicina,
&apartmani[i].cijena,&apartmani[i].kn, &apartmani[i].stanje )==6)
{
i++;
}
for(p=0;p<i;p++)
{
printf("%s %s %s %d %s %s",apartmani[p].Objekat,apartmani[p].Mjesto,apartmani[p].velicina,apartmani[p].cijena,
apartmani[p].kn, apartmani[p].stanje);
}
}
For example:
string 1: Apartman, Novalja, 100.00 m2, 750000kn, dobro ocuvano.
string 2: Kuca, Ivanbregovia, 20m2, Imtoski, 21252RH, vrijednost-neprocjenjiva.
You should use fgets() plus sscanf().
You should not cast malloc[Do I cast the result of malloc?][1]. Remember to check the return value of malloc, since it can be failed.
change the line of allocating apartmani to:
Apartmani *apartmani= malloc(sizeof(Apartmani)*50);
if(!apartmani) {return -1;}
Do not use & for the input of string.
Check the value of i because its value is limited to 50.
Your code is missing the declaration of i (should be: int i = 0), and the declaration of p also.
Your while loop can be as below:
int i = 0;
char line[100];
while(i < 50 && fgets(line,sizeof(line),stdin))
{
line[strcspn (line, "\n" )] = '\0'; // trip the enter character at the end of line.
int err = sscanf(line,"%20[^,],%20[^,],%19[^,],%d,%2[^,],%19[^\n]", apartmani[i].Objekat,apartmani[i].Mjesto,apartmani[i].velicina,&apartmani[i].cijena,
apartmani[i].kn, apartmani[i].stanje);
if(err != 6)
break;
i++;
}
If I understand you correctly, you want to store several 'Apartmani' structures.
In this case, you have 2 main possibilites :
Using array of structures (Fastest to write but less efficient)
Use linked-list (More efficient but more complex to use)
Examples
1: Using array of structures
#define MAX_APARTMANI 50
int main(void) {
int i = 0;
/* Create Apartmani array */
Apartmani *apartmani_tab[MAX_APARTMANI];
do {
/* loop by using malloc on a single element */
apartmani_tab[i] = (Apartmani *) malloc(sizeof(Apartmani));
/* While check using scanf */
} while (scanf("%[^,\n],%[^,],%[^,],%d%[^,],%[^\n]", apartmani_tab[i]->Objekat, apartmani_tab[i]->Mjesto, apartmani_tab[i]->velicina,
apartmani_tab[i]->cijena, apartmani_tab[i]->kn, apartmani_tab[i]->stanje) == 6 && ++i < MAX_APARTMANI)
/* good pratice: don't forget to free memory ! */
while (--i > 0) {
free(apartmani_tab[i]);
}
return (0);
}
2: Using linked-list
typedef struct Apartmani {
char Objekat[20+1];
char Mjesto[20+1];
char velicina [20];
int cijena;
char kn[3];
char stanje[20];
struct Apartmani *next;/* add pointer to next item in the list */
} Apartmani_t;
Apartmani_t *new_item(void) {
Apartmani_t *new_element = NULL;
new_element = (Apartmani_t *) malloc(sizeof(Apartmani));
if (!new_element)
return (NULL);
memset(new_element, 0, sizeof(*new_element));
new_element->next = NULL;
return (new_element);
}
int main(void) {
/* Initialize Apartmani list*/
Apartmani *apartmani_list = NULL, *current = NULL;
do {
if (!apartmani_list) { /* if empty list */
apartmani_list = new_item(); /* add first item */
if (!apartmani_list) /* prevent malloc errors */
break;
current = apartmani_list; /* link current pointer to list */
} else {
current->next = new_item();
if (!current->next) /* if malloc fails */
break;
current = current->next; /* update current pointer */
}
} while (scanf("%[^,\n],%[^,],%[^,],%d%[^,],%[^\n]", current->Objekat, current->Mjesto, current->velicina, current->cijena, current->kn, current->stanje) == 6) /* While check using scanf */
/* good pratice: don't forget to free memory ! */
while (apartmani_list) {
current = apartmani_list->next;
free(apartmani_list);
apartmani_list = current;
}
}
NB: I have not tried this code but the final version is probably very close to that.
Since my 8051 compiler doesn't have a feature that detects unused variables, I decide to try to implement my own, but it doesn't work.
When the program runs, it successfully identifies all labels by reading the main file then for each label name, it scans the entire file and for each line scanned, the following function is called:
findlabel(labelname,fileline);
Upon executing the program, it incorrectly identifies the following variables as unused:
PQR, MNO, TUV, and the null
The file that I am having this program continuously scan has the following contents:
ABC equ 1h
GHI equ 2h
JKL equ 3h
TUV equ 6h
MNO equ 4h
PQR equ 5h
cjne A,#ABC,def
mov GHI,#1h
mov JKL,MNO
def:
But MNO is used in the "mov JKL,MNO" line.
I also tried trimming out carriage returns and extra spacing and that was no help.
What am I doing wrong?
Source code follows:
void trim(char* astr){
while (astr[0]==' ' || astr[0]=='\r'|| astr[0]=='\t' || astr[0]=='\n'){
strcpy(astr,astr+1);
}
int sz=strlen(astr)-2;
while(astr[sz]==' ' || astr[sz]=='\r'|| astr[sz]=='\t' || astr[sz]=='\n'){
astr[sz]='\0';sz--;
}
}
int findlabel(char* lbl,char*fline){
int par,isdec=0;
char* semicolon=strcasestr(fline,";");
char* qs=strcasestr(fline,"';'");
if (semicolon && !qs){
//strip everything after semicolon if not quoted
memcpy(fline,fline,semicolon-fline);
fline[semicolon-fline]='\0';
}
trim(fline);
char* spc=strcasestr(fline," "); //Make sure there's a space inbetween text
if (spc){
strcpy(fline,spc+1); // toss out command
trim(fline);
for (par=1;par<=3;par++){
char ilbl[2000];
char* comma=strcasestr(fline,",");
if (comma){
//found comma so strip it and save parameter to ilbl
memcpy(ilbl,fline,comma-fline);
ilbl[comma-fline]='\0';
strcpy(fline,comma+1);
}else{
//no comma so run this loop one more time with last part of file line as parameter
strcpy(ilbl,fline);par=99;
}
trim(ilbl);
if (strcasecmp(ilbl,lbl)==0){isdec=1;par=99;} //first param = #label
if (ilbl[0]=='#'){
strcpy(ilbl,ilbl+1);
if (strcasecmp(ilbl,lbl)==0){isdec=1;par=99;} //first param = label
}
}
}
return isdec;
}
What am I doing wrong?
Approaching the problem with a custom solution without using any known patterns generally used to solve it, as it is a known problem.
This is a parsing problem, so you would need a state machine to lex tokens into a symbol table, and then increase the symbol count whenever they are seen. At the end of the program, check symbols whose count is zero, and they are the unused ones.
struct symbol_s {
char *name;
size_t count;
struct symbol_s *next;
};
This struct is a list of name and count pairs, which is enough to track the number of symbols seen.
One distinction here when parsing is the declaration (or first time use) of a symbol and their subsequent use. In your example, we can define the declaration as any symbol that is followed by an equ string. This can later be extended or modified to include other types, such as labels.
With all that, a rather clumsy implementation can be written as:
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
const char *source = " \
ABC equ 1h \
GHI equ 2h \
JKL equ 3h \
TUV equ 6h \
MNO equ 4h \
PQR equ 5h \
\
cjne A,#ABC,def \
mov GHI,#1h \
mov JKL,MNO \
def: \
";
struct symbol_s {
char *name;
size_t count;
struct symbol_s *next;
};
void symbol_append_child(struct symbol_s *head, struct symbol_s *next)
{
struct symbol_s *it;
assert(head != NULL);
assert(next != NULL);
it = head;
while (it->next != NULL && (it = it->next)) { }
it->next = next;
}
struct symbol_s *symbol_new(char *name)
{
struct symbol_s *symbol;
assert(name != NULL);
symbol = calloc(1, sizeof(*symbol));
symbol->name = strdup(name);
return symbol;
}
void symbol_free(struct symbol_s *symbol)
{
free(symbol->name);
free(symbol);
}
int main()
{
int cursor = 0;
int c;
struct symbol_s head = { 0 };
while ((c = source[cursor++]) != 0) {
if (isalpha(c)) {
int begin_cursor = cursor - 1;
while ((c = source[cursor++]) && isalnum(c)) {}
if (isspace(c) && strncmp("equ", source + cursor, 3) == 0) {
char buffer[512];
size_t len = cursor - begin_cursor - 1;
strncpy(buffer, source + begin_cursor, len);
buffer[len] = 0;
symbol_append_child(&head, symbol_new(buffer));
} else {
char buffer[512];
size_t len = cursor - begin_cursor - 1;
strncpy(buffer, source + begin_cursor, len);
buffer[len] = 0;
struct symbol_s *it;
for (it = &head; it != NULL; it = it->next) {
if (it->name == NULL) {
continue;
}
if (strncmp(it->name, source + begin_cursor, len) == 0) {
it->count += 1;
}
}
continue;
}
}
}
// Print unused symbols
struct symbol_s *it;
for (it = &head; it != NULL; it = it->next) {
if (it->name == NULL) {
continue;
}
if (it->count == 0) {
printf("Unused symbol: %s\n", it->name, it->count);
}
}
// clean up
struct symbol_s *prev;
for (prev = NULL, it = &head; it != NULL; prev = it, it = it->next) {
if (it->name == NULL) {
continue;
}
if (prev != NULL && prev->name) {
symbol_free(prev);
}
}
symbol_free(prev);
return 0;
}
Which prints:
Unused symbol: TUV
Unused symbol: PQR
For running example: https://ideone.com/b3KggJ
Although, as a reminder, you can do this more easily with regexp and a higher level language.
How can I read each individual character from a string that is accessed through an array of pointers? In the below code I currently have generated an array of pointers to strings called, symCodes, in my makeCodes function. I want to read the strings 8 characters at a time, I thought about concatenating each string together, then looping through that char by char but the strings in symCodes could be up to 255 characters each, so I feel like that could possibly be too much all to handle at once. Instead, I thought I could read each character from the strings, character by character.
I've tried scanf or just looping through and always end up with seg faults. At the end of headerEncode(), it's near the bottom. I malloc enough memory for each individual string, I try to loop through the array of pointers and print out each individual character but am ending up with a seg fault.
Any suggestions of a different way to read an array of pointers to strings, character by character, up to n amount of characters is appreciated.
EDIT 1: I've updated the program to no longer output warnings when using the -Wall and -W flags. I'm no longer getting a seg fault(yay!) but I'm still unsure of how to go about my question, how can I read an array of pointers to strings, character by character, up to n amount of characters?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "huffman.h"
#define FAIL 0
#define SUCCESS 1
/* global 1 day arrays that hold chars and their freqs from file */
unsigned long globalFreqs[256] = {0};
unsigned char globalUsedCh[256] = {0};
char globalCodes[256] = {0};
unsigned char globalUniqueSymbols;
unsigned long totalCount = 0;
typedef struct HuffmanTreeNode* HTNode;
struct HuffmanTreeNode* globalSortedLL;
/*
struct has the input letter, the letters frequency, and the left and irght childs
*/
struct HuffmanTreeNode
{
char symbol;
unsigned long freq;
char *code;
struct HuffmanTreeNode *left, *right;
struct HuffmanTreeNode* next;
};
/* does it make sense to have a struct for the entire huffman tree to see its size? */
struct HuffmanTree
{
unsigned size;
};
/*generate new node with given symbol and freq */
struct HuffmanTreeNode* newNode(char symbol, int freq)
{
struct HuffmanTreeNode* newNode = malloc(sizeof(struct HuffmanTreeNode));
newNode->symbol = symbol;
newNode->freq = freq;
newNode->left = newNode->right = NULL;
return newNode;
}
/*current work in progress, i believe this is the way to insert it for a BST
/* will change for HuffmanTreenode once working
/*
*/
struct HuffmanTreeNode* insert(struct HuffmanTreeNode* node, struct HuffmanTreeNode* htnNew)
{
struct HuffmanTreeNode* currentNode = node;
if(currentNode == NULL || compareTwoNodes(htnNew, currentNode))
{
htnNew->next = currentNode;
return htnNew;
}
else
{
while(currentNode->next != NULL && compareTwoNodes(currentNode->next, htnNew))
{
currentNode = currentNode->next;
}
htnNew->next = currentNode->next;
currentNode->next = htnNew;
return node;
}
}
int compareTwoNodes(struct HuffmanTreeNode* a, struct HuffmanTreeNode* b)
{
if(b->freq < a->freq)
{
return 0;
}
if(a->freq == b->freq)
{
if(a->symbol > b->symbol)
return 1;
return 0;
}
if(b->freq > a->freq)
return 1;
}
struct HuffmanTreeNode* popNode(struct HuffmanTreeNode** head)
{
struct HuffmanTreeNode* node = *head;
*head = (*head)->next;
return node;
}
/*convert output to bytes from bits*/
/*use binary fileio to output */
/*put c for individual character byte*/
/*fwrite each individual byte for frequency of symbol(look at fileio slides) */
/*
#function:
#param:
#return:
*/
int listLength(struct HuffmanTreeNode* node)
{
struct HuffmanTreeNode* current = node;
int length = 0;
while(current != NULL)
{
length++;
current = current->next;
}
return length;
}
/*
#function:
#param:
#return:
*/
void printList(struct HuffmanTreeNode* node)
{
struct HuffmanTreeNode* currentNode = node;
while(currentNode != NULL)
{
if(currentNode->symbol <= ' ' || currentNode->symbol > '~')
printf("=%d", currentNode->symbol);
else
printf("%c", currentNode->symbol);
printf("%lu ", currentNode->freq);
currentNode = currentNode->next;
}
printf("\n");
}
/*
#function:
#param:
#return:
*/
void buildSortedList()
{
int i;
for(i = 0; i < 256; i++)
{
if(!globalFreqs[i] == 0)
{
globalSortedLL = insert(globalSortedLL, newNode(i, globalFreqs[i]));
}
}
printf("Sorted freqs: ");
printList(globalSortedLL);
printf("listL: %d\n", listLength(globalSortedLL));
}
/*
#function: isLeaf()
will test to see if the current node is a leaf or not
#param:
#return
*/
int isLeaf(struct HuffmanTreeNode* node)
{
if((node->left == NULL) && (node->right == NULL))
return SUCCESS;
else
return FAIL;
}
/*where I plan to build the actual huffmantree */
/*
#function:
#param:
#return:
*/
struct HuffmanTreeNode* buildHuffmanTree(struct HuffmanTreeNode* node)
{
int top = 0;
struct HuffmanTreeNode *left, *right, *topNode, *huffmanTree;
struct HuffmanTreeNode* head = node;
struct HuffmanTreeNode *newChildNode, *firstNode, *secondNode;
while(head->next != NULL)
{
/*grab first two items from linkedL, and remove two items*/
firstNode = popNode(&head);
secondNode = popNode(&head);
/*combine sums, use higher symbol, create new node*/
newChildNode = newNode(secondNode->symbol, (firstNode->freq + secondNode->freq));
newChildNode->left = firstNode;
newChildNode->right = secondNode;
/*insert new node, decrement total symbols in use */
head = insert(head, newChildNode);
}
return head;
}
void printTable(char *codesArray[])
{
int i;
printf("Symbol\tFreq\tCode\n");
for(i = 0; i < 256; i++)
{
if(globalFreqs[i] != 0)
{
if(i <= ' ' || i > '~')
{
printf("=%d\t%lu\t%s\n", i, globalFreqs[i], codesArray[i]);
}
else
{
printf("%c\t%lu\t%s\n", i, globalFreqs[i], codesArray[i]);
}
}
}
printf("Total chars = %lu\n", totalCount);
}
void makeCodes(
struct HuffmanTreeNode *node, /* Pointer to some tree node */
char *code, /* The *current* code in progress */
char *symCodes[256], /* The array to hold the codes for all the symbols */
int depth) /* How deep in the tree we are (code length) */
{
char *copiedCode;
int i = 0;
if(isLeaf(node))
{
code[depth] = '\0';
symCodes[node->symbol] = code;
return;
}
copiedCode = malloc(255*sizeof(char));
memcpy(copiedCode, code, 255*sizeof(char));
code[depth] = '0';
copiedCode[depth] = '1';
makeCodes(node->left, code, symCodes, depth+1);
makeCodes(node->right, copiedCode, symCodes, depth+1);
}
/*
#function: getFileFreq()
gets the frequencies of each character in the given
file from the command line, this function will also
create two global 1d arrays, one for the currently
used characters in the file, and then one with those
characters frequencies, the two arrays will line up
parallel
#param: FILE* in, FILE* out,
the current file being processed
#return: void
*/
void getFileFreq(FILE* in, FILE* out)
{
unsigned long freqs[256] = {0};
int i, t, fileCh;
while((fileCh = fgetc(in)) != EOF)
{
freqs[fileCh]++;
totalCount++;
}
for(i = 0; i < 256; i++)
{
if(freqs[i] != 0)
{
globalUsedCh[i] = i;
globalFreqs[i] = freqs[i];
if(i <= ' ' || i > '~')
{
globalUniqueSymbols++;
}
else
{
globalUniqueSymbols++;
}
}
}
/* below code until total count is for debugging purposes */
printf("Used Ch: ");
for(t = 0; t < 256; t++)
{
if(globalUsedCh[t] != 0)
{
if(t <= ' ' || t > '~')
{
printf("%d ", globalUsedCh[t]);
}
else
printf("%c ", globalUsedCh[t]);
}
}
printf("\n");
printf("Freq Ch: ");
for(t = 0; t < 256; t++)
{
if(globalFreqs[t] != 0)
{
printf("%lu ", globalFreqs[t]);
}
}
printf("\n");
/* end of code for debugging/vizualazation of arrays*/
printf("Total Count %lu\n", totalCount);
printf("globalArrayLength: %d\n", globalUniqueSymbols);
}
void headerEncode(FILE* in, FILE* out, char *symCodes[256])
{
char c;
int i, ch, t, q, b, z;
char *a;
char *fileIn;
unsigned char *uniqueSymbols;
unsigned char *byteStream;
unsigned char *tooManySym = 0;
unsigned long totalEncodedSym;
*uniqueSymbols = globalUniqueSymbols;
totalEncodedSym = ftell(in);
rewind(in);
fileIn = malloc((totalEncodedSym+1)*sizeof(char));
fread(fileIn, totalEncodedSym, 1, in);
if(globalUniqueSymbols == 256)
{
fwrite(tooManySym, 1, sizeof(char), out);
}
else
{
fwrite(uniqueSymbols, 1, sizeof(uniqueSymbols)-7, out);
}
for(i = 0; i < 256; i++)
{
if(globalFreqs[i] != 0)
{
fwrite(globalUsedCh+i, 1, sizeof(char), out);
fwrite(globalFreqs+i, 8, sizeof(char), out);
}
}
for(t = 0; t < totalEncodedSym; t++)
{
fwrite(symCodes[fileIn[t]], 8, sizeof(char), out);
}
for(q = 0; q < totalEncodedSym; q++)
{
symCodes[q] = malloc(255*sizeof(char));
a = symCodes[q];
while(*a != '\0')
printf("%c\n", *(a++));
}
printf("Total encoded symbols: %lu\n", totalEncodedSym);
printf("%s\n", fileIn);
}
void encodeFile(FILE* in, FILE* out)
{
int top = 0;
int i;
char *code;
char *symCodes[256] = {0};
int depth = 0;
code = malloc(255*sizeof(char));
getFileFreq(in, out);
buildSortedList();
makeCodes(buildHuffmanTree(globalSortedLL), code, symCodes, depth);
printTable(symCodes);
headerEncode(in, out, symCodes);
free(code);
}
/*
void decodeFile(FILE* in, FILE* out)
{
}*/
There are many problems in your code:
[major] function compareTwoNodes does not always return a value. The compiler can detect such problems if instructed to output more warnings.
[major] the member symbol in the HuffmanTreeNode should have type int. Type char is problematic as an index value because it can be signed or unsigned depending on compiler configuration and platform specificities. You assume that char has values from 0 to 255, which is incorrect for most platforms where char actually has a range of -128 .. 127. Use unsigned char or int but cast the char values to unsigned char to ensure proper promotion.
[major] comparison if (globalUniqueSymbols == 256) is always false because globalUniqueSymbols is an unsigned char. The maximum number of possible byte values is indeed 256 for 8-bit bytes, but it does not fit in an unsigned char, make globalUniqueSymbols an int.
[major] *uniqueSymbols = globalUniqueSymbols; in function headerEncode stores globalUniqueSymbols into an uninitialized pointer, definitely undefined behavior, probable segmentation fault.
[major] sizeof(uniqueSymbols) is the size of a pointer, not the size of the array not the size of the type. Instead of hacking it as sizeof(uniqueSymbols)-7, fputc(globalUniqueSymbols, out);
[major] fwrite(tooManySym, 1, sizeof(char), out); is incorrect too, since tooManySym is initialized to 0, ie: it is a NULL pointer. You need a special value to tell that all bytes values are used in the source stream, use 0 for that and write it with fputc(0, out);.
You have nested C style comments before function insert, this is not a bug but error prone and considered bad style.
function newNode should take type unsigned long for freq for consistency.
function buildHuffmanTree has unused local variables: right, top and topNode.
variable i is unused in function makeCodes.
many unused variables in headerEncode: byteStream, c, ch, b...
totalEncodedSym is an unsigned long, use an index of the proper type in the loops where you stop at totalEncodedSym.
unused variables un encodeFile: i, top...
Most of these can be detected by the compiler with the proper warning level: gcc -Wall -W or clang -Weverything...
There are probably also errors in the program logic, but you cannot see these until you fix the major problems above.
I am trying to tokenize a string. I have a table of available tokens ordered in the form of a trie. Each token knows it has children. A simple tokens table will look like,
pattern value has_children
-------- ------ --------
s s-val 1
stack stack-val 0
over over-val 1
overflow overflow-val 0
In this table, stack is a child of s and overflow is a child of over. In practice, this table will have 5000+ records ordered in this way.
Now, given a string stackover, it should output stack-valover-val. Algorithm is greedy and it will try to find the longest match always.
To do this, I will start reading each character from the input, look for match, if a match found and the token has children, look for match again by including next character. Do this until we find the longest match. If no match found, try to match by including the next character until we reach the end of string or a successful match.
If we reached end of the string without a match, output ? symbol and remove the first character from the input. Repeat the whole process with remaining characters.
This algorithm works, but the backtracking and iterating on all possible combinations of the input makes it slow and complex.
I am wondering is there a better way of solving this? Any help would be appreciated.
Instead of backtracking you could keep in memory all possible results, until one result singles out at certain point in input stream. Example
Tokens: S STACK STACKOVERFLOW STAG OVER OVERFLOW
String: SSTACKOVERFUN
1 - Found S on place 0, have tokens that begin with S, try them all, only S is valid, so resolve S
2 - S on 1, have such tokens, try them, possible valid are S and STACK. Don't resolve, just keep them in mind.
3 - T on 2, have no such tokens, so S could be resolved now, but we also have longer token (STACK) so S is no good. Ditch S, and STACK is only left, but it has children. Try string for children. There are no possible children so resolve STACK
4 - O on 6, have such tokens, try them, have only OVER, so resolve OVER
5 - F on 10, no such tokens, and nothing to resolve from before so this is non-tokenizable
6 and 7 - same as step 5
Final result: S STACK OVER fun
Could you use the Aho-Corasick algorithm? It creates an automaton to search a keyword tree (trie).
I'm thinking that you want to take all of your keywords and sort them reverse alphabetically, so your list would become (plus a few extras)
0 stack 1
1 s 0
2 overflow 3
3 over 5
4 ovum 5
5 o 0
6 exchange 7
7 ex 0
The third column of this list are pointers to the parent token which is always lower on the list. Then you can take your target string and binary search where it fits on this list. If it lands above a token which matches then you clip off that portion and repeat the process for the remainder. If it doesn't match you use the parent pointer to find the next longest potential matching token.
If you want to get really fancy you can also chunk up the strings into 64bit words and compare 8 characters at once in the binary search.
I suggest you try Ragel, It can generate efficient scanners that can do longest match/backtracking. See chapter 6.3 in the Ragel user guide for more information.
I've created a tiny test which I think matches your specification, this is only the state machine description, without the code to feed input:
%%{
machine test;
main := |*
's' => { puts("s-val");};
'stack' => { puts("stack-val");};
'over' => { puts("over-val");};
'overflow' => { puts("overflow-val");};
# Anything else matches to any, outputs a '?' and continues
any => {putc('?');};
*|;
}%%
The following token_tree code is based on the prefix_tree class from ZeroMQ
The prefix_tree class only returns "true" when one of the tree's prefixes matches the start of the input text. It will not even tell you which prefix or how long that prefix was.
This token_tree will look for the longest token that matches the start of the input text. The search
function token_tree_longest_token() only needs to return the length of the longest token matched
against the start of the input text.
The basic algorithm is similar to the one described in the question, but it's implmentation might be faster.
Also there are some ways to improve memory usage, which could have it faster.
#include <stdint.h>
#include <stdlib.h>
/* #define TEST_TOKEN_TREE */
/*
* TODO: possible improvements, use multiple types of nodes: string/branch/leaf.
* The string node would replace a chain of normal token_nodes and save memory.
* This would require spliting a node to add branch points.
* Use these structs:
* struct token_node {
* uint32_t ref_count;
* uint8_t node_type; -- node is token_node_str/token_node_branch/token_node_leaf
* };
* struct token_node_str {
* token_node base;
* uint8_t reserved;
* uint16_t len; -- string length
* token_node *child; -- string nodes can only have one child.
* uint8_t str[0]; -- embedded string (not null-terminated)
* };
* struct token_node_branch {
* token_node base;
* uint8_t min; -- smallest char in child list.
* uint16_t count; -- child count.
* token_node *children[0];
* };
* struct token_node_leaf { -- leaf nodes have no children.
* token_node base;
* };
* This will save memory, but will make code much more complex.
*/
typedef struct token_tree token_tree;
typedef struct token_node token_node;
struct token_tree {
token_node *root; /**< root node of token tree. */
};
struct token_node {
uint32_t ref_count; /**< how many token references end at this node. */
uint8_t min; /**< smallest 'char' in children's list. */
uint8_t reserved; /**< padding. */
uint16_t count; /**< number of children. (max count = 256, so count must be 16bits) */
token_node *children[0]; /**< list of children nodes. index by (c - min) */
};
#define NODE_SIZE(count) (sizeof(token_node) + (sizeof(token_node *) * count))
static token_node *token_node_new(uint16_t count) {
token_node *node = calloc(1, NODE_SIZE(count));
node->count = count;
return node;
}
static void token_node_build_chain(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
do {
/* the last node in the chain will have no children. */
node = token_node_new((len == 0) ? 0 : 1);
*pnode = node; /* add node to slot in parent's children list. */
if(len == 0) break;
/* new node will have one child. */
node->min = *token;
node->count = 1;
/* slot where next node will be saved. */
pnode = &(node->children[0]);
/* consume char. */
token++;
len--;
} while(1);
/* mark last node as end of a valid token. */
node->ref_count++;
}
static void token_node_free(token_node *node) {
uint32_t i;
uint32_t count = node->count;
/* free children nodes. */
for(i=0; i < count; i++) {
if(node->children[i]) token_node_free(node->children[i]);
}
free(node);
}
static void token_node_grow(token_node **pnode, uint8_t c) {
token_node *node = *pnode;
token_node **children;
uint8_t old_min = node->min;
uint16_t old_count = node->count;
uint32_t i;
uint8_t min;
uint16_t count;
if(c < old_min) {
min = c;
count = old_count + (old_min - min);
} else {
if(old_count == 0) {
/* the list was empty, so this is the first char. */
old_min = c;
}
min = old_min;
c -= old_min;
if(c < old_count) {
/* don't need to grow. */
return;
}
count = c + 1;
}
node = realloc(node, NODE_SIZE(count));
*pnode = node;
children = node->children;
/* if the 'min' value changed, then we need to move all the old slots up. */
if(old_min != min) {
uint32_t diff = old_min - min;
for(i=count-1; i >= diff; i--) {
children[i] = children[i - diff];
}
/* null new slots at start of children list. */
for(i=0; i < diff; i++) {
children[i] = NULL;
}
} else {
/* null new slots at end of children list. */
for(i=old_count; i < count; i++) {
children[i] = NULL;
}
}
node->min = min;
node->count = count;
}
static token_node **token_node_find_last_node(token_node **pnode, const uint8_t **ptoken, size_t *plen) {
const uint8_t *token = *ptoken;
size_t len = *plen;
uint32_t c;
token_node *node = *pnode;
while(node && len) {
/* next char. */
c = (*token);
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/*
* NOTE: we don't consume this char and "*pnode" will not be null.
* When adding tokens, this node will be grown to hold more children.
*/
break;
}
/* consume char. */
token++;
len--;
/* get pointer to next node's slot. */
pnode = &(node->children[c]);
node = *pnode;
}
*ptoken = token;
*plen = len;
/* return pointer to last node's slot. */
return pnode;
}
static void token_node_add(token_node **pnode, const uint8_t *token, size_t len) {
token_node *node;
/* find last node in chain for this token. */
pnode = token_node_find_last_node(pnode, &token, &len);
/* if full token was consumed then we found the last node for this token. */
if(!len) {
node = *pnode;
node->ref_count++;
return;
}
/* check if the children list of the last node needs to be grown. */
node = *pnode;
if(node) {
uint32_t c = *token;
/* consume char. */
token++;
len--;
/* grow node to make room for new char. */
token_node_grow(pnode, c);
node = *pnode; /* token_node_grow() may change the node's pointer. */
/* get slot for new child. */
pnode = &(node->children[c - node->min]);
}
/* build node chain for un-consumed part of token. */
token_node_build_chain(pnode, token, len);
}
static size_t token_node_longest_token(token_node *node, const uint8_t *text, size_t len) {
size_t last_token_len = 0;
size_t off = 0;
uint32_t c;
/* loop until we get a NULL node or run out of text. */
do {
if(node->ref_count > 0) {
/* found a token, keep track of it's length. */
last_token_len = off;
}
/* end of input text. */
if(off >= len) break;
/* next char. */
c = text[off];
/* if c < node->min, then it will underflow and be > node->count. */
c -= node->min;
/* make sure c is in range. */
if(c >= node->count) {
/* End of search, no more child nodes. */
break;
}
/* consume char. */
off++;
/* get pointer to next node's slot. */
node = node->children[c];
} while(node);
/* return length of largest token found. */
return last_token_len;
}
extern token_tree *token_tree_new() {
token_tree *tree = malloc(sizeof(token_tree));
tree->root = token_node_new(0);
return tree;
}
extern void token_tree_free(token_tree *tree) {
token_node_free(tree->root);
free(tree);
}
extern void token_tree_add(token_tree *tree, const char *token, size_t len) {
token_node_add(&(tree->root), token, len);
}
extern size_t token_tree_longest_token(token_tree *tree, const char *text, size_t len) {
return token_node_longest_token(tree->root, text, len);
}
#ifdef TEST_TOKEN_TREE
#include <stdio.h>
#include <string.h>
static const char *test_tokens[] = {
"s",
"stack",
"stackoverflow",
"over",
"overflow",
NULL,
};
static const char *test_input[] = {
"aastackoverasdfasdf",
"stack7777",
"777stack777",
"overstackflow",
NULL,
};
static void add_tokens(token_tree *tree, const char **tokens) {
int i;
for(i = 0; tokens[i] != NULL; i++) {
token_tree_add(tree, tokens[i], strlen(tokens[i]));
}
}
static void print_tokens(token_tree *tree, const char *text) {
size_t len = strlen(text);
size_t token_len;
printf("input: \"%s\"\n", text);
printf("tokens: [");
while(len) {
token_len = token_tree_longest_token(tree, text, len);
if(token_len > 0) {
printf("<%.*s>", (int)token_len, text);
} else {
printf("?");
token_len = 1;
}
text += token_len;
len -= token_len;
}
printf("]\n");
}
static void run_test(token_tree *tree, const char **texts) {
int i;
for(i = 0; texts[i] != NULL; i++) {
print_tokens(tree, texts[i]);
}
}
int main(int argc, char *argv[]) {
token_tree *tree = token_tree_new();
add_tokens(tree, test_tokens);
run_test(tree, test_input);
run_test(tree, test_tokens);
token_tree_free(tree);
}
#endif