Lower bound from the right in a B-tree - c

I have a B-tree and I'd like to, given an arbitrary parameter key, figure out what the greatest data key less then or equal to the parameter key. In other words, I want it to look to the left to figure out what key it should use in O(log n).
I've already modified the implementation of lower_bound in C code.
#define ORDER 3
static int compare(const int a, const int b) { return a > b; }
struct node { unsigned size; int key[ORDER - 1]; };
struct branch { struct node base, *child[ORDER]; };
struct ref { struct node *node; unsigned height, idx; };
struct tree { struct node *node; unsigned height; };
static struct ref lower(const struct tree tree, const int x) {
struct ref lo, found;
found.node = 0;
if(!tree.node) return found;
for(lo.node = tree.node, lo.height = tree.height; ;
lo.node = ((const struct branch *)(const void *)lo.node)->child[lo.idx],
lo.height--) {
unsigned hi = lo.node->size; lo.idx = 0;
if(!hi) continue;
do {
const unsigned mid = (lo.idx + hi) / 2; /* Will not overflow. */
if(compare(x, lo.node->key[mid]) > 0) lo.idx = mid + 1;
else hi = mid;
} while(lo.idx < hi);
if(lo.idx < lo.node->size) { /* Within bounds, record the current. */
found = lo;
if(compare(x, lo.node->key[lo.idx]) > 0) break;
}
if(!lo.height) break;
}
return found;
}
static int tree_lower_or(const struct tree tree,
const int key, const int default_value) {
struct ref ref;
return (ref = lower(tree, key)).node
&& ref.idx < ref.node->size ? ref.node->key[ref.idx] : default_value;
}
#include <stdio.h>
int main(void) {
struct node node[] = { { 2, {1,2} }, { 2, {5, 6} } };
struct branch root = { { 1, {4} }, {node, node+1} };
const struct tree tree = { &root.base, 1 };
int i;
for(i = 0; i < 8; i++)
printf("%d->%d%s", i, tree_lower_or(tree, i, 0), i < 7 ? ", " : "\n");
return 0;
}
This uses the example in std::lower_bound, data = {1, 2, 4, 5, 5, 6}. (Note that my B-tree's keys are strongly increasing, so I can't have two 5s.) It prints out 0->1, 1->1, 2->2, 3->4, 4->4, 5->5, 6->6, 7->0, which is x->next x in set or 0.
This is not quite what I want. The upper_bound is also not quite what I want, but close.
I want a lower bound from the right instead of the left, x->last x in set or 0.
Is there a name for this and how to I modify the lower above to give this result?

The way I would implement this is:
get the upper_bound
get the previous element (if any)
A) if there is a previous element and the element is == the key you are searching for, return it
B) otherwise, return the upper bound
In general you either care about the element directly before the upper_bound or about the upper_bound.

Following the upper_bound advice, I was able to get the required behaviour without going down twice by keeping a return variable that I updated as appropriate. I found that I was being a little sloppy. The lower_bound just lines up correctly, but I found upper_bound not really obvious.
The first thing I did was work out a better example where it would be really obvious what was in the range and what was in the domain. In this case, I thought of the letter keys as the domain and the node-indices as the range, (as in my question.)
Here, key and x are arbitrary elements of the domain of letters. Applying the upper_bound process for each node gives us hi in the range. If hi.idx is non-zero, then found.idx = hi.idx - 1 is an element of the range and a valid data reference. We go down the tree and allow this to be overwritten if appropriate. Finally, in tree_left_or and tree_right_or, we transform the range element found, (it is just an unstable internal pointer-index), to a meaningful corresponding letter domain key in the set of keys.
/* https://github.com/neil-edelman/orcish needed for Graphviz names. */
/*#include "orcish.h"*/
#include <stdio.h>
#include <assert.h>
#define ORDER 3
static int compare(const char a, const char b) { return a > b; }
struct node { unsigned size; char key[ORDER - 1]; };
struct branch { struct node base, *child[ORDER]; };
struct ref { struct node *node; unsigned height, idx; };
struct tree { struct node *node; unsigned height; };
/** #return A reference the element at the greatest lower bound of `x` in
`tree`, or if the element doesn't exist, `node` will be null. */
static struct ref right(const struct tree tree, const char x) {
struct ref lo, found;
found.node = 0;
if(!tree.node) return found;
for(lo.node = tree.node, lo.height = tree.height; ;
lo.node = ((const struct branch *)(const void *)lo.node)->child[lo.idx],
lo.height--) {
unsigned hi = lo.node->size; lo.idx = 0;
if(!hi) continue;
do {
const unsigned mid = (lo.idx + hi) / 2; /* Will not overflow. */
if(compare(x, lo.node->key[mid]) > 0) lo.idx = mid + 1;
else hi = mid;
} while(lo.idx < hi);
if(lo.idx < lo.node->size) { /* Within bounds, record the current. */
found = lo;
if(compare(x, lo.node->key[lo.idx]) > 0) break;
}
if(!lo.height) break;
}
return found;
}
/** #return Minimum element equal to or greater then `key` in `tree`, or, if
the `key` is larger than any in the set, `default_value`. */
static char tree_right_or(const struct tree tree,
const char key, const char default_value) {
struct ref ref;
return (ref = right(tree, key)).node
&& ref.idx < ref.node->size ? ref.node->key[ref.idx] : default_value;
}
/** #return A reference to the predecessor of the element at the least upper
bound of `x` in `tree`, or `node` will be null if the predecessor doesn't
exist. */
static struct ref left(const struct tree tree, const char x) {
struct ref hi, found;
found.node = 0;
if(!tree.node) return found;
for(hi.node = tree.node, hi.height = tree.height; ;
hi.node = ((const struct branch *)(const void *)hi.node)->child[hi.idx],
hi.height--) {
unsigned lo = 0;
if(!(hi.idx = hi.node->size)) continue;
do { /* Upper-bound. */
const unsigned mid = (lo + hi.idx) / 2; /* Will not overflow. */
if(compare(hi.node->key[mid], x) <= 0) lo = mid + 1;
else hi.idx = mid;
} while(lo < hi.idx);
if(hi.idx) {
found = hi, found.idx--;
/* Equal elements. */
if(compare(x, found.node->key[found.idx]) <= 0) break;
}
if(!hi.height) break; /* Reached the bottom. */
}
return found;
}
/** #return Maximum element equal to or smaller then `key` in `tree`, or, if
the `key` is smaller than any in the set, `default_value`. */
static char tree_left_or(const struct tree tree,
const char key, const char default_value) {
const struct ref ref = left(tree, key);
return ref.node ? ref.node->key[ref.idx] : default_value;
}
#if 0
static void subgraph(const struct tree *const sub, FILE *fp) {
const struct branch *branch;
unsigned i;
assert(sub->node && fp);
fprintf(fp, "\ttrunk%p [label = <\n"
"<table border=\"0\" cellspacing=\"0\">\n"
"\t<tr><td border=\"0\" port=\"0\">"
"<font color=\"Gray75\">%s</font></td></tr>\n",
(const void *)sub->node, orcify(sub->node));
if(sub->node->size) fprintf(fp, "\t<hr/>\n");
for(i = 0; i < sub->node->size; i++) {
const char *const bgc = i & 1 ? " bgcolor=\"Gray95\"" : "";
fprintf(fp, "\t<tr><td border=\"0\" align=\"left\""
" port=\"%u\"%s>%c</td></tr>\n", i + 1, bgc, sub->node->key[i]);
}
fprintf(fp, "\t<hr/>\n"
"\t<tr><td></td></tr>\n"
"</table>>];\n");
if(!sub->height) return;
/* Draw the lines between trees. */
branch = (struct branch *)(void *)sub->node;
for(i = 0; i <= branch->base.size; i++)
fprintf(fp, "\ttrunk%p:%u:se -> trunk%p;\n",
(const void *)sub->node, i, (const void *)branch->child[i]);
/* Recurse. */
for(i = 0; i <= branch->base.size; i++) {
struct tree child;
child.node = branch->child[i], child.height = sub->height - 1;
subgraph(&child, fp);
}
}
/** <https://graphviz.org/> */
static void graph(const struct tree *const tree,
const char *const fn) {
FILE *fp;
assert(tree && fn);
if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
fprintf(fp, "digraph {\n"
"\tgraph [rankdir=LR, truecolor=true, bgcolor=transparent,"
" fontname=modern, splines=false];\n"
"\tnode [shape=none, fontname=modern];\n");
if(!tree->node)
fprintf(fp, "\tidle [shape=plaintext];\n");
else subgraph(tree, fp);
fprintf(fp, "\tnode [color=\"Red\"];\n"
"}\n");
fclose(fp);
}
#endif
int main(void) {
struct node node[] = { { 2, {'b','d'} }, { 2, {'h','j'} } };
struct branch root = { { 1, {'f'} }, {node, node+1} };
const struct tree tree = { &root.base, 1 };
const int expected[] = { 'z', 'b', 'b', 'd', 'd', 'f',
'f', 'h', 'h', 'j', 'j', 'j' };
char left[sizeof expected / sizeof *expected];
char i;
int passed;
/*graph(&tree, "graph.gv");
printf("nodes in B-tree:\n"
"%s:(b,d), %s:(f), %s:(h,j)\n\n",
orcify(&node[0]), orcify(&root), orcify(&node[1]));*/
printf("right or z\n");
for(i = 'a'; i < 'm'; i++)
printf("%c\t%c\n", i, tree_right_or(tree, i, 'z'));
printf("\n"
"left or z\n");
for(i = 'a'; i < 'm'; i++)
printf("%c\t%c\n", i, left[i-'a'] = tree_left_or(tree, i, 'z'));
printf("\n"
"supposed to be...\n");
for(passed = 1, i = 'a'; i < 'm'; i++) {
printf("%c\t%c\n", i, expected[i-'a']);
if(left[i-'a'] != expected[i-'a']) passed = 0;
}
printf("\n"
"%s.\n", passed ? "PASSED" : "failed");
return 0;
}

Related

How to unscramble a word and find all its matches in a txt file in C?

So given a string of up to 7 letters, I need to find every permutation of that string (with and without all the letters) and then check if any of those permutations can be found in my dictionary.txt file, and print the ones that match. So basically, if the user inputs "try," the permutations would be try, tr, tyr, ty, t, rty, etc., and then check if any of them match words in the txt file. I tried to do this using strncopy and strcmp, but the program doesn't always correctly deduce that two things are equal, it takes forever to run, and there's a bug where it counts having zero letters as a permutation of the original string.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define SIZE 100 /* number of words in dictionary.txt */
#define MAX 7 /* max number of letters in given string */
/* function to swap values at two pointers */
void swap(char *x, char *y){
char temp;
temp = *x;
*x = *y;
*y = temp;
}
/* function to find permutations of the string */
void permute(char *letters, int l, int r){
if (l == r){
char *a[SIZE];
FILE *file = fopen("dictionary.txt", "r");
char target[MAX_2];
memset(target, '\0', sizeof(target));
for (int i = 0; i < SIZE; i++){
a[i] = malloc(100000);
fscanf(file, "%s", a[i]);
}
for (int i = 0; i < 10; i++){
for (int j = 0; j < r - 1; j++){
strcpy(target, a[i]);
if (strcmp(target, &letters[i]) == 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Match\n");
}
/*else if (strcmp(target, &letters[i]) != 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Not a match\n");
}
*/
}
}
for (int i = 0; i < SIZE; i++){
free (a[i]);
}
fclose(file);
}
else{
for (int i = l; i <= r; i++){
swap((tiles+l), (tiles+i));
permute(tiles, l+1, r);
swap((tiles+l), (tiles+i));
}
}
}
int main(){
/* initializing tile input */
char letters[MAX];
printf("Please enter your letters: ");
scanf("%s", letters);
/* finding size of input */
int size = strlen(letters);
/* finds all the permutation of the input */
/* parameters: string; start of the string; end of the string */
permute(letters, 0, size);
return 0;
}
Any help or suggestions to pinpoint what I'm doing wrong would be greatly appreciated.
As hinted in my comment, you can map all permutations of a string to a single code value, just by using the bits of a big enough unsigned integer as a bit set. Thus, the (same length) permutations of e.g. the word "try" all map to the same value.
As far as I understood your problem, you also want to match words, which start out with a substring of the wanted word. For that to work, you need to generated N such codes, if N is the number of letters, a word contains. I.e. For a three letter word, the code for the first letter, the first 2 letters and the code for all 3 letters.
Since reading from a file is probably not the problem, here the code, showcasing the "code based" string matching idea (which should be reasonably fast):
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define MAX_WORD_LENGTH 7
typedef uint32_t WordCode;
typedef struct WordCodes_tag {
size_t count;
WordCode codes[MAX_WORD_LENGTH];
} WordCodes_t;
bool word_to_code(const char* word,
size_t start,
size_t end,
WordCode* code) {
if ((end - start) > MAX_WORD_LENGTH)
return false;
*code = 0;
for (size_t i = start; i < end; i++) {
char c = word[i];
if ((c >= 'a') && (c <= 'z')) {
char bit = c - 'a';
WordCode mask = 1 << bit;
(*code) |= mask;
} else {
return false;
}
}
return true;
}
bool word_to_codes(const char* word, WordCodes_t* codes) {
if (NULL == codes)
return false;
if (NULL == word)
return false;
codes->count = 0;
size_t nchars = strlen(word);
if (nchars > MAX_WORD_LENGTH)
return false;
for (size_t len = nchars; len >= 1; len--) {
WordCode word_code;
if (word_to_code(word, 0, len, &word_code)) {
codes->codes[codes->count] = word_code;
codes->count++;
} else {
return false;
}
}
return true;
}
void show_word_codes(const WordCodes_t* codes) {
if (NULL == codes) return;
printf("(");
for (size_t i = 0; i < codes->count; i++) {
if (i > 0)
printf(", %d", codes->codes[i]);
else
printf("%d", codes->codes[i]);
}
printf(")\n");
}
bool is_match(const WordCodes_t* a, const WordCodes_t* b) {
if ((NULL == a) || (NULL == b))
return false;
if ((0 == a->count) || (0 == b->count))
return false;
const WordCodes_t *temp = NULL;
if (a->count < b->count) {
temp = a;
a = b;
b = temp;
}
size_t a_offset = a->count - b->count;
for (size_t i = a_offset, j = 0; i < a->count; i++, j++) {
if (a->codes[i] == b->codes[j])
return true;
}
return false;
}
int main(int argc, const char* argv[]) {
const char* wanted = "try";
const char* dictionary[] = {
"house", "mouse", "cat", "tree", "try", "yrt", "t"
};
size_t dict_len = sizeof(dictionary) / sizeof(char*);
WordCodes_t wanted_codes;
if (word_to_codes(wanted, &wanted_codes)) {
printf("word codes of the wanted word '%s': ", wanted);
show_word_codes(&wanted_codes);
for (size_t i = 0; i < dict_len; i++) {
WordCodes_t found_codes;
if (word_to_codes(dictionary[i],&found_codes)) {
printf("word codes of dictionary word '%s' (%s): ",
dictionary[i],
is_match(&wanted_codes, &found_codes) ?
"match" : "no match");
show_word_codes(&found_codes);
} else {
printf("word_to_codes(%s) failed!", dictionary[i]);
}
}
} else {
puts("word_to_codes() failed!");
return -1;
}
}
As function is_match() above shows, you need only compare the codes for the respective substring length. Thus, even if you have 2 sets of up to 7 numbers, you need only maximum 7 comparisons.
The output looks like this (which seems to make sense):
./search
word codes of the wanted word 'try': (17432576, 655360, 524288)
word codes of dictionary word 'house' (no match): (1327248, 1327232, 1065088, 16512, 128)
word codes of dictionary word 'mouse' (no match): (1331216, 1331200, 1069056, 20480, 4096)
word codes of dictionary word 'cat' (no match): (524293, 5, 4)
word codes of dictionary word 'tree' (match): (655392, 655376, 655360, 524288)
word codes of dictionary word 'try' (match): (17432576, 655360, 524288)
word codes of dictionary word 'yrt' (match): (17432576, 16908288, 16777216)
word codes of dictionary word 't' (match): (524288)
If you want to match the words in a dictionary against all partial permutations of a search term, you don't have to create all permutations. (The number of permutations n! grows very quickly with the length of the search term, n.)
Instead, it is easier to write a customized search function. You can make use of two strategies here:
A word w is a permutation of the search term s if both words are eaqual if the letters are sorted. For example, "integral" and "triangle" are anagrams of each other, because both sort to "aegilnrt".
You can skip letters in the search term when searching to account for partial anagrams. Because the search term and the word will be sorted, you know which ones to skip: The ones that are lexically "smaller" than the next letter in the word.
So your matching function should sort the words first and then compare the words character by character in such a way that characters from the search term can be skipped.
Here's code that does that:
int char_cmp(const void *pa, const void *pb)
{
const char *a = pa;
const char *b = pb;
return *a - *b;
}
bool partial_anagram(const char *aa, const char *bb)
{
char A[64];
char B[64];
const char *a = strcpy(A, aa);
const char *b = strcpy(B, bb);
qsort(A, strlen(A), 1, char_cmp);
qsort(B, strlen(B), 1, char_cmp);
while (*b) {
while (*a && *a < *b) a++;
if (*a != *b) return false;
a++;
b++;
}
return true;
}
Things to note:
Sorting is done with the function qsort from <stdlib.h>, for which you need a comparator function, in this case char_cmp.
The sorted strings are copies, so that the original strings are not modified. (The code above is unsafe, because it doesn't enforce that the length of the strings is less than 64 characters. Unfortunately, the function strncpy, which can accept a maximum buffer size, is not safe, either, because it can leave the buffer unterminated. A safe way to copy the strings would be snprintf(A, sizeof(A), "%s", aa), but I've kept the strcpy for, er, "simplicity".)
The function partial_anagram takes unsorted strings and sorts them. That makes for a clean interface, but it is inefficient when you want to test against the same search term repeatedly as in your case. You could change the function, so that it expects already sorted strings. This will reduce the function to just the loop and will place the responsibility of sorting to the caller.
If you really have a lot of searches, there is yet more room for optimization. For example, you could insert the sorted dictionary into a trie. Given that you original code read the whole file for each permutation, I guess you're not worried that much about performance. :)
I've put a working example online. The code above works with pointers. If you are more at ease with indices, you can rewrite the function:
bool partial_anagram(const char *aa, const char *bb)
{
char a[64];
char b[64];
unsigned i = 0;
unsigned j = 0;
strcpy(a, aa);
strcpy(b, bb);
qsort(a, strlen(a), 1, char_cmp);
qsort(b, strlen(b), 1, char_cmp);
while (b[j]) {
while (a[i] && a[i] < b[j]) i++;
if (a[i] != b[j]) return false;
i++;
j++;
}
return true;
}
Problem
One is using an algorithm that has exponentially growing run-time with the problem size. There are probably lots of ways to speed this up, but, as suggested by #SparKot, a trie, or prefix tree, is a particularly good fit. One can build a trie from an dictionary array of size n, assuming the length of the strings in your dictionary are bounded, in O(n log n). Looking up angrams in the worst-case, where the letters never run out, (ignoring the arbitrary 7 limit,) is still worst case O(n).
$ bin/trie AAABBBCCCDDDEEEFFFGGGHHHIIIJJJKKKLLLMMMNNNOOOPPPQQQRRRSSSTTTUUUVVVWWWXXXYYYZZZ < Tutte_le_parole_inglesi.txt
build_index warning: duplicate "OUTSOURCING".
build_index warning: duplicate "OUTSOURCINGS".
Loaded 216553 trie entries.
AA
AAH
AAHED
AAHING
...
ZYTHUMS
ZYZZYVA
ZYZZYVAS
211929 words found.
Proposal
The reason a prefix tree is so effective, is it allows you to query prefixes as (even more) efficiently as lookup. With this, one can do a very effective branch-and-bound-style algorithm. That is, the longer the string, the less words it will be a prefix match to; if the string is not a prefix match for any of the words in the dictionary, one can rule out any longer strings and just not test them.
So my idea is, form a histogram with the Scrabble-string of length k in O(k). Then, recursively, add more and more letters, matching, until no dictionary entries are prefix matches of the string. This will run in (*I think) O(n log n + k), assuming a bound on the number of comparisons needed to distinguish words; ie, one's dictionary is not { a, aa, aaa, aaaa, aaaaa, aaaaaa, ... }.
Implementation
I use a PATRiCA tree. It is especially attractive because a lot of data is implicit; one can use a simple array to represent the leaves on a complete binary tree. Specifically, n leaves are already just the list of words in lexicographical order, we want to build an index of n - 1 branches. It requires a stop code; the null-termination in C is perfect. I don't have to create copies of everything and manage them. The below code first sets up a dynamic array, which is useful for input, then sets up a trie, then implements the algorithm.
#include <stdlib.h> /* EXIT malloc free qsort */
#include <stdio.h> /* printf */
#include <string.h> /* memmove memcpy */
#include <assert.h> /* assert */
#include <errno.h> /* errno */
#include <limits.h> /* UINT_MAX */
#include <ctype.h> /* isgraph */
/* Dynamic array. */
#define MIN_ARRAY(name, type) \
struct name##_array { type *data; size_t size, capacity; }; \
static int name##_array_reserve(struct name##_array *const a, \
const size_t min) { \
size_t c0; \
type *data; \
const size_t max_size = (size_t)-1 / sizeof *a->data; \
if(a->data) { \
if(min <= a->capacity) return 1; \
c0 = a->capacity < 7 ? 7 : a->capacity; \
} else { \
if(!min) return 1; \
c0 = 7; \
} \
if(min > max_size) return errno = ERANGE, 0; \
/* `c_n = a1.625^n`, approximation golden ratio `\phi ~ 1.618`. */ \
while(c0 < min) { \
size_t c1 = c0 + (c0 >> 1) + (c0 >> 3); \
if(c0 >= c1) { c0 = max_size; break; } /* Unlikely. */ \
c0 = c1; \
} \
if(!(data = realloc(a->data, sizeof *a->data * c0))) \
{ if(!errno) errno = ERANGE; return 0; } \
a->data = data, a->capacity = c0; \
return 1; \
} \
static type *name##_array_buffer(struct name##_array *const a, \
const size_t n) { \
if(a->size > (size_t)-1 - n) { errno = ERANGE; return 0; } \
return name##_array_reserve(a, a->size + n) \
&& a->data ? a->data + a->size : 0; \
} \
static type *name##_array_append(struct name##_array *const a, \
const size_t n) { \
type *b; \
if(!(b = name##_array_buffer(a, n))) return 0; \
return a->size += n, b; \
} \
static type *name##_array_new(struct name##_array *const a) \
{ return name##_array_append(a, 1); } \
static struct name##_array name##_array(void) \
{ struct name##_array a; a.data = 0, a.capacity = a.size = 0; return a; } \
static void name##_array_(struct name##_array *const a) \
{ if(a) free(a->data), *a = name##_array(); }
MIN_ARRAY(char, char)
/** Append a file, `fp`, to `c`, and add a '\0'.
#return Success. A partial read is failure. #throws[fopen, fread, malloc]
#throws[EISEQ] The text file has embedded nulls.
#throws[ERANGE] If the standard library does not follow POSIX. */
static int append_file(struct char_array *c, FILE *const fp) {
const size_t granularity = 4096;
size_t nread;
char *cursor;
int success = 0;
assert(c && fp);
/* Read entire file in chunks. */
do if(!(cursor = char_array_buffer(c, granularity))
|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
|| !char_array_append(c, nread)) goto catch;
while(nread == granularity);
/* File to `C` string. */
if(!(cursor = char_array_new(c))) goto catch;
*cursor = '\0';
/* Binary files with embedded '\0' are not allowed. */
if(strchr(c->data, '\0') != cursor) { errno = EILSEQ; goto catch; }
{ success = 1; goto finally; }
catch:
if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
finally:
if(fp) fclose(fp);
return success;
}
/* Trie is base-2 compact radix tree, described in <Morrison, 1968 PATRICiA>.
Specifically, this is a full binary tree. */
struct branch { unsigned skip, left; };
static const size_t skip_max = UINT_MAX, left_max = UINT_MAX;
MIN_ARRAY(branch, struct branch)
MIN_ARRAY(leaf, char *)
struct trie { struct branch_array branches; struct leaf_array leaves; };
static struct trie trie(void) { struct trie t;
t.branches = branch_array(), t.leaves = leaf_array(); return t; }
static void trie_(struct trie *const t) { if(t) branch_array_(&t->branches),
leaf_array_(&t->leaves), *t = trie(); }
/** From string `a`, extract `bit`, either 0 or 1. */
static int is_bit(const char *const a, const size_t bit) {
const size_t byte = bit >> 3;
const unsigned char mask = 128 >> (bit & 7);
return !!(a[byte] & mask);
}
/** #return Whether `a` and `b` are equal up to the minimum of their lengths'. */
static int is_prefix(const char *a, const char *b) {
for( ; ; a++, b++) {
if(*a == '\0') return 1;
if(*a != *b) return *b == '\0';
}
}
/** [low, high). */
struct range { size_t low, high; };
static int init_branches_r(struct trie *const t, size_t bit,
const struct range range) {
struct range r;
size_t skip = 0, left;
struct branch *branch;
assert(t && t->leaves.size);
assert(t->branches.capacity >= t->leaves.size - 1);
assert(range.low <= range.high && range.high <= t->leaves.size);
if(range.low + 1 >= range.high) return 1; /* Only one, leaf. */
/* Endpoints of sorted range: skip [_1_111...] or [...000_0_] don't care. */
while(is_bit(t->leaves.data[range.low], bit)
|| !is_bit(t->leaves.data[range.high - 1], bit)) {
if(skip == skip_max) return errno = ERANGE, 0;
bit++, skip++;
}
/* Binary search for the rightmost 0 (+1). */
r = range;
while(r.low < r.high) {
size_t m = r.low + (r.high - r.low) / 2;
if(is_bit(t->leaves.data[m], bit)) r.high = m; else r.low = m + 1;
}
if((left = r.low - range.low - 1) > left_max) return errno = ERANGE, 0;
/* Should have space for all branches pre-allocated. */
branch = branch_array_new(&t->branches), assert(branch);
branch->left = (unsigned)left;
branch->skip = (unsigned)skip;
bit++;
return (r.low = range.low, r.high = range.low + left + 1,
init_branches_r(t, bit, r)) && (r.low = r.high, r.high = range.high,
init_branches_r(t, bit, r)) /* && (printf("}\n"), 1) */;
}
/** Orders `a` and `b` by their pointed-to-strings. #implements qsort bsearch */
static int vstrcmp(const void *const a, const void *const b)
{ return strcmp(*(const char *const*)a, *(const char *const*)b); }
/** #param[a] A zero-terminated file containing words. Will be parsed and
modified.
#param[t] An idle tree that is initialized from `a`. Any modification of `a`
invalidates `t`.
#return Whether the tree initialization was entirely successful. */
static int build_trie(struct trie *const t, struct char_array *const a) {
struct range range;
size_t i;
char *cursor, *end, **leaf;
int is_run = 0;
/* Strict for processing ease; this could be made more permissive. */
assert(a && a->size && a->data[a->size - 1] == '\0'
&& t && !t->branches.size && !t->leaves.size);
for(cursor = a->data, end = a->data + a->size; cursor < end; cursor++) {
/* Fixme: 7-bit; mælström would be parsed as "m", "lstr", "m". */
if(!isgraph(*cursor)) {
*cursor = '\0', is_run = 0;
} else if(!is_run) {
if(!(leaf = leaf_array_new(&t->leaves))) return 0;
*leaf = cursor, is_run = 1;
}
}
if(!t->leaves.size) return errno = EILSEQ, 0; /* No parseable info. */
/* Sort and de-duplicate (inefficiently.) Want to treat it as an index. */
qsort(t->leaves.data, t->leaves.size, sizeof *t->leaves.data, &vstrcmp);
for(i = 1; i < t->leaves.size; i++) {
if(strcmp(t->leaves.data[i - 1], t->leaves.data[i]) < 0) continue;
fprintf(stderr, "build_index warning: duplicate \"%s\".\n",
t->leaves.data[i]);
memmove(t->leaves.data + i, t->leaves.data + i + 1,
sizeof *t->leaves.data * (t->leaves.size - i - 1));
t->leaves.size--, i--;
}
range.low = 0, range.high = t->leaves.size;
if(!branch_array_reserve(&t->branches, t->leaves.size - 1)
|| !init_branches_r(t, 0, range)) return 0;
assert(t->branches.size + 1 == t->leaves.size);
return 1;
}
/** #return In `t`, which must be non-empty, given a `prefix`, stores all leaf
prefix matches, only given the index, ignoring don't care bits.
#order \O(`prefix.length`) */
static struct range partial_prefix(const struct trie *const t,
const char *const prefix) {
size_t n0 = 0, n1 = t->branches.size, i = 0, left;
struct branch *branch;
size_t byte, key_byte = 0, bit = 0;
struct range range = { 0, 0 };
assert(t && prefix);
assert(n1 + 1 == t->leaves.size); /* Full binary tree. */
while(n0 < n1) {
branch = t->branches.data + n0;
bit += branch->skip;
/* '\0' is not included for partial match. */
for(byte = bit >> 3; key_byte <= byte; key_byte++)
if(prefix[key_byte] == '\0') goto finally;
left = branch->left;
if(!is_bit(prefix, bit++)) n1 = ++n0 + left;
else n0 += left + 1, i += left + 1;
}
assert(n0 == n1);
finally:
assert(n0 <= n1 && i - n0 + n1 < t->leaves.size);
range.low = i, range.high = i - n0 + n1 + 1;
return range;
}
/* #return Given a `prefix`, what is the range of matched strings in `t`. */
static struct range prefix(const struct trie *const t,
const char *const prefix) {
struct range range;
assert(t && prefix);
if(!t->leaves.size) goto catch;
range = partial_prefix(t, prefix);
if(range.low <= range.high)
if(!is_prefix(prefix, t->leaves.data[range.low])) goto catch;
goto finally;
catch:
range.low = range.high = 0;
finally:
return range;
}
/* Debug graph. */
/** Given a branch `b` in `tr` branches, calculate the right child branches.
#order \O(log `size`) */
static unsigned right_count(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < total && right < total);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1;
}
assert(b0 == b);
return right;
}
/** #return Follows the branches to `b` in `tr` and returns the leaf. */
static unsigned left_leaf(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, i = 0, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < tr->branches.size && right < tr->branches.size);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1, i += left + 1;
}
assert(b0 == b);
return i;
}
static void graph(const struct trie *const tr, const char *const fn) {
unsigned left, right, b, i;
FILE *fp = 0;
assert(tr && fn);
if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
fprintf(fp, "digraph {\n"
"\tgraph [truecolor=true, bgcolor=transparent];\n"
"\tfontface=modern;\n"
"\tnode [shape=none];\n"
"\n");
if(!tr->branches.size) {
assert(!tr->leaves.size);
fprintf(fp, "\tidle;\n");
} else {
assert(tr->branches.size + 1 == tr->leaves.size);
fprintf(fp, "\t// branches\n");
for(b = 0; b < tr->branches.size; b++) { /* Branches. */
const struct branch *branch = tr->branches.data + b;
left = branch->left, right = right_count(tr, b);
fprintf(fp, "\ttree%pbranch%u [label = \"%u\", shape = circle, "
"style = filled, fillcolor = Grey95];\n"
"\ttree%pbranch%u -> ", (const void *)tr, b, branch->skip,
(const void *)tr, b);
if(left) fprintf(fp, "tree%pbranch%u [arrowhead = rnormal];\n",
(const void *)tr, b + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = rnormal];\n",
(const void *)tr, left_leaf(tr, b));
fprintf(fp, "\ttree%pbranch%u -> ", (const void *)tr, b);
if(right) fprintf(fp, "tree%pbranch%u [arrowhead = lnormal];\n",
(const void *)tr, b + left + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = lnormal];\n",
(const void *)tr, left_leaf(tr, b) + left + 1);
}
}
fprintf(fp, "\t// leaves\n");
for(i = 0; i < tr->leaves.size; i++) fprintf(fp,
"\ttree%pleaf%u [label = <%s<FONT COLOR=\"Gray85\">⊔</FONT>>];\n",
(const void *)tr, i, tr->leaves.data[i]);
fprintf(fp, "\n"
"\tnode [color = \"Red\"];\n"
"}\n");
fclose(fp);
}
/* Actual program. */
/* The input argument histogram. Used in <fn:find_r>. (Simple, but questionable
design choice.) */
static unsigned char hist[128];
static const size_t hist_max = UCHAR_MAX,
hist_size = sizeof hist / sizeof *hist;
static size_t words_found;
/** Branch-and-bound recursive function. */
static void find_r(const struct trie *const tr, char *const word) {
struct range r;
size_t len, i;
assert(word);
r = prefix(tr, word);
if(r.low >= r.high) return; /* Found nothing, we can bound this branch. */
if(!strcmp(word, tr->leaves.data[r.low])) { /* Found a match. */
printf("%s\n", word), words_found++;
if(++r.low == r.high) return;
}
len = strlen(word);
for(i = 0; i < hist_size; i++) {
unsigned char *freq;
if(!*(freq = hist + i)) continue;
(*freq)--;
word[len] = (char)i, word[len + 1] = '\0';
find_r(tr, word);
(*freq)++;
}
}
int main(int argc, char *argv[]) {
struct char_array dict = char_array();
struct trie tr = trie();
char *word;
size_t i;
int success = EXIT_FAILURE;
assert(CHAR_BIT == 8); /* C89 this value can change, assumes C99 value. */
if(argc != 2) { errno = EILSEQ;
fprintf(stderr, "Needs argument and dictionary input.\n"); goto catch; }
word = argv[1];
/* Load the dictionary from stdin and index it into a trie. */
if(!append_file(&dict, stdin) || !build_trie(&tr, &dict)) goto catch;
fprintf(stderr, "Loaded %lu trie entries.\n",(unsigned long)tr.leaves.size);
graph(&tr, "dictionary.gv");
/* Histogram the argument. */
for(i = 0; word[i] != '\0'; i++) {
unsigned char *freq;
if(word[i] & 0x80) continue; /* UTF-8 is not supported. :[ */
if(*(freq = hist + word[i]) == hist_max)
{ errno = ERANGE; goto catch; } /* "aaaaaaaaa..." x 5M? */
(*freq)++;
}
/* Might as well re-use the word now that we're finished with it; it's the
right length. */
*word = '\0', find_r(&tr, word);
fprintf(stderr, "%lu words found.\n", (unsigned long)words_found);
{ success = EXIT_SUCCESS; goto finally; }
catch:
perror("word");
finally:
trie_(&tr);
char_array_(&dict);
return success;
}

how to check for duplicates in a char array c

I'm pretty new to C and how would I check the duplicates of a 1D char array
for example
#define MAX_SIZE 60
Char canvas[MAX_SIZE] = {0};
for(int i=0; i<MAX_SIZE;i++){
//How do i check if there is a duplicate in that array?
}
How do I iterate through to check for duplicates, like do i have to use double for loops and do sizeOf(canavas)/SOMETHING here?
My solution, using a function:
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
bool mem_hasduplicates(const char arr[], size_t len)
{
assert(arr != NULL);
if (len == 0)
return false;
for (size_t i = 0; i < len - 1; ++i) {
for (size_t j = i + 1; j < len; ++j) {
if (arr[i] == arr[j]) {
return true;
}
}
}
return false;
}
int main() {
const char canvas[] = "zcxabca";
printf("%x\n", mem_hasduplicates(canvas, sizeof(canvas)/sizeof(canvas[0])));
const char other_canvas[] = "abcfsd";
printf("%x\n", mem_hasduplicates(other_canvas, sizeof(other_canvas)/sizeof(other_canvas[0])));
}
Live version available at onlinegdb.
#edit Or we can "just" create a histogram from all the numbers as #selbie suggested, although this got me complicated fast:
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
struct histogram_value_s {
char value;
unsigned int count;
};
struct histogram_s {
struct histogram_value_s *v;
size_t len;
};
#define HISTOGRAM_INIT() {0}
void histogram_fini(struct histogram_s *t)
{
t->len = 0;
free(t->v);
}
static int histogram_sort_by_value_qsort_cb(const void *a0, const void *b0)
{
const struct histogram_value_s *a = a0;
const struct histogram_value_s *b = b0;
assert(a != NULL);
assert(b != NULL);
return a->value - b->value;
}
void histogram_sort_by_value(struct histogram_s *t)
{
qsort(t->v, t->len, sizeof(*t->v), histogram_sort_by_value_qsort_cb);
}
static int histogram_sort_by_count_qsort_cb(const void *a0, const void *b0)
{
const struct histogram_value_s *a = a0;
const struct histogram_value_s *b = b0;
assert(a != NULL);
assert(b != NULL);
return a->count - b->count;
}
void histogram_sort_by_count(struct histogram_s *t)
{
qsort(t->v, t->len, sizeof(*t->v), histogram_sort_by_count_qsort_cb);
}
int histogram_getValue_2(const struct histogram_s *t, char value, size_t *idx, unsigned int *ret0)
{
for (size_t i = 0; i < t->len; ++i) {
if (t->v[i].value == value) {
if (ret0) {
*ret0 = t->v[i].count;
}
if (idx) {
*idx = i;
}
return 0;
}
}
return -1;
}
void histogram_printlns_generic(const struct histogram_s *t, const char fmt[])
{
assert(t != NULL);
for (size_t i = 0; i < t->len; ++i) {
printf(fmt, t->v[i].value, t->v[i].count);
}
}
int histogram_add(struct histogram_s *t, char value)
{
size_t idx;
if (histogram_getValue_2(t, value, &idx, NULL) == 0) {
if (t->v[idx].count == UINT_MAX) {
goto ERR;
}
++t->v[idx].count;
} else {
void *tmp;
tmp = realloc(t->v, (t->len + 1) * sizeof(*t->v));
if (tmp == NULL) goto ERR;
t->v = tmp;
t->v[t->len] = (struct histogram_value_s){
.value = value,
.count = 1,
};
++t->len;
}
return 0;
ERR:
return -1;
}
bool histogram_has_any_count_greater_then_2(const struct histogram_s *t)
{
assert(t != NULL);
for (size_t i = 0; i < t->len; ++i) {
if (t->v[i].count >= 2) {
return true;
}
}
return false;
}
/* ----------------------------------------------------------- */
int histogram_create_from_mem(struct histogram_s *ret0, const char arr[], size_t len)
{
assert(ret0 != NULL);
assert(arr != NULL);
struct histogram_s ret = HISTOGRAM_INIT();
for (size_t i = 0; i < len; ++i) {
const char to_add = arr[i];
if (histogram_add(&ret, to_add) < 0) {
goto ERR;
}
}
*ret0 = ret;
return 0;
ERR:
histogram_fini(&ret);
return -1;
}
int main() {
const char canvas[] = "abc";
struct histogram_s h;
int ret;
ret = histogram_create_from_mem(&h, canvas, sizeof(canvas)/sizeof(canvas[0]));
if (ret) {
fprintf(stderr, "mem_createhistogram error!\n");
return -1;
}
printf("'%s' %s duplicates\n",
canvas,
histogram_has_any_count_greater_then_2(&h)
? "has"
: "does not have"
);
histogram_fini(&h);
}
Live version here.
#edit Or we can sort the array, and check if any two adjacent bytes are the same!
#include <stdlib.h>
#include <stdbool.h>
int cmp_chars(const void *a, const void *b)
{
return *(char*)a - *(char*)b;
}
int main() {
char canvas[] = "abca";
qsort(canvas, sizeof(canvas) - 1, sizeof(canvas[0]), cmp_chars);
bool duplicate_found = false;
for (char *p = canvas; p[1] != '\0'; ++p) {
if (p[0] == p[1]) {
duplicate_found = true;
break;
}
}
printf("'%s' %s duplicates\n",
canvas,
duplicate_found ? "has" : "does not have");
}
Live version available at onlinegdb.
If Char is just a typo for char, then this becomes relatively simple - set up a second array, indexed by character code, that keeps track of the number of occurrences of each character:
#include <limits.h>
#include <ctype.h>
...
int charCount[SCHAR_MAX+1] = {0}; // We're only going to worry about non-negative
// character codes (i.e., standard ASCII)
// [0..127]
...
/**
* This assumes that canvas is *not* a 0-terminated string, and that
* every element of the array is meaningful. If that's not the case,
* then loop on the length of the string instead of MAX_SIZE.
*/
for ( int i = 0; i < MAX_SIZE; i++ )
{
if ( canvas[i] >= 0 && canvas[i] <= SCHAR_MAX )
{
charCount[canvas[i]]++; // index into charCount by the value of canvas[i]
}
}
Then you can walk through the charCount array and print all the character values that occurred more than once:
for ( int i = 0; i <= SCHAR_MAX; i++ )
{
if ( charCount[i] > 1 )
{
/**
* If the character value is a printable character (punctuation, alpha,
* digit), print the character surrounded by single quotes - otherwise,
* print the character code as a decimal integer.
*/
printf( isprint( i ) ? "'%c': %d\n" : "%d: %d\n", i, charCount[i] );
}
}
What's that SCHAR_MAX all about, any why am I yammering about non-negative character codes in the comments?
In C, characters the basic execution character set (digits, upper and lowercase letters, common punctuation characters) are guaranteed to have non-negative encodings (i.e., the [0..127] range of standard ASCII). Characters outside of that basic execution character set may have positive or negative values, depending on the implementation. Thus, the range of char values may be [-128..127] on some platforms and [0..255] on others.
The limits.h header defines constants for various type ranges - for characters, it defines the following constants:
UCHAR_MAX - maximum unsigned character value (255 on most platforms)
SCHAR_MIN - minimum signed character value (-128 on most platforms)
SCHAR_MAX - maximum signed character value (127 on most platforms)
CHAR_MIN - minimum character value, either 0 or SCHAR_MIN depending on platform
CHAR_MAX - maximum character value, either UCHAR_MAX or SCHAR_MAX depending on value
To keep this code simple, I'm only worrying about character codes in the range [0..127]; otherwise, I'd have to map negative character codes onto non-negative array indices, and I didn't feel like doing that.
Both this method and the nested loop solution require some tradeoffs. The nested loop solution trades time for space, while this solution trades space for time. In this case, the additional space is fixed regardless of how large canvas becomes. In the nested loop case, time will increase with the square of the length of canvas. For short inputs, there's effectively no difference, but if canvas gets large enough, you will notice a significant decrease in performance with the nested loop solution.

Count occurrences and associate with given array in C

I'm having issues to correct my code so that it works as I want it.
I have three arrays given in this example:
char arr[MAX_ELEMENTS][MAX_LENGTH] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[MAX_ELEMENTS][MAX_LENGTH] = {"PARI", "PARI", "LOND", "PARI", "PARI", "CYKA", "LOND", "CYKA", "PARI"};
int freq[MAX_ELEMENTS];
I wish to create a function that can count the amount of occurrences of string elements in arr[] and store them in freq[]. Apart from that I also wish to know in what race[] there have been the most occurrences of given arr[].
To demonstrate this here is an example of what output I wish to receive when the function works:
In Race [PARI] the highest occurence was [ABS] with 3 occurences!
In Race [LOND] the highest occurence was [ACT] with 1 occurences!
.....
Currently, I am able to count the occurrences of arr[] in freq[] but I can't associate them with their respective race[] and give that output..
for(i=0; i<size; i++)
{
count = 1;
for(j=i+1; j<size; j++)
{
/* If duplicate element is found */
if(strcmp(arr[i], arr[j])==0)
{
count++;
/* Make sure not to count frequency of same element again */
freq[j] = 0;
}
}
/* If frequency of current element is not counted */
if(freq[i] != 0)
{
freq[i] = count;
}
}
Giving me currently :
ABS occurs 3 times.
ACT occurs 2 times.
etc. etc...
But I don't know how I can associate them with the race[] and only count them if a given race.
You probably have to use struct here to format your data.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#define true 1
#define len 100
#define elms 10
struct NODE;
#define Node struct NODE
struct NODE {
unsigned long int val;
int count;
char name[len];
Node *left;
Node *right;
};
Node * makeNode(char * str, unsigned long int val){
Node * tmp = (Node *)malloc(sizeof(Node));
strcpy(tmp->name, str);
tmp->val = val;
tmp->left = NULL;
tmp->right = NULL;
tmp->count = 1;
return tmp;
}
unsigned long int getHash(char * name){
int prime = 19;
int i = 0;
unsigned long int val = 0;
while(name[i]!='\0'){
val += (name[i] * pow(prime, i) );
++i;
}
return val;
}
void insert(Node * root, char * name){
Node * newnode;
int val = getHash(name);
Node * tmp = root;
while(tmp != NULL) {
if ( tmp->val == val){
tmp->count += 1;
break;
}
if (val > tmp->val){
if( tmp->right != NULL)
tmp = tmp->right;
else{
tmp->right = makeNode(name, val);
break;
}
}else {
if( tmp->left != NULL)
tmp = tmp->left;
else{
tmp -> left = makeNode(name, val);
break;
}
}
}
}
Node * find(Node * root, char * name){
int val = getHash(name);
Node * tmp = root;
while(tmp != NULL){
if(tmp -> val == val){
return tmp;
}else if (val > tmp->val){
tmp = tmp->right;
}else{
tmp = tmp->left;
}
}
return NULL;
}
struct Race {
char name[len];
char elements[elms][len];
};
char arr[elms][len] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[elms][len] = {"PARI", "PARI", "LOND", "PARI", "PARI", "CYKA", "LOND", "CYKA", "PARI"};
int freq[elms];
void copyArray(char dest[elms][len], char src[elms][len] ){
int i = 0;
while(strlen(src[i]) > 0){
strcpy(dest[i],src[i]);
++i;
}
}
int main(){
Node * root = makeNode("root", 0);
int i = 0;
while(strlen(arr[i]) > 0){
insert(root,arr[i]);
++i;
}
i = 0;
while(strlen(arr[i]) > 0){
Node * r = find(root,arr[i]);
printf("found %s, count = %ld\n", r->name, r->count);
++i;
}
// make representation of race
struct Race r1, r2;
strcpy(r1.name, "PARI");
{
char tmp[elms][len] = { "ABS", "PPB", "QQQ" };
copyArray(r1.elements, tmp);
}
strcpy(r2.name, "LOND");
{
char tmp[elms][len] = { "ACT" };
copyArray(r2.elements, tmp);
}
struct Race races[2] = {r1, r2};
i = 0;
while(i < 2){
struct Race * current = &races[i];
printf("for %s", current->name);
Node * max = NULL;
int m = -1;
int j = 0;
while(strlen(current->elements[j]) > 0){
Node * tmp = find(root, current->elements[j]);
if( tmp != NULL && tmp->count > m) {
max = tmp;
m = tmp->count;
}
++j;
}
if (max != NULL){
printf(" max is %s : %d\n", max->name, max->count);
}else{
printf(" max is None\n");
}
++i;
}
return 0;
}
Basically you have to format you data, and specify link between them. Here I used Binary tree and Rabin karp hashing technique to store data efficiently.
Binary tree is best way to solve counting problem, since the search operation fairly cheap. and Rabin karp hashing technique will avoid string comparison every time.
And I create a struct called Race to store all related elements of that race. so the algorithm is going to be.
let arr be array of elements
let races be array of races
for each race in races
define related element
#find occurrence now
#Binary tree will increment count if element already exist.
let binary_tree be a Binary Tree
for each element in arr
add element to binary_tree
# now we have all the elements with it's count
# let's iterate through races now
for each race in races
m = null
for element in race.elements
node = find_element_in_binary_tree(element)
if node is not null
m = max(m, node)
if m is not null then
print m
else
print not found
First, initializations, note the []s
char arr[][MAX_LENGTH] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[][MAX_LENGTH] = {"PARI","PARI","LOND","PARI","PARI","CYKA","LOND","CYKA","PARI"};
int freq[MAX_ELEMENTS];
int n = sizeof(arr)/sizeof(*arr); // get actual number of used items
int i,j;
int max = 0; // init max to 0
The main loop goes through arr and race, and whenever a dupe is found at [j] (after [i]), "invalidate" the dupe ("already processed") by setting its first char to 0 (empty string).
Note that j starts from i and not i+1 to ensure freq is at least 1, even for the first non-dupes items.
for(i=0 ; i<n ; i++) {
freq[i] = 0; // ensure freq is 0 for any item
if ( ! *arr[i]) continue; // skip already processed items
for(j=i ; j<n ; j++) { // j=i, not i+1!
if (!strcmp(arr[i],arr[j]) && !strcmp(race[i],race[j])) {
freq[i]++; // update max if necessary
if (freq[i] > max) max = freq[i];
if (j > i) *arr[j] = 0; // invalidate that arr element
}
}
}
Finally display the max appearances, including ties
printf("Items at max=%d:\n", max);
for(i=0 ; i<n ; i++) {
if (freq[i] == max) { // skipped items are never displayed (max cannot be 0)
printf("%s / %s\n", arr[i],race[i]);
}
}
(no need to check for "invalidation" as max will be >0, and all invalidated items have freq[i] == 0)

Sorting 2 arrays - one of chars and one of integers

so I have two arrays, one with some n number of chars (could be around 1000-2000) and second with exact same number n of integers. Chars represent words, and integers numbers of occurences of these words in my tree. I want to sort it so the word with highest number of occurences is first, second highest second etc etc. Could anyone lend me a hand, please? I have not taken data structures/algorithm class yet so I am having problems with that.
My code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define ARRAY_SIZE(a) sizeof(a)/sizeof(a[0])
#define ALPHABET_SIZE (26)
// Converts key current character into index
// use only 'a' through 'z' and lower case
#define CHAR_TO_INDEX(C) ((int)C - (int)'A')
#define INDEX_TO_CHAR(IX) ('A' + IX)
char words[3000][40]={{0}};
int counters[3000]={0};
int wordnr=0;
typedef struct trie_node trie_node_t;
struct trie_node
{
int counter;
trie_node_t *children[ALPHABET_SIZE];
};
typedef struct trie trie_t;
struct trie
{
trie_node_t *root;
int count;
};
// Returns new trie node
trie_node_t *getNode(void)
{
trie_node_t *pNode = NULL;
pNode = (trie_node_t *)malloc(sizeof(trie_node_t));
if( pNode )
{
int i;
pNode->counter = 0;
for(i = 0; i < ALPHABET_SIZE; i++)
{
pNode->children[i] = NULL;
}
}
return pNode;
}
// Initializes trie
void initialize(trie_t *pTrie)
{
pTrie->root = getNode();
pTrie->count = 0;
}
void setorder_rec(trie_node_t *pCrawl, char *str, int n)
{
if (pCrawl == NULL) return;
if (pCrawl->counter) {
str[n]='\0';
strcpy(words[wordnr],str);
words[wordnr][strlen(str)]='\0';
counters[wordnr]=pCrawl->counter;
wordnr++;
printf("%.*s: %d\n", n, str, pCrawl->counter);
}
for (int i = 0; i < ALPHABET_SIZE; i++) {
str[n] = INDEX_TO_CHAR(i);
setorder_rec(pCrawl->children[i], str, n + 1);
}
}
void setorder(trie_t *pTrie)
{
char tempword[40] = {0};
setorder_rec(pTrie->root, tempword, 0);
}
void insert(trie_t *pTrie, char key[])
{
int level;
int length = strlen(key);
int index;
trie_node_t *pCrawl;
pTrie->count++;
pCrawl = pTrie->root;
for( level = 0; level < length; level++ )
{
index = CHAR_TO_INDEX(key[level]);
if( !pCrawl->children[index] )
{
pCrawl->children[index] = getNode();
}
pCrawl = pCrawl->children[index];
}
pCrawl->counter++;
printf("counter slow 3= %d\n", pCrawl->counter);
}
int main()
{
char keys[][20] = {"THE", "THE", "BYE", "A", "THERE", "ANSWER", "ANSWER", "BBUWNTSMFK", "THE", "THEIR", "ANSWER", "THE", "LOL", "OMG", "WTF"};
trie_t trie;
char output[][20] = {"Not present in trie", "Present in trie"};
initialize(&trie);
// Construct trie
for(int i = 0; i < ARRAY_SIZE(keys); i++)
{
insert(&trie, keys[i]);
}
setorder(&trie);
for(int i=0; i<=9; i++)
{
printf("#%d %s=%d\n", i, words[i], counters[i]);
}
return 0;
}
Arrays that I want to sort are "words" and "counters"
Here is a simple bubble sort code you can use:
for (c = 0 ; c < ( n - 1 ); c++)
{
for (d = 0 ; d < n - c - 1; d++)
{
if (counters[d] > counters[d+1])
{
swap = counters[d];
counters[d] = counters[d+1];
counters[d+1] = swap;
swap2 = words[d];
words[d] = words[d+1];
words[d+1] = swap2;
}
}
}
printf("Sorted words:\n");
for ( c = 0 ; c < n ; c++ )
printf("%d\n", words[c]);**strong text**

Anagram Solver C

I am totally new to C so I am having troubles with hash table and linked list. I am making an anagram solver. There are many examples I found online but each person has done it differently and rather complicated so I'm really confused now.
I'm pretty okay with the most of the implementation of the program. But I'm actually stuck at the very beginning.
So I need to create a hash table where in each entry, the key is an int and the value is a linked list of words.
The way I get the key, or the hash value, is by converting a word to a number. For example, A is 1, B is 2, C is 3, AB is 3, BC is 5, ABC is 6, and so on. I guess the words should be case insensitive to make things easier.
Below is the code I'm working on. I'm pretty sure is not in the correct syntax. Right now I'm just working on the structure of the table.
typedef struct Entry {
int key;
char *word;
Entry *next;
} Entry;
typedef struct HashTable {
int size;
Entry *entry;
} HashTable;
// initialize table
HashTable* create(int size) {
HashTable *table = (HashTable *)malloc(sizeof(HashTable));
table->entry = (Entry *)malloc(sizeof(Entry) * size);
table->size = size;
int i;
for (i = 0; i < size; i++) {
table->entry[i].key = 0; // All entries to be 0
}
return table;
}
// hash the word
int getHash(char *word)
{
// How do I implement a loop here
}
void insert(HashTable *table, int key, char *word) {
int hash = getHash(word);
int i = 0;
// if key has already existed, find and add to linked list
while(table->entry[hash].key != 0 && (i < table->size)) {
if(table->entry[hash].key == key) {
table->entry[hash].word = word;
return; /* */
}
//hash = (hash + 1); // I'm also stuck with incrementing the hash value
i++; // increment loop index
}
// if key does not exist, find a '0 slot', and store the key and value
if(table->entry[hash].key == 0) {
table->entry[hash].key = key;
table->entry[hash].word = word;
}
}
I would suggest start from a rather simple way to find anagrams of a word from text.
int anagrams(char * word, char * text) {
int bin[256] = { 0 }, m = 0, found = 0, len = 0, c, i;
for (i = 0; word[i]; i++, bin[c]--, len++) {
c = word[i];
if(bin[c] == 0) m++;
}
for (i = 0; text[i]; i++) {
c = text[i];
if (bin[c] == 0) m++;
if (bin[c] == -1) m--;
bin[c]++;
if (i >= len) {
c = text[i - len];
if (bin[c] == 0) m++;
if (bin[c] == 1) m--;
bin[c]--;
}
if (m == 0) found++;
}
return found;
}

Resources