So given a string of up to 7 letters, I need to find every permutation of that string (with and without all the letters) and then check if any of those permutations can be found in my dictionary.txt file, and print the ones that match. So basically, if the user inputs "try," the permutations would be try, tr, tyr, ty, t, rty, etc., and then check if any of them match words in the txt file. I tried to do this using strncopy and strcmp, but the program doesn't always correctly deduce that two things are equal, it takes forever to run, and there's a bug where it counts having zero letters as a permutation of the original string.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define SIZE 100 /* number of words in dictionary.txt */
#define MAX 7 /* max number of letters in given string */
/* function to swap values at two pointers */
void swap(char *x, char *y){
char temp;
temp = *x;
*x = *y;
*y = temp;
}
/* function to find permutations of the string */
void permute(char *letters, int l, int r){
if (l == r){
char *a[SIZE];
FILE *file = fopen("dictionary.txt", "r");
char target[MAX_2];
memset(target, '\0', sizeof(target));
for (int i = 0; i < SIZE; i++){
a[i] = malloc(100000);
fscanf(file, "%s", a[i]);
}
for (int i = 0; i < 10; i++){
for (int j = 0; j < r - 1; j++){
strcpy(target, a[i]);
if (strcmp(target, &letters[i]) == 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Match\n");
}
/*else if (strcmp(target, &letters[i]) != 0){
printf("%s\n", target);
printf("%s\n", letters);
printf("Not a match\n");
}
*/
}
}
for (int i = 0; i < SIZE; i++){
free (a[i]);
}
fclose(file);
}
else{
for (int i = l; i <= r; i++){
swap((tiles+l), (tiles+i));
permute(tiles, l+1, r);
swap((tiles+l), (tiles+i));
}
}
}
int main(){
/* initializing tile input */
char letters[MAX];
printf("Please enter your letters: ");
scanf("%s", letters);
/* finding size of input */
int size = strlen(letters);
/* finds all the permutation of the input */
/* parameters: string; start of the string; end of the string */
permute(letters, 0, size);
return 0;
}
Any help or suggestions to pinpoint what I'm doing wrong would be greatly appreciated.
As hinted in my comment, you can map all permutations of a string to a single code value, just by using the bits of a big enough unsigned integer as a bit set. Thus, the (same length) permutations of e.g. the word "try" all map to the same value.
As far as I understood your problem, you also want to match words, which start out with a substring of the wanted word. For that to work, you need to generated N such codes, if N is the number of letters, a word contains. I.e. For a three letter word, the code for the first letter, the first 2 letters and the code for all 3 letters.
Since reading from a file is probably not the problem, here the code, showcasing the "code based" string matching idea (which should be reasonably fast):
#include <stdio.h>
#include <inttypes.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define MAX_WORD_LENGTH 7
typedef uint32_t WordCode;
typedef struct WordCodes_tag {
size_t count;
WordCode codes[MAX_WORD_LENGTH];
} WordCodes_t;
bool word_to_code(const char* word,
size_t start,
size_t end,
WordCode* code) {
if ((end - start) > MAX_WORD_LENGTH)
return false;
*code = 0;
for (size_t i = start; i < end; i++) {
char c = word[i];
if ((c >= 'a') && (c <= 'z')) {
char bit = c - 'a';
WordCode mask = 1 << bit;
(*code) |= mask;
} else {
return false;
}
}
return true;
}
bool word_to_codes(const char* word, WordCodes_t* codes) {
if (NULL == codes)
return false;
if (NULL == word)
return false;
codes->count = 0;
size_t nchars = strlen(word);
if (nchars > MAX_WORD_LENGTH)
return false;
for (size_t len = nchars; len >= 1; len--) {
WordCode word_code;
if (word_to_code(word, 0, len, &word_code)) {
codes->codes[codes->count] = word_code;
codes->count++;
} else {
return false;
}
}
return true;
}
void show_word_codes(const WordCodes_t* codes) {
if (NULL == codes) return;
printf("(");
for (size_t i = 0; i < codes->count; i++) {
if (i > 0)
printf(", %d", codes->codes[i]);
else
printf("%d", codes->codes[i]);
}
printf(")\n");
}
bool is_match(const WordCodes_t* a, const WordCodes_t* b) {
if ((NULL == a) || (NULL == b))
return false;
if ((0 == a->count) || (0 == b->count))
return false;
const WordCodes_t *temp = NULL;
if (a->count < b->count) {
temp = a;
a = b;
b = temp;
}
size_t a_offset = a->count - b->count;
for (size_t i = a_offset, j = 0; i < a->count; i++, j++) {
if (a->codes[i] == b->codes[j])
return true;
}
return false;
}
int main(int argc, const char* argv[]) {
const char* wanted = "try";
const char* dictionary[] = {
"house", "mouse", "cat", "tree", "try", "yrt", "t"
};
size_t dict_len = sizeof(dictionary) / sizeof(char*);
WordCodes_t wanted_codes;
if (word_to_codes(wanted, &wanted_codes)) {
printf("word codes of the wanted word '%s': ", wanted);
show_word_codes(&wanted_codes);
for (size_t i = 0; i < dict_len; i++) {
WordCodes_t found_codes;
if (word_to_codes(dictionary[i],&found_codes)) {
printf("word codes of dictionary word '%s' (%s): ",
dictionary[i],
is_match(&wanted_codes, &found_codes) ?
"match" : "no match");
show_word_codes(&found_codes);
} else {
printf("word_to_codes(%s) failed!", dictionary[i]);
}
}
} else {
puts("word_to_codes() failed!");
return -1;
}
}
As function is_match() above shows, you need only compare the codes for the respective substring length. Thus, even if you have 2 sets of up to 7 numbers, you need only maximum 7 comparisons.
The output looks like this (which seems to make sense):
./search
word codes of the wanted word 'try': (17432576, 655360, 524288)
word codes of dictionary word 'house' (no match): (1327248, 1327232, 1065088, 16512, 128)
word codes of dictionary word 'mouse' (no match): (1331216, 1331200, 1069056, 20480, 4096)
word codes of dictionary word 'cat' (no match): (524293, 5, 4)
word codes of dictionary word 'tree' (match): (655392, 655376, 655360, 524288)
word codes of dictionary word 'try' (match): (17432576, 655360, 524288)
word codes of dictionary word 'yrt' (match): (17432576, 16908288, 16777216)
word codes of dictionary word 't' (match): (524288)
If you want to match the words in a dictionary against all partial permutations of a search term, you don't have to create all permutations. (The number of permutations n! grows very quickly with the length of the search term, n.)
Instead, it is easier to write a customized search function. You can make use of two strategies here:
A word w is a permutation of the search term s if both words are eaqual if the letters are sorted. For example, "integral" and "triangle" are anagrams of each other, because both sort to "aegilnrt".
You can skip letters in the search term when searching to account for partial anagrams. Because the search term and the word will be sorted, you know which ones to skip: The ones that are lexically "smaller" than the next letter in the word.
So your matching function should sort the words first and then compare the words character by character in such a way that characters from the search term can be skipped.
Here's code that does that:
int char_cmp(const void *pa, const void *pb)
{
const char *a = pa;
const char *b = pb;
return *a - *b;
}
bool partial_anagram(const char *aa, const char *bb)
{
char A[64];
char B[64];
const char *a = strcpy(A, aa);
const char *b = strcpy(B, bb);
qsort(A, strlen(A), 1, char_cmp);
qsort(B, strlen(B), 1, char_cmp);
while (*b) {
while (*a && *a < *b) a++;
if (*a != *b) return false;
a++;
b++;
}
return true;
}
Things to note:
Sorting is done with the function qsort from <stdlib.h>, for which you need a comparator function, in this case char_cmp.
The sorted strings are copies, so that the original strings are not modified. (The code above is unsafe, because it doesn't enforce that the length of the strings is less than 64 characters. Unfortunately, the function strncpy, which can accept a maximum buffer size, is not safe, either, because it can leave the buffer unterminated. A safe way to copy the strings would be snprintf(A, sizeof(A), "%s", aa), but I've kept the strcpy for, er, "simplicity".)
The function partial_anagram takes unsorted strings and sorts them. That makes for a clean interface, but it is inefficient when you want to test against the same search term repeatedly as in your case. You could change the function, so that it expects already sorted strings. This will reduce the function to just the loop and will place the responsibility of sorting to the caller.
If you really have a lot of searches, there is yet more room for optimization. For example, you could insert the sorted dictionary into a trie. Given that you original code read the whole file for each permutation, I guess you're not worried that much about performance. :)
I've put a working example online. The code above works with pointers. If you are more at ease with indices, you can rewrite the function:
bool partial_anagram(const char *aa, const char *bb)
{
char a[64];
char b[64];
unsigned i = 0;
unsigned j = 0;
strcpy(a, aa);
strcpy(b, bb);
qsort(a, strlen(a), 1, char_cmp);
qsort(b, strlen(b), 1, char_cmp);
while (b[j]) {
while (a[i] && a[i] < b[j]) i++;
if (a[i] != b[j]) return false;
i++;
j++;
}
return true;
}
Problem
One is using an algorithm that has exponentially growing run-time with the problem size. There are probably lots of ways to speed this up, but, as suggested by #SparKot, a trie, or prefix tree, is a particularly good fit. One can build a trie from an dictionary array of size n, assuming the length of the strings in your dictionary are bounded, in O(n log n). Looking up angrams in the worst-case, where the letters never run out, (ignoring the arbitrary 7 limit,) is still worst case O(n).
$ bin/trie AAABBBCCCDDDEEEFFFGGGHHHIIIJJJKKKLLLMMMNNNOOOPPPQQQRRRSSSTTTUUUVVVWWWXXXYYYZZZ < Tutte_le_parole_inglesi.txt
build_index warning: duplicate "OUTSOURCING".
build_index warning: duplicate "OUTSOURCINGS".
Loaded 216553 trie entries.
AA
AAH
AAHED
AAHING
...
ZYTHUMS
ZYZZYVA
ZYZZYVAS
211929 words found.
Proposal
The reason a prefix tree is so effective, is it allows you to query prefixes as (even more) efficiently as lookup. With this, one can do a very effective branch-and-bound-style algorithm. That is, the longer the string, the less words it will be a prefix match to; if the string is not a prefix match for any of the words in the dictionary, one can rule out any longer strings and just not test them.
So my idea is, form a histogram with the Scrabble-string of length k in O(k). Then, recursively, add more and more letters, matching, until no dictionary entries are prefix matches of the string. This will run in (*I think) O(n log n + k), assuming a bound on the number of comparisons needed to distinguish words; ie, one's dictionary is not { a, aa, aaa, aaaa, aaaaa, aaaaaa, ... }.
Implementation
I use a PATRiCA tree. It is especially attractive because a lot of data is implicit; one can use a simple array to represent the leaves on a complete binary tree. Specifically, n leaves are already just the list of words in lexicographical order, we want to build an index of n - 1 branches. It requires a stop code; the null-termination in C is perfect. I don't have to create copies of everything and manage them. The below code first sets up a dynamic array, which is useful for input, then sets up a trie, then implements the algorithm.
#include <stdlib.h> /* EXIT malloc free qsort */
#include <stdio.h> /* printf */
#include <string.h> /* memmove memcpy */
#include <assert.h> /* assert */
#include <errno.h> /* errno */
#include <limits.h> /* UINT_MAX */
#include <ctype.h> /* isgraph */
/* Dynamic array. */
#define MIN_ARRAY(name, type) \
struct name##_array { type *data; size_t size, capacity; }; \
static int name##_array_reserve(struct name##_array *const a, \
const size_t min) { \
size_t c0; \
type *data; \
const size_t max_size = (size_t)-1 / sizeof *a->data; \
if(a->data) { \
if(min <= a->capacity) return 1; \
c0 = a->capacity < 7 ? 7 : a->capacity; \
} else { \
if(!min) return 1; \
c0 = 7; \
} \
if(min > max_size) return errno = ERANGE, 0; \
/* `c_n = a1.625^n`, approximation golden ratio `\phi ~ 1.618`. */ \
while(c0 < min) { \
size_t c1 = c0 + (c0 >> 1) + (c0 >> 3); \
if(c0 >= c1) { c0 = max_size; break; } /* Unlikely. */ \
c0 = c1; \
} \
if(!(data = realloc(a->data, sizeof *a->data * c0))) \
{ if(!errno) errno = ERANGE; return 0; } \
a->data = data, a->capacity = c0; \
return 1; \
} \
static type *name##_array_buffer(struct name##_array *const a, \
const size_t n) { \
if(a->size > (size_t)-1 - n) { errno = ERANGE; return 0; } \
return name##_array_reserve(a, a->size + n) \
&& a->data ? a->data + a->size : 0; \
} \
static type *name##_array_append(struct name##_array *const a, \
const size_t n) { \
type *b; \
if(!(b = name##_array_buffer(a, n))) return 0; \
return a->size += n, b; \
} \
static type *name##_array_new(struct name##_array *const a) \
{ return name##_array_append(a, 1); } \
static struct name##_array name##_array(void) \
{ struct name##_array a; a.data = 0, a.capacity = a.size = 0; return a; } \
static void name##_array_(struct name##_array *const a) \
{ if(a) free(a->data), *a = name##_array(); }
MIN_ARRAY(char, char)
/** Append a file, `fp`, to `c`, and add a '\0'.
#return Success. A partial read is failure. #throws[fopen, fread, malloc]
#throws[EISEQ] The text file has embedded nulls.
#throws[ERANGE] If the standard library does not follow POSIX. */
static int append_file(struct char_array *c, FILE *const fp) {
const size_t granularity = 4096;
size_t nread;
char *cursor;
int success = 0;
assert(c && fp);
/* Read entire file in chunks. */
do if(!(cursor = char_array_buffer(c, granularity))
|| (nread = fread(cursor, 1, granularity, fp), ferror(fp))
|| !char_array_append(c, nread)) goto catch;
while(nread == granularity);
/* File to `C` string. */
if(!(cursor = char_array_new(c))) goto catch;
*cursor = '\0';
/* Binary files with embedded '\0' are not allowed. */
if(strchr(c->data, '\0') != cursor) { errno = EILSEQ; goto catch; }
{ success = 1; goto finally; }
catch:
if(!errno) errno = EILSEQ; /* Will never be true on POSIX. */
finally:
if(fp) fclose(fp);
return success;
}
/* Trie is base-2 compact radix tree, described in <Morrison, 1968 PATRICiA>.
Specifically, this is a full binary tree. */
struct branch { unsigned skip, left; };
static const size_t skip_max = UINT_MAX, left_max = UINT_MAX;
MIN_ARRAY(branch, struct branch)
MIN_ARRAY(leaf, char *)
struct trie { struct branch_array branches; struct leaf_array leaves; };
static struct trie trie(void) { struct trie t;
t.branches = branch_array(), t.leaves = leaf_array(); return t; }
static void trie_(struct trie *const t) { if(t) branch_array_(&t->branches),
leaf_array_(&t->leaves), *t = trie(); }
/** From string `a`, extract `bit`, either 0 or 1. */
static int is_bit(const char *const a, const size_t bit) {
const size_t byte = bit >> 3;
const unsigned char mask = 128 >> (bit & 7);
return !!(a[byte] & mask);
}
/** #return Whether `a` and `b` are equal up to the minimum of their lengths'. */
static int is_prefix(const char *a, const char *b) {
for( ; ; a++, b++) {
if(*a == '\0') return 1;
if(*a != *b) return *b == '\0';
}
}
/** [low, high). */
struct range { size_t low, high; };
static int init_branches_r(struct trie *const t, size_t bit,
const struct range range) {
struct range r;
size_t skip = 0, left;
struct branch *branch;
assert(t && t->leaves.size);
assert(t->branches.capacity >= t->leaves.size - 1);
assert(range.low <= range.high && range.high <= t->leaves.size);
if(range.low + 1 >= range.high) return 1; /* Only one, leaf. */
/* Endpoints of sorted range: skip [_1_111...] or [...000_0_] don't care. */
while(is_bit(t->leaves.data[range.low], bit)
|| !is_bit(t->leaves.data[range.high - 1], bit)) {
if(skip == skip_max) return errno = ERANGE, 0;
bit++, skip++;
}
/* Binary search for the rightmost 0 (+1). */
r = range;
while(r.low < r.high) {
size_t m = r.low + (r.high - r.low) / 2;
if(is_bit(t->leaves.data[m], bit)) r.high = m; else r.low = m + 1;
}
if((left = r.low - range.low - 1) > left_max) return errno = ERANGE, 0;
/* Should have space for all branches pre-allocated. */
branch = branch_array_new(&t->branches), assert(branch);
branch->left = (unsigned)left;
branch->skip = (unsigned)skip;
bit++;
return (r.low = range.low, r.high = range.low + left + 1,
init_branches_r(t, bit, r)) && (r.low = r.high, r.high = range.high,
init_branches_r(t, bit, r)) /* && (printf("}\n"), 1) */;
}
/** Orders `a` and `b` by their pointed-to-strings. #implements qsort bsearch */
static int vstrcmp(const void *const a, const void *const b)
{ return strcmp(*(const char *const*)a, *(const char *const*)b); }
/** #param[a] A zero-terminated file containing words. Will be parsed and
modified.
#param[t] An idle tree that is initialized from `a`. Any modification of `a`
invalidates `t`.
#return Whether the tree initialization was entirely successful. */
static int build_trie(struct trie *const t, struct char_array *const a) {
struct range range;
size_t i;
char *cursor, *end, **leaf;
int is_run = 0;
/* Strict for processing ease; this could be made more permissive. */
assert(a && a->size && a->data[a->size - 1] == '\0'
&& t && !t->branches.size && !t->leaves.size);
for(cursor = a->data, end = a->data + a->size; cursor < end; cursor++) {
/* Fixme: 7-bit; mælström would be parsed as "m", "lstr", "m". */
if(!isgraph(*cursor)) {
*cursor = '\0', is_run = 0;
} else if(!is_run) {
if(!(leaf = leaf_array_new(&t->leaves))) return 0;
*leaf = cursor, is_run = 1;
}
}
if(!t->leaves.size) return errno = EILSEQ, 0; /* No parseable info. */
/* Sort and de-duplicate (inefficiently.) Want to treat it as an index. */
qsort(t->leaves.data, t->leaves.size, sizeof *t->leaves.data, &vstrcmp);
for(i = 1; i < t->leaves.size; i++) {
if(strcmp(t->leaves.data[i - 1], t->leaves.data[i]) < 0) continue;
fprintf(stderr, "build_index warning: duplicate \"%s\".\n",
t->leaves.data[i]);
memmove(t->leaves.data + i, t->leaves.data + i + 1,
sizeof *t->leaves.data * (t->leaves.size - i - 1));
t->leaves.size--, i--;
}
range.low = 0, range.high = t->leaves.size;
if(!branch_array_reserve(&t->branches, t->leaves.size - 1)
|| !init_branches_r(t, 0, range)) return 0;
assert(t->branches.size + 1 == t->leaves.size);
return 1;
}
/** #return In `t`, which must be non-empty, given a `prefix`, stores all leaf
prefix matches, only given the index, ignoring don't care bits.
#order \O(`prefix.length`) */
static struct range partial_prefix(const struct trie *const t,
const char *const prefix) {
size_t n0 = 0, n1 = t->branches.size, i = 0, left;
struct branch *branch;
size_t byte, key_byte = 0, bit = 0;
struct range range = { 0, 0 };
assert(t && prefix);
assert(n1 + 1 == t->leaves.size); /* Full binary tree. */
while(n0 < n1) {
branch = t->branches.data + n0;
bit += branch->skip;
/* '\0' is not included for partial match. */
for(byte = bit >> 3; key_byte <= byte; key_byte++)
if(prefix[key_byte] == '\0') goto finally;
left = branch->left;
if(!is_bit(prefix, bit++)) n1 = ++n0 + left;
else n0 += left + 1, i += left + 1;
}
assert(n0 == n1);
finally:
assert(n0 <= n1 && i - n0 + n1 < t->leaves.size);
range.low = i, range.high = i - n0 + n1 + 1;
return range;
}
/* #return Given a `prefix`, what is the range of matched strings in `t`. */
static struct range prefix(const struct trie *const t,
const char *const prefix) {
struct range range;
assert(t && prefix);
if(!t->leaves.size) goto catch;
range = partial_prefix(t, prefix);
if(range.low <= range.high)
if(!is_prefix(prefix, t->leaves.data[range.low])) goto catch;
goto finally;
catch:
range.low = range.high = 0;
finally:
return range;
}
/* Debug graph. */
/** Given a branch `b` in `tr` branches, calculate the right child branches.
#order \O(log `size`) */
static unsigned right_count(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < total && right < total);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1;
}
assert(b0 == b);
return right;
}
/** #return Follows the branches to `b` in `tr` and returns the leaf. */
static unsigned left_leaf(const struct trie *const tr,
const unsigned b) {
unsigned left, right, total = (unsigned)tr->branches.size, i = 0, b0 = 0;
assert(tr && b < tr->branches.size);
for( ; ; ) {
right = total - (left = tr->branches.data[b0].left) - 1;
assert(left < tr->branches.size && right < tr->branches.size);
if(b0 >= b) break;
if(b <= b0 + left) total = left, b0++;
else total = right, b0 += left + 1, i += left + 1;
}
assert(b0 == b);
return i;
}
static void graph(const struct trie *const tr, const char *const fn) {
unsigned left, right, b, i;
FILE *fp = 0;
assert(tr && fn);
if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
fprintf(fp, "digraph {\n"
"\tgraph [truecolor=true, bgcolor=transparent];\n"
"\tfontface=modern;\n"
"\tnode [shape=none];\n"
"\n");
if(!tr->branches.size) {
assert(!tr->leaves.size);
fprintf(fp, "\tidle;\n");
} else {
assert(tr->branches.size + 1 == tr->leaves.size);
fprintf(fp, "\t// branches\n");
for(b = 0; b < tr->branches.size; b++) { /* Branches. */
const struct branch *branch = tr->branches.data + b;
left = branch->left, right = right_count(tr, b);
fprintf(fp, "\ttree%pbranch%u [label = \"%u\", shape = circle, "
"style = filled, fillcolor = Grey95];\n"
"\ttree%pbranch%u -> ", (const void *)tr, b, branch->skip,
(const void *)tr, b);
if(left) fprintf(fp, "tree%pbranch%u [arrowhead = rnormal];\n",
(const void *)tr, b + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = rnormal];\n",
(const void *)tr, left_leaf(tr, b));
fprintf(fp, "\ttree%pbranch%u -> ", (const void *)tr, b);
if(right) fprintf(fp, "tree%pbranch%u [arrowhead = lnormal];\n",
(const void *)tr, b + left + 1);
else fprintf(fp,
"tree%pleaf%u [color = Gray85, arrowhead = lnormal];\n",
(const void *)tr, left_leaf(tr, b) + left + 1);
}
}
fprintf(fp, "\t// leaves\n");
for(i = 0; i < tr->leaves.size; i++) fprintf(fp,
"\ttree%pleaf%u [label = <%s<FONT COLOR=\"Gray85\">⊔</FONT>>];\n",
(const void *)tr, i, tr->leaves.data[i]);
fprintf(fp, "\n"
"\tnode [color = \"Red\"];\n"
"}\n");
fclose(fp);
}
/* Actual program. */
/* The input argument histogram. Used in <fn:find_r>. (Simple, but questionable
design choice.) */
static unsigned char hist[128];
static const size_t hist_max = UCHAR_MAX,
hist_size = sizeof hist / sizeof *hist;
static size_t words_found;
/** Branch-and-bound recursive function. */
static void find_r(const struct trie *const tr, char *const word) {
struct range r;
size_t len, i;
assert(word);
r = prefix(tr, word);
if(r.low >= r.high) return; /* Found nothing, we can bound this branch. */
if(!strcmp(word, tr->leaves.data[r.low])) { /* Found a match. */
printf("%s\n", word), words_found++;
if(++r.low == r.high) return;
}
len = strlen(word);
for(i = 0; i < hist_size; i++) {
unsigned char *freq;
if(!*(freq = hist + i)) continue;
(*freq)--;
word[len] = (char)i, word[len + 1] = '\0';
find_r(tr, word);
(*freq)++;
}
}
int main(int argc, char *argv[]) {
struct char_array dict = char_array();
struct trie tr = trie();
char *word;
size_t i;
int success = EXIT_FAILURE;
assert(CHAR_BIT == 8); /* C89 this value can change, assumes C99 value. */
if(argc != 2) { errno = EILSEQ;
fprintf(stderr, "Needs argument and dictionary input.\n"); goto catch; }
word = argv[1];
/* Load the dictionary from stdin and index it into a trie. */
if(!append_file(&dict, stdin) || !build_trie(&tr, &dict)) goto catch;
fprintf(stderr, "Loaded %lu trie entries.\n",(unsigned long)tr.leaves.size);
graph(&tr, "dictionary.gv");
/* Histogram the argument. */
for(i = 0; word[i] != '\0'; i++) {
unsigned char *freq;
if(word[i] & 0x80) continue; /* UTF-8 is not supported. :[ */
if(*(freq = hist + word[i]) == hist_max)
{ errno = ERANGE; goto catch; } /* "aaaaaaaaa..." x 5M? */
(*freq)++;
}
/* Might as well re-use the word now that we're finished with it; it's the
right length. */
*word = '\0', find_r(&tr, word);
fprintf(stderr, "%lu words found.\n", (unsigned long)words_found);
{ success = EXIT_SUCCESS; goto finally; }
catch:
perror("word");
finally:
trie_(&tr);
char_array_(&dict);
return success;
}
I'm pretty new to C and how would I check the duplicates of a 1D char array
for example
#define MAX_SIZE 60
Char canvas[MAX_SIZE] = {0};
for(int i=0; i<MAX_SIZE;i++){
//How do i check if there is a duplicate in that array?
}
How do I iterate through to check for duplicates, like do i have to use double for loops and do sizeOf(canavas)/SOMETHING here?
My solution, using a function:
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
bool mem_hasduplicates(const char arr[], size_t len)
{
assert(arr != NULL);
if (len == 0)
return false;
for (size_t i = 0; i < len - 1; ++i) {
for (size_t j = i + 1; j < len; ++j) {
if (arr[i] == arr[j]) {
return true;
}
}
}
return false;
}
int main() {
const char canvas[] = "zcxabca";
printf("%x\n", mem_hasduplicates(canvas, sizeof(canvas)/sizeof(canvas[0])));
const char other_canvas[] = "abcfsd";
printf("%x\n", mem_hasduplicates(other_canvas, sizeof(other_canvas)/sizeof(other_canvas[0])));
}
Live version available at onlinegdb.
#edit Or we can "just" create a histogram from all the numbers as #selbie suggested, although this got me complicated fast:
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <limits.h>
struct histogram_value_s {
char value;
unsigned int count;
};
struct histogram_s {
struct histogram_value_s *v;
size_t len;
};
#define HISTOGRAM_INIT() {0}
void histogram_fini(struct histogram_s *t)
{
t->len = 0;
free(t->v);
}
static int histogram_sort_by_value_qsort_cb(const void *a0, const void *b0)
{
const struct histogram_value_s *a = a0;
const struct histogram_value_s *b = b0;
assert(a != NULL);
assert(b != NULL);
return a->value - b->value;
}
void histogram_sort_by_value(struct histogram_s *t)
{
qsort(t->v, t->len, sizeof(*t->v), histogram_sort_by_value_qsort_cb);
}
static int histogram_sort_by_count_qsort_cb(const void *a0, const void *b0)
{
const struct histogram_value_s *a = a0;
const struct histogram_value_s *b = b0;
assert(a != NULL);
assert(b != NULL);
return a->count - b->count;
}
void histogram_sort_by_count(struct histogram_s *t)
{
qsort(t->v, t->len, sizeof(*t->v), histogram_sort_by_count_qsort_cb);
}
int histogram_getValue_2(const struct histogram_s *t, char value, size_t *idx, unsigned int *ret0)
{
for (size_t i = 0; i < t->len; ++i) {
if (t->v[i].value == value) {
if (ret0) {
*ret0 = t->v[i].count;
}
if (idx) {
*idx = i;
}
return 0;
}
}
return -1;
}
void histogram_printlns_generic(const struct histogram_s *t, const char fmt[])
{
assert(t != NULL);
for (size_t i = 0; i < t->len; ++i) {
printf(fmt, t->v[i].value, t->v[i].count);
}
}
int histogram_add(struct histogram_s *t, char value)
{
size_t idx;
if (histogram_getValue_2(t, value, &idx, NULL) == 0) {
if (t->v[idx].count == UINT_MAX) {
goto ERR;
}
++t->v[idx].count;
} else {
void *tmp;
tmp = realloc(t->v, (t->len + 1) * sizeof(*t->v));
if (tmp == NULL) goto ERR;
t->v = tmp;
t->v[t->len] = (struct histogram_value_s){
.value = value,
.count = 1,
};
++t->len;
}
return 0;
ERR:
return -1;
}
bool histogram_has_any_count_greater_then_2(const struct histogram_s *t)
{
assert(t != NULL);
for (size_t i = 0; i < t->len; ++i) {
if (t->v[i].count >= 2) {
return true;
}
}
return false;
}
/* ----------------------------------------------------------- */
int histogram_create_from_mem(struct histogram_s *ret0, const char arr[], size_t len)
{
assert(ret0 != NULL);
assert(arr != NULL);
struct histogram_s ret = HISTOGRAM_INIT();
for (size_t i = 0; i < len; ++i) {
const char to_add = arr[i];
if (histogram_add(&ret, to_add) < 0) {
goto ERR;
}
}
*ret0 = ret;
return 0;
ERR:
histogram_fini(&ret);
return -1;
}
int main() {
const char canvas[] = "abc";
struct histogram_s h;
int ret;
ret = histogram_create_from_mem(&h, canvas, sizeof(canvas)/sizeof(canvas[0]));
if (ret) {
fprintf(stderr, "mem_createhistogram error!\n");
return -1;
}
printf("'%s' %s duplicates\n",
canvas,
histogram_has_any_count_greater_then_2(&h)
? "has"
: "does not have"
);
histogram_fini(&h);
}
Live version here.
#edit Or we can sort the array, and check if any two adjacent bytes are the same!
#include <stdlib.h>
#include <stdbool.h>
int cmp_chars(const void *a, const void *b)
{
return *(char*)a - *(char*)b;
}
int main() {
char canvas[] = "abca";
qsort(canvas, sizeof(canvas) - 1, sizeof(canvas[0]), cmp_chars);
bool duplicate_found = false;
for (char *p = canvas; p[1] != '\0'; ++p) {
if (p[0] == p[1]) {
duplicate_found = true;
break;
}
}
printf("'%s' %s duplicates\n",
canvas,
duplicate_found ? "has" : "does not have");
}
Live version available at onlinegdb.
If Char is just a typo for char, then this becomes relatively simple - set up a second array, indexed by character code, that keeps track of the number of occurrences of each character:
#include <limits.h>
#include <ctype.h>
...
int charCount[SCHAR_MAX+1] = {0}; // We're only going to worry about non-negative
// character codes (i.e., standard ASCII)
// [0..127]
...
/**
* This assumes that canvas is *not* a 0-terminated string, and that
* every element of the array is meaningful. If that's not the case,
* then loop on the length of the string instead of MAX_SIZE.
*/
for ( int i = 0; i < MAX_SIZE; i++ )
{
if ( canvas[i] >= 0 && canvas[i] <= SCHAR_MAX )
{
charCount[canvas[i]]++; // index into charCount by the value of canvas[i]
}
}
Then you can walk through the charCount array and print all the character values that occurred more than once:
for ( int i = 0; i <= SCHAR_MAX; i++ )
{
if ( charCount[i] > 1 )
{
/**
* If the character value is a printable character (punctuation, alpha,
* digit), print the character surrounded by single quotes - otherwise,
* print the character code as a decimal integer.
*/
printf( isprint( i ) ? "'%c': %d\n" : "%d: %d\n", i, charCount[i] );
}
}
What's that SCHAR_MAX all about, any why am I yammering about non-negative character codes in the comments?
In C, characters the basic execution character set (digits, upper and lowercase letters, common punctuation characters) are guaranteed to have non-negative encodings (i.e., the [0..127] range of standard ASCII). Characters outside of that basic execution character set may have positive or negative values, depending on the implementation. Thus, the range of char values may be [-128..127] on some platforms and [0..255] on others.
The limits.h header defines constants for various type ranges - for characters, it defines the following constants:
UCHAR_MAX - maximum unsigned character value (255 on most platforms)
SCHAR_MIN - minimum signed character value (-128 on most platforms)
SCHAR_MAX - maximum signed character value (127 on most platforms)
CHAR_MIN - minimum character value, either 0 or SCHAR_MIN depending on platform
CHAR_MAX - maximum character value, either UCHAR_MAX or SCHAR_MAX depending on value
To keep this code simple, I'm only worrying about character codes in the range [0..127]; otherwise, I'd have to map negative character codes onto non-negative array indices, and I didn't feel like doing that.
Both this method and the nested loop solution require some tradeoffs. The nested loop solution trades time for space, while this solution trades space for time. In this case, the additional space is fixed regardless of how large canvas becomes. In the nested loop case, time will increase with the square of the length of canvas. For short inputs, there's effectively no difference, but if canvas gets large enough, you will notice a significant decrease in performance with the nested loop solution.
I'm having issues to correct my code so that it works as I want it.
I have three arrays given in this example:
char arr[MAX_ELEMENTS][MAX_LENGTH] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[MAX_ELEMENTS][MAX_LENGTH] = {"PARI", "PARI", "LOND", "PARI", "PARI", "CYKA", "LOND", "CYKA", "PARI"};
int freq[MAX_ELEMENTS];
I wish to create a function that can count the amount of occurrences of string elements in arr[] and store them in freq[]. Apart from that I also wish to know in what race[] there have been the most occurrences of given arr[].
To demonstrate this here is an example of what output I wish to receive when the function works:
In Race [PARI] the highest occurence was [ABS] with 3 occurences!
In Race [LOND] the highest occurence was [ACT] with 1 occurences!
.....
Currently, I am able to count the occurrences of arr[] in freq[] but I can't associate them with their respective race[] and give that output..
for(i=0; i<size; i++)
{
count = 1;
for(j=i+1; j<size; j++)
{
/* If duplicate element is found */
if(strcmp(arr[i], arr[j])==0)
{
count++;
/* Make sure not to count frequency of same element again */
freq[j] = 0;
}
}
/* If frequency of current element is not counted */
if(freq[i] != 0)
{
freq[i] = count;
}
}
Giving me currently :
ABS occurs 3 times.
ACT occurs 2 times.
etc. etc...
But I don't know how I can associate them with the race[] and only count them if a given race.
You probably have to use struct here to format your data.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#define true 1
#define len 100
#define elms 10
struct NODE;
#define Node struct NODE
struct NODE {
unsigned long int val;
int count;
char name[len];
Node *left;
Node *right;
};
Node * makeNode(char * str, unsigned long int val){
Node * tmp = (Node *)malloc(sizeof(Node));
strcpy(tmp->name, str);
tmp->val = val;
tmp->left = NULL;
tmp->right = NULL;
tmp->count = 1;
return tmp;
}
unsigned long int getHash(char * name){
int prime = 19;
int i = 0;
unsigned long int val = 0;
while(name[i]!='\0'){
val += (name[i] * pow(prime, i) );
++i;
}
return val;
}
void insert(Node * root, char * name){
Node * newnode;
int val = getHash(name);
Node * tmp = root;
while(tmp != NULL) {
if ( tmp->val == val){
tmp->count += 1;
break;
}
if (val > tmp->val){
if( tmp->right != NULL)
tmp = tmp->right;
else{
tmp->right = makeNode(name, val);
break;
}
}else {
if( tmp->left != NULL)
tmp = tmp->left;
else{
tmp -> left = makeNode(name, val);
break;
}
}
}
}
Node * find(Node * root, char * name){
int val = getHash(name);
Node * tmp = root;
while(tmp != NULL){
if(tmp -> val == val){
return tmp;
}else if (val > tmp->val){
tmp = tmp->right;
}else{
tmp = tmp->left;
}
}
return NULL;
}
struct Race {
char name[len];
char elements[elms][len];
};
char arr[elms][len] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[elms][len] = {"PARI", "PARI", "LOND", "PARI", "PARI", "CYKA", "LOND", "CYKA", "PARI"};
int freq[elms];
void copyArray(char dest[elms][len], char src[elms][len] ){
int i = 0;
while(strlen(src[i]) > 0){
strcpy(dest[i],src[i]);
++i;
}
}
int main(){
Node * root = makeNode("root", 0);
int i = 0;
while(strlen(arr[i]) > 0){
insert(root,arr[i]);
++i;
}
i = 0;
while(strlen(arr[i]) > 0){
Node * r = find(root,arr[i]);
printf("found %s, count = %ld\n", r->name, r->count);
++i;
}
// make representation of race
struct Race r1, r2;
strcpy(r1.name, "PARI");
{
char tmp[elms][len] = { "ABS", "PPB", "QQQ" };
copyArray(r1.elements, tmp);
}
strcpy(r2.name, "LOND");
{
char tmp[elms][len] = { "ACT" };
copyArray(r2.elements, tmp);
}
struct Race races[2] = {r1, r2};
i = 0;
while(i < 2){
struct Race * current = &races[i];
printf("for %s", current->name);
Node * max = NULL;
int m = -1;
int j = 0;
while(strlen(current->elements[j]) > 0){
Node * tmp = find(root, current->elements[j]);
if( tmp != NULL && tmp->count > m) {
max = tmp;
m = tmp->count;
}
++j;
}
if (max != NULL){
printf(" max is %s : %d\n", max->name, max->count);
}else{
printf(" max is None\n");
}
++i;
}
return 0;
}
Basically you have to format you data, and specify link between them. Here I used Binary tree and Rabin karp hashing technique to store data efficiently.
Binary tree is best way to solve counting problem, since the search operation fairly cheap. and Rabin karp hashing technique will avoid string comparison every time.
And I create a struct called Race to store all related elements of that race. so the algorithm is going to be.
let arr be array of elements
let races be array of races
for each race in races
define related element
#find occurrence now
#Binary tree will increment count if element already exist.
let binary_tree be a Binary Tree
for each element in arr
add element to binary_tree
# now we have all the elements with it's count
# let's iterate through races now
for each race in races
m = null
for element in race.elements
node = find_element_in_binary_tree(element)
if node is not null
m = max(m, node)
if m is not null then
print m
else
print not found
First, initializations, note the []s
char arr[][MAX_LENGTH] = {"ABS","ABS","ABS","ACT","ACT","PPB","PPB","QQQ","QQQ"};
char race[][MAX_LENGTH] = {"PARI","PARI","LOND","PARI","PARI","CYKA","LOND","CYKA","PARI"};
int freq[MAX_ELEMENTS];
int n = sizeof(arr)/sizeof(*arr); // get actual number of used items
int i,j;
int max = 0; // init max to 0
The main loop goes through arr and race, and whenever a dupe is found at [j] (after [i]), "invalidate" the dupe ("already processed") by setting its first char to 0 (empty string).
Note that j starts from i and not i+1 to ensure freq is at least 1, even for the first non-dupes items.
for(i=0 ; i<n ; i++) {
freq[i] = 0; // ensure freq is 0 for any item
if ( ! *arr[i]) continue; // skip already processed items
for(j=i ; j<n ; j++) { // j=i, not i+1!
if (!strcmp(arr[i],arr[j]) && !strcmp(race[i],race[j])) {
freq[i]++; // update max if necessary
if (freq[i] > max) max = freq[i];
if (j > i) *arr[j] = 0; // invalidate that arr element
}
}
}
Finally display the max appearances, including ties
printf("Items at max=%d:\n", max);
for(i=0 ; i<n ; i++) {
if (freq[i] == max) { // skipped items are never displayed (max cannot be 0)
printf("%s / %s\n", arr[i],race[i]);
}
}
(no need to check for "invalidation" as max will be >0, and all invalidated items have freq[i] == 0)