searching an lzw encoded file - c

I am looking to alter the LZW compressor to enable it to search for a word in an LZW encoded file and finds the number of matches for that search term.
For example if my file is used as
Prompt:>lzw "searchterm" encoded_file.lzw
32
Any suggestions on how to achive this?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BITS 12 /* Setting the number of bits to 12, 13*/
#define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */
#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */
#define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/
/* 14 bits are selected. */
#if BITS == 16
#define TABLE_SIZE 99991
#endif
#if BITS == 14
#define TABLE_SIZE 18041 /* The string table size needs to be a */
#endif /* prime number that is somewhat larger*/
#if BITS == 13 /* than 2**BITS. */
#define TABLE_SIZE 9029
#endif
#if BITS <= 12
#define TABLE_SIZE 5021
#endif
void *malloc();
int *code_value; /* This is the code value array */
unsigned int *prefix_code; /* This array holds the prefix codes */
unsigned char *append_character; /* This array holds the appended chars */
unsigned char decode_stack[4000]; /* This array holds the decoded string */
/*
* Forward declarations
*/
void compress(FILE *input,FILE *output);
void expand(FILE *input,FILE *output);
int find_match(int hash_prefix,unsigned int hash_character);
void output_code(FILE *output,unsigned int code);
unsigned int input_code(FILE *input);
unsigned char *decode_string(unsigned char *buffer,unsigned int code);
/********************************************************************
**
** This program gets a file name from the command line. It compresses the
** file, placing its output in a file named test.lzw. It then expands
** test.lzw into test.out. Test.out should then be an exact duplicate of
** the input file.
**
*************************************************************************/
main(int argc, char *argv[])
{
FILE *input_file;
FILE *output_file;
FILE *lzw_file;
char input_file_name[81];
char command;
command=(argv==3);
/*
** The three buffers are needed for the compression phase.
*/
code_value=(int*)malloc(TABLE_SIZE*sizeof(int));
prefix_code=(unsigned int *)malloc(TABLE_SIZE*sizeof(unsigned int));
append_character=(unsigned char *)malloc(TABLE_SIZE*sizeof(unsigned char));
if (code_value==NULL || prefix_code==NULL || append_character==NULL)
{
printf("Fatal error allocating table space!\n");
exit(-1);
}
/*
** Get the file name, open it up, and open up the lzw output file.
*/
if (argc>1)
strcpy(input_file_name,argv[1]);
else
{
printf("Input file name? ");
scanf("%s",input_file_name);
}
input_file=fopen(input_file_name,"rb");
lzw_file=fopen("test.lzw","wb");
if (input_file==NULL || lzw_file==NULL)
{
printf("Fatal error opening files.\n");
exit(-1);
};
/*
** Compress the file.
*/
if(command=='r')
{
compress(input_file,lzw_file);
}
fclose(input_file);
fclose(lzw_file);
free(c-ode_value);
/*
** Now open the files for the expansion.
*/
lzw_file=fopen("test.lzw","rb");
output_file=fopen("test.out","wb");
if (lzw_file==NULL || output_file==NULL)
{
printf("Fatal error opening files.\n");
exit(-2);
};
/*
** Expand the file.
*/
expand(lzw_file,output_file);
fclose(lzw_file);
fclose(output_file);
free(prefix_code);
free(append_character);
}
/*
** This is the compression routine. The code should be a fairly close
** match to the algorithm accompanying the article.
**
*/
void compress(FILE *input,FILE *output)
{
unsigned int next_code;
unsigned int character;
unsigned int string_code;
unsigned int index;
int i;
next_code=256; /* Next code is the next available string code*/
for (i=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */
code_value[i]=-1;
i=0;
printf("Compressing...\n");
string_code=getc(input); /* Get the first code */
/*
** This is the main loop where it all happens. This loop runs util all of
** the input has been exhausted. Note that it stops adding codes to the
** table after all of the possible codes have been defined.
*/
while ((character=getc(input)) != (unsigned)EOF)
{
if (++i==1000) /* Print a * every 1000 */
{ /* input characters. This */
i=0; /* is just a pacifier. */
printf("*");
}
index=find_match(string_code,character);/* See if the string is in */
if (code_value[index] != -1) /* the table. If it is, */
string_code=code_value[index]; /* get the code value. If */
else /* the string is not in the*/
{ /* table, try to add it. */
if (next_code <= MAX_CODE)
{
code_value[index]=next_code++;
prefix_code[index]=string_code;
append_character[index]=character;
}
output_code(output,string_code); /* When a string is found */
string_code=character; /* that is not in the table*/
} /* I output the last string*/
} /* after adding the new one*/
/*
** End of the main loop.
*/
output_code(output,string_code); /* Output the last code */
output_code(output,MAX_VALUE); /* Output the end of buffer code */
output_code(output,0); /* This code flushes the output buffer*/
printf("\n");
}
/*
** This is the hashing routine. It tries to find a match for the prefix+char
** string in the string table. If it finds it, the index is returned. If
** the string is not found, the first available index in the string table is
** returned instead.
*/
int find_match(int hash_prefix,unsigned int hash_character)
{
int index;
int offset;
index = (hash_character << HASHING_SHIFT) ^ hash_prefix;
if (index == 0)
offset = 1;
else
offset = TABLE_SIZE - index;
while (1)
{
if (code_value[index] == -1)
return(index);
if (prefix_code[index] == hash_prefix &&
append_character[index] == hash_character)
return(index);
index -= offset;
if (index < 0)
index += TABLE_SIZE;
}
}
/*
** This is the expansion routine. It takes an LZW format file, and expands
** it to an output file. The code here should be a fairly close match to
** the algorithm in the accompanying article.
*/
void expand(FILE *input,FILE *output)
{
unsigned int next_code;
unsigned int new_code;
unsigned int old_code;
int character;
int counter;
unsigned char *string;
next_code=256; /* This is the next available code to define */
counter=0; /* Counter is used as a pacifier. */
printf("Expanding...\n");
old_code=input_code(input); /* Read in the first code, initialize the */
character=old_code; /* character variable, and send the first */
putc(old_code,output); /* code to the output file */
/*
** This is the main expansion loop. It reads in characters from the LZW file
** until it sees the special code used to inidicate the end of the data.
*/
while ((new_code=input_code(input)) != (MAX_VALUE))
{
if (++counter==1000) /* This section of code prints out */
{ /* an asterisk every 1000 characters */
counter=0; /* It is just a pacifier. */
printf("*");
}
/*
** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING
** case which generates an undefined code. It handles it by decoding
** the last code, and adding a single character to the end of the decode string.
*/
if (new_code>=next_code)
{
*decode_stack=character;
string=decode_string(decode_stack+1,old_code);
}
/*
** Otherwise we do a straight decode of the new code.
*/
else
string=decode_string(decode_stack,new_code);
/*
** Now we output the decoded string in reverse order.
*/
character=*string;
while (string >= decode_stack)
putc(*string--,output);
/*
** Finally, if possible, add a new code to the string table.
*/
if (next_code <= MAX_CODE)
{
prefix_code[next_code]=old_code;
append_character[next_code]=character;
next_code++;
}
old_code=new_code;
}
printf("\n");
}
/*
** This routine simply decodes a string from the string table, storing
** it in a buffer. The buffer can then be output in reverse order by
** the expansion program.
*/
unsigned char *decode_string(unsigned char *buffer,unsigned int code)
{
int i;
i=0;
while (code > 255)
{
*buffer++ = append_character[code];
code=prefix_code[code];
if (i++>=MAX_CODE)
{
printf("Fatal error during code expansion.\n");
exit(-3);
}
}
*buffer=code;
return(buffer);
}
/*
** The following two routines are used to output variable length
** codes. They are written strictly for clarity, and are not
** particularyl efficient.
*/
unsigned int input_code(FILE *input)
{
unsigned int return_value;
static int input_bit_count=0;
static unsigned long input_bit_buffer=0L;
while (input_bit_count <= 24)
{
input_bit_buffer |=
(unsigned long) getc(input) << (24-input_bit_count);
input_bit_count += 8;
}
return_value=input_bit_buffer >> (32-BITS);
input_bit_buffer <<= BITS;
input_bit_count -= BITS;
return(return_value);
}
void output_code(FILE *output,unsigned int code)
{
static int output_bit_count=0;
static unsigned long output_bit_buffer=0L;
output_bit_buffer |= (unsigned long) code << (32-BITS-output_bit_count);
output_bit_count += BITS;
while (output_bit_count >= 8)
{
putc(output_bit_buffer >> 24,output);
output_bit_buffer <<= 8;
output_bit_count -= 8;
}
}

Here's a document on an algorithm to do regex searching directly in LZW compressed bytes :
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.1434&rep=rep1&type=pdf
It contains references to efficient algorithms to search for exact strings as well.

Related

What is the safest way to hash a list of non-repeating integers without taking order into account?

I am looking for a hash function, that can hash a list of non-repeating integers while ignoring the order of them.
Example
I want the two lists
l1 = [0, 1, 3, 7]
l2 = [7, 3, 1, 0]
to have the same hash.
Background
I have an algorithm that finds a list of vertices on a graph. In an undirected graph, the algorithm will find certain lists multiple times in different orders. With my current understanding of the algorithm, it is easier to filter out the duplicates rather than re-inventing the algorithm. For performance reasons, I understand it to be easier to hash the found lists of vertices rather than comparing the whole lists.
Possible answers
Now, I see that
an XOR or a simple sum might be an answer.
Unfortunately, both offer too much potential for hash collisions, as I see it.
The not-very-efficient working method is to sort a list, and then use this sorted list to compare the new list (also sorted) against.
Other Thoughts
Given that
The lists contain only integers.
The integers will be the vertex indices, and the graph can have billions of vertices.
The integers in a list are non-repeating, and their order doesn't matter.
The lists can and will consist of between 2 and 100 (and in some cases > 1000) entries.
No need for cryptographically-secure randomness.
I have this feeling that there should be a relatively easy and straight-forward answer, and I just have not found it.
Use a combination of the product, sum and ^. All are communitive (order independent) with unsigned math.
unsigned long long product = 1;
unsigned sum = 0; // Maybe unsigned long long
unsigned x = 0;
for (i=0; i < array_element_count; i++) {
product *= l[i];
sum += l[i];
x ^= l[i];
}
unsigned long long pre_hash = product + sum + ((unsigned long long) x << 32));
unsigned hash = pre_hash % hash_table_size;
Tip: hash_table_size should be a prime to effectively use all pre_hash bits.
If array_element_count was high, I would consider p *= shift_right_until_odd(l[i]), else p will too often become 0.
If l[i] == 0 p *= l[i] deserves something different. A simple mitigation is p *= l[i] | 1, but that is something pulled out of the air.
Hashing takes time for good design and the above are candidate building blocks for OP.
Any CRC will do the job. Just XOR (I have used 64bit numbers, but 32bits crc, but it should work also with full 64 xor/crc or 32bit xor/crc) the elements together (to eliminate any order between them, as the XOR operation is conmutative, you eliminate the dependency on the order) mod 2&31, then take a CRC32 of the result (that will spread the set of values uniformly, as it warrants ---or tries to--- that a change in one bit will affect half of the bits in the result) See here for sample code and several crc tables. The repository is BSD license, so you can use it as desired.
Below is a sample implementation that generates random lists, and reorders them, comparing their hashes:
crc32ieee8023.h
#ifndef CRC32IEEE8023_H
#define CRC32IEEE8023_H
#include "crc.h"
extern CRC_STATE crc32ieee8023[];
#endif /* CRC32IEEE8023_H */
crc.h
#ifndef CRC_H
#define CRC_H
#include <stdlib.h>
#include <stdint.h>
#define CRC_TABLE_SIZE 256
#define CRC_BYTE_SIZE 8
#define CRC_BYTE_MASK 0xff
typedef uint8_t CRC_BYTE;
typedef uint64_t CRC_STATE;
CRC_STATE do_crc(
CRC_STATE state,
CRC_BYTE *buff,
size_t nbytes,
CRC_STATE *table);
#endif /* CRC_H */
test_xor_crc_hash.c
(This is the important file, where all the stuff is included.)
/* test_crc_table -- program to test a crc hash algorithm that
* checks a list of numbers and generates the same crc in a form
* that is independent on the list order presented.
* Program generates a list of random numbers (32bit) then it
* generates a random permutation of the list and a sorted list,
* calculates the hash over the three lists, and compares them.
*/
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "crc.h"
#include "crc32ieee8023.h"
#define DFLT_N 10
#define RANDOM_DEV "/dev/urandom"
int long_compare(
const void *_a,
const void *_b);
void print(
const char *name,
const uint64_t *v,
int vsz,
CRC_STATE crc,
uint64_t xor);
int
main(int argc, char **argv)
{
int opt;
int n = DFLT_N,
res;
/* process options */
while ((opt = getopt(argc, argv, "n:")) != EOF) {
switch (opt) {
case 'n': res = sscanf(optarg, "%u", &n);
if (res != 1) {
fprintf(stderr,
"%s: invalid format (-n)\n",
optarg);
}
break;
} /* switch */
} /* while */
/* initialization of random number generator */
unsigned short random_state[3];
int fd = open(RANDOM_DEV, O_RDONLY);
if (fd < 0) {
fprintf(stderr,
"open: %s: %s\n",
RANDOM_DEV, strerror(errno));
exit(EXIT_FAILURE);
}
res = read(fd, random_state, sizeof random_state);
if (res < 0) { /* error */
fprintf(stderr,
"read: %s: %s\n",
RANDOM_DEV, strerror(errno));
exit(EXIT_FAILURE);
}
if (res < sizeof random_state) {
fprintf(stderr,
"read: %s: incomplete read (%d/%zd)\n",
RANDOM_DEV, res, sizeof random_state);
exit(EXIT_FAILURE);
}
seed48(random_state);
close(fd);
/* generate a list of random numbers and make two copies */
uint64_t *original = calloc(n, sizeof *original),
*copy_sorted = calloc(n, sizeof *copy_sorted),
*random_sort = calloc(n, sizeof *random_sort);
/* make two copies */
for (int i = 0; i < n; i++) {
original[i] = copy_sorted[i]
= random_sort[i]
= (long)lrand48() | ((long)lrand48() << 32);
}
/* sort the numbers */
qsort(copy_sorted, n, sizeof *copy_sorted, long_compare);
/* and random permutation */
for (int i = 0; i < n-1; i++) {
int j = lrand48() % (n - i);
if (i != j) {
uint64_t temp = random_sort[i];
random_sort[i] = random_sort[j];
random_sort[j] = temp;
}
}
/* calculate the sorts */
uint64_t xor_original = 0, xor_sorted = 0, xor_random = 0;
for (int i = 0; i < n; i++) {
xor_original ^= original[i];
xor_sorted ^= copy_sorted[i];
xor_random ^= random_sort[i];
}
/* now, calculate the crc's (a crc64 would be better for long) */
CRC_STATE
crc_original = do_crc(0xffffffff, (unsigned char *)&xor_original,
sizeof xor_original, crc32ieee8023),
crc_sorted = do_crc(0xffffffff, (unsigned char *)&xor_sorted,
sizeof xor_sorted, crc32ieee8023),
crc_random = do_crc(0xffffffff, (unsigned char *)&xor_random,
sizeof xor_random, crc32ieee8023);
print("original", original, n, crc_original, xor_original);
print(" sorted", copy_sorted, n, crc_sorted, xor_sorted);
print(" random", random_sort, n, crc_random, xor_random);
if (crc_original != crc_sorted || crc_sorted != crc_random) {
fprintf(stderr, "crc's don't match (crc_original == 0x%08lx, "
"crc_sorted == 0x%08lx, crc_random == 0x%08lx)\n",
crc_original, crc_sorted, crc_random);
}
/* change only one bit in one element to see how it changes the hash */
int bit_to_change = lrand48() % (n * 64),
elem_to_change = bit_to_change % n;
bit_to_change %= 64;
original[elem_to_change] ^= (1UL << bit_to_change); /* change the bit */
/* we should do the calculation over all elements, but just
* changing a bit in one element will change just the same bit in the
* xor_original accumulation variable */
uint64_t xor_original_new = xor_original;
xor_original_new ^= (1UL << bit_to_change);
printf("element=%d, bit=%d\n", elem_to_change, bit_to_change);
uint64_t crc_original_new = do_crc(0xffffffff, (unsigned char *)&xor_original_new, sizeof xor_original_new, crc32ieee8023);
print(" chg1bit", original, n, crc_original_new, xor_original_new);
}
int long_compare(const void *_a, const void *_b)
{
const uint64_t *a = _a, *b = _b;
return *a == *b
? 0
: *a > *b
? +1
: -1;
}
void print(const char *name, const uint64_t *v, int vsz, CRC_STATE crc, uint64_t xor)
{
printf("%s: { ", name);
char *sep = "";
for (int i = 0; i < vsz; i++) {
printf("%s0x%016lx", sep, v[i]);
sep = ", ";
}
printf(" }\n"
" xor = 0x%016lx, crc = 0x%08lx\n",
xor, crc);
}
crc.c
#include <sys/types.h>
#include "crc.h"
/* table based CRC calculation */
CRC_STATE do_crc(
CRC_STATE state,
CRC_BYTE *buff,
size_t nbytes,
CRC_STATE *table)
{
CRC_STATE index;
while (nbytes--) {
state ^= *buff++;
index = state & CRC_BYTE_MASK;
state >>= CRC_BYTE_SIZE;
state ^= table[index];
} /* while */
return state;
} /* do_crc */
crc32ieee8023.c
#include "crc.h"
/* variables */
CRC_STATE crc32ieee8023[] = {
/* Comando usado: mkcrc -gpedb88320 */
/* Polinomio: x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1 */
/* 0 */ 0x0, 0x77073096, 0xee0e612c, 0x990951ba,
/* 4 */ 0x76dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
/* 8 */ 0xedb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
/* 12 */ 0x9b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
/* 16 */ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
/* 20 */ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
/* 24 */ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
/* 28 */ 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
/* 32 */ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
/* 36 */ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
/* 40 */ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
/* 44 */ 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
/* 48 */ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
/* 52 */ 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
/* 56 */ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
/* 60 */ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
/* 64 */ 0x76dc4190, 0x1db7106, 0x98d220bc, 0xefd5102a,
/* 68 */ 0x71b18589, 0x6b6b51f, 0x9fbfe4a5, 0xe8b8d433,
/* 72 */ 0x7807c9a2, 0xf00f934, 0x9609a88e, 0xe10e9818,
/* 76 */ 0x7f6a0dbb, 0x86d3d2d, 0x91646c97, 0xe6635c01,
/* 80 */ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
/* 84 */ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
/* 88 */ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
/* 92 */ 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
/* 96 */ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
/* 100 */ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
/* 104 */ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
/* 108 */ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
/* 112 */ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
/* 116 */ 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
/* 120 */ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
/* 124 */ 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
/* 128 */ 0xedb88320, 0x9abfb3b6, 0x3b6e20c, 0x74b1d29a,
/* 132 */ 0xead54739, 0x9dd277af, 0x4db2615, 0x73dc1683,
/* 136 */ 0xe3630b12, 0x94643b84, 0xd6d6a3e, 0x7a6a5aa8,
/* 140 */ 0xe40ecf0b, 0x9309ff9d, 0xa00ae27, 0x7d079eb1,
/* 144 */ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
/* 148 */ 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
/* 152 */ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
/* 156 */ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
/* 160 */ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
/* 164 */ 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
/* 168 */ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
/* 172 */ 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
/* 176 */ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
/* 180 */ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
/* 184 */ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
/* 188 */ 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
/* 192 */ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x26d930a,
/* 196 */ 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x5005713,
/* 200 */ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0xcb61b38,
/* 204 */ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0xbdbdf21,
/* 208 */ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
/* 212 */ 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
/* 216 */ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
/* 220 */ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
/* 224 */ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
/* 228 */ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
/* 232 */ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
/* 236 */ 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
/* 240 */ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
/* 244 */ 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
/* 248 */ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
/* 252 */ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
}; /* crc32ieee8023 */
Makefile
targets = test_xch
toclean = $(targets)
test_xch_deps =
test_xch_objs = crc32ieee8023.o crc.o test_xor_crc_hash.o
test_xch_libs =
test_xch_ldfl =
toclean += $(test_xch_objs)
all: $(targets)
clean:
$(RM) $(toclean)
test_xch: $(test_xch_deps) $(test_xch_objs)
$(CC) $(LDFLAGS) $($#_ldfl) -o $# $($#_objs) $($#_libs) $(LIBS)
To make the program, just run:
$ make
and to run it, you can use option -n that allows you to specify the number of random elements to generate.
I think you will have to invent one to avoid the slow sorting option. In addition to XOR and arithmetic addition, there are bit rotations, and bit masks you could use. If you need high collision resistance, you could just combine more than one of the hash functions. e.g. Assuming the d_i and arithmetic are modular like with uint32_t for example,
H_1 = sum_{i = 1 to n} d_i
H_2 = xor_{i = 1 to n} d_i
H_3 = xor_{i = 1 to n} (rotl(d_i, d_i & 0x1f) + c)
Then take H1H2H3 as a 12 byte hash.

How can I resolve the collision in the hashing in this code I did? Currently cannot search for NG CHEA YEAT's ID only

I have the following text file
1171203258:HOSSAIN, MARUF
1181202660:KUHAN RAJ A/L TAMIL CHEL WAM
1181203465:PONG KAI SUN
1191102443:FAIZA OSAMA ABDALLA HASHIM
1201302289:LEE JIA WEI
1201302368:SHEIKH, AHNAF AZMAIN
1201100584:HI CHIA LING
1201101509:NG CHEA YEAT
1191103201:PHUAH CHEE HAOU
1201100879:MOSTAFA ARABY MADBOULY AHMED
1191103215:TONG JUN YANG
1191103119:ANG QIZHENG
1171302286:DARWIN KUMAR A/L MUNIAN
1181101192:HAIZUN NAJWA BINTI MOHD RIFIN
1201100926:NG XUE NIE
1191302417:ALMARHOON, ALI HUSSAIN A
1201100225:HEMAN RAO A/L SUBRAMANIAM
1181100823:LIM ZHEN BANG
1161202587:SOHEIL PRAKASAN SUPPAN
1201100603:AVINASH MURALI
1181101858:CHEAH KOK YEW
1191103071:GAN WEI TONG
1201100301:KEVIN THAM ZHENG YIT
1201100648:LIM CHER AIK
1201302222:SHIVAA RUTRAN A/L NAGATHEESAN
1201100779:TAN WEI XIANG
1191100919:WONG HONG WEI
The code I have for now, work well but have collision in the hashing I think
Here is what I have so far:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define MDIR 27 //size of list
#define MBUFF 256
#define MHASH 109 //hash function is %109
#define MNAME 40
struct List{
char name[40];
int studID;
};
//function prototype
int comparator(const void* p, const void* q){
return strcmp(((struct List*)p)->name,((struct List*)q)->name);
}
int readData(struct List dir[]);
int hashfunc(char *name);
void hash(struct List dir[], int ndir,
int hashtable[]);
int search(char *key,
struct List s[], int hashtable[]);
//main function
int main(){
int ndir, result, hashtable[MHASH];
int count;
int i;
int j;
struct List s[27];
char temp[27];
char query[40];
FILE *fptr;
fptr = fopen("rec.txt", "r+");
if (fptr != NULL) {
printf("File created successfully!\n");
}
else {
printf("Failed to create the file.\n");
// exit status for OS that an error occurred
return -1;
}
for(count = 0; count < 27; count++){
fscanf(fptr,"%d", &s[count].studID);
fgets(s[count].name,40,fptr);
}
qsort
qsort(s,27,sizeof(struct List),comparator);
printing the sorted name then continue the hashing of searching
//printing sorted name
printf("Sorted Names\n");
for(i=0;i<27;i++){
printf("%d%s\n", i+1, s[i].name);
}
fclose(fptr);
hashing of searching part
ndir=readData(s);
hash(s,ndir,hashtable);
puts("\nName to search>>");
fgets(query,MNAME-1,stdin);
query[strlen(query)-1]='\0';
result=search(query,s,hashtable);
if(result==-1)
printf("Not Found");
else
printf("%s's ID is %d\n",
s[result].name, s[result].studID);
return 0;
}
read function
int readData(struct List dir[]){
FILE *fdir=fopen("rec.txt","r");
char buff[MBUFF];
int i=0;
while(i<MDIR && fgets(buff,MBUFF-1,fdir)){
dir[i].studID=atol(strtok(buff,":"));
strcpy(dir[i].name,strtok(NULL, "\n"));
i++;
}
return(i);
}
hash function
int hashfunc(char *name){
long sum=0;
int k=0;
while(name[k]){
sum+=name[k];
k++;
}
return( (int) (sum % MHASH) );
}
hash function
void hash(struct List dir[], int ndir,
int hashtable[]){
int k;
int index;
for(k=0;k<ndir;k++){
index = hashfunc(dir[k].name);
hashtable[index]=k;
}
}
search function
int search(char *key, struct List dir[],
int hashtable[]){
int index=hashfunc(key);
int k=hashtable[index];
if(strcmp(key,dir[k].name)==0)
return(k);
else
return(-1);
}
I am not sure for the hashing of searching part
Whenever faced with a need to separate fields in a line of data, the normal approach is to read an entire line of data as a string into a buffer (character array). Then you separate what you need from the buffer using whatever method fits the data the best. Either using a pair of pointers to bracket the text you need and then copying the characters between the pointers. You can automate the process using string functions like strchr() to locate the ':' in the buffer. You can also use string functions like strtok() to split the buffer into tokens on any given set of delimiters.
However here there is an even simpler method. Since you have a fixed format for the studID and name in the line, you can simply use sscanf(), e.g.
#include <stdio.h>
#include <stdlib.h>
#define MXSTUD 30 /* if you need a constant, #define one (or more) */
#define MXNAME 40
typedef struct list { /* adding typedef for convenience */
char name[MXNAME];
unsigned studID;
} list;
...
int main (int argc, char **argv) {
int count = 0; /* count of students */
char buf[MXNAME * 2]; /* temprorary storage for line */
list s[MXSTUD] = {{ .name = "" }}; /* list array initialized all 0 */
/* open filename given as 1st argument or "rec.text" if none given */
FILE *fptr = fopen (argc > 1 ? argv[1] : "rec.text", "r");
if (!fptr) { /* validate file open for reading */
fputs ("error: file open failed\n", stderr);
return 1;
}
while (fgets (buf, sizeof buf, fptr)) { /* read each line into buf */
/* separate studID and name using sscanf() */
if (sscanf (buf, "%u:%39[^\n]", &s[count].studID, s[count].name) == 2) {
count += 1; /* increment count on success */
}
}
...
That's all that is needed to read each line of data and separate the line into studID and name storing each in an element of the list array of struct.
Use qsort() For Sorting
Regardless of whether you have an array or allocated block of memory containing objects, qsort() provides a simple and efficient way to sort it. All you need to do is write a compare() function telling qsort() how to compare the elements. The declaration for the qsort() compare function is:
int compare (const void *a, const void *b);`
Where a and b are simple pointers-to elements of your array to be compared. So when writing the function, all you need to do is cast a and b to the proper type and write the logic to compare whatever you like in the two elements. A negative return means a sorts before b and a positive return means b sorts before a. A zero return means the elements are equal.
Casting the a and b to type const list * (you include const since the data isn't modified which allows the compiler freedom to optimize more fully), you simply loop over each name comparing characters and returning when two characters differ or the end of file is reached. Here, to sort your s[] array by name you can do:
/* qsort compare function lexagraphically sorts words */
int compare (const void *a, const void *b)
{
/* a & b are pointers to adjacent list elements, (pointers to list) */
const list *sa = (const list *)a,
*sb = (const list *)b;
const char *na = sa->name, /* pointers to name in each element */
*nb = sb->name;
/* loop advancing a character in each word per-iteration */
for (;; na++, nb++) {
/* if characters differ or at end of either */
if (*na != *nb || !*na)
break;
}
return (*na > *nb) - (*na < *nb); /* return sort order */
}
Then to sort your array of list (your s[] array) with qsort(), all that is needed is:
qsort (s, count, sizeof *s, compare); /* sort array by name */
Putting it all together in a short program that reads from the filename given as the first argument to the program (or from "rec.text" by default if no argument is given), you can do:
#include <stdio.h>
#include <stdlib.h>
#define MXSTUD 30 /* if you need a constant, #define one (or more) */
#define MXNAME 40
typedef struct list { /* adding typedef for convenience */
char name[MXNAME];
unsigned studID;
} list;
/* qsort compare function lexagraphically sorts words */
int compare (const void *a, const void *b)
{
/* a & b are pointers to adjacent list elements, (pointers to list) */
const list *sa = (const list *)a,
*sb = (const list *)b;
const char *na = sa->name, /* pointers to name in each element */
*nb = sb->name;
/* loop advancing a character in each word per-iteration */
for (;; na++, nb++) {
/* if characters differ or at end of either */
if (*na != *nb || !*na)
break;
}
return (*na > *nb) - (*na < *nb); /* return sort order */
}
int main (int argc, char **argv) {
int count = 0; /* count of students */
char buf[MXNAME * 2]; /* temprorary storage for line */
list s[MXSTUD] = {{ .name = "" }}; /* list array initialized all 0 */
/* open filename given as 1st argument or "rec.text" if none given */
FILE *fptr = fopen (argc > 1 ? argv[1] : "rec.text", "r");
if (!fptr) { /* validate file open for reading */
fputs ("error: file open failed\n", stderr);
return 1;
}
while (fgets (buf, sizeof buf, fptr)) { /* read each line into buf */
/* separate studID and name using sscanf() */
if (sscanf (buf, "%u:%39[^\n]", &s[count].studID, s[count].name) == 2) {
count += 1; /* increment count on success */
}
}
qsort (s, count, sizeof *s, compare); /* sort array by name */
for (int i = 0; i < count; i++) { /* output results */
printf ("%2d %10u %s\n", i + 1, s[i].studID, s[i].name);
}
}
(note: you simply need to open the file in read mode "r")
Example Use/Output
With your data in a file named dat/studIDlist.txt, for the 27 students in your data you would get:
$ ./bin/studIDlist dat/studIDlist.txt
1 1191302417 ALMARHOON, ALI HUSSAIN A
2 1191103119 ANG QIZHENG
3 1201100603 AVINASH MURALI
4 1181101858 CHEAH KOK YEW
5 1171302286 DARWIN KUMAR A/L MUNIAN
6 1191102443 FAIZA OSAMA ABDALLA HASHIM
7 1191103071 GAN WEI TONG
8 1181101192 HAIZUN NAJWA BINTI MOHD RIFIN
9 1201100225 HEMAN RAO A/L SUBRAMANIAM
10 1201100584 HI CHIA LING
11 1171203258 HOSSAIN, MARUF
12 1201100301 KEVIN THAM ZHENG YIT
13 1181202660 KUHAN RAJ A/L TAMIL CHEL WAM
14 1201302289 LEE JIA WEI
15 1201100648 LIM CHER AIK
16 1181100823 LIM ZHEN BANG
17 1201100879 MOSTAFA ARABY MADBOULY AHMED
18 1201101509 NG CHEA YEAT
19 1201100926 NG XUE NIE
20 1191103201 PHUAH CHEE HAOU
21 1181203465 PONG KAI SUN
22 1201302368 SHEIKH, AHNAF AZMAIN
23 1201302222 SHIVAA RUTRAN A/L NAGATHEESAN
24 1161202587 SOHEIL PRAKASAN SUPPAN
25 1201100779 TAN WEI XIANG
26 1191103215 TONG JUN YANG
27 1191100919 WONG HONG WEI
You will have to get line by line your file and store it in an array.
FILE *fp = fopen("lorem.txt", "r");
if(fp == NULL) {
perror("Unable to open file!");
exit(1);
}
char chunk[128];
while(fgets(chunk, sizeof(chunk), fp) != NULL) {
fputs(chunk, stdout);
fputs("|*\n", stdout); // marker string used to show where the content the chunk array has ended
}
fclose(fp);
To split each line use strtok() function:
char *token = strtok(line, ":"); // To separate the first block from the second like seen on your image.
char *token[1] = strtok(token, ","); // To separate the other part

How to send and receive data through UART between two Arduino Unos

I have been working through this problem for quite some time now and have succeeded in getting a partial mark. I would like to know what is wrong with the code that I have, that is preventing me from succeeding under certain conditions
I need one arduino to communicate with another one by sending a string of characters. So far I have succeeded in sending and receiving some data but think that I may be having an issue with the buffer I have set up in my uart_receive_string() function. I will provide all of the necessary information and code needed in order to test this, just let me know if any more info is required and Ill be happy to provide.
Here is a link to the tinkercad driver: https://www.tinkercad.com/things/eUZqkaIHp6J
Just click "Copy and Tinker" and hit the code button up top in order to paste the below code into it. You will need to paste the code into both ardunios by selecting them via the drop down box.
This is the criteria for the question I am working on:
This is the output I should receive in the test driver provided:
Here is the current code that I have implemented:
It is what needs to be copied into tinkercad for both arduino's
#include <stdint.h>
#include <stdio.h>
#include <avr/io.h>
#include <avr/interrupt.h>
#include <util/delay.h>
void uart_putbyte(unsigned char data);
int uart_getbyte(unsigned char *buffer);
/*
** Define a function named uart_send_string which transmits the contents of
** a standard C string (i.e. a null-terminated char array) over UART. The
** function should iterate over the characters in the array, using a cast to
** convert each to an unsigned char, and transmitting the resulting byte via
** uart_putbyte. The end of the string should be signalled by sending a single
** null byte. That is, the number 0, not the character '0'.
**
** Param: str - string to be transmitted.
**
** Returns: Nothing.
*/
// vvvvvvv I need help with this vvvvvvv
void uart_send_string(char str[])
{
int i = 0;
char ch;
do{
ch = str[i];
uart_putbyte(ch);
i++;
}while(ch != '\0');
}
/*
** Define a function named uart_receive_string which uses uart_getbyte to fetch
** the contents of a standard C string (i.e. a null-terminated char array)
** from UART. The function should wait for characters, and must not return
** until a complete string has been retrieved.
**
** Note that uart_getbyte will return 1 if a byte is available, and zero
** otherwise. Therefore, to fetch a byte and store it in a variable named x,
** you will need to use a construct of the form:
** unsigned char x;
** while (! uart_getbyte(&x)) {
** // Do nothing.
** }
**
** Param: buffer - a char array which has capacity to store a string
** containing at most (buff_len-1) characters. If more than (buff_len-1)
** characters are received, the first (buff_len-1) of them should be
** stored consecutively in the buffer, and any others discarded. The
** string must be terminated correctly with a null terminator in all
** circumstances.
**
** Param: buff_len - an int which specifies the capacity of the buffer.
**
** Returns: Nothing. However, up to buff_len elements of buffer may have been
** overwritten by incoming data.
*/
//vvvvvvv I need help with this vvvvvvv
void uart_receive_string(char buffer[], int buff_len)
{
int i = 0;
unsigned char ch;
while(!uart_getbyte(&ch))
{
if(ch == 0)
{
break;
}
if(i < buff_len-1)
{
ch = buffer[i];
uart_putbyte(ch);
i++;
}
}
buffer[i]=0;
}
/*
***************************************************************************
** Initialise UART.
***************************************************************************
*/
void uart_init(void) {
UBRR0 = F_CPU / 16 / 9600 - 1;
UCSR0A = 0;
UCSR0B = (1 << RXEN0) | (1 << TXEN0);
UCSR0C = (3 << UCSZ00);
}
/*
**************************************************************************
** Send one byte, protecting against overrun in the transmit buffer.
**
** Param: data - a byte to be transmitted.
**
** Returns: Nothing.
***************************************************************************
*/
#ifndef __AMS__
void uart_putbyte(unsigned char data) {
// Wait for empty transmit buffer
while (!(UCSR0A & (1 << UDRE0)));
// Send data by assigning into UDR0
UDR0 = data;
}
#endif
/*
***************************************************************************
** Attempt to receive one byte, returning immediately to sender.
**
** Param: buffer - the address of a byte in which a result may be stored.
**
** Returns: If a byte is available returns 1 and stores the incoming byte in
** location referenced by buffer. Otherwise returns 0 and makes no other
** change to the state.
***************************************************************************
*/
#ifndef __AMS__
int uart_getbyte(unsigned char *buffer) {
// If receive buffer contains data...
if (UCSR0A & (1 << RXC0)) {
// Copy received byte from UDR0 into memory location (*buffer)
*buffer = UDR0;
//
return 1;
}
else {
return 0;
}
}
#endif
/*
***************************************************************************
** Implement main event loop.
***************************************************************************
*/
void process() {
// Use two devices, as indicated in the supplied TinkerCad model. One
// device acts as the sender (is_sender = 1), the other as receiver
// (is_sender = 0). Change this to set the role accordingly.
const int is_sender = 1;
if (is_sender) {
static char * messages_to_send[] = {
"", // Empty string
"A", // String with one symbol.
"Hello from CAB202!", // Multiple symbols
"1234567890abcdefghijklmnopqrstuvwxyz", // Longer than buffer size.
NULL, // End of list
};
static int next_message = 0;
uart_send_string(messages_to_send[next_message]);
next_message ++;
if (messages_to_send[next_message] == NULL) next_message = 0;
_delay_ms(300);
}
else {
#define BUFF_SIZE 20
char buffer[BUFF_SIZE];
uart_receive_string(buffer, BUFF_SIZE);
uart_send_string(buffer);
uart_putbyte('\r');
uart_putbyte('\n');
}
}
int main(void) {
uart_init();
while (1) {
process();
}
return 0;
}
The areas of this code that I am required to work on are these:
This is needed to send the data:
void uart_send_string(char str[])
{
int i = 0;
char ch;
do{
ch = str[i];
uart_putbyte(ch);
i++;
}while(ch != '\0');
}
This is needed to receive the data:
void uart_receive_string(char buffer[], int buff_len)
{
int i = 0;
unsigned char ch;
while(!uart_getbyte(&ch))
{
if(ch == 0)
{
break;
}
if(i < buff_len-1)
{
ch = buffer[i];
uart_putbyte(ch);
i++;
}
}
buffer[i]=0;
}
I am really sorry if this is hard to understand. Ill do my best to clarify any additional information that is needed. I just need to figure out what I am doing incorrectly.

Error trying to compile program from instructor (symbols expected in define line)

I was given a template from my instructor and then modified the code to frequency histogram of a given text. However i'm getting errors trying to compile the code. I believe the errors indicated by the compiler is at the beginning of the code. The errors have been taken a screenshot of and attached below. Thanks in advance
errors:
test.c:6:25: error: expected declaration specifiers or '...' before '\x20'
#define FIRST_PRINTABLE ' ' // Space character code 32, see Etter 2016 text, pp. 418-420
^
test.c:8:30: note: in expansion of macro 'FIRST_PRINTABLE'
#define NUM_PRINTABLE (int) (FIRST_PRINTABLE-LAST_PRINTABLE+1)
^~~~~~~~~~~~~~~
test.c:11:38: note: in expansion of macro 'NUM_PRINTABLE'
void init_array(int histogram[], int NUM_PRINTABLE);
^~~~~~~~~~~~~
code:
/*
* Comp120 - Lab 7: Starter project -- Complete this code
* Character Frequency analysis -- read a text file and display frequency
* analysis
* for all printable characters.
*
* Author: J. Fall
* Date: Feb. 2017
*/
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
// Definition of printable character set
#define FIRST_PRINTABLE ' ' // Space character code 32, see Etter 2016 text, pp. 418-420
#define LAST_PRINTABLE '~'
#define NUM_PRINTABLE (int) (FIRST_PRINTABLE-LAST_PRINTABLE+1)
// Function prototypes:
void init_array(int histogram[], int NUM_PRINTABLE);
int sum_array(const int histogram[], int NUM_PRINTABLE);
bool isPrintable(char c);
void compute_frequency(char* filename, int histogram[], int NUM_PRINTABLE);
void write_histogram(int histogram[], int NUM_PRINTABLE);
FILE* openFileRead(char* filename);
int main( int argc, char* argv[] )
{
if (argc < 2) {
printf("Usage: freq inputFile \n");
exit(-1);
}
int histogram[NUM_PRINTABLE]; // Array of counters -- one for each printible character
compute_frequency(argv[1], histogram, NUM_PRINTABLE);
write_histogram(histogram, NUM_PRINTABLE);
printf( "Program complete. \n" );
return 0 ;
}
/*
* Initialize the array of integers of given length to all zeros
*/
void init_array(int histogram[], int NUM_PRINTABLE)
{
int i = 0;
for(i=0;i<NUM_PRINTABLE;i++){
histogram[i]=0;
} // TODO: write function to assign 0 too every array element.
}
/*
* Return the sum of all items in the given array
*/
int sum_array(const int array[], int NUM_PRINTABLE)
{
int i = 0;
int sum = 0;
for(i=0;i<NUM_PRINTABLE;i++){
sum = sum + histogram[i];
}
return sum; // TODO: write function to add up every element in the given array.
}
/*
* Return true iff the character is PRINTABLE
*/
bool isPrintable(char c)
{
if (c >= FIRST_PRINTABLE && <= LAST_PRINTABLE){
return true;
}
else
return false; // TODO: write function to return true iff c is a printable character
}
/*
* Compute the frequency histogram for all PRINTABLE characters in the given file
*/
void compute_frequency(char* filename, int histogram[], int NUM_PRINTABLE)
{
FILE* inputFile = openFileRead(filename);
init_array(histogram, NUM_PRINTABLE);
char c = getc(inputFile); // priming read -- read first character from file
while (c != EOF) {
if(isPrintable(c)){
int bin = c - FIRST_PRINTABLE;// TODO: write algorithm to count the number of times character c occurs.
histogram[bin]++;
}
// HINT: since array indexes start at zero, map the ASCII code for each
// printable charcter onto an index by subtracting FIRST_PRINTABLE
// After processing previous character, read next character from file to re-prime the loop
c = getc(inputFile);
}
}
/*
* Write the frequency histogram out to the given file
*/
void write_histogram(int histogram[], int NUM_PRINTABLE)
{
FILE* outputFile = stdout; // Simplifictaion: output is written to the console instead of to an output file.
int total_count = sum_array(histogram, NUM_PRINTABLE);
fprintf(outputFile, "Frequency Analysis Results. Input contained %d printable characters. \n", total_count);
fprintf(outputFile, "Char | Frequency \n");
fprintf(outputFile, "____ | _________ \n");
int i;
for (i=0; i<NUM_PRINTABLE; i++) {
char ch = (char) (i + FIRST_PRINTABLE);
double freq;
if (histogram[i] > 0) {
double freq = histogram[i]/(double)total_count * 100;
fprintf(outputFile, "%3c | %9.3f%% \n", ch, freq);
}
else
fprintf(outputFile, "%3c | %9d%% \n", ch, 0);
}
}
/*
* Attempt to open the file for read access.
* Peforms error check and exits if file is not accessible
*/
FILE* openFileRead(char* filename)
{
FILE* inFile = fopen(filename, "r" );
if( inFile == NULL) {
printf( "Error opening input file %s, program terminating!\n", filename);
exit(-1);
}
return inFile;
}
#define NUM_PRINTABLE (int) (FIRST_PRINTABLE-LAST_PRINTABLE+1)
// Function prototypes:
void init_array(int histogram[], int NUM_PRINTABLE);
when those both lines are expanded by preprocessor it translates as (you can use gcc -E on the source to see it in action):
void init_array(int histogram[], int (int) (' '-'~'+1));
which is obviously a syntax error. Just use NUM_PRINTABLE as a constant in your functions, not as a parameter.
Aside, the macro is functionnaly wrong, it should be
#define NUM_PRINTABLE (LAST_PRINTABLE-FIRST_PRINTABLE+1)
or the value would be negative. (and you don't need to cast to int since character literals are already int)

`nmcnt` and `type` of R's `sxpinfo` header

§1.1.2 of R's internals manual says that
The `sxpinfo' header is defined as the 64-bit C structure below:
struct sxpinfo_struct {
/* Type and namedcnt in first byte */
unsigned int nmcnt : 3; /* count of "names" referring to object */
unsigned int type : 5; /* discussed above */
/* Garbage collector stuff - keep in one byte to maybe speed up access */
unsigned int gccls : 3; /* node class for garbage collector */
unsigned int gcgen : 1; /* old generation number - may be best first */
unsigned int mark : 1; /* marks object as in use in garbage collector */
/* Object flag */
unsigned int obj : 1; /* set if this is an S3 or S4 object */
/* Flags to synchronize with helper threads */
unsigned int in_use: 1; /* whether contents may be in use by a helper */
unsigned int being_computed : 1; /* whether helper may be computing this */
/* "general purpose" field, used for miscellaneous purposes */
unsigned int gp : 16; /* The "general purpose" field */
union {
struct { /* field below is for vectors only */
R_len_t truelength; /* for old stuff - may someday be defunct... */
} vec;
struct { /* fields below are for non-vectors only */
/* Debugging */
unsigned int debug : 1; /* function/environment is being debugged */
unsigned int rstep : 1; /* function is to be debugged, but only once */
unsigned int trace : 1; /* function is being traced */
/* Symbol binding */
unsigned int basec : 1; /* sym has base binding in global cache */
unsigned int spec_sym : 1; /* this is a "special" symbol */
unsigned int no_spec_sym : 1; /* environment has no special symbols */
unsigned int unused : 2; /* not yet used */
/* Primitive operations */
unsigned char var1, var2;/* variants for evals of fast primitive args */
unsigned char pending_ok;/* whether args can have computation pending */
} nonvec; /* - only one bit, but maybe faster as byte */
} u;
};
which leaves me with two questions, just about the nmcnt and type fields (first byte):
why are 25−2 SEXPTYPEs represented with five bits when 2⁵=64&gg;25 ?
is there a limit of 2³=8 names that can refer to a given (eg S4) object?
Thanks in advance.

Resources