I am looking for a hash function, that can hash a list of non-repeating integers while ignoring the order of them.
Example
I want the two lists
l1 = [0, 1, 3, 7]
l2 = [7, 3, 1, 0]
to have the same hash.
Background
I have an algorithm that finds a list of vertices on a graph. In an undirected graph, the algorithm will find certain lists multiple times in different orders. With my current understanding of the algorithm, it is easier to filter out the duplicates rather than re-inventing the algorithm. For performance reasons, I understand it to be easier to hash the found lists of vertices rather than comparing the whole lists.
Possible answers
Now, I see that
an XOR or a simple sum might be an answer.
Unfortunately, both offer too much potential for hash collisions, as I see it.
The not-very-efficient working method is to sort a list, and then use this sorted list to compare the new list (also sorted) against.
Other Thoughts
Given that
The lists contain only integers.
The integers will be the vertex indices, and the graph can have billions of vertices.
The integers in a list are non-repeating, and their order doesn't matter.
The lists can and will consist of between 2 and 100 (and in some cases > 1000) entries.
No need for cryptographically-secure randomness.
I have this feeling that there should be a relatively easy and straight-forward answer, and I just have not found it.
Use a combination of the product, sum and ^. All are communitive (order independent) with unsigned math.
unsigned long long product = 1;
unsigned sum = 0; // Maybe unsigned long long
unsigned x = 0;
for (i=0; i < array_element_count; i++) {
product *= l[i];
sum += l[i];
x ^= l[i];
}
unsigned long long pre_hash = product + sum + ((unsigned long long) x << 32));
unsigned hash = pre_hash % hash_table_size;
Tip: hash_table_size should be a prime to effectively use all pre_hash bits.
If array_element_count was high, I would consider p *= shift_right_until_odd(l[i]), else p will too often become 0.
If l[i] == 0 p *= l[i] deserves something different. A simple mitigation is p *= l[i] | 1, but that is something pulled out of the air.
Hashing takes time for good design and the above are candidate building blocks for OP.
Any CRC will do the job. Just XOR (I have used 64bit numbers, but 32bits crc, but it should work also with full 64 xor/crc or 32bit xor/crc) the elements together (to eliminate any order between them, as the XOR operation is conmutative, you eliminate the dependency on the order) mod 2&31, then take a CRC32 of the result (that will spread the set of values uniformly, as it warrants ---or tries to--- that a change in one bit will affect half of the bits in the result) See here for sample code and several crc tables. The repository is BSD license, so you can use it as desired.
Below is a sample implementation that generates random lists, and reorders them, comparing their hashes:
crc32ieee8023.h
#ifndef CRC32IEEE8023_H
#define CRC32IEEE8023_H
#include "crc.h"
extern CRC_STATE crc32ieee8023[];
#endif /* CRC32IEEE8023_H */
crc.h
#ifndef CRC_H
#define CRC_H
#include <stdlib.h>
#include <stdint.h>
#define CRC_TABLE_SIZE 256
#define CRC_BYTE_SIZE 8
#define CRC_BYTE_MASK 0xff
typedef uint8_t CRC_BYTE;
typedef uint64_t CRC_STATE;
CRC_STATE do_crc(
CRC_STATE state,
CRC_BYTE *buff,
size_t nbytes,
CRC_STATE *table);
#endif /* CRC_H */
test_xor_crc_hash.c
(This is the important file, where all the stuff is included.)
/* test_crc_table -- program to test a crc hash algorithm that
* checks a list of numbers and generates the same crc in a form
* that is independent on the list order presented.
* Program generates a list of random numbers (32bit) then it
* generates a random permutation of the list and a sorted list,
* calculates the hash over the three lists, and compares them.
*/
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "crc.h"
#include "crc32ieee8023.h"
#define DFLT_N 10
#define RANDOM_DEV "/dev/urandom"
int long_compare(
const void *_a,
const void *_b);
void print(
const char *name,
const uint64_t *v,
int vsz,
CRC_STATE crc,
uint64_t xor);
int
main(int argc, char **argv)
{
int opt;
int n = DFLT_N,
res;
/* process options */
while ((opt = getopt(argc, argv, "n:")) != EOF) {
switch (opt) {
case 'n': res = sscanf(optarg, "%u", &n);
if (res != 1) {
fprintf(stderr,
"%s: invalid format (-n)\n",
optarg);
}
break;
} /* switch */
} /* while */
/* initialization of random number generator */
unsigned short random_state[3];
int fd = open(RANDOM_DEV, O_RDONLY);
if (fd < 0) {
fprintf(stderr,
"open: %s: %s\n",
RANDOM_DEV, strerror(errno));
exit(EXIT_FAILURE);
}
res = read(fd, random_state, sizeof random_state);
if (res < 0) { /* error */
fprintf(stderr,
"read: %s: %s\n",
RANDOM_DEV, strerror(errno));
exit(EXIT_FAILURE);
}
if (res < sizeof random_state) {
fprintf(stderr,
"read: %s: incomplete read (%d/%zd)\n",
RANDOM_DEV, res, sizeof random_state);
exit(EXIT_FAILURE);
}
seed48(random_state);
close(fd);
/* generate a list of random numbers and make two copies */
uint64_t *original = calloc(n, sizeof *original),
*copy_sorted = calloc(n, sizeof *copy_sorted),
*random_sort = calloc(n, sizeof *random_sort);
/* make two copies */
for (int i = 0; i < n; i++) {
original[i] = copy_sorted[i]
= random_sort[i]
= (long)lrand48() | ((long)lrand48() << 32);
}
/* sort the numbers */
qsort(copy_sorted, n, sizeof *copy_sorted, long_compare);
/* and random permutation */
for (int i = 0; i < n-1; i++) {
int j = lrand48() % (n - i);
if (i != j) {
uint64_t temp = random_sort[i];
random_sort[i] = random_sort[j];
random_sort[j] = temp;
}
}
/* calculate the sorts */
uint64_t xor_original = 0, xor_sorted = 0, xor_random = 0;
for (int i = 0; i < n; i++) {
xor_original ^= original[i];
xor_sorted ^= copy_sorted[i];
xor_random ^= random_sort[i];
}
/* now, calculate the crc's (a crc64 would be better for long) */
CRC_STATE
crc_original = do_crc(0xffffffff, (unsigned char *)&xor_original,
sizeof xor_original, crc32ieee8023),
crc_sorted = do_crc(0xffffffff, (unsigned char *)&xor_sorted,
sizeof xor_sorted, crc32ieee8023),
crc_random = do_crc(0xffffffff, (unsigned char *)&xor_random,
sizeof xor_random, crc32ieee8023);
print("original", original, n, crc_original, xor_original);
print(" sorted", copy_sorted, n, crc_sorted, xor_sorted);
print(" random", random_sort, n, crc_random, xor_random);
if (crc_original != crc_sorted || crc_sorted != crc_random) {
fprintf(stderr, "crc's don't match (crc_original == 0x%08lx, "
"crc_sorted == 0x%08lx, crc_random == 0x%08lx)\n",
crc_original, crc_sorted, crc_random);
}
/* change only one bit in one element to see how it changes the hash */
int bit_to_change = lrand48() % (n * 64),
elem_to_change = bit_to_change % n;
bit_to_change %= 64;
original[elem_to_change] ^= (1UL << bit_to_change); /* change the bit */
/* we should do the calculation over all elements, but just
* changing a bit in one element will change just the same bit in the
* xor_original accumulation variable */
uint64_t xor_original_new = xor_original;
xor_original_new ^= (1UL << bit_to_change);
printf("element=%d, bit=%d\n", elem_to_change, bit_to_change);
uint64_t crc_original_new = do_crc(0xffffffff, (unsigned char *)&xor_original_new, sizeof xor_original_new, crc32ieee8023);
print(" chg1bit", original, n, crc_original_new, xor_original_new);
}
int long_compare(const void *_a, const void *_b)
{
const uint64_t *a = _a, *b = _b;
return *a == *b
? 0
: *a > *b
? +1
: -1;
}
void print(const char *name, const uint64_t *v, int vsz, CRC_STATE crc, uint64_t xor)
{
printf("%s: { ", name);
char *sep = "";
for (int i = 0; i < vsz; i++) {
printf("%s0x%016lx", sep, v[i]);
sep = ", ";
}
printf(" }\n"
" xor = 0x%016lx, crc = 0x%08lx\n",
xor, crc);
}
crc.c
#include <sys/types.h>
#include "crc.h"
/* table based CRC calculation */
CRC_STATE do_crc(
CRC_STATE state,
CRC_BYTE *buff,
size_t nbytes,
CRC_STATE *table)
{
CRC_STATE index;
while (nbytes--) {
state ^= *buff++;
index = state & CRC_BYTE_MASK;
state >>= CRC_BYTE_SIZE;
state ^= table[index];
} /* while */
return state;
} /* do_crc */
crc32ieee8023.c
#include "crc.h"
/* variables */
CRC_STATE crc32ieee8023[] = {
/* Comando usado: mkcrc -gpedb88320 */
/* Polinomio: x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1 */
/* 0 */ 0x0, 0x77073096, 0xee0e612c, 0x990951ba,
/* 4 */ 0x76dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
/* 8 */ 0xedb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
/* 12 */ 0x9b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
/* 16 */ 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
/* 20 */ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
/* 24 */ 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
/* 28 */ 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
/* 32 */ 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
/* 36 */ 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
/* 40 */ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
/* 44 */ 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
/* 48 */ 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
/* 52 */ 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
/* 56 */ 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
/* 60 */ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
/* 64 */ 0x76dc4190, 0x1db7106, 0x98d220bc, 0xefd5102a,
/* 68 */ 0x71b18589, 0x6b6b51f, 0x9fbfe4a5, 0xe8b8d433,
/* 72 */ 0x7807c9a2, 0xf00f934, 0x9609a88e, 0xe10e9818,
/* 76 */ 0x7f6a0dbb, 0x86d3d2d, 0x91646c97, 0xe6635c01,
/* 80 */ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
/* 84 */ 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
/* 88 */ 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
/* 92 */ 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
/* 96 */ 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
/* 100 */ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
/* 104 */ 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
/* 108 */ 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
/* 112 */ 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
/* 116 */ 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
/* 120 */ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
/* 124 */ 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
/* 128 */ 0xedb88320, 0x9abfb3b6, 0x3b6e20c, 0x74b1d29a,
/* 132 */ 0xead54739, 0x9dd277af, 0x4db2615, 0x73dc1683,
/* 136 */ 0xe3630b12, 0x94643b84, 0xd6d6a3e, 0x7a6a5aa8,
/* 140 */ 0xe40ecf0b, 0x9309ff9d, 0xa00ae27, 0x7d079eb1,
/* 144 */ 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
/* 148 */ 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
/* 152 */ 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
/* 156 */ 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
/* 160 */ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
/* 164 */ 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
/* 168 */ 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
/* 172 */ 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
/* 176 */ 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
/* 180 */ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
/* 184 */ 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
/* 188 */ 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
/* 192 */ 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x26d930a,
/* 196 */ 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x5005713,
/* 200 */ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0xcb61b38,
/* 204 */ 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0xbdbdf21,
/* 208 */ 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
/* 212 */ 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
/* 216 */ 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
/* 220 */ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
/* 224 */ 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
/* 228 */ 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
/* 232 */ 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
/* 236 */ 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
/* 240 */ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
/* 244 */ 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
/* 248 */ 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
/* 252 */ 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
}; /* crc32ieee8023 */
Makefile
targets = test_xch
toclean = $(targets)
test_xch_deps =
test_xch_objs = crc32ieee8023.o crc.o test_xor_crc_hash.o
test_xch_libs =
test_xch_ldfl =
toclean += $(test_xch_objs)
all: $(targets)
clean:
$(RM) $(toclean)
test_xch: $(test_xch_deps) $(test_xch_objs)
$(CC) $(LDFLAGS) $($#_ldfl) -o $# $($#_objs) $($#_libs) $(LIBS)
To make the program, just run:
$ make
and to run it, you can use option -n that allows you to specify the number of random elements to generate.
I think you will have to invent one to avoid the slow sorting option. In addition to XOR and arithmetic addition, there are bit rotations, and bit masks you could use. If you need high collision resistance, you could just combine more than one of the hash functions. e.g. Assuming the d_i and arithmetic are modular like with uint32_t for example,
H_1 = sum_{i = 1 to n} d_i
H_2 = xor_{i = 1 to n} d_i
H_3 = xor_{i = 1 to n} (rotl(d_i, d_i & 0x1f) + c)
Then take H1H2H3 as a 12 byte hash.
Documentation for pb_ostream_from_buffer says
After writing, you can check stream.bytes_written to find out how much
valid data there is in the buffer. This should be passed as the
message length on decoding side.
So ideally, when I send the serialized data I need to also send the bytes_written as a parameter separate from the buffer.
The problem is that my interface only allows me to send one variable: the buffer.
QUESTION
How do I specify always serialize the struct with no optimizations so that bufsize in
pb_istream_from_buffer(const pb_byte_t *buf, size_t bufsize)
can be a constant (i.e. the macro that specifies the maximum size) instead of needing to pass stream.bytes_written?
According to the Protocol Buffers encoding specification there are variable size types (like int32, int64, string, etc) and fixed size types (like fixed32, fixed64, double, etc). Now, this variable size encoding is more than just an optimization, it's a part of the design and specification. So disabling this "optimization" by the means of Protocol Buffers is only possible if your data consists exclusively of fixed length types and has no repeated fields as long as the number of repetitions is not fixed. I presume that this is not the case, since you're asking this question. So the short answer is no, it's not possible by means of the library because it would violate the encoding specification.
But in my opinion the desired effect could be easily achieved by encoding the size into the buffer with little CPU and RAM overhead. I presume you know the maximum size of the message generated by nanopb, we denote it by MAX_MSG_SIZE. We call this message the payload message. Suppose that this MAX_MSG_SIZE can be represented by some integer type, which we denote by wrapped_size_t (e.g. uint16_t).
The idea is simple:
allocate the buffer slightly larger than MAX_MSG_SIZE;
write the payload message generated by nanopb at some offset into the allocated buffer;
use this offset to encode the size of the payload message at the beginning of the buffer;
transmit the whole buffer having the fixed size equal to MAX_MSG_SIZE + sizeof(wrapped_size_t) to the receiver;
upon reception decode the size of the payload message and pass both the decoded size and the payload message to pb_istream_from_buffer.
I attach the code to illustrate the idea. I used an example from nanopb repository:
#include <stdio.h>
#include <inttypes.h>
#include <string.h>
#include <pb_encode.h>
#include <pb_decode.h>
#include "simple.pb.h"
//#define COMMON_ENDIANNES
#ifdef COMMON_ENDIANNES
#define encode_size encode_size_ce
#define decode_size decode_size_ce
#else
#define encode_size encode_size_le
#define decode_size decode_size_le
#endif
typedef uint16_t wrapped_size_t;
/* Maximum size of the message returned by bytes_written */
const size_t MAX_MSG_SIZE = 11;
/* Size of the field storing the actual size of the message
* (as returned by bytes_written) */
const size_t SIZE_FIELD = sizeof(wrapped_size_t);
/* Fixed wrapped message size */
const size_t FIXED_MSG_SIZE = MAX_MSG_SIZE + sizeof(wrapped_size_t);
void print_usage(char *prog);
/* Get the address of the payload buffer from the transmitted buffer */
uint8_t* payload_buffer(uint8_t *buffer);
/* Encode the payload size into the transmitted buffer (common endiannes) */
void encode_size_ce(uint8_t *buffer, size_t size);
/* Decode the payload size into the transmitted buffer (common endiannes) */
wrapped_size_t decode_size_ce(uint8_t *buffer);
/* Encode the payload size into the transmitted buffer (little endian) */
void encode_size_le(uint8_t *buffer, size_t size);
/* Decode the payload size into the transmitted buffer (little endian) */
size_t decode_size_le(uint8_t *buffer);
int main(int argc, char* argv[])
{
/* This is the buffer where we will store our message. */
uint8_t buffer[MAX_MSG_SIZE + sizeof(wrapped_size_t)];
bool status;
if(argc > 2 || (argc == 2 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))))
{
print_usage(argv[0]);
return 1;
}
/* Encode our message */
{
/* Allocate space on the stack to store the message data.
*
* Nanopb generates simple struct definitions for all the messages.
* - check out the contents of simple.pb.h!
* It is a good idea to always initialize your structures
* so that you do not have garbage data from RAM in there.
*/
SimpleMessage message = SimpleMessage_init_zero;
/* Create a stream that will write to our buffer. */
pb_ostream_t stream = pb_ostream_from_buffer(payload_buffer(buffer),
MAX_MSG_SIZE);
if(argc > 1)
sscanf(argv[1], "%" SCNd32, &message.lucky_number);
else
{
printf("Input lucky number: ");
scanf("%" SCNd32, &message.lucky_number);
}
/* Encode the payload message */
status = pb_encode(&stream, SimpleMessage_fields, &message);
/* Wrap the payload, i.e. add the size to the buffer */
encode_size(buffer, stream.bytes_written);
/* Then just check for any errors.. */
if (!status)
{
printf("Encoding failed: %s\n", PB_GET_ERROR(&stream));
return 1;
}
}
/* Now we could transmit the message over network, store it in a file, etc.
* Note, the transmitted message has a fixed length equal to FIXED_MSG_SIZE
* and is stored in buffer
*/
/* But for the sake of simplicity we will just decode it immediately. */
{
/* Allocate space for the decoded message. */
SimpleMessage message = SimpleMessage_init_zero;
/* Create a stream that reads from the buffer. */
pb_istream_t stream = pb_istream_from_buffer(payload_buffer(buffer),
decode_size(buffer));
/* Now we are ready to decode the message. */
status = pb_decode(&stream, SimpleMessage_fields, &message);
/* Check for errors... */
if (!status)
{
printf("Decoding failed: %s\n", PB_GET_ERROR(&stream));
return 1;
}
/* Print the data contained in the message. */
printf("Your lucky number was %d; payload length was %d.\n",
(int)message.lucky_number, (int)decode_size(buffer));
}
return 0;
}
void print_usage(char *prog)
{
printf("usage: %s [<lucky_number>]\n", prog);
}
uint8_t* payload_buffer(uint8_t *buffer)
{
return buffer + SIZE_FIELD;
}
void encode_size_ce(uint8_t *buffer, size_t size)
{
*(wrapped_size_t*)buffer = size;
}
wrapped_size_t decode_size_ce(uint8_t *buffer)
{
return *(wrapped_size_t*)buffer;
}
void encode_size_le(uint8_t *buffer, size_t size)
{
int i;
for(i = 0; i < sizeof(wrapped_size_t); ++i)
{
buffer[i] = size;
size >>= 8;
}
}
size_t decode_size_le(uint8_t *buffer)
{
int i;
size_t ret = 0;
for(i = sizeof(wrapped_size_t) - 1; i >= 0; --i)
ret = buffer[i] + (ret << 8);
return ret;
}
UPD Ok, if, for some reason, you still wish to stick to the original GPB encoding there's another option available: fill the unused part of the buffer (i.e. the part after the last byte written by nanopb) with some valid data which will be ignored. For instance, you can reserve a field number which doesn't mark any field in your *.proto file but is used to mark the data which will be discarded by the GPB decoder. Let's denote this reserved field number as RESERVED_FIELD_NUMBER. This is used for backward compatibility but you can use it for your purpose as well. Let's call this filling-in the buffer with the dummy data sealing (perhaps there's a better term). This method also requires that you have at least 2 free bytes available to you after pb_encode.
So the idea of sealing is even simpler:
calculate how many buffer bytes is left unfilled after pb_encode;
mark the rest of the buffer as array of bytes with RESERVED_FIELD_NUMBER.
I attach the updated code, the main function is bool seal_buffer(uint8_t *buffer, size_t size), call it after pb_encode to seal the buffer and you're done. Currently, it has a limitation of sealing no more than 2 ** 28 + 4 bytes, but it could be easily updated to overcome this limitation.
#include <stdio.h>
#include <assert.h>
#include <inttypes.h>
#include <pb_encode.h>
#include <pb_decode.h>
#include "simple.pb.h"
/* Reserved field_number shouldn't be used for field numbering. We use it
* to mark the data which will be ignored upon reception by GPB parser.
* This number should be 1 to 15 to fit into a single byte. */
const uint8_t RESERVED_FIELD_NUMBER = 15;
/* Maximum size of the message returned by bytes_written (payload size) */
const size_t MAX_MSG_SIZE = 200;
/* Size of the transmitted message (reserve 2 bytes for minimal sealing) */
const size_t FIXED_MSG_SIZE = MAX_MSG_SIZE + 2;
void print_usage(char *prog);
/* Sealing the buffer means filling it in with data which is valid
* in the sense that a GPB parser accepts it as valid but ignores it */
bool seal_buffer(uint8_t *buffer, size_t size);
int main(int argc, char* argv[])
{
/* This is the buffer where we will store our message. */
uint8_t buffer[FIXED_MSG_SIZE];
bool status;
if(argc > 2 || (argc == 2 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))))
{
print_usage(argv[0]);
return 1;
}
/* Encode our message */
{
/* Allocate space on the stack to store the message data.
*
* Nanopb generates simple struct definitions for all the messages.
* - check out the contents of simple.pb.h!
* It is a good idea to always initialize your structures
* so that you do not have garbage data from RAM in there.
*/
SimpleMessage message = SimpleMessage_init_zero;
/* Create a stream that will write to our buffer. */
pb_ostream_t stream = pb_ostream_from_buffer(buffer, sizeof(buffer));
if(argc > 1)
sscanf(argv[1], "%" SCNd32, &message.lucky_number);
else
{
printf("Input lucky number: ");
scanf("%" SCNd32, &message.lucky_number);
}
/* Now we are ready to encode the message! */
status = pb_encode(&stream, SimpleMessage_fields, &message);
/* Then just check for any errors.. */
if (!status)
{
fprintf(stderr, "Encoding failed: %s\n", PB_GET_ERROR(&stream));
return 1;
}
/* Now the main part - making the buffer fixed-size */
assert(stream.bytes_written + 2 <= FIXED_MSG_SIZE);
if(!seal_buffer(buffer + stream.bytes_written,
FIXED_MSG_SIZE - stream.bytes_written))
{
fprintf(stderr, "Failed sealing the buffer "
"(filling in with valid but ignored data)\n");
return 1;
}
}
/* Now we could transmit the message over network, store it in a file or
* wrap it to a pigeon's leg.
*/
/* But because we are lazy, we will just decode it immediately. */
{
/* Allocate space for the decoded message. */
SimpleMessage message = SimpleMessage_init_zero;
/* Create a stream that reads from the buffer. */
pb_istream_t stream = pb_istream_from_buffer(buffer, FIXED_MSG_SIZE);
/* Now we are ready to decode the message. */
status = pb_decode(&stream, SimpleMessage_fields, &message);
/* Check for errors... */
if (!status)
{
fprintf(stderr, "Decoding failed: %s\n", PB_GET_ERROR(&stream));
return 1;
}
/* Print the data contained in the message. */
printf("Your lucky number was %d.\n", (int)message.lucky_number);
}
return 0;
}
void print_usage(char *prog)
{
printf("usage: %s [<lucky_number>]\n", prog);
}
bool seal_buffer(uint8_t *buffer, size_t size)
{
size_t i;
if(size == 1)
{
fprintf( stderr, "Cannot seal the buffer, at least 2 bytes are needed\n");
return false;
}
assert(size - 5 < 1<<28);
if(size - 5 >= 1<<28)
{
fprintf( stderr, "Representing the size exceeding 2 ** 28 + 4, "
"although it's not difficult, is not yet implemented\n");
return false;
}
buffer[0] = (15 << 3) + 2;
/* encode the size */
if(size - 2 < 1<<7)
buffer[1] = size - 2;
else
{
/* Size is large enough to fit into 7 bits (1 byte).
* For simplicity we represent the remaining size by 4 bytes (28 bits).
* Note that 1 byte is used for encoding field_number and wire_type,
* plus 4 bytes for the size encoding, therefore the "remaining size"
* is equal to (size - 5)
*/
size -= 5;
for(i = 0; i < 4; ++i)
{
buffer[i + 1] = i < 3? (size & 0x7f) | 0x80: size & 0x7f;
size >>= 7;
}
}
return true;
}
I must further develop a simulator using C, capable of simulating different cache types (direct, n-way associative, fully associative). Right now my code works in the sense that it can simulate a direct-mapped cache, however it cannot simulate any other type.
My Code
My C file:
/*
* CS3375 Computer Architecture
* Course Project
* Cache Simulator Design and Development
* FALL 2017
* By Yong Chen
*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include "cachesim.h"
int main(int argc, char *argv[])
{
char type;
if (argc != 3) {
printf("Usage: %s <direct> <trace file name>\n", argv[0]);
return 1;
}
#ifdef DBG
printf("BLOCK SIZE = %d Bytes\n", BLOCK_SIZE);
printf("%d-WAY\n", WAY_SIZE);
printf("CACHE SIZE = %d Bytes\n", CACHE_SIZE);
printf("NUMBER OF BLOCKS = %d\n", NUM_BLOCKS);
printf("NUMBER OF SETS = %d\n", NUM_SETS);
printf("\n");
#endif
struct direct_mapped_cache d_cache;
char* trace_file_name = argv[2];
char mem_request[20];
uint64_t address;
FILE *fp;
/* Initialization */
for (int i=0; i<NUM_BLOCKS; i++) {
d_cache.valid_field[i] = 0;
d_cache.dirty_field[i] = 0;
d_cache.tag_field[i] = 0;
}
d_cache.hits = 0;
d_cache.misses = 0;
/* Opening the memory trace file */
fp = fopen(trace_file_name, "r");
/*Checks if argument specified direct-mapped cache*/
if (strncmp(argv[1], "direct", 6)==0) { /* Simulating direct-mapped cache */
/* Read the memory request address and access the cache */
while (fgets(mem_request, 20, fp)!= NULL) {
address = convert_address(mem_request);
direct_mapped_cache_access(&d_cache, address);
}
/*Calculate Hit and Miss Rate*/
double hit_rate = ((1.0 * d_cache.hits)/(d_cache.hits + d_cache.misses));
double miss_rate = ((1.0 * d_cache.misses)/(d_cache.hits + d_cache.misses));
/*Print out the results*/
printf("\n==================================\n");
printf("Cache type: Direct-Mapped Cache\n");
printf("==================================\n");
printf("Cache Hits: %d\n", d_cache.hits);
printf("Cache Misses: %d\n", d_cache.misses);
printf("Cache Hit Rate: %f\n", hit_rate);
printf("Cache Miss Rate: %f\n", miss_rate);
printf("\n");
}
fclose(fp);
return 0;
}
uint64_t convert_address(char memory_addr[])
/* Converts the physical 32-bit address in the trace file to the "binary" \\
* (a uint64 that can have bitwise operations on it) */
{
uint64_t binary = 0;
int i = 0;
while (memory_addr[i] != '\n') {
if (memory_addr[i] <= '9' && memory_addr[i] >= '0') {
binary = (binary*16) + (memory_addr[i] - '0');
} else {
if(memory_addr[i] == 'a' || memory_addr[i] == 'A') {
binary = (binary*16) + 10;
}
if(memory_addr[i] == 'b' || memory_addr[i] == 'B') {
binary = (binary*16) + 11;
}
if(memory_addr[i] == 'c' || memory_addr[i] == 'C') {
binary = (binary*16) + 12;
}
if(memory_addr[i] == 'd' || memory_addr[i] == 'D') {
binary = (binary*16) + 13;
}
if(memory_addr[i] == 'e' || memory_addr[i] == 'E') {
binary = (binary*16) + 14;
}
if(memory_addr[i] == 'f' || memory_addr[i] == 'F') {
binary = (binary*16) + 15;
}
}
i++;
}
#ifdef DBG
printf("%s converted to %llu\n", memory_addr, binary);
#endif
return binary;
}
void direct_mapped_cache_access(struct direct_mapped_cache *cache, uint64_t address)
{
uint64_t block_addr = address >> (unsigned)log2(BLOCK_SIZE);
uint64_t index = block_addr % NUM_BLOCKS;
uint64_t tag = block_addr >> (unsigned)log2(NUM_BLOCKS);
#ifdef DBG
printf("Memory address: %llu, Block address: %llu, Index: %llu, Tag: %llu ", address, block_addr, index, tag);
#endif
if (cache->valid_field[index] && cache->tag_field[index] == tag) { /* Cache hit */
cache->hits += 1;
#ifdef DBG
printf("Hit!\n");
#endif
} else {
/* Cache miss */
cache->misses += 1;
#ifdef DBG
printf("Miss!\n");
#endif
if (cache->valid_field[index] && cache->dirty_field[index]) {
/* Write the cache block back to memory */
}
cache->tag_field[index] = tag;
cache->valid_field[index] = 1;
cache->dirty_field[index] = 0;
}
}
My .h file:
/*
* CS3375 Computer Architecture
* Course Project
* Cache Simulator Design and Development
* FALL 2017
* By Yong Chen
*/
#include <stdio.h>
/* Cache block size (or cache line size) in bytes*/
#define BLOCK_SIZE 64 /*(must be power of 2). 4 Bytes = 1 Word NOTE: MUST CHANGE DEPENDING ON TYPE*/
#define WAY_SIZE 1 /* Associativity; 1-way = direct-mapped MUST CHANGE DEPENDING ON TYPE*/
#define CACHE_SIZE 32768 /* Cache capacity in bytes (must be power of 2) THIS WILL STAY FIXED*/
#define NUM_BLOCKS (CACHE_SIZE / BLOCK_SIZE)
#define NUM_SETS (BLOCK_SIZE/WAY_SIZE)
/*For fully associative, num sets is equal to num blocks because way size is equal to num blocks. */
/*MAY TRY LEAVING THESE VARIABLES UNDEFINED, AND THEY WILL BE SET DEPENDING ON USER INPUT.*/
#define DBG /*Prints debugging information*/
/*The data structure of direct-mapped cache*/
struct direct_mapped_cache {
unsigned valid_field[NUM_BLOCKS]; /* Valid field */
unsigned dirty_field[NUM_BLOCKS]; /* Dirty field; since we don't distinguish writes and \\
reads in this project yet, this field doesn't really matter */
uint64_t tag_field[NUM_BLOCKS]; /* Tag field */
char data_field[NUM_BLOCKS][BLOCK_SIZE]; /* Data field; since we don't really fetch data, \\
this field doesn't really matter */
int hits; /* Hit count */
int misses; /* Miss count */
};
/*Read the memory traces and convert it to binary*/
uint64_t convert_address(char memory[]);
/*Simulate the direct-mapped cache*/
void direct_mapped_cache_access(struct direct_mapped_cache *cache, uint64_t address);
What I've Tried
Admittedly, I am a beginner when it comes to the C language, so my solution may be simpler than I think, but I've been unable to find any answers thus far. I've considered changing where the cache variables were defined using "#define" depending on the argument, but I have learned that "#define" is run by pre-processing, so this won't work.
I've also tried creating multiple struct classes for each type of cache that needs to be simulated, but since struct variables in C cannot be initialized within the class, I can't get this to work either.
To my understanding, structs in C cannot have constructors either, as I've looked into this as well.
Any help or step in the right direction will be greatly appreciated.
first there is a you are mistaken about struct : they are not object but a data layout which mean no constructor or destructor (if you need those feature you will need dedicated function)
next your actual question: "can i have a variable amount of memory in my struct ?"
Answer:
Yes, but not directly, you will need to use function like malloc and free to dynamically allocate and deallocate memory, i won't make a full explanation about them you will find how to use them very easily online.
You will need to do something like :
#include<stdlib.h>
struct mystruct {
unsigned size;
unsigned * memory;
};
typedef struct mystruct mystruct;
mystruct * mystruct_constructor(unsigned s) {
//allocate the space of struct itself
mystruct * ms = malloc(sizeof(mystruct));
ms->size = s;
//allocate the variable size array
ms->memory = malloc(sizeof(unsigned) * s);
return ms;
}
void mystruct_destructor(mystruct * ms) {
free(ms->memory);
free(ms);
}
just know that malloc don't initialize the space so the data within are unknown (and likely not zero).
as for the why you can't define variable size struct is because you need compile time array (those like unsigned var[X]) to have a compile time defined size, because when you make such array the struct actually contain X element of the array type.
which mean:
#define N whatever_value_you_want
struct {
unsigned item[N + 1]; // valid because it can be known at compile-time
};
has the same (under normal condition) layout as:
struct {
unsigned item_0;
unsigned item_1;
unsigned item_2;
...
unsigned item_N;
}
I have IP addresses coming into a box in the form of, for example, 21211328.
I want to take this integer and convert it to its binary form.
The problem I'm getting right now is that my function spits out, based off of the example, -62674752.
This is obviously wrong, as it can't be negative and this isn't in binary.
The function I am using looks like this:
int toBinary(int decimalNo){
if (decimalNo == 0) return 0;
if (decimalNo == 1) return 1; /* optional */
return (decimalNo % 2) + 10 * toBinary(decimalNo / 2);
}
I am using it as follows:
int num_converted = toBinary(iph->saddr); // convert it to binary
printk(KERN_INFO "BSADDR: %d", num_converted); // print the binary conversion to kernel for debugging
if ((num_converted & mask_array) == masked_sub){
return NF_DROP;
}
And this is returning the incorrect output in my kernel logs as seen above.
iph->saddr returns the 21211328, and int num_converted = toBinary(iph->saddr); returns -62674752.
First, ip addresses are normally represented (for human consumption) as quartets of individual bytes (four numbers separated by dots in network byte order) and not as binary sequences.
The address you post (21211328) represents the address 1.67.168.192 (which I guess is a wrong representation ---you need to consider that all ip addresses are in network byte order, most signifiant byte first--- of address 192.168.67.1)
to successfully decode an ip address, you have first to consider if you machine will treat it in the correct endianness or you will have to switch bytes to be able to interpret it as a number. The correct integer (in decimal) for the address 192.168.67.1 would be 3,232,252,673. This number cannot be represented as a signed int, because it has the most signifiant bit on, so it would be represented as a negative number.
To decode an IP address, you first have to convert the number you get from the socket interface to host byte order (endianness) by use of the ntohl(3) function. Once you have the address in host byte order, then you have to convert the number to base 256. This is quite easy, and you can do it with the following snippet of code:
unsigned long ip_address = ntohl(server_address.sin_addr.s_addr);
int i;
for (i = 24; i >= 0; i -= 8) {
if (i < 24) printf("."); /* print the dot between the numbers */
printf("%d",
(ip_address >> i) & 0xff);
}
which I'll explain below:
The expression (ip_address >> i) & 0xff means to right shift the bits 24 to 31, by i places (or 24 places, 16 places, 8 places and 0 places) to the right. So first we will put the 8 most signifiant bits in the place 0 to 7 (24..31 => 0..7), next we will do that for the bits next to them (16..23 => 0..7), and so on until the least signifiant bits, which are not shifted at all (0..7 => 0..7). Once we have the bits we are interested in positions 0..7, we mask them with 0xff (which is a value with 1s in bit positions 0..7 and 0s elsewhere) so the and bit operator & will leave only the bits we have moved to the fixed positions 0..7 and will mask out all the others. So we got a number between 0 and 255 finally, which is what we are printing.
In the case you indeed want to represent them in binary form, you can do it by slightly modifying the above code, just considering that instead of having eight bit numbers as digits you have one bit numbers as digits:
unsigned long ip_address = ntohl(server_address.sin_addr.s_addr);
for (i = 31; i >= 0; i--) {
/* In this case, we don't print the dot between the numbers */
printf("%d",
(ip_address >> i) & 0x1);
}
The mask this time is 0x01 which only masks the least signifiant bit of the number.
Finally, I wrote a complete program to illustrate both ways of decoding, showing in addition the use of the ntohl(3) function:
ipaddr.c
/* 00001 */ #include <arpa/inet.h>
/* 00002 */ #include <stdio.h>
/* 00003 */ #ifndef DOTTED_DECIMAL
/* 00004 */ #define DOTTED_DECIMAL 1
/* 00005 */ #endif
/* 00006 */ #if DOTTED_DECIMAL
/* 00007 */ # define NBITS 8
/* 00008 */ # define SEP "."
/* 00009 */ #else /* BINARY */
/* 00010 */ # define NBITS 1
/* 00011 */ # define SEP ""
/* 00012 */ #endif
/* 00013 */ #define MASK ((1 << NBITS) - 1) /* 100..00 - 1 = 011..11, with NBITS `1` bits */
/* 00014 */ char *ip_formatted(long ip, char *sep, char *buff, size_t buffsz);
/* 00015 */ int main()
/* 00016 */ {
/* 00017 */ char line[1024];
/* 00018 */ unsigned long ip_netfmt = 21211328; /* this was the ip you posted (in network byte order) */
/* 00019 */ unsigned long ip_hostfmt = ntohl(ip_netfmt); /* this is the ip in host byte order */
/* 00020 */ printf("%lu => [%s]\n", ip_netfmt, ip_formatted(ip_netfmt, SEP, line, sizeof line));
/* 00021 */ printf("%lu => [%s]\n", ip_hostfmt, ip_formatted(ip_hostfmt, SEP, line, sizeof line));
/* 00022 */ }
/* 00023 */ char *ip_formatted(long ip, char *sep, char *buff, size_t buffsz)
/* 00024 */ {
/* 00025 */ size_t n;
/* 00026 */ char *s = buff;
/* 00027 */ int i;
/* 00028 */ for (i = 32 - NBITS; i >= 0; i -= NBITS) {
/* 00029 */ int digit = (ip >> i) & MASK;
/* 00030 */ n = snprintf(s, buffsz,
/* 00031 */ "%s%d",
/* 00032 */ i == 32 - NBITS ? "" : sep,
/* 00033 */ digit);
/* 00034 */ s += n; buffsz -= n;
/* 00035 */ }
/* 00036 */ return buff;
/* 00037 */ }
compile this code with the following commands:
$ cc -o ipaddr -DDOTTED_DECIMAL=1 ipaddr.c
to see output in dotted decimal, and
$ cc -o ipaddr -DDOTTED_DECIMAL=0 ipaddr.c
to see output in binary digits.
(I included line numbers, so references to the code can be used, and commented, so you can directly compile the code by cut and paste it)
Try this:
#include <stdio.h>
char *getBin( unsigned long val) {
static char buf[33];
int i;
buf[32]='\0';
for(i=31; i>=0; i--){
buf[i] = val & 1?'1':'0';
val/=2;
}
return buf;
}
int main()
{
unsigned long ip=0x80e2d301;
printf( " ip:%08lx, bin:%sB\n", (unsigned long)ip, getBin(ip) );
return 0;
}