I'm trying to compare a byte array with a hex number, having a surprisingly hard time.
#include <stdio.h>
int main()
{
int bytes[4] = { 0x7c, 0x71, 0xde, 0xbb };
int number = 0x7c71debb;
printf("%u\n", number);
printf("%u\n", (int)*bytes);
return 0;
}
I'm getting:
2087837371
124
I did some reading and I tried using memcpy as suggested in various places:
#include <stdio.h>
#include <string.h>
int main()
{
int bytes[4] = { 0x7c, 0x71, 0xde, 0xbb };
int number = 0x7c71debb;
int frombytes;
memcpy(&frombytes, bytes, 4);
printf("%u\n", number);
printf("%u\n", frombytes);
return 0;
}
Still the same result:
2087837371
124
I mean, it's been like an hour if I got to be honest frustration is starting to get a hold of me.
It all started from me trying to do this:
if ((unsigned int)bytes == 0x7c71debb)
EDIT:
After switching bytes' type to char or uint8_t, here's what I'm getting:
#include <stdio.h>
#include <string.h>
#include <stdint.h>
int main()
{
uint8_t bytes[4] = { 0x7c, 0x71, 0xde, 0xbb };
int number = 0x7c71debb;
int frombytes;
memcpy(&frombytes, bytes, 4);
printf("%u\n", number);
printf("%u\n", (int)*bytes);
printf("%u\n", frombytes);
return 0;
}
Results:
2087837371
124
3151917436
You are making two assumptions:
You're assuming int is exactly 32 bits in size.
This might be correct, but it could be smaller or larger. You should use int32_t or uint32_t instead.
You're assuming a big-endian machine.
If you are using an x86 or x86_64, this is incorrect. These are little-endian architectures. The bytes are ordered from least-significant to most-significant.
The following code avoids those assumptions:
int32_t frombytes =
(uint32_t)bytes[0] << ( 8 * 3 ) |
(uint32_t)bytes[1] << ( 8 * 2 ) |
(uint32_t)bytes[2] << ( 8 * 1 ) |
(uint32_t)bytes[3] << ( 8 * 0 );
(It looks a bit expensive, but your compiler should optimize this.)
You can build the int using bitwise operators on the array values.
The below code, which assumes big-endian architecture and ints that are at least 4 bytes long, outputs:
2087837371
2087837371
#include <stdio.h>
int main()
{
int bytes[4] = { 0x7c, 0x71, 0xde, 0xbb };
int number = 0x7c71debb;
int number2 = bytes[3] | bytes[2] << 8 | bytes[1] << 16 | bytes[0] << 24;
printf("%u\n", number);
printf("%u\n", number2);
return 0;
}
I'm reading the values from a SD card in an ARM micro:
Res = f_read(&fil, (void*)buf, 6, &NumBytesRead);
where fil is a pointer, buf is a buffer where the data is stored.
And that's the problem: it's an array but I'd like to have the contents of that array in a single variable.
To give an actual example: the 6 bytes read from the file are:
buf[0] = 0x1B
buf[1] = 0x26
buf[2] = 0xB3
buf[3] = 0x54
buf[4] = 0xA1
buf[5] = 0xCF
And I'd like to have: uint64_t data be equal to 0x1B26B354A1CF. That is, all the elements of the array "concatenated" in one single 64 bit integer.
Without type punning you can do as below.
uint64_t data = 0;
for (int i=0; i<6; i++)
{
data <<= 8;
data |= (uint64_t) buf[i];
}
Use union but remember about the endianes.
union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef enum
{
LITTLE_E,
BIG_E,
}ENDIANESS;
ENDIANESS checkEndianess(void)
{
ENDIANESS result = BIG_E;
u64 d64 = {.u64 = 0xff};
if(d64.u8[0]) result = LITTLE_E;
return result;
}
uint64_t arrayToU64(uint8_t *array, ENDIANESS e) // for the array BE
{
u64 d64;
if(e == LITTLE_E)
{
memmove(&d64, array, sizeof(d64.u64));
}
else
{
for(int index = sizeof(d64.u64) - 1; index >= 0; index--)
{
d64.u8[sizeof(d64.u64) - index - 1] = array[index];
}
}
return d64.u64;
}
int main()
{
uint8_t BIG_E_Array[] = {0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x80};
ENDIANESS e;
printf("This system endianess: %s\n", (e = checkEndianess()) == BIG_E ? "BIG":"LITTLE");
printf("Punned uint64_t for our system 0x%lx\n", arrayToU64(BIG_E_Array, e));
printf("Punned uint64_t for the opposite endianess system 0x%lx\n", arrayToU64(BIG_E_Array, e == BIG_E ? LITTLE_E : BIG_E));
return 0;
}
To things to take care of here:
have the bytes be ordered correctly
read the six bytes into one 64bit integer
Issue 1 can be taken care of by storing the byte coming in in network byte order (Big Endian) into the 64 bit integer in host byte order by for example using the two marcos below:
/* below defines of htonll() and ntohll() are taken from this answer:
https://stackoverflow.com/a/28592202/694576
*/
#if __BIG_ENDIAN__
# define htonll(x) (x)
# define ntohll(x) (x)
#else
# define htonll(x) ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32))
# define ntohll(x) ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32))
#endif
Issue 2 can be solved in multiple ways:
Extending your approach
#define BUFFER_SIZE (6)
...
assert(BUFFER_SIZE <= sizeof (uint64_t));
uint8_t buffer[BUFFER_SIZE];
FILE * pf = ...; /* open file here */
/* test if file has been opened successfully here */
... result = f_read(pf, buffer, BUFFER_SIZE, ...);
/* test result for success */
uint64_t number = 0;
memset(&number, buffer, BUFFER_SIZE)
number = ntohll(number);
Use "Type Punning" by using a union
union buffer_wrapper
{
uint8_t u8[sizeof (uint64_t)];
uint64_t u64;
}
Instead of
uint8_t buffer[BUFFER_SIZE];
use
union buffer_wrapper buffer;
and instead of
memcpy(&number, buffer, BUFFER_SIZE)
number = ntohll(number)
use
number = ntohll(buffer.u64)
I consider how to make efficient XORing of 2 bytes arrays.
I have this bytes arrays defined as unsigned char *
I think that XORing them as uint64_t will be much faster. Is it true?
How efficiently convert unsigned char * to this uint64_t * preferably inside the XORing loop? How to make padding of last bytes if length of the bytes array % 8 isn't 0?
Here is my current code that XORs bytes array, but each byte (unsigned char) separately:
unsigned char *bitwise_xor(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
unsigned char *XOR_Bytes_Array;
// allocate XORed bytes array
XOR_Bytes_Array = malloc(sizeof(unsigned char) * length);
// perform bitwise XOR operation on bytes arrays A and B
for(int i=0; i < length; i++)
XOR_Bytes_Array[i] = (unsigned char)(A_Bytes_Array[i] ^ B_Bytes_Array[i]);
return XOR_Bytes_Array;
}
Ok, in the meantime I have tried to do it this way. My bytes_array are rather large (rgba bitmaps 4*1440*900?).
static uint64_t next64bitsFromBytesArray(const unsigned char *bytesArray, const int i) {
uint64_t next64bits = (uint64_t) bytesArray[i+7] | ((uint64_t) bytesArray[i+6] << 8) | ((uint64_t) bytesArray[i+5] << 16) | ((uint64_t) bytesArray[i+4] << 24) | ((uint64_t) bytesArray[i+3] << 32) | ((uint64_t) bytesArray[i+2] << 40) | ((uint64_t) bytesArray[i+1] << 48) | ((uint64_t)bytesArray[i] << 56);
return next64bits;
}
unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
unsigned char *XOR_Bytes_Array;
// allocate XORed bytes array
XOR_Bytes_Array = malloc(sizeof(unsigned char) * length);
// perform bitwise XOR operation on bytes arrays A and B using uint64_t
for(int i=0; i<length; i+=8) {
uint64_t A_Bytes = next64bitsFromBytesArray(A_Bytes_Array, i);
uint64_t B_Bytes = next64bitsFromBytesArray(B_Bytes_Array, i);
uint64_t XOR_Bytes = A_Bytes ^ B_Bytes;
memcpy(XOR_Bytes_Array + i, &XOR_Bytes, 8);
}
return XOR_Bytes_Array;
}
UPDATE: (2nd approach to this problem)
unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
const uint64_t *aBytes = (const uint64_t *) A_Bytes_Array;
const uint64_t *bBytes = (const uint64_t *) B_Bytes_Array;
unsigned char *xorBytes = malloc(sizeof(unsigned char)*length);
for(int i = 0, j=0; i < length; i +=8) {
uint64_t aXORbBytes = aBytes[j] ^ bBytes[j];
//printf("a XOR b = 0x%" PRIx64 "\n", aXORbBytes);
memcpy(xorBytes + i, &aXORbBytes, 8);
j++;
}
return xorBytes;
}
So I did an experiment:
#include <stdlib.h>
#include <stdint.h>
#ifndef TYPE
#define TYPE uint64_t
#endif
TYPE *
xor(const void *va, const void *vb, size_t l)
{
const TYPE *a = va;
const TYPE *b = vb;
TYPE *r = malloc(l);
size_t i;
for (i = 0; i < l / sizeof(TYPE); i++) {
*r++ = *a++ ^ *b++;
}
return r;
}
Compiled both for uint64_t and uint8_t with clang with basic optimizations. In both cases the compiler vectorized the hell out of this. The difference was that the uint8_t version had code to handle when l wasn't a multiple of 8. So if we add code to handle the size not being a multiple of 8, you'll probably end up with equivalent generated code. Also, the 64 bit version unrolled the loop a few times and had code to handle that, so for big enough arrays you might gain a few percent here. On the other hand, on big enough arrays you'll be memory-bound and the xor operation won't matter a bit.
Are you sure your compiler won't deal with this? This is a kind of micro-optimization that makes sense only when you're measuring things and then you wouldn't need to ask which one is faster, you'd know.
Now can I copy 4 bytes (known start position, NOT 0) in an array (char*data) to a DWORD?
There are two parts to this: first, getting the 4 bytes from the array at the specified position (4 elements, each element is a char which is 1 byte in C), and then moving them to a DWORD.
I'm not sure how to do either.
Thanks!
You can use memcpy (it requires string.h header):
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
int main(void)
{
uint8_t array[] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09
}
uint32_t dword;
size_t offset = 2;
memcpy(&dword, array + offset, sizeof(dword));
printf("Dword value = 0x%08x\n", dword);
return 0;
}
I think below code shoudl work
memcpy(destination, input+offset, howMany);
What you look for is deserialization. In this case, somewhere must be specified if the data in your char * are big endian (start with the highest byte) or little endian (start with the lowest byte). For portable applications, BE might be the favourite.
After you have done so, your function might look like
uintmax_t readarrayvalue_be(const char * data, uint8_t bytes)
{
uintmax_t value = 0;
while (bytes) {
value <<= 8;
value |= *(data++) & 0xFF;
bytes--;
}
return value;
}
while in the concrete case it might be restructured as
uint32_t readarrayvalue_u32_be(const char * data)
{
uint32_t value = ((data[0] & 0xFF) << 24) | ((data[1] & 0xFF) << 16) | ((data[2] & 0xFF) << 8) | (data[3] & 0xFF);
return value;
}
I have a couple uint8_t arrays in my c code, and I'd like to compare an arbitrary sequence bits from one with another. So for example, I have bitarray_1 and bitarray_2, and I'd like to compare bits 13 - 47 from bitarray_1 with bits 5-39 of bitarray_2. What is the most efficient way to do this?
Currently it's a huge bottleneck in my program, since I just have a naive implementation that copies the bits into the beginning of a new temporary array, and then uses memcmp on them.
three words: shift, mask and xor.
shift to get the same memory alignment for both bitarray. If not you will have to shift one of the arrays before comparing them. Your exemple is probably misleading because bits 13-47 and 5-39 have the same memory alignment on 8 bits addresses. This wouldn't be true if you were comparing say bits 14-48 with bits 5-39.
Once everything is aligned and exceeding bits cleared for table boundaries a xor is enough to perform the comparison of all the bits at once. Basically you can manage to do it with just one memory read for each array, which should be pretty efficient.
If memory alignment is the same for both arrays as in your example memcmp and special case for upper and lower bound is probably yet faster.
Also accessing array by uint32_t (or uint64_t on 64 bits architectures) should also be more efficient than accessing by uint8_t.
The principle is simple but as Andrejs said the implementation is not painless...
Here is how it goes (similarities with #caf proposal is no coincidence):
/* compare_bit_sequence() */
int compare_bit_sequence(uint8_t s1[], unsigned s1_off, uint8_t s2[], unsigned s2_off,
unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t clear_lo_bits[] =
{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
uint8_t v1;
uint8_t * max_s1;
unsigned end;
uint8_t lsl;
uint8_t v1_mask;
int delta;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off >> 3;
s1_off &= 7;
s2 += s2_off >> 3;
s2_off &= 7;
/* Make sure s2 is the sequence with the shorter offset */
if (s2_off > s1_off){
uint8_t * tmp_s;
unsigned tmp_off;
tmp_s = s2; s2 = s1; s1 = tmp_s;
tmp_off = s2_off; s2_off = s1_off; s1_off = tmp_off;
}
delta = s1_off;
/* handle the beginning, s2 incomplete */
if (s2_off > 0){
delta = s1_off - s2_off;
v1 = delta
? (s1[0] >> delta | s1[1] << (8 - delta)) & clear_lo_bits[delta]
: s1[0];
if (length <= 8 - s2_off){
if ((v1 ^ *s2)
& clear_lo_bits[s2_off]
& mask_lo_bits[s2_off + length]){
return NOT_EQUAL;
}
else {
return EQUAL;
}
}
else{
if ((v1 ^ *s2) & clear_lo_bits[s2_off]){
return NOT_EQUAL;
}
length -= 8 - s2_off;
}
s1++;
s2++;
}
/* main loop, we test one group of 8 bits of v2 at each loop */
max_s1 = s1 + (length >> 3);
lsl = 8 - delta;
v1_mask = clear_lo_bits[delta];
while (s1 < max_s1)
{
if ((*s1 >> delta | (*++s1 << lsl & v1_mask)) ^ *s2++)
{
return NOT_EQUAL;
}
}
/* last group of bits v2 incomplete */
end = length & 7;
if (end && ((*s2 ^ *s1 >> delta) & mask_lo_bits[end]))
{
return NOT_EQUAL;
}
return EQUAL;
}
All possible optimisations are not yet used. One promising one would be to use larger chunks of data (64 bits or 32 bits at once instead of 8), you could also detect cases where offset are synchronised for both arrays and in such cases use a memcmp instead of the main loop, replace modulos % 8 by logical operators & 7, replace '/ 8' by '>> 3', etc., have to branches of code instead of swapping s1 and s2, etc, but the main purpose is achieved : only one memory read and not memory write for each array item hence most of the work can take place inside processor registers.
bits 13 - 47 of bitarray_1 are the same as bits 5 - 39 of bitarray_1 + 1.
Compare the first 3 bits (5 - 7) with a mask and the other bits (8 - 39) with memcmp().
Rather than shift and copy the bits, maybe representing them differently is faster. You have to measure.
/* code skeleton */
static char bitarray_1_bis[BIT_ARRAY_SIZE*8+1];
static char bitarray_2_bis[BIT_ARRAY_SIZE*8+1];
static const char *lookup_table[] = {
"00000000", "00000001", "00000010" /* ... */
/* 256 strings */
/* ... */ "11111111"
};
/* copy every bit of bitarray_1 to an element of bitarray_1_bis */
for (k = 0; k < BIT_ARRAY_SIZE; k++) {
strcpy(bitarray_1_bis + 8*k, lookup_table[bitarray_1[k]]);
strcpy(bitarray_2_bis + 8*k, lookup_table[bitarray_2[k]]);
}
memcmp(bitarray_1_bis + 13, bitarray_2_bis + 5, 47 - 13 + 1);
You can (and should) limit the copy to the minimum possible.
I have no idea if it's faster, but it wouldn't surprise me if it was. Again, you have to measure.
The easiest way to do this is to convert the more complex case into a simpler case, then solve the simpler case.
In the following code, do_compare() solves the simpler case (where the sequences are never offset by more than 7 bits, s1 is always offset as much or more than s2, and the length of the sequence is non-zero). The compare_bit_sequence() function then takes care of converting the harder case to the easier case, and calls do_compare() to do the work.
This just does a single-pass through the bit sequences, so hopefully that's an improvement on your copy-and-memcmp implementation.
#define NOT_EQUAL 0
#define EQUAL 1
/* do_compare()
*
* Does the actual comparison, but has some preconditions on parameters to
* simplify things:
*
* length > 0
* 8 > s1_off >= s2_off
*/
int do_compare(const uint8_t s1[], const unsigned s1_off, const uint8_t s2[],
const unsigned s2_off, const unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t mask_hi_bits[] =
{ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
const unsigned msb = (length + s1_off - 1) / 8;
const unsigned s2_shl = s1_off - s2_off;
const unsigned s2_shr = 8 - s2_shl;
unsigned n;
uint8_t s1s2_diff, lo_bits = 0;
for (n = 0; n <= msb; n++)
{
/* Shift s2 so it is aligned with s1, pulling in low bits from
* the high bits of the previous byte, and store in s1s2_diff */
s1s2_diff = lo_bits | (s2[n] << s2_shl);
/* Save the bits needed to fill in the low-order bits of the next
* byte. HERE BE DRAGONS - since s2_shr can be 8, this below line
* only works because uint8_t is promoted to int, and we know that
* the width of int is guaranteed to be >= 16. If you change this
* routine to work with a wider type than uint8_t, you will need
* to special-case this line so that if s2_shr is the width of the
* type, you get lo_bits = 0. Don't say you weren't warned. */
lo_bits = s2[n] >> s2_shr;
/* XOR with s1[n] to determine bits that differ between s1 and s2 */
s1s2_diff ^= s1[n];
/* Look only at differences in the high bits in the first byte */
if (n == 0)
s1s2_diff &= mask_hi_bits[8 - s1_off];
/* Look only at differences in the low bits of the last byte */
if (n == msb)
s1s2_diff &= mask_lo_bits[(length + s1_off) % 8];
if (s1s2_diff)
return NOT_EQUAL;
}
return EQUAL;
}
/* compare_bit_sequence()
*
* Adjusts the parameters to match the preconditions for do_compare(), then
* calls it to do the work.
*/
int compare_bit_sequence(const uint8_t s1[], unsigned s1_off,
const uint8_t s2[], unsigned s2_off, unsigned length)
{
/* Handle length zero */
if (length == 0)
return EQUAL;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off / 8;
s1_off %= 8;
s2 += s2_off / 8;
s2_off %= 8;
/* Make sure s2 is the sequence with the shorter offset */
if (s1_off >= s2_off)
return do_compare(s1, s1_off, s2, s2_off, length);
else
return do_compare(s2, s2_off, s1, s1_off, length);
}
To do the comparison in your example, you'd call:
compare_bit_sequence(bitarray_1, 13, bitarray_2, 5, 35)
(Note that I am numbering the bits from zero, and assuming that the bitarrays are laid out little-endian, so this will start the comparison from the sixth-least-significant bit in bitarray2[0], and the sixth-least-signifcant bit in bitarray1[1]).
What about writing the function that will calculate the offsets from both arrays, apply the mask, shift the bits and store the result to the int so you may compare them. If the bits count (34 in your example) exceeds the length of the int - recurse or loop.
Sorry, the example will be pain in the ass.
Here is my unoptimized bit sequence comparison function:
#include <stdio.h>
#include <stdint.h>
// 01234567 01234567
uint8_t bitsA[] = { 0b01000000, 0b00010000 };
uint8_t bitsB[] = { 0b10000000, 0b00100000 };
int bit( uint8_t *bits, size_t bitpoz, size_t len ){
return (bitpoz<len)? !!(bits[bitpoz/8]&(1<<(7-bitpoz%8))): 0;
}
int bitcmp( uint8_t *bitsA, size_t firstA, size_t lenA,
uint8_t *bitsB, size_t firstB, size_t lenB ){
int cmp;
for( size_t i=0; i<lenA || i<lenB; i++ ){
if( (cmp = bit(bitsA,firstA+i,firstA+lenA) -
bit(bitsB,firstB+i,firstB+lenB)) ) return cmp;
}
return 0;
}
int main(){
printf( "cmp: %i\n", bitcmp( bitsA,1,11, bitsB,0,11 ) );
}
EDIT: Here is my (untested) bitstring equality test function:
#include <stdlib.h>
#include <stdint.h>
#define load_64bit(bits,first) (*(uint64_t*)bits<<first | *(bits+8)>>(8-first))
#define load_32bit(bits,first) (*(uint32_t*)bits<<first | *(bits+4)>>(8-first))
#define load_16bit(bits,first) (*(uint16_t*)bits<<first | *(bits+2)>>(8-first))
#define load_8bit( bits,first) ( *bits<<first | *(bits+1)>>(8-first))
static inline uint8_t last_bits( uint8_t *bits, size_t first, size_t size ){
return (first+size>8?load_8bit(bits,first):*bits<<first)>>(8-size);
}
int biteq( uint8_t *bitsA, size_t firstA,
uint8_t *bitsB, size_t firstB, size_t size ){
if( !size ) return 1;
bitsA+=firstA/8; firstA%=8;
bitsB+=firstB/8; firstB%=8;
for(; size>64;size-=64,bitsA+=8,bitsB+=8)
if(load_64bit(bitsA,firstA)!=load_64bit(bitsB,firstB)) return 0;
for(; size>32;size-=32,bitsA+=4,bitsB+=4)
if(load_32bit(bitsA,firstA)!=load_32bit(bitsB,firstB)) return 0;
for(; size>16;size-=16,bitsA+=2,bitsB+=2)
if(load_16bit(bitsA,firstA)!=load_16bit(bitsB,firstB)) return 0;
for(; size> 8;size-= 8,bitsA++, bitsB++ )
if(load_8bit( bitsA,firstA)!=load_8bit( bitsB,firstB)) return 0;
return !size ||
last_bits(bitsA,firstA,size)==last_bits(bitsB,firstB,size);
}
I made a simple measurement tool to see how fast is it:
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#define SIZE 1000000
uint8_t bitsC[SIZE];
volatile int end_loop;
void sigalrm_hnd( int sig ){ (void)sig; end_loop=1; }
int main(){
uint64_t loop_count; int cmp;
signal(SIGALRM,sigalrm_hnd);
loop_count=0; end_loop=0; alarm(10);
while( !end_loop ){
for( int i=1; i<7; i++ ){
loop_count++;
cmp = biteq( bitsC,i, bitsC,7-i,(SIZE-1)*8 );
if( !cmp ){ printf( "cmp: %i (==0)\n", cmp ); return -1; }
}
}
printf( "biteq: %.2f round/sec\n", loop_count/10.0 );
}
Result:
bitcmp: 8.40 round/sec
biteq: 363.60 round/sec
EDIT2: last_bits() changed.