Slice up an uint8_t array

Slice up an uint8_t array - c

Let's say that I have an array of 16 uint8_t as follows:
uint8_t array[] = {0x13, 0x01, 0x4E, 0x52, 0x31, 0x4A, 0x35, 0x36, 0x4C, 0x11, 0x21, 0xC6, 0x3C, 0x73, 0xC2, 0x41};
This array stores the data contained in a 128 bits register of an external peripheral. Some of the information it represents are stored on 2, 3, 8, 12 bits ... and so on.
What is the best and elegant way to slice it up and bit mask the information I need? (The problem is that some things that I need overlaps the length of one cell of the array)
If that can help, this snippet I wrote converts the whole array into a char* string. But casting this into an int is not option because.. well 16 bytes.
int i;
char str[33];
for(i = 0; i < sizeof(array) / sizeof(*array) ; i++) {
sprintf(str+2*i,"%02hX",array[i]);
}
puts(str);
13014E52314A35364C1121C63C73C241

Actually such problem also occures when trying to parse all kind of bitstreams, like video or image files or compressed data by algorithms like LZ*. So the approach used there is to implement a bitstream reader.
But in your case the bit sequence is fixed length and quite short, so one way is to manually check the field values using bitwise operations.
Or you can use this function that I just wrote, which can extract arbitrary number of bits from a uint8 array, starting from desired bit position:
uint32_t extract_bits(uint8_t *arr, unsigned int bit_index, unsigned int bit_count)
{
/* Assert that we are not requested to extract more than 32 bits */
uint32_t result = 0;
assert(bit_count <= sizeof(result)*8 && arr != NULL);
/* You can additionally check if you are trying to extract bits exceeding the 16 byte range */
assert(bit_index + bit_count <= 16 * 8);
unsigned int arr_id = bit_index / 8;
unsigned int bit_offset = bit_index % 8;
if (bit_offset > 0) {
/* Extract first 'unaligned_bit_count' bits, which happen to be non-byte-aligned.
* When we do extract those bits, the remaining will be byte-aligned so
* we will thread them in different manner.
*/
unsigned int unaligned_bit_count = 8 - bit_offset;
/* Check if we need less than the remaining unaligned bits */
if (bit_count < unaligned_bit_count) {
result = (arr[arr_id] >> bit_offset) & ((1 << bit_count) - 1);
return result;
}
/* We need them all */
result = arr[arr_id] >> bit_offset;
bit_count -= unaligned_bit_count;
/* Move to next byte element */
arr_id++;
}
while (bit_count > 0) {
/* Try to extract up to 8 bits per iteration */
int bits_to_extract = bit_count > 8 ? 8 : bit_count;
if (bits_to_extract < 8) {
result = (result << bits_to_extract) | (arr[arr_id] & ((1 << bits_to_extract)-1));
}else {
result = (result << bits_to_extract) | arr[arr_id];
}
bit_count -= bits_to_extract;
arr_id++;
}
return result;
}
Here is example of how it is used.
uint32_t r;
/* Extracts bits [7..8] and places them as most significant bits of 'r' */
r = extract_bits(arr, 7, 2)
/* Extracts bits [4..35] and places them as most significant bits of 'r' */
r = extract_bits(arr, 4, 32);
/* Visualize */
printf("slice=%x\n", r);
And then the visualisation of r is up to you. They can either be represented as hex dwords, characters, or however you decide.

Related

How to concatenate the hexadecimal data in an array in C

I have my data field as follows DATA = 0x02 0x01 0x02 0x03 0x04 0x05 0x06 0x07 Now I want to concatenate this data as follows DATA = 0x01020304050607. How can I do it using C program. I found a program in C for concatenation of data in an array and the program is as follows:
#include<stdio.h>
int main(void)
{
int num[3]={1, 2, 3}, n1, n2, new_num;
n1 = num[0] * 100;
n2 = num[1] * 10;
new_num = n1 + n2 + num[2];
printf("%d \n", new_num);
return 0;
}
For the hexadecimal data in the array how can I manipulate the above program to concatenate the hexadecimal data?

You need a 64 bit variable num as result, instead of 10 as factor you need 16, and instead of 100 as factor, you need 256.
But if your data is provided as an array of bytes, then you can simply insert complete bytes, i.e. repeatedly shifting by 8 bits (meaning a factor of 256):
int main(void)
{
uint8_t data[8] = { 0x02, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
unsigned long long num = 0;
for (int i=0; i<8; i++) {
num <<=8; // shift by a complete byte, equal to num *= 256
num |= data[i]; // write the respective byte
}
printf("num is %016llx\n",num);
return 0;
}
Output:
num is 0201020304050607

Lest say you have input like
int DATA[8] = {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07};
If you want output like 0x0001020304050607, to store this resultant output you need one variable of unsigned long long type. For e.g
int main(void) {
int DATA[8] = {0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07};
int ele = sizeof(DATA)/sizeof(DATA[0]);
unsigned long long mask = 0x00;
for(int row = 0; row < ele; row++) {
mask = mask << 8;/* every time left shifted by 8(0x01-> 0000 0001) times */
mask = DATA[row] | mask; /* put at correct location */
}
printf("%016llx\n",mask);
return 0;
}

Here's some kind of hack that writes your data directly into an integer, without any bitwise operators:
#include <stdio.h>
#include <stdint.h>
#include <string.h>
uint64_t numberize(const uint8_t from[8]) {
uint64_t r = 0;
uint8_t *p = &r;
#if '01' == 0x4849 // big endian
memcpy(p, from, 8);
#else // little endian
for (int i=7; i >= 0; --i)
*p++ = from[i];
#endif
return r;
}
int main() {
const uint8_t data[8] = { 0x02, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
printf("result is %016llx\n", numberize(data));
return 0;
}
This does work and outputs this independently of the endianness of your machine:
result is 0201020304050607
The compile-time endianness test was taken from this SO answer.

CRC32 calculation with CRC hash at the beginning of the message in C

I need to calculate CRC of the message and put it at the beginning of this message, so that the final CRC of the message with 'prepended' patch bytes equals 0. I was able to do this very easily with the help of few articles, but not for my specific parameters. The thing is that I have to use a given CRC32 algorithm which calculates the CRC of the memory block, but I don't have that 'reverse' algorithm that calculates those 4 patch bytes/'kind of CRC'. Parameters of the given CRC32 algorithm are:
Polynomial: 0x04C11DB7
Endianess: big-endian
Initial value: 0xFFFFFFFF
Reflected: false
XOR out with: 0L
Test stream: 0x0123, 0x4567, 0x89AB, 0xCDEF results in CRC = 0x612793C3
The code to calculate the CRC (half-byte, table-driven, I hope data type definitions are self-explanatory):
uint32 crc32tab(uint16* data, uint32 len, uint32 crc)
{
uint8 nibble;
int i;
while(len--)
{
for(i = 3; i >= 0; i--)
{
nibble = (*data >> i*4) & 0x0F;
crc = ((crc << 4) | nibble) ^ tab[crc >> 28];
}
data++;
}
return crc;
}
The table needed is (I thougth the short [16] table should contain every 16th element from the large [256] table, but this table contains actually first 16 elements, but that's how it was provided to me):
static const uint32 tab[16]=
{
0x00000000, 0x04C11DB7, 0x09823B6E, 0x0D4326D9,
0x130476DC, 0x17C56B6B, 0x1A864DB2, 0x1E475005,
0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, 0x2B4BCB61,
0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD
};
I modified the code so it's not so long, but the functionality stays the same. The problem is that this forward CRC calculation looks more like backward/reverse CRC calc.
I've spent almost a week trying to find out the correct polynomial/algorithm/table combination, but with no luck. If it helps, I came up with bit-wise algorithm that corresponds to table-driven code above, although that was not so hard after all:
uint32 crc32(uint16* data, uint32 len, uint32 crc)
{
uint32 i;
while(len--)
{
for(i = 0; i < 16; i++)
{
// #define POLY 0x04C11DB7
crc = (crc << 1) ^ (((crc ^ *data) & 0x80000000) ? POLY : 0);
}
crc ^= *data++;
}
return crc;
}
Here are expected results - first 2 16-bit words make the needed unknown CRC and the rest is the known data itself (by feeding these examples to provided algorithm, the result is 0).
{0x3288, 0xD244, 0xCDEF, 0x89AB, 0x4567, 0x0123}
{0xC704, 0xDD7B, 0x0000} - append as many zeros as you like, the result is the same
{0xCEBD, 0x1ADD, 0xFFFF}
{0x81AB, 0xB932, 0xFFFF, 0xFFFF}
{0x0857, 0x0465, 0x0000, 0x0123}
{0x1583, 0xD959, 0x0123}
^ ^
| |
unknown bytes that I need to calculate
I think testing this on 0xFFFF or 0x0000 words is convenient because the direction of calculation and endianess is not important (I hope :D). So be careful to use other test bytes, because the direction of calculation is quite devious :D. Also you can see that by feeding only zeros to the algorithm (both forward and backward), the result is so-called residue (0xC704DD7B), that may be helpful.
So...I wrote at least 10 different functions (bite-wise, tables, combination of polynomials etc.) trying to solve this, but with no luck. I give you here the function in which I put my hopes into. It's 'reversed' algorithm of the table-driven one above, with different table of course. The problem is that the only correct CRC I get from that is with all 0s message and that's not so unexpected. Also I have written the reversed implementation of the bit-wise algorithm (reversed shifts, etc.), but that one returns only the first byte correctly.
Here is the table-driven one, pointer to data should point to the last element of the message and crc input should be the requested crc (0s for the whole message or you can maybe take another approach - that the last 4 bytes of message are the CRC you are looking for: Calculating CRC initial value instead of appending the CRC to payload) :
uint32 crc32tabrev(uint16* data, uint32 len, uint32 crc)
{
uint8 nibble;
int i;
while(len--)
{
for(i = 0; i < 4; i++)
{
nibble = (*data >> i*4) & 0x0F;
crc = (crc >> 4) ^ revtab[((crc ^ nibble) & 0x0F)];
}
data--;
}
return reverse(crc); //reverse() flips all bits around center (MSB <-> LSB ...)
}
The table, which I hope is 'the chosen one':
static const uint32 revtab[16]=
{
0x00000000, 0x1DB71064, 0x3B6E20C8, 0x26D930AC,
0x76DC4190, 0x6B6B51F4, 0x4DB26158, 0x5005713C,
0xEDB88320, 0xF00F9344, 0xD6D6A3E8, 0xCB61B38C,
0x9B64C2B0, 0x86D3D2D4, 0xA00AE278, 0xBDBDF21C
};
As you can see, this algorithm has some perks which make me run in circles and I think I'm maybe on the right track, but I'm missing something. I hope an extra pair of eyes will see what I can not. I'm sorry for the long post (no potato :D), but I think all of that explanation was neccessary. Thank you in advance for insight or advice.

I will answer for your CRC specification, that of a CRC-32/MPEG-2. I will have to ignore your attempts at calculating that CRC, since they are incorrect.
Anyway, to answer your question, I happen to have written a program that solves this problem. It is called spoof.c. It very rapidly computes what bits to change in a message to get a desired CRC. It does this in order log(n) time, where n is the length of the message. Here is an example:
Let's take the nine-byte message 123456789 (those digits represented in ASCII). We will prepend it with four zero bytes, which we will change to get the desired CRC at the end. The message in hex is then: 00 00 00 00 31 32 33 34 35 36 37 38 39. Now we compute the CRC-32/MPEG-2 for that message. We get 373c5870.
Now we run spoof with this input, which is the CRC length in bits, the fact that it is not reflected, the polynomial, the CRC we just computed, the length of the message in bytes, and all 32 bit locations in the first four bytes (which is what we are allowing spoof to change):
32 0 04C11DB7
373c5870 13
0 0 1 2 3 4 5 6 7
1 0 1 2 3 4 5 6 7
2 0 1 2 3 4 5 6 7
3 0 1 2 3 4 5 6 7
It gives this output with what bits in those first four bytes to set:
invert these bits in the sequence:
offset bit
0 1
0 2
0 4
0 5
0 6
1 0
1 2
1 5
1 7
2 0
2 2
2 5
2 6
2 7
3 0
3 1
3 2
3 4
3 5
3 7
We then set the first four bytes to: 76 a5 e5 b7. We then test by computing the CRC-32/MPEG-2 of the message 76 a5 e5 b7 31 32 33 34 35 36 37 38 39 and we get 00000000, the desired result.
You can adapt spoof.c to your application.
Here is an example that correctly computes the CRC-32/MPEG-2 on a stream of bytes using a bit-wise algorithm:
uint32_t crc32m(uint32_t crc, const unsigned char *buf, size_t len)
{
int k;
while (len--) {
crc ^= (uint32_t)(*buf++) << 24;
for (k = 0; k < 8; k++)
crc = crc & 0x80000000 ? (crc << 1) ^ 0x04c11db7 : crc << 1;
}
return crc;
}
and with a nybble-wise algorithm using the table in the question (which is correct):
uint32_t crc_table[] = {
0x00000000, 0x04C11DB7, 0x09823B6E, 0x0D4326D9,
0x130476DC, 0x17C56B6B, 0x1A864DB2, 0x1E475005,
0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, 0x2B4BCB61,
0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD
};
uint32_t crc32m_nyb(uint32_t crc, const unsigned char *buf, size_t len)
{
while (len--) {
crc ^= (uint32_t)(*buf++) << 24;
crc = (crc << 4) ^ crc_table[crc >> 28];
crc = (crc << 4) ^ crc_table[crc >> 28];
}
return crc;
}
In both cases, the initial CRC must be 0xffffffff.

Alternate approach. Assumes xorout = 0, if not, then after calculating the normal crc, then crc ^= xorout to remove it. The method here multiplies the normal crc by (1/2)%(crc polynomial) raised to (message size in bits) power % (crc polynomial) equivalent to cycling it backwards. If the message size is fixed, then the mapping is fixed and time complexity is O(1). Otherwise, it's O(log(n)).
This example code uses Visual Studio and an intrinsic for carryless multiply (PCLMULQDQ), which uses XMM (128 bit) registers. Visual Studio uses __m128i type to represent integer XMM values.
#include <stdio.h>
#include <stdlib.h>
#include <intrin.h>
typedef unsigned char uint8_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
#define POLY (0x104c11db7ull)
#define POLYM ( 0x04c11db7u)
static uint32_t crctbl[256];
static __m128i poly; /* poly */
static __m128i invpoly; /* 2^64 / POLY */
void GenMPoly(void) /* generate __m128i poly info */
{
uint64_t N = 0x100000000ull;
uint64_t Q = 0;
for(size_t i = 0; i < 33; i++){
Q <<= 1;
if(N&0x100000000ull){
Q |= 1;
N ^= POLY;
}
N <<= 1;
}
poly.m128i_u64[0] = POLY;
invpoly.m128i_u64[0] = Q;
}
void GenTbl(void) /* generate crc table */
{
uint32_t crc;
uint32_t c;
uint32_t i;
for(c = 0; c < 0x100; c++){
crc = c<<24;
for(i = 0; i < 8; i++)
/* assumes twos complement */
crc = (crc<<1)^((0-(crc>>31))&POLYM);
crctbl[c] = crc;
}
}
uint32_t GenCrc(uint8_t * bfr, size_t size) /* generate crc */
{
uint32_t crc = 0xffffffffu;
while(size--)
crc = (crc<<8)^crctbl[(crc>>24)^*bfr++];
return(crc);
}
/* carryless multiply modulo poly */
uint32_t MpyModPoly(uint32_t a, uint32_t b) /* (a*b)%poly */
{
__m128i ma, mb, mp, mt;
ma.m128i_u64[0] = a;
mb.m128i_u64[0] = b;
mp = _mm_clmulepi64_si128(ma, mb, 0x00); /* p[0] = a*b */
mt = _mm_clmulepi64_si128(mp, invpoly, 0x00); /* t[1] = (p[0]*((2^64)/POLY))>>64 */
mt = _mm_clmulepi64_si128(mt, poly, 0x01); /* t[0] = t[1]*POLY */
return mp.m128i_u32[0] ^ mt.m128i_u32[0]; /* ret = p[0] ^ t[0] */
}
/* exponentiate by repeated squaring modulo poly */
uint32_t PowModPoly(uint32_t a, uint32_t b) /* pow(a,b)%poly */
{
uint32_t prd = 0x1u; /* current product */
uint32_t sqr = a; /* current square */
while(b){
if(b&1)
prd = MpyModPoly(prd, sqr);
sqr = MpyModPoly(sqr, sqr);
b >>= 1;
}
return prd;
}
int main()
{
uint32_t inv; /* 1/2 % poly, constant */
uint32_t fix; /* fix value, constant if msg size fixed */
uint32_t crc; /* crc at end of msg */
uint32_t pre; /* prefix for msg */
uint8_t msg[13] = {0x00,0x00,0x00,0x00,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39};
GenMPoly(); /* generate __m128i polys */
GenTbl(); /* generate crc table */
inv = PowModPoly(2, 0xfffffffeu); /* inv = 2^(2^32-2) % Poly = 1/2 % poly */
fix = PowModPoly(inv, 8*sizeof(msg)); /* fix value */
crc = GenCrc(msg, sizeof(msg)); /* calculate normal crc */
pre = MpyModPoly(fix, crc); /* convert to prefix */
printf("crc = %08x pre = %08x ", crc, pre);
msg[0] = (uint8_t)(pre>>24); /* store prefix in msg */
msg[1] = (uint8_t)(pre>>16);
msg[2] = (uint8_t)(pre>> 8);
msg[3] = (uint8_t)(pre>> 0);
crc = GenCrc(msg, sizeof(msg)); /* check result */
if(crc == 0)
printf("passed\n");
else
printf("failed\n");
return 0;
}

Well, few hours after my question, someone whose name I don't remember posted an answer to my question which turned out to be correct. Somehow this answer got completely deleted, I don't know why or who did it, but I'd like to thank to this person and in the case you will see this, please post your answer again and I'll delete this one. But for other users, here's his answer that worked for me, thank you again, mysterious one (unfortunately, I can't replicate his notes and suggestions well enough, just the code itself):
Edit: The original answer came from user samgak, so this stays here until he'll post his answer.
The reverse CRC algorithm:
uint32 revcrc32(uint16* data, uint32 len, uint32 crc)
{
uint32 i;
data += len - 1;
while(len--)
{
crc ^= *data--;
for(i = 0; i < 16; i++)
{
uint32 crc1 = ((crc ^ POLY) >> 1) | 0x80000000;
uint32 crc2 = crc >> 1;
if(((crc1 << 1) ^ (((crc1 ^ *data) & 0x80000000) ? POLY : 0)) == crc)
crc = crc1;
else if(((crc2 << 1) ^ (((crc2 ^ *data) & 0x80000000) ? POLY : 0)) == crc)
crc = crc2;
}
}
return crc;
}
Find patch bytes:
#define CRC_OF_ZERO 0xb7647d
void bruteforcecrc32(uint32 targetcrc)
{
// compute prefixes:
uint16 j;
for(j = 0; j <= 0xffff; j++)
{
uint32 crc = revcrc32(&j, 1, targetcrc);
if((crc >> 16) == (CRC_OF_ZERO >> 16))
{
printf("prefixes: %04lX %04lX\n", (crc ^ CRC_OF_ZERO) & 0xffff, (uint32)j);
return;
}
}
}
Usage:
uint16 test[] = {0x0123, 0x4567, 0x89AB, 0xCDEF}; // prefix should be 0x0CD8236A
bruteforcecrc32(revcrc32(test, 4, 0L));

Efficient algorithm for finding a byte in a bit array

Given a bytearray uint8_t data[N] what is an efficient method to find a byte uint8_t search within it even if search is not octet aligned? i.e. the first three bits of search could be in data[i] and the next 5 bits in data[i+1].
My current method involves creating a bool get_bit(const uint8_t* src, struct internal_state* state) function (struct internal_state contains a mask that is bitshifted right, &ed with src and returned, maintaining size_t src_index < size_t src_len) , leftshifting the returned bits into a uint8_t my_register and comparing it with search every time, and using state->src_index and state->src_mask to get the position of the matched byte.
Is there a better method for this?

If you're searching an eight bit pattern within a large array you can implement a sliding window over 16 bit values to check if the searched pattern is part of the two bytes forming that 16 bit value.
To be portable you have to take care of endianness issues which is done by my implementation by building the 16 bit value to search for the pattern manually. The high byte is always the currently iterated byte and the low byte is the following byte. If you do a simple conversion like value = *((unsigned short *)pData) you will run into trouble on x86 processors...
Once value, cmp and mask are setup cmp and mask are shifted. If the pattern was not found within hi high byte the loop continues by checking the next byte as start byte.
Here is my implementation including some debug printouts (the function returns the bit position or -1 if pattern was not found):
int findPattern(unsigned char *data, int size, unsigned char pattern)
{
int result = -1;
unsigned char *pData;
unsigned char *pEnd;
unsigned short value;
unsigned short mask;
unsigned short cmp;
int tmpResult;
if ((data != NULL) && (size > 0))
{
pData = data;
pEnd = data + size;
while ((pData < pEnd) && (result == -1))
{
printf("\n\npData = {%02x, %02x, ...};\n", pData[0], pData[1]);
if ((pData + 1) < pEnd) /* still at least two bytes to check? */
{
tmpResult = (int)(pData - data) * 8; /* calculate bit offset according to current byte */
/* avoid endianness troubles by "manually" building value! */
value = *pData << 8;
pData++;
value += *pData;
/* create a sliding window to check if search patter is within value */
cmp = pattern << 8;
mask = 0xFF00;
while (mask > 0x00FF) /* the low byte is checked within next iteration! */
{
printf("cmp = %04x, mask = %04x, tmpResult = %d\n", cmp, mask, tmpResult);
if ((value & mask) == cmp)
{
result = tmpResult;
break;
}
tmpResult++; /* count bits! */
mask >>= 1;
cmp >>= 1;
}
}
else
{
/* only one chance left if there is only one byte left to check! */
if (*pData == pattern)
{
result = (int)(pData - data) * 8;
}
pData++;
}
}
}
return (result);
}

I don't think you can do much better than this in C:
/*
* Searches for the 8-bit pattern represented by 'needle' in the bit array
* represented by 'haystack'.
*
* Returns the index *in bits* of the first appearance of 'needle', or
* -1 if 'needle' is not found.
*/
int search(uint8_t needle, int num_bytes, uint8_t haystack[num_bytes]) {
if (num_bytes > 0) {
uint16_t window = haystack[0];
if (window == needle) return 0;
for (int i = 1; i < num_bytes; i += 1) {
window = window << 8 + haystack[i];
/* Candidate for unrolling: */
for (int j = 7; j >= 0; j -= 1) {
if ((window >> j) & 0xff == needle) {
return 8 * i - j;
}
}
}
}
return -1;
}
The main idea is to handle the 87.5% of cases that cross the boundary between consecutive bytes by pairing bytes in a wider data type (uint16_t in this case). You could adjust it to use an even wider data type, but I'm not sure that would gain anything.
What you cannot safely or easily do is anything involving casting part or all of your array to a wider integer type via a pointer (i.e. (uint16_t *)&haystack[i]). You cannot be ensured of proper alignment for such a cast, nor of the byte order with which the result might be interpreted.

I don't know if it would be better, but i would use sliding window.
uint counter = 0, feeder = 8;
uint window = data[0];
while (search ^ (window & 0xff)){
window >>= 1;
feeder--;
if (feeder < 8){
counter++;
if (counter >= data.length) {
feeder = 0;
break;
}
window |= data[counter] << feeder;
feeder += 8;
}
}
//Returns index of first bit of first sequence occurrence or -1 if sequence is not found
return (feeder > 0) ? (counter+1)*8-feeder : -1;
Also with some alterations you can use this method to search for arbitrary length (1 to 64-array_element_size_in_bits) bits sequence.

If AVX2 is acceptable (with earlier versions it didn't work out so well, but you can still do something there), you can search in a lot of places at the same time. I couldn't test this on my machine (only compile) so the following is more to give to you an idea of how it could be approached than copy&paste code, so I'll try to explain it rather than just code-dump.
The main idea is to read an uint64_t, shift it right by all values that make sense (0 through 7), then for each of those 8 new uint64_t's, test whether the byte is in there. Small complication: for the uint64_t's shifted by more than 0, the highest position should not be counted since it has zeroes shifted into it that might not be in the actual data. Once this is done, the next uint64_t should be read at an offset of 7 from the current one, otherwise there is a boundary that is not checked across. That's fine though, unaligned loads aren't so bad anymore, especially if they're not wide.
So now for some (untested, and incomplete, see below) code,
__m256i needle = _mm256_set1_epi8(find);
size_t i;
for (i = 0; i < n - 6; i += 7) {
// unaligned load here, but that's OK
uint64_t d = *(uint64_t*)(data + i);
__m256i x = _mm256_set1_epi64x(d);
__m256i low = _mm256_srlv_epi64(x, _mm256_set_epi64x(3, 2, 1, 0));
__m256i high = _mm256_srlv_epi64(x, _mm256_set_epi64x(7, 6, 5, 4));
low = _mm256_cmpeq_epi8(low, needle);
high = _mm256_cmpeq_epi8(high, needle);
// in the qword right-shifted by 0, all positions are valid
// otherwise, the top position corresponds to an incomplete byte
uint32_t lowmask = 0x7f7f7fffu & _mm256_movemask_epi8(low);
uint32_t highmask = 0x7f7f7f7fu & _mm256_movemask_epi8(high);
uint64_t mask = lowmask | ((uint64_t)highmask << 32);
if (mask) {
int bitindex = __builtin_ffsl(mask);
// the bit-index and byte-index are swapped
return 8 * (i + (bitindex & 7)) + (bitindex >> 3);
}
}
The funny "bit-index and byte-index are swapped" thing is because searching within a qword is done byte by byte and the results of those comparisons end up in 8 adjacent bits, while the search for "shifted by 1" ends up in the next 8 bits and so on. So in the resulting masks, the index of the byte that contains the 1 is a bit-offset, but the bit-index within that byte is actually the byte-offset, for example 0x8000 would correspond to finding the byte at the 7th byte of the qword that was right-shifted by 1, so the actual index is 8*7+1.
There is also the issue of the "tail", the part of the data left over when all blocks of 7 bytes have been processed. It can be done much the same way, but now more positions contain bogus bytes. Now n - i bytes are left over, so the mask has to have n - i bits set in the lowest byte, and one fewer for all other bytes (for the same reason as earlier, the other positions have zeroes shifted in). Also, if there is exactly 1 byte "left", it isn't really left because it would have been tested already, but that doesn't really matter. I'll assume the data is sufficiently padded that accessing out of bounds doesn't matter. Here it is, untested:
if (i < n - 1) {
// make n-i-1 bits, then copy them to every byte
uint32_t validh = ((1u << (n - i - 1)) - 1) * 0x01010101;
// the lowest position has an extra valid bit, set lowest zero
uint32_t validl = (validh + 1) | validh;
uint64_t d = *(uint64_t*)(data + i);
__m256i x = _mm256_set1_epi64x(d);
__m256i low = _mm256_srlv_epi64(x, _mm256_set_epi64x(3, 2, 1, 0));
__m256i high = _mm256_srlv_epi64(x, _mm256_set_epi64x(7, 6, 5, 4));
low = _mm256_cmpeq_epi8(low, needle);
high = _mm256_cmpeq_epi8(high, needle);
uint32_t lowmask = validl & _mm256_movemask_epi8(low);
uint32_t highmask = validh & _mm256_movemask_epi8(high);
uint64_t mask = lowmask | ((uint64_t)highmask << 32);
if (mask) {
int bitindex = __builtin_ffsl(mask);
return 8 * (i + (bitindex & 7)) + (bitindex >> 3);
}
}

If you are searching a large amount of memory and can afford an expensive setup, another approach is to use a 64K lookup table. For each possible 16-bit value, the table stores a byte containing the bit shift offset at which the matching octet occurs (+1, so 0 can indicate no match). You can initialize it like this:
uint8_t* g_pLookupTable = malloc(65536);
void initLUT(uint8_t octet)
{
memset(g_pLookupTable, 0, 65536); // zero out
for(int i = 0; i < 65536; i++)
{
for(int j = 7; j >= 0; j--)
{
if(((i >> j) & 255) == octet)
{
g_pLookupTable[i] = j + 1;
break;
}
}
}
}
Note that the case where the value is shifted 8 bits is not included (the reason will be obvious in a minute).
Then you can scan through your array of bytes like this:
int findByteMatch(uint8_t* pArray, uint8_t octet, int length)
{
if(length >= 0)
{
uint16_t index = (uint16_t)pArray[0];
if(index == octet)
return 0;
for(int bit, i = 1; i < length; i++)
{
index = (index << 8) | pArray[i];
if(bit = g_pLookupTable[index])
return (i * 8) - (bit - 1);
}
}
return -1;
}
Further optimization:
Read 32 or however many bits at a time from pArray into a uint32_t and then shift and AND each to get byte one at a time, OR with index and test, before reading another 4.
Pack the LUT into 32K by storing a nybble for each index. This might help it squeeze into the cache on some systems.
It will depend on your memory architecture whether this is faster than an unrolled loop that doesn't use a lookup table.

byte swapping/reversing a character array

I am looking for a method to reverse the bytes in a character array.I also need to reverse the individual bits of the bytes being swapped before positioning them in the right place.
for example say I have a char arr[1000] whose arr[0] = 00100011 and arr[999] = 11010110, I want to swap arr[0] and arr[999] and in addition reverse the bits in each of them. so the output would be arr[999]= 11000100 ( reversed bits of arr[0]) and arr[0] = 01101011 (reversed bits of arr[999]).
I have some code to do the bit reversal inside a byte :
static char reverseByte(char val)
{
char result = 0;
int counter = 8;
while (counter-- < 0)
{
result <<= 1;
result |= (char)(val & 1);
val = (char)(val >> 1);
}
return result;
}
But this would mean running an outer loop to do the byte swap and then running the above small loop for each byte inside i.e 1000 in the above case. Is this the right approach ? Is there a better way to achieve this ?
Any help would be greatly appreciated.

How about this?:
#include <stdio.h>
#include <limits.h>
#if CHAR_BIT != 8
#error char is expected to be 8 bits
#endif
unsigned char RevByte(unsigned char b)
{
static const unsigned char t[16] =
{
0x0, 0x8, 0x4, 0xC, 0x2, 0xA, 0x6, 0xE,
0x1, 0x9, 0x5, 0xD, 0x3, 0xB, 0x7, 0xF
};
return t[b >> 4] | (t[b & 0xF] << 4);
}
void RevBytes(unsigned char* b, size_t c)
{
size_t i;
for (i = 0; i < c / 2; i++)
{
unsigned char t = b[i];
b[i] = RevByte(b[c - 1 - i]);
b[c - 1 - i] = RevByte(t);
}
if (c & 1)
b[c / 2] = RevByte(b[c / 2]);
}
int main(void)
{
int i;
unsigned char buf[16] =
{
0x0, 0x8, 0x4, 0xC, 0x2, 0xA, 0x6, 0xE,
0x1, 0x9, 0x5, 0xD, 0x3, 0xB, 0x7, 0xF
};
RevBytes(buf, 16);
for (i = 0; i < 16; i++)
printf("0x%02X ", buf[i]);
puts("");
return 0;
}
Output (ideone):
0xF0 0xE0 0xD0 0xC0 0xB0 0xA0 0x90 0x80 0x70 0x60 0x50 0x40 0x30 0x20 0x10 0x00

Reversing the elements of a collection is an old trick - you swap first and last, then first+1 and last-1, then... until first+i is equal to or further along the collection than last-j.
All you have to add onto that is
-flipping the bits of the two entries before swapping them
-if you have one element left in the middle, flip its bits on the spot

To do this - assuming your bit reversal method is correct - if you can spare the extra char in storage, you can simply traverse the bytes. Consider the following method:
static void swapReverseBytes(char* arr, size_t len)
{
int i = 0;
char tmp = 0;
if(arr == NULL || len < 1)
return;
if(len == 1) {
arr[0] = reverseByte(arr[0]);
return;
}
for(i = 0 ; i < (len / 2) ; ++i) {
tmp = arr[len - i - 1];
arr[len - i - 1] = reverseByte(arr[i]);
arr[i] = reverseByte(tmp);
}
}
This is a rough sketch (should compile, I think) and, assuming I have no off-by-one errors, this should work. As mentioned earlier, it is probably fastest to reverse the bits using a LUT, but byte swapping will be relatively similar since you need to actually move each byte unless you are keeping some state about the traversal order of the array. If that is the case, it is quite possible to simply use some state (i.e. a flag) to determine whether the array should be traversed normally (1...n) or in reverse order (n...1). In any event, the swapping is "free" in the Big-O, but in practice may (not necessarily) show a performance impact (since there will be no "actual" swapping of the byte order). Note, this would not work if the user expects an array which is sorted - this trick only works in the event that this is internal (or you have some sort of encapsulation, say, as in a C++ class).

To do this - assuming your bit reversal method is correct - if you can spare the extra char in storage, you can simply traverse the bytes. Consider the following method:
static void swapReverseBytes(char* arr, size_t len)
{
int i = 0;
char tmp = 0;
if(arr == NULL || len < 1)
return;
if(len == 1) {
arr[0] = reverseByte(arr[0]);
return;
}
for(i = 0 ; i < (len / 2) ; ++i) {
tmp = arr[len - i - 1];
arr[len - i - 1] = reverseByte(arr[i]);
arr[i] = reverseByte(tmp);
}
}
This is a rough sketch (should compile, I think) and, assuming I have no off-by-one errors, this should work. As mentioned earlier, it is probably fastest to reverse the bits using a LUT, but the byte swapping method will be relatively similar since you need to actually move each byte unless you are keeping some state about the traversal order of the array. If that is the case, it is quite possible to simply use some state (i.e. a flag) to determine whether the array should be traversed normally (1...n) or in reverse order (n...1). In any event, the swapping is "free" in the Big-O, but in practice may (not necessarily) show a performance impact. So if your optimization is really on speed and an extra int in space isn't too much, this state may be worthwhile for you. Note that this trick will only work if this is internal. If this method is public to the user and he or she does not know about the flag, then this will not actually flip anything. Another option to use this is if you can use C++, you can create a class which encapsulates this functionality for the user.

Comparing arbitrary bit sequences in a byte array in c

I have a couple uint8_t arrays in my c code, and I'd like to compare an arbitrary sequence bits from one with another. So for example, I have bitarray_1 and bitarray_2, and I'd like to compare bits 13 - 47 from bitarray_1 with bits 5-39 of bitarray_2. What is the most efficient way to do this?
Currently it's a huge bottleneck in my program, since I just have a naive implementation that copies the bits into the beginning of a new temporary array, and then uses memcmp on them.

three words: shift, mask and xor.
shift to get the same memory alignment for both bitarray. If not you will have to shift one of the arrays before comparing them. Your exemple is probably misleading because bits 13-47 and 5-39 have the same memory alignment on 8 bits addresses. This wouldn't be true if you were comparing say bits 14-48 with bits 5-39.
Once everything is aligned and exceeding bits cleared for table boundaries a xor is enough to perform the comparison of all the bits at once. Basically you can manage to do it with just one memory read for each array, which should be pretty efficient.
If memory alignment is the same for both arrays as in your example memcmp and special case for upper and lower bound is probably yet faster.
Also accessing array by uint32_t (or uint64_t on 64 bits architectures) should also be more efficient than accessing by uint8_t.
The principle is simple but as Andrejs said the implementation is not painless...
Here is how it goes (similarities with #caf proposal is no coincidence):
/* compare_bit_sequence() */
int compare_bit_sequence(uint8_t s1[], unsigned s1_off, uint8_t s2[], unsigned s2_off,
unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t clear_lo_bits[] =
{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
uint8_t v1;
uint8_t * max_s1;
unsigned end;
uint8_t lsl;
uint8_t v1_mask;
int delta;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off >> 3;
s1_off &= 7;
s2 += s2_off >> 3;
s2_off &= 7;
/* Make sure s2 is the sequence with the shorter offset */
if (s2_off > s1_off){
uint8_t * tmp_s;
unsigned tmp_off;
tmp_s = s2; s2 = s1; s1 = tmp_s;
tmp_off = s2_off; s2_off = s1_off; s1_off = tmp_off;
}
delta = s1_off;
/* handle the beginning, s2 incomplete */
if (s2_off > 0){
delta = s1_off - s2_off;
v1 = delta
? (s1[0] >> delta | s1[1] << (8 - delta)) & clear_lo_bits[delta]
: s1[0];
if (length <= 8 - s2_off){
if ((v1 ^ *s2)
& clear_lo_bits[s2_off]
& mask_lo_bits[s2_off + length]){
return NOT_EQUAL;
}
else {
return EQUAL;
}
}
else{
if ((v1 ^ *s2) & clear_lo_bits[s2_off]){
return NOT_EQUAL;
}
length -= 8 - s2_off;
}
s1++;
s2++;
}
/* main loop, we test one group of 8 bits of v2 at each loop */
max_s1 = s1 + (length >> 3);
lsl = 8 - delta;
v1_mask = clear_lo_bits[delta];
while (s1 < max_s1)
{
if ((*s1 >> delta | (*++s1 << lsl & v1_mask)) ^ *s2++)
{
return NOT_EQUAL;
}
}
/* last group of bits v2 incomplete */
end = length & 7;
if (end && ((*s2 ^ *s1 >> delta) & mask_lo_bits[end]))
{
return NOT_EQUAL;
}
return EQUAL;
}
All possible optimisations are not yet used. One promising one would be to use larger chunks of data (64 bits or 32 bits at once instead of 8), you could also detect cases where offset are synchronised for both arrays and in such cases use a memcmp instead of the main loop, replace modulos % 8 by logical operators & 7, replace '/ 8' by '>> 3', etc., have to branches of code instead of swapping s1 and s2, etc, but the main purpose is achieved : only one memory read and not memory write for each array item hence most of the work can take place inside processor registers.

bits 13 - 47 of bitarray_1 are the same as bits 5 - 39 of bitarray_1 + 1.
Compare the first 3 bits (5 - 7) with a mask and the other bits (8 - 39) with memcmp().
Rather than shift and copy the bits, maybe representing them differently is faster. You have to measure.
/* code skeleton */
static char bitarray_1_bis[BIT_ARRAY_SIZE*8+1];
static char bitarray_2_bis[BIT_ARRAY_SIZE*8+1];
static const char *lookup_table[] = {
"00000000", "00000001", "00000010" /* ... */
/* 256 strings */
/* ... */ "11111111"
};
/* copy every bit of bitarray_1 to an element of bitarray_1_bis */
for (k = 0; k < BIT_ARRAY_SIZE; k++) {
strcpy(bitarray_1_bis + 8*k, lookup_table[bitarray_1[k]]);
strcpy(bitarray_2_bis + 8*k, lookup_table[bitarray_2[k]]);
}
memcmp(bitarray_1_bis + 13, bitarray_2_bis + 5, 47 - 13 + 1);
You can (and should) limit the copy to the minimum possible.
I have no idea if it's faster, but it wouldn't surprise me if it was. Again, you have to measure.

The easiest way to do this is to convert the more complex case into a simpler case, then solve the simpler case.
In the following code, do_compare() solves the simpler case (where the sequences are never offset by more than 7 bits, s1 is always offset as much or more than s2, and the length of the sequence is non-zero). The compare_bit_sequence() function then takes care of converting the harder case to the easier case, and calls do_compare() to do the work.
This just does a single-pass through the bit sequences, so hopefully that's an improvement on your copy-and-memcmp implementation.
#define NOT_EQUAL 0
#define EQUAL 1
/* do_compare()
*
* Does the actual comparison, but has some preconditions on parameters to
* simplify things:
*
* length > 0
* 8 > s1_off >= s2_off
*/
int do_compare(const uint8_t s1[], const unsigned s1_off, const uint8_t s2[],
const unsigned s2_off, const unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t mask_hi_bits[] =
{ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
const unsigned msb = (length + s1_off - 1) / 8;
const unsigned s2_shl = s1_off - s2_off;
const unsigned s2_shr = 8 - s2_shl;
unsigned n;
uint8_t s1s2_diff, lo_bits = 0;
for (n = 0; n <= msb; n++)
{
/* Shift s2 so it is aligned with s1, pulling in low bits from
* the high bits of the previous byte, and store in s1s2_diff */
s1s2_diff = lo_bits | (s2[n] << s2_shl);
/* Save the bits needed to fill in the low-order bits of the next
* byte. HERE BE DRAGONS - since s2_shr can be 8, this below line
* only works because uint8_t is promoted to int, and we know that
* the width of int is guaranteed to be >= 16. If you change this
* routine to work with a wider type than uint8_t, you will need
* to special-case this line so that if s2_shr is the width of the
* type, you get lo_bits = 0. Don't say you weren't warned. */
lo_bits = s2[n] >> s2_shr;
/* XOR with s1[n] to determine bits that differ between s1 and s2 */
s1s2_diff ^= s1[n];
/* Look only at differences in the high bits in the first byte */
if (n == 0)
s1s2_diff &= mask_hi_bits[8 - s1_off];
/* Look only at differences in the low bits of the last byte */
if (n == msb)
s1s2_diff &= mask_lo_bits[(length + s1_off) % 8];
if (s1s2_diff)
return NOT_EQUAL;
}
return EQUAL;
}
/* compare_bit_sequence()
*
* Adjusts the parameters to match the preconditions for do_compare(), then
* calls it to do the work.
*/
int compare_bit_sequence(const uint8_t s1[], unsigned s1_off,
const uint8_t s2[], unsigned s2_off, unsigned length)
{
/* Handle length zero */
if (length == 0)
return EQUAL;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off / 8;
s1_off %= 8;
s2 += s2_off / 8;
s2_off %= 8;
/* Make sure s2 is the sequence with the shorter offset */
if (s1_off >= s2_off)
return do_compare(s1, s1_off, s2, s2_off, length);
else
return do_compare(s2, s2_off, s1, s1_off, length);
}
To do the comparison in your example, you'd call:
compare_bit_sequence(bitarray_1, 13, bitarray_2, 5, 35)
(Note that I am numbering the bits from zero, and assuming that the bitarrays are laid out little-endian, so this will start the comparison from the sixth-least-significant bit in bitarray2[0], and the sixth-least-signifcant bit in bitarray1[1]).

What about writing the function that will calculate the offsets from both arrays, apply the mask, shift the bits and store the result to the int so you may compare them. If the bits count (34 in your example) exceeds the length of the int - recurse or loop.
Sorry, the example will be pain in the ass.

Here is my unoptimized bit sequence comparison function:
#include <stdio.h>
#include <stdint.h>
// 01234567 01234567
uint8_t bitsA[] = { 0b01000000, 0b00010000 };
uint8_t bitsB[] = { 0b10000000, 0b00100000 };
int bit( uint8_t *bits, size_t bitpoz, size_t len ){
return (bitpoz<len)? !!(bits[bitpoz/8]&(1<<(7-bitpoz%8))): 0;
}
int bitcmp( uint8_t *bitsA, size_t firstA, size_t lenA,
uint8_t *bitsB, size_t firstB, size_t lenB ){
int cmp;
for( size_t i=0; i<lenA || i<lenB; i++ ){
if( (cmp = bit(bitsA,firstA+i,firstA+lenA) -
bit(bitsB,firstB+i,firstB+lenB)) ) return cmp;
}
return 0;
}
int main(){
printf( "cmp: %i\n", bitcmp( bitsA,1,11, bitsB,0,11 ) );
}
EDIT: Here is my (untested) bitstring equality test function:
#include <stdlib.h>
#include <stdint.h>
#define load_64bit(bits,first) (*(uint64_t*)bits<<first | *(bits+8)>>(8-first))
#define load_32bit(bits,first) (*(uint32_t*)bits<<first | *(bits+4)>>(8-first))
#define load_16bit(bits,first) (*(uint16_t*)bits<<first | *(bits+2)>>(8-first))
#define load_8bit( bits,first) ( *bits<<first | *(bits+1)>>(8-first))
static inline uint8_t last_bits( uint8_t *bits, size_t first, size_t size ){
return (first+size>8?load_8bit(bits,first):*bits<<first)>>(8-size);
}
int biteq( uint8_t *bitsA, size_t firstA,
uint8_t *bitsB, size_t firstB, size_t size ){
if( !size ) return 1;
bitsA+=firstA/8; firstA%=8;
bitsB+=firstB/8; firstB%=8;
for(; size>64;size-=64,bitsA+=8,bitsB+=8)
if(load_64bit(bitsA,firstA)!=load_64bit(bitsB,firstB)) return 0;
for(; size>32;size-=32,bitsA+=4,bitsB+=4)
if(load_32bit(bitsA,firstA)!=load_32bit(bitsB,firstB)) return 0;
for(; size>16;size-=16,bitsA+=2,bitsB+=2)
if(load_16bit(bitsA,firstA)!=load_16bit(bitsB,firstB)) return 0;
for(; size> 8;size-= 8,bitsA++, bitsB++ )
if(load_8bit( bitsA,firstA)!=load_8bit( bitsB,firstB)) return 0;
return !size ||
last_bits(bitsA,firstA,size)==last_bits(bitsB,firstB,size);
}
I made a simple measurement tool to see how fast is it:
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#define SIZE 1000000
uint8_t bitsC[SIZE];
volatile int end_loop;
void sigalrm_hnd( int sig ){ (void)sig; end_loop=1; }
int main(){
uint64_t loop_count; int cmp;
signal(SIGALRM,sigalrm_hnd);
loop_count=0; end_loop=0; alarm(10);
while( !end_loop ){
for( int i=1; i<7; i++ ){
loop_count++;
cmp = biteq( bitsC,i, bitsC,7-i,(SIZE-1)*8 );
if( !cmp ){ printf( "cmp: %i (==0)\n", cmp ); return -1; }
}
}
printf( "biteq: %.2f round/sec\n", loop_count/10.0 );
}
Result:
bitcmp: 8.40 round/sec
biteq: 363.60 round/sec
EDIT2: last_bits() changed.