What's a time efficient algorithm to copy unaligned bit arrays?

What's a time efficient algorithm to copy unaligned bit arrays? - c

I've had to do this many times in the past, and I've never been satisfied with the results.
Can anyone suggest a fast way of copying a contiguous bit array from source to destination where both the source and destination's may not be aligned (right shifted) on convenient processor boundaries?
If both the source and destination's aren't aligned , the problem can quickly be changed into one where only either of them aren't aligned (after the first copy say).
As a starting point, my code inevitably ends up looking something like the following (untested, ignore side effects this is just an off the cuff example):
const char mask[8] = { 1, 3, 7, 15, 31, 63, 127, 255 };
/* Assume:
* - destination is already zeroed,
* - offsets are right shifts
* - bits to copy is big (> 32 say)
*/
int bitarray_copy(char * src, int src_bit_offset, int src_bit_len,
char * dst, int dst_bit_offset) {
if (src_bit_offset == dst_bit_offset) { /* Not very interesting */
} else {
int bit_diff_offset = src_bit_offset - dst_bit_offset; /* assume positive */
int loop_count;
char c;
char mask_val = mask[bit_diff_offset];
/* Get started, line up the destination. */
c = (*src++ << bit_diff_offset) | ((*src >> (8 - bit_diff_offset)) & mask_val);
c &= mask[8-dst_bit_offset];
*dst++ |= c;
src_bit_len -= 8 - dst_bit_offset;
loop_count = src_bit_len >> 3;
while (--loop_count >= 0)
* dst ++ = (*src++ << bit_diff_offset) | ((*src >> (8 - bit_diff_offset)) & mask_val);
/* Trailing tail copy etc ... */
if (src_bit_len % 8) /* ... */
}
}
(actually this is better than I've done before. It doesn't look too bad)

This is what I ended up doing. (EDIT Changed on 8/21/2014 for a single bit copy bug.)
#include <limits.h>
#include <string.h>
#include <stddef.h>
#define PREPARE_FIRST_COPY() \
do { \
if (src_len >= (CHAR_BIT - dst_offset_modulo)) { \
*dst &= reverse_mask[dst_offset_modulo]; \
src_len -= CHAR_BIT - dst_offset_modulo; \
} else { \
*dst &= reverse_mask[dst_offset_modulo] \
| reverse_mask_xor[dst_offset_modulo + src_len]; \
c &= reverse_mask[dst_offset_modulo + src_len]; \
src_len = 0; \
} } while (0)
static void
bitarray_copy(const unsigned char *src_org, int src_offset, int src_len,
unsigned char *dst_org, int dst_offset)
{
static const unsigned char mask[] =
{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
static const unsigned char reverse_mask[] =
{ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
static const unsigned char reverse_mask_xor[] =
{ 0xff, 0x7f, 0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 };
if (src_len) {
const unsigned char *src;
unsigned char *dst;
int src_offset_modulo,
dst_offset_modulo;
src = src_org + (src_offset / CHAR_BIT);
dst = dst_org + (dst_offset / CHAR_BIT);
src_offset_modulo = src_offset % CHAR_BIT;
dst_offset_modulo = dst_offset % CHAR_BIT;
if (src_offset_modulo == dst_offset_modulo) {
int byte_len;
int src_len_modulo;
if (src_offset_modulo) {
unsigned char c;
c = reverse_mask_xor[dst_offset_modulo] & *src++;
PREPARE_FIRST_COPY();
*dst++ |= c;
}
byte_len = src_len / CHAR_BIT;
src_len_modulo = src_len % CHAR_BIT;
if (byte_len) {
memcpy(dst, src, byte_len);
src += byte_len;
dst += byte_len;
}
if (src_len_modulo) {
*dst &= reverse_mask_xor[src_len_modulo];
*dst |= reverse_mask[src_len_modulo] & *src;
}
} else {
int bit_diff_ls,
bit_diff_rs;
int byte_len;
int src_len_modulo;
unsigned char c;
/*
* Begin: Line things up on destination.
*/
if (src_offset_modulo > dst_offset_modulo) {
bit_diff_ls = src_offset_modulo - dst_offset_modulo;
bit_diff_rs = CHAR_BIT - bit_diff_ls;
c = *src++ << bit_diff_ls;
c |= *src >> bit_diff_rs;
c &= reverse_mask_xor[dst_offset_modulo];
} else {
bit_diff_rs = dst_offset_modulo - src_offset_modulo;
bit_diff_ls = CHAR_BIT - bit_diff_rs;
c = *src >> bit_diff_rs &
reverse_mask_xor[dst_offset_modulo];
}
PREPARE_FIRST_COPY();
*dst++ |= c;
/*
* Middle: copy with only shifting the source.
*/
byte_len = src_len / CHAR_BIT;
while (--byte_len >= 0) {
c = *src++ << bit_diff_ls;
c |= *src >> bit_diff_rs;
*dst++ = c;
}
/*
* End: copy the remaing bits;
*/
src_len_modulo = src_len % CHAR_BIT;
if (src_len_modulo) {
c = *src++ << bit_diff_ls;
c |= *src >> bit_diff_rs;
c &= reverse_mask[src_len_modulo];
*dst &= reverse_mask_xor[src_len_modulo];
*dst |= c;
}
}
}
}

Your inner loop takes pieces of two bytes and moves them to a destination byte. That's almost optimal. Here are a few more hints in no particular order:
There's no need to limit yourself to a byte at a time. Use the largest integer size your platform will let you get away with. This of course will complicate your starting and trailing logic.
If you use unsigned chars or integers, you may not need to mask the second piece of the source after it's shifted right. This will depend on your compiler.
If you do need the mask, make sure your compiler is moving the table lookup outside of the loop. If it isn't, copy it to a temporary variable and use that.

What is optimal will depend upon the target platform. On some platforms without barrel shifters, shifting the whole vector right or left one bit, n times, for n<3, will be the fastest approach (on the PIC18 platform, an 8x-unrolled byte loop to shift left one bit will cost 11 instruction cycles per eight bytes). Otherwise, I like the pattern (note src2 will have to be initialized depending upon what you want done with the end of your buffer)
src1 = *src++;
src2 = (src1 shl shiftamount1) | (src2 shr shiftamount2);
*dest++ = src2;
src2 = *src++;
src1 = (src2 shl shiftamount1) | (src1 shr shiftamount2);
*dest++ = src1;
That should lend itself to very efficient implementation on an ARM (eight instructions every two words, if registers are available for src, dest, src1, src2, shiftamount1, and shiftamount2. Using more registers would allow faster operation via multi-word load/store instructions. Handling four words would be something like (one machine instruction per line, except the first four lines would together be one instruction, as would the last four lines ):
src0 = *src++;
src1 = *src++;
src2 = *src++;
src3 = *src++;
tmp = src0;
src0 = src0 shr shiftamount1
src0 = src0 | src1 shl shiftamount2
src1 = src1 shr shiftamount1
src1 = src1 | src2 shl shiftamount2
src2 = src2 shr shiftamount1
src2 = src2 | src3 shl shiftamount2
src3 = src3 shr shiftamount1
src3 = src3 | tmp shl shiftamount2
*dest++ = src0;
*dest++ = src1;
*dest++ = src2;
*dest++ = src3;
Eleven instructions per 16 bytes rotated.

Your solution looks similar to most I've seen: basically do some unaligned work at the start and end, with the main loop in the middle using aligned accesses. If you really need efficiency and do this on very long bitstreams, I would suggest using something architecture-specific like SSE2 in the main loop.

Related

AVX2 delete special byte and bring the next bytes 1 unit back [duplicate]

What would be an efficient way to optimize the following code with sse ?
uint16_t change1= ... ;
uint8_t* pSrc = ... ;
uint8_t* pDest = ... ;
if(change1 & 0x0001) *pDest++ = pSrc[0];
if(change1 & 0x0002) *pDest++ = pSrc[1];
if(change1 & 0x0004) *pDest++ = pSrc[2];
if(change1 & 0x0008) *pDest++ = pSrc[3];
if(change1 & 0x0010) *pDest++ = pSrc[4];
if(change1 & 0x0020) *pDest++ = pSrc[5];
if(change1 & 0x0040) *pDest++ = pSrc[6];
if(change1 & 0x0080) *pDest++ = pSrc[7];
if(change1 & 0x0100) *pDest++ = pSrc[8];
if(change1 & 0x0200) *pDest++ = pSrc[9];
if(change1 & 0x0400) *pDest++ = pSrc[10];
if(change1 & 0x0800) *pDest++ = pSrc[11];
if(change1 & 0x1000) *pDest++ = pSrc[12];
if(change1 & 0x2000) *pDest++ = pSrc[13];
if(change1 & 0x4000) *pDest++ = pSrc[14];
if(change1 & 0x8000) *pDest++ = pSrc[15];
So far I am using a quite big lookup table for it, but I really want to get rid of it:
SSE3Shuffle::Entry& e0 = SSE3Shuffle::g_Shuffle.m_Entries[change1];
_mm_storeu_si128((__m128i*)pDest, _mm_shuffle_epi8(*(__m128i*)pSrc, e0.mask));
pDest += e0.offset;

Assuming:
change1 = _mm_movemask_epi8(bytemask);
offset = popcnt(change1);
On large buffers, using two shuffles and a 1 KiB table is only ~10% slower than using 1 shuffle and a 1MiB table. My attempts at generating the shuffle mask via prefix sums and bit twiddling are about about half the speed of the table based methods
(solutions using pext/pdep were not explored).
Reducing table size: Use two lookups into a 2 KiB table instead of 1 lookup into a 1 MiB table. Always keep the top-most byte - if that byte is to be discarded then it doesn't matter what byte is at that position (down to 7-bit indices, or 1 KiB table). Further reduce possible combinations by manually packing the two bytes in each 16-bit lane (down to a 216 byte table).
The following example strips whitespace from text using SSE4.1. If only SSSE3 is available then blendv can be emulated. The 64-bit halves are re-combined by overlapping writes to memory, but they could be re-combined in the xmm register (as seen in the AVX2 example).
#include <stdint.h>
#include <smmintrin.h> // SSE4.1
size_t despacer (void* dst_void, void* src_void, size_t length)
{
uint8_t* src = (uint8_t*)src_void;
uint8_t* dst = (uint8_t*)dst_void;
if (length >= 16) {
// table of control characters (space, tab, newline, carriage return)
const __m128i lut_cntrl = _mm_setr_epi8(' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0);
// bits[4:0] = index -> ((trit_d * 0) + (trit_c * 9) + (trit_b * 3) + (trit_a * 1))
// bits[15:7] = popcnt
const __m128i sadmask = _mm_set1_epi64x(0x8080898983838181);
// adding 8 to each shuffle index is cheaper than extracting the high qword
const __m128i offset = _mm_cvtsi64_si128(0x0808080808080808);
// shuffle control indices
static const uint64_t table[27] = {
0x0000000000000706, 0x0000000000070600, 0x0000000007060100, 0x0000000000070602,
0x0000000007060200, 0x0000000706020100, 0x0000000007060302, 0x0000000706030200,
0x0000070603020100, 0x0000000000070604, 0x0000000007060400, 0x0000000706040100,
0x0000000007060402, 0x0000000706040200, 0x0000070604020100, 0x0000000706040302,
0x0000070604030200, 0x0007060403020100, 0x0000000007060504, 0x0000000706050400,
0x0000070605040100, 0x0000000706050402, 0x0000070605040200, 0x0007060504020100,
0x0000070605040302, 0x0007060504030200, 0x0706050403020100
};
const uint8_t* end = &src[length & ~15];
do {
__m128i v = _mm_loadu_si128((__m128i*)src);
src += 16;
// detect spaces
__m128i mask = _mm_cmpeq_epi8(_mm_shuffle_epi8(lut_cntrl, v), v);
// shift w/blend: each word now only has 3 states instead of 4
// which reduces the possiblities per qword from 128 to 27
v = _mm_blendv_epi8(v, _mm_srli_epi16(v, 8), mask);
// extract bitfields describing each qword: index, popcnt
__m128i desc = _mm_sad_epu8(_mm_and_si128(mask, sadmask), sadmask);
size_t lo_desc = (size_t)_mm_cvtsi128_si32(desc);
size_t hi_desc = (size_t)_mm_extract_epi16(desc, 4);
// load shuffle control indices from pre-computed table
__m128i lo_shuf = _mm_loadl_epi64((__m128i*)&table[lo_desc & 0x1F]);
__m128i hi_shuf = _mm_or_si128(_mm_loadl_epi64((__m128i*)&table[hi_desc & 0x1F]), offset);
// store an entire qword then advance the pointer by how ever
// many of those bytes are actually wanted. Any trailing
// garbage will be overwritten by the next store.
// note: little endian byte memory order
_mm_storel_epi64((__m128i*)dst, _mm_shuffle_epi8(v, lo_shuf));
dst += (lo_desc >> 7);
_mm_storel_epi64((__m128i*)dst, _mm_shuffle_epi8(v, hi_shuf));
dst += (hi_desc >> 7);
} while (src != end);
}
// tail loop
length &= 15;
if (length != 0) {
const uint64_t bitmap = 0xFFFFFFFEFFFFC1FF;
do {
uint64_t c = *src++;
*dst = (uint8_t)c;
dst += ((bitmap >> c) & 1) | ((c + 0xC0) >> 8);
} while (--length);
}
// return pointer to the location after the last element in dst
return (size_t)(dst - ((uint8_t*)dst_void));
}
Whether the tail loop should be vectorized or use cmov is left as an exercise for the reader. Writing each byte unconditionally/branchlessly is fast when the input is unpredictable.
Using AVX2 to generate the shuffle control mask using an in-register table is only slightly slower than using large precomputed tables.
#include <stdint.h>
#include <immintrin.h>
// probably needs improvment...
size_t despace_avx2_vpermd(const char* src_void, char* dst_void, size_t length)
{
uint8_t* src = (uint8_t*)src_void;
uint8_t* dst = (uint8_t*)dst_void;
const __m256i lut_cntrl2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0));
const __m256i permutation_mask = _mm256_set1_epi64x( 0x0020100884828180 );
const __m256i invert_mask = _mm256_set1_epi64x( 0x0020100880808080 );
const __m256i zero = _mm256_setzero_si256();
const __m256i fixup = _mm256_set_epi32(
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707,
0x08080808, 0x0F0F0F0F, 0x00000000, 0x07070707
);
const __m256i lut = _mm256_set_epi32(
0x04050607, // 0x03020100', 0x000000'07
0x04050704, // 0x030200'00, 0x0000'0704
0x04060705, // 0x030100'00, 0x0000'0705
0x04070504, // 0x0300'0000, 0x00'070504
0x05060706, // 0x020100'00, 0x0000'0706
0x05070604, // 0x0200'0000, 0x00'070604
0x06070605, // 0x0100'0000, 0x00'070605
0x07060504 // 0x00'000000, 0x'07060504
);
// hi bits are ignored by pshufb, used to reject movement of low qword bytes
const __m256i shuffle_a = _mm256_set_epi8(
0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70,
0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78, 0x07, 0x16, 0x25, 0x34, 0x43, 0x52, 0x61, 0x70
);
// broadcast 0x08 then blendd...
const __m256i shuffle_b = _mm256_set_epi32(
0x08080808, 0x08080808, 0x00000000, 0x00000000,
0x08080808, 0x08080808, 0x00000000, 0x00000000
);
for( uint8_t* end = &src[(length & ~31)]; src != end; src += 32){
__m256i r0,r1,r2,r3,r4;
unsigned int s0,s1;
r0 = _mm256_loadu_si256((__m256i *)src); // asrc
// detect spaces
r1 = _mm256_cmpeq_epi8(_mm256_shuffle_epi8(lut_cntrl2, r0), r0);
r2 = _mm256_sad_epu8(zero, r1);
s0 = (unsigned)_mm256_movemask_epi8(r1);
r1 = _mm256_andnot_si256(r1, permutation_mask);
r1 = _mm256_sad_epu8(r1, invert_mask); // index_bitmap[0:5], low32_spaces_count[7:15]
r2 = _mm256_shuffle_epi8(r2, zero);
r2 = _mm256_sub_epi8(shuffle_a, r2); // add space cnt of low qword
s0 = ~s0;
r3 = _mm256_slli_epi64(r1, 29); // move top part of index_bitmap to high dword
r4 = _mm256_srli_epi64(r1, 7); // number of spaces in low dword
r4 = _mm256_shuffle_epi8(r4, shuffle_b);
r1 = _mm256_or_si256(r1, r3);
r1 = _mm256_permutevar8x32_epi32(lut, r1);
s1 = _mm_popcnt_u32(s0);
r4 = _mm256_add_epi8(r4, shuffle_a);
s0 = s0 & 0xFFFF; // isolate low oword
r2 = _mm256_shuffle_epi8(r4, r2);
s0 = _mm_popcnt_u32(s0);
r2 = _mm256_max_epu8(r2, r4); // pin low qword bytes
r1 = _mm256_xor_si256(r1, fixup);
r1 = _mm256_shuffle_epi8(r1, r2); // complete shuffle mask
r0 = _mm256_shuffle_epi8(r0, r1); // despace!
_mm_storeu_si128((__m128i*)dst, _mm256_castsi256_si128(r0));
_mm_storeu_si128((__m128i*)&dst[s0], _mm256_extracti128_si256(r0,1));
dst += s1;
}
// tail loop
length &= 31;
if (length != 0) {
const uint64_t bitmap = 0xFFFFFFFEFFFFC1FF;
do {
uint64_t c = *src++;
*dst = (uint8_t)c;
dst += ((bitmap >> c) & 1) | ((c + 0xC0) >> 8);
} while (--length);
}
return (size_t)(dst - ((uint8_t*)dst_void));
}
For posterity, the 1 KiB version (generating the table is left as an exercise for the reader).
static const uint64_t table[128] __attribute__((aligned(64))) = {
0x0706050403020100, 0x0007060504030201, ..., 0x0605040302010700, 0x0605040302010007
};
const __m128i mask_01 = _mm_set1_epi8( 0x01 );
__m128i vector0 = _mm_loadu_si128((__m128i*)src);
__m128i vector1 = _mm_shuffle_epi32( vector0, 0x0E );
__m128i bytemask0 = _mm_cmpeq_epi8( ???, vector0); // detect bytes to omit
uint32_t bitmask0 = _mm_movemask_epi8(bytemask0) & 0x7F7F;
__m128i hsum = _mm_sad_epu8(_mm_add_epi8(bytemask0, mask_01), _mm_setzero_si128());
vector0 = _mm_shuffle_epi8(vector0, _mm_loadl_epi64((__m128i*) &table[(uint8_t)bitmask0]));
_mm_storel_epi64((__m128i*)dst, vector0);
dst += (uint32_t)_mm_cvtsi128_si32(hsum);
vector1 = _mm_shuffle_epi8(vector1, _mm_loadl_epi64((__m128i*) &table[bitmask0 >> 8]));
_mm_storel_epi64((__m128i*)dst, vector1);
dst += (uint32_t)_mm_cvtsi128_si32(_mm_unpackhi_epi64(hsum, hsum));
https://github.com/InstLatx64/AVX512_VPCOMPRESSB_Emu has some benchmarks.

If one is willing to use BMI2 available on haswell and later, one can use pdep to first compress unwanted nibbles out from uint64_t, and then use pext to scatter the result to shuffle mask.
// Step 1 -- replicate mask to nibbles
uint64_t change4 = pdep(change1, 0x1111111111111111ULL) * 0x0F;
// Step 2 -- extract index from array of nibbles
uint64_t indices = pext(0xfedcba09876543210, change4);
// Step 3 -- interleave nibbles to octects
uint64_t high = pdep(indices >> 32ULL,0x0F0F0F0F0F0F0F0F);
uint64_t low = pdep(indices, 0x0F0F0F0F0F0F0F0FULL);
// Step 4 -- use these two masks to compress pSrc
__m128i compressed = _mm_shuffle_epi8(pSrc, _mm_set_epi64(high, low));
// Step 5 -- store 16 bytes unaligned
_mm_storeu_si128(pDst, compressed);
// Step 6 -- increment target pointer
pDst += __mm_popcnt(change1);
Also other variants (based on cumulative sum or sorting the 'X's (or zero bits) out from XX23456789abXXef will first require some technique to spread the bits from uint16_t evenly to __m128i (i.e. reverse of movemask_epi8).
The 64k entry LUT can however be split to top and bottom parts:
int c = change1 & 0xff;
int p = __popcount(c);
uint64_t a = LUT256[c]; // low part of index
uint64_t b = LUT256[change1 >> 8]; // top part of index
b += addlut9[p]; // 0x0101010101010101 * p
// Then must concatenate b|a at pth position of 'a'
if (p < 8)
{
a |= b << (8*(8-p));
b >>= 8*p;
}
__m128i d = _mm_shuffle_epi8(_mm_loadu_si128(pSrc),_mm_set1_epi64(b,a));
// and continue with steps 5 and 6 as before

Slice up an uint8_t array

Let's say that I have an array of 16 uint8_t as follows:
uint8_t array[] = {0x13, 0x01, 0x4E, 0x52, 0x31, 0x4A, 0x35, 0x36, 0x4C, 0x11, 0x21, 0xC6, 0x3C, 0x73, 0xC2, 0x41};
This array stores the data contained in a 128 bits register of an external peripheral. Some of the information it represents are stored on 2, 3, 8, 12 bits ... and so on.
What is the best and elegant way to slice it up and bit mask the information I need? (The problem is that some things that I need overlaps the length of one cell of the array)
If that can help, this snippet I wrote converts the whole array into a char* string. But casting this into an int is not option because.. well 16 bytes.
int i;
char str[33];
for(i = 0; i < sizeof(array) / sizeof(*array) ; i++) {
sprintf(str+2*i,"%02hX",array[i]);
}
puts(str);
13014E52314A35364C1121C63C73C241

Actually such problem also occures when trying to parse all kind of bitstreams, like video or image files or compressed data by algorithms like LZ*. So the approach used there is to implement a bitstream reader.
But in your case the bit sequence is fixed length and quite short, so one way is to manually check the field values using bitwise operations.
Or you can use this function that I just wrote, which can extract arbitrary number of bits from a uint8 array, starting from desired bit position:
uint32_t extract_bits(uint8_t *arr, unsigned int bit_index, unsigned int bit_count)
{
/* Assert that we are not requested to extract more than 32 bits */
uint32_t result = 0;
assert(bit_count <= sizeof(result)*8 && arr != NULL);
/* You can additionally check if you are trying to extract bits exceeding the 16 byte range */
assert(bit_index + bit_count <= 16 * 8);
unsigned int arr_id = bit_index / 8;
unsigned int bit_offset = bit_index % 8;
if (bit_offset > 0) {
/* Extract first 'unaligned_bit_count' bits, which happen to be non-byte-aligned.
* When we do extract those bits, the remaining will be byte-aligned so
* we will thread them in different manner.
*/
unsigned int unaligned_bit_count = 8 - bit_offset;
/* Check if we need less than the remaining unaligned bits */
if (bit_count < unaligned_bit_count) {
result = (arr[arr_id] >> bit_offset) & ((1 << bit_count) - 1);
return result;
}
/* We need them all */
result = arr[arr_id] >> bit_offset;
bit_count -= unaligned_bit_count;
/* Move to next byte element */
arr_id++;
}
while (bit_count > 0) {
/* Try to extract up to 8 bits per iteration */
int bits_to_extract = bit_count > 8 ? 8 : bit_count;
if (bits_to_extract < 8) {
result = (result << bits_to_extract) | (arr[arr_id] & ((1 << bits_to_extract)-1));
}else {
result = (result << bits_to_extract) | arr[arr_id];
}
bit_count -= bits_to_extract;
arr_id++;
}
return result;
}
Here is example of how it is used.
uint32_t r;
/* Extracts bits [7..8] and places them as most significant bits of 'r' */
r = extract_bits(arr, 7, 2)
/* Extracts bits [4..35] and places them as most significant bits of 'r' */
r = extract_bits(arr, 4, 32);
/* Visualize */
printf("slice=%x\n", r);
And then the visualisation of r is up to you. They can either be represented as hex dwords, characters, or however you decide.

How to dump utf8 from octal ISO-8859-1 in C

I'm trying to output the right character in utf8 given the following octal sequence \303\255 and \346\234\254, but I don't get the correct output.
#include <stdio.h>
#include <stdlib.h>
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
*buf++ = (char)ch;
return 1;
}
if(ch < 0x800) {
*buf++ = (ch >> 6) | 0xC0;
*buf++ = (ch & 0x3F) | 0x80;
return 2;
}
if(ch < 0x10000) {
*buf++ = (ch >> 12) | 0xE0;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 3;
}
if(ch < 0x110000) {
*buf++ = (ch >> 18) | 0xF0;
*buf++ = ((ch >> 12) & 0x3F) | 0x80;
*buf++ = ((ch >> 6) & 0x3F) | 0x80;
*buf++ = (ch & 0x3F) | 0x80;
return 4;
}
return 0;
}
void output (char *str) {
char *buffer = calloc(8, sizeof(char));
int n = 0;
while(*str) {
n = encode(buffer + n, *str++);
}
printf("%s\n", buffer);
free (buffer);
}
int main() {
char *str1 = "\303\255";
char *str2 = "\346\234\254";
output(str1);
output(str2);
return 0;
}
Outputs: Ã & æ¬ instead of í & 本

The problem is that the code sequence you use is already UTF-8
/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";
So your encode function is trying to encode an already encoded UTF-8 which should not work.
When i print these sequences in my UTF-8 enabled terminal i see what you are expecting to see:
$ printf "%s\n" $'\303\255'
í
$ printf "%s\n" $'\346\234\254'
本
So maybe you need to rethink what you are trying to accomplish and post a new question if you have new problems there.

It's a pity, but you cannot compare a char value (being it signed or unsigned) with values over 0x100. You are missing something if you try to convert one byte (iso-8859-1) values to utf-8. The iso-8859-1 characters have the same code values as their UTF counterparts, so the conversion is fairly straightforward, as will be shown below.
First of all, all the iso-8859-1 characters are the same as their UTF counterparts, so the first transformation is the identity: We convert each value in iso-8859-1 to the same value in UTF (look that when I say UTF y mean the UTF code for that character, without using any codification, as when I say UTF-8, which is actually an encoding of UTF in eight bit bytes)
UTF values in the range 0x80...0xff must be encoded with two bytes, the first byte using bits 7 and 6 with pattern 110000xx being xx the two most significant bits of the input code, and followed by a second byte with 10xxxxxx being xxxxxx the six least significant bits (bits 5 to 0) of the input code. For UTF values in the range 0x00...0x7f you encode them with just the same byte as the UTF code.
The following function does preciselly this:
size_t iso2utf( unsigned char *buf, unsigned char iso )
{
size_t res = 0;
if ( iso & 0x80 ) {
*buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */
*buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */
res += 2;
} else {
*buf++ = iso; /* a 0xxxxxxx character, untouched. */
res++;
}
*buf = '\0';
return res;
} /* iso2utf */
If you want a complete UTF into UTF-8 encoder, you can try this (I used a different approach, as there can be as much as seven bytes per UTF char ---actually not so much, as currently only 24 or 25 bit codes are used):
#include <string.h>
#include <stdlib.h>
typedef unsigned int UTF; /* you can use wchar_t if you prefer */
typedef unsigned char BYTE;
/* I will assume that UTF string is also zero terminated */
size_t utf_utf8 (BYTE *out, UTF *in)
{
size_t res = 0;
for (;*in;in++) {
UTF c = *in; /* copy the UTF value */
/* we are constructing the string backwards, so finally
* we have it properly ordered. */
size_t n = 0; /* number of characters for this one */
BYTE aux[7], /* buffer to construct the string */
*p = aux + sizeof aux; /* point one cell past the end */
static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01};
static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};
for (;c >= limits[n]; c >>= 6) {
*--p = 0x80 | (c & 0x3f); n++;
} /* for */
*--p = masks[n] | c; n++;
memcpy(out, p, n); out += n; res += n;
} /* for */
*out = '\0'; /* terminate string */
return res;
} /* utf_utf8 */
See that the seven bytes per UTF code is hardwired, as it is the fact of UTF codes being 32bit integer. I don't expect UTF codes to go further past the 32 bit limit, but in that case, both, the UTF typedef, and the sizes and contents of the tables aux, limits and masks might be changed accordingly. There's a maximum limit of 7 or 8 for the number of characters used for the utf-8 encoding also, and it's not specified in the standard in any form how to proceed if the UTF codespace should run out of codes any time, so better not to mesh too much with this.

Useless function parameter: unsigned char ch
/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
if(ch < 0x80) {
...
return 1;
if(ch < 0x800) {
...
return 2;
if(ch < 0x10000) {
Sorry, GTG.
Note: Code incorrectly does not detect high and low surrogates.

How to circular shift an array of 4 chars?

I have an array of four unsigned chars. I want to treat it like a 32-bit number (assume the upper bits of the char are don't care. I only care about the lower 8-bits). Then, I want to circularly shift it by an arbitrary number of places. I've got a few different shift sizes, all determined at compile-time.
E.g.
unsigned char a[4] = {0x81, 0x1, 0x1, 0x2};
circular_left_shift(a, 1);
/* a is now { 0x2, 0x2, 0x2, 0x5 } */
Edit: To everyone wondering why I didn't mention CHAR_BIT != 8, because this is standard C. I didn't specify a platform, so why are you assuming one?

static void rotate_left(uint8_t *d, uint8_t *s, uint8_t bits)
{
const uint8_t octetshifts = bits / 8;
const uint8_t bitshift = bits % 8;
const uint8_t bitsleft = (8 - bitshift);
const uint8_t lm = (1 << bitshift) - 1;
const uint8_t um = ~lm;
int i;
for (i = 0; i < 4; i++)
{
d[(i + 4 - octetshifts) % 4] =
((s[i] << bitshift) & um) |
((s[(i + 1) % 4] >> bitsleft) & lm);
}
}
Obviously

While keeping in mind plain C the best way is
inline void circular_left_shift(char *chars, short shift) {
__int32 *dword = (__int32 *)chars;
*dword = (*dword << shift) | (*dword >> (32 - shift));
}
Uhmm, char is 16 bits long, was not clear for me. I presume int is still 32 bit.
inline void circular_left_shift(char *chars, short shift) {
int i, part;
part = chars[0] >> (16 - shift);
for (i = 0; i < 3; ++i)
chars[i] = (chars[i] << shift) | (chars[i + 1] >> (16 - shift));
chars[3] = (chars[3] << shift) | part;
}
Or you could just unwind this cycle.
You could dig further into asm instruction ror, on x86 it's capable of performing such shift up to 31 bits left. Something like a
MOV CL, 31
ROR EAX, CL

Use union:
typedef union chr_int{
unsigned int i;
unsigned char c[4];
};
It's safer (because of pointer aliasing) and easier to manipulate.
EDIT: you should have mention earlier that your char isn't 8 bits. However, this should do the trick:
#define ORIG_MASK 0x81010102
#define LS_CNT 1
unsigned char a[4] = {
((ORIG_MASK << LS_CNT ) | (ORIG_MASK >> (32 - LS_CNT))) & 0xff,
((ORIG_MASK << (LS_CNT + 8)) | (ORIG_MASK >> (24 - LS_CNT))) & 0xff,
((ORIG_MASK << LS_CNT + 16)) | (ORIG_MASK >> (16 - LS_CNT))) & 0xff,
((ORIG_MASK << (LS_CNT + 24)) | (ORIG_MASK >> ( 8 - LS_CNT))) & 0xff
};

Comparing arbitrary bit sequences in a byte array in c

I have a couple uint8_t arrays in my c code, and I'd like to compare an arbitrary sequence bits from one with another. So for example, I have bitarray_1 and bitarray_2, and I'd like to compare bits 13 - 47 from bitarray_1 with bits 5-39 of bitarray_2. What is the most efficient way to do this?
Currently it's a huge bottleneck in my program, since I just have a naive implementation that copies the bits into the beginning of a new temporary array, and then uses memcmp on them.

three words: shift, mask and xor.
shift to get the same memory alignment for both bitarray. If not you will have to shift one of the arrays before comparing them. Your exemple is probably misleading because bits 13-47 and 5-39 have the same memory alignment on 8 bits addresses. This wouldn't be true if you were comparing say bits 14-48 with bits 5-39.
Once everything is aligned and exceeding bits cleared for table boundaries a xor is enough to perform the comparison of all the bits at once. Basically you can manage to do it with just one memory read for each array, which should be pretty efficient.
If memory alignment is the same for both arrays as in your example memcmp and special case for upper and lower bound is probably yet faster.
Also accessing array by uint32_t (or uint64_t on 64 bits architectures) should also be more efficient than accessing by uint8_t.
The principle is simple but as Andrejs said the implementation is not painless...
Here is how it goes (similarities with #caf proposal is no coincidence):
/* compare_bit_sequence() */
int compare_bit_sequence(uint8_t s1[], unsigned s1_off, uint8_t s2[], unsigned s2_off,
unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t clear_lo_bits[] =
{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
uint8_t v1;
uint8_t * max_s1;
unsigned end;
uint8_t lsl;
uint8_t v1_mask;
int delta;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off >> 3;
s1_off &= 7;
s2 += s2_off >> 3;
s2_off &= 7;
/* Make sure s2 is the sequence with the shorter offset */
if (s2_off > s1_off){
uint8_t * tmp_s;
unsigned tmp_off;
tmp_s = s2; s2 = s1; s1 = tmp_s;
tmp_off = s2_off; s2_off = s1_off; s1_off = tmp_off;
}
delta = s1_off;
/* handle the beginning, s2 incomplete */
if (s2_off > 0){
delta = s1_off - s2_off;
v1 = delta
? (s1[0] >> delta | s1[1] << (8 - delta)) & clear_lo_bits[delta]
: s1[0];
if (length <= 8 - s2_off){
if ((v1 ^ *s2)
& clear_lo_bits[s2_off]
& mask_lo_bits[s2_off + length]){
return NOT_EQUAL;
}
else {
return EQUAL;
}
}
else{
if ((v1 ^ *s2) & clear_lo_bits[s2_off]){
return NOT_EQUAL;
}
length -= 8 - s2_off;
}
s1++;
s2++;
}
/* main loop, we test one group of 8 bits of v2 at each loop */
max_s1 = s1 + (length >> 3);
lsl = 8 - delta;
v1_mask = clear_lo_bits[delta];
while (s1 < max_s1)
{
if ((*s1 >> delta | (*++s1 << lsl & v1_mask)) ^ *s2++)
{
return NOT_EQUAL;
}
}
/* last group of bits v2 incomplete */
end = length & 7;
if (end && ((*s2 ^ *s1 >> delta) & mask_lo_bits[end]))
{
return NOT_EQUAL;
}
return EQUAL;
}
All possible optimisations are not yet used. One promising one would be to use larger chunks of data (64 bits or 32 bits at once instead of 8), you could also detect cases where offset are synchronised for both arrays and in such cases use a memcmp instead of the main loop, replace modulos % 8 by logical operators & 7, replace '/ 8' by '>> 3', etc., have to branches of code instead of swapping s1 and s2, etc, but the main purpose is achieved : only one memory read and not memory write for each array item hence most of the work can take place inside processor registers.

bits 13 - 47 of bitarray_1 are the same as bits 5 - 39 of bitarray_1 + 1.
Compare the first 3 bits (5 - 7) with a mask and the other bits (8 - 39) with memcmp().
Rather than shift and copy the bits, maybe representing them differently is faster. You have to measure.
/* code skeleton */
static char bitarray_1_bis[BIT_ARRAY_SIZE*8+1];
static char bitarray_2_bis[BIT_ARRAY_SIZE*8+1];
static const char *lookup_table[] = {
"00000000", "00000001", "00000010" /* ... */
/* 256 strings */
/* ... */ "11111111"
};
/* copy every bit of bitarray_1 to an element of bitarray_1_bis */
for (k = 0; k < BIT_ARRAY_SIZE; k++) {
strcpy(bitarray_1_bis + 8*k, lookup_table[bitarray_1[k]]);
strcpy(bitarray_2_bis + 8*k, lookup_table[bitarray_2[k]]);
}
memcmp(bitarray_1_bis + 13, bitarray_2_bis + 5, 47 - 13 + 1);
You can (and should) limit the copy to the minimum possible.
I have no idea if it's faster, but it wouldn't surprise me if it was. Again, you have to measure.

The easiest way to do this is to convert the more complex case into a simpler case, then solve the simpler case.
In the following code, do_compare() solves the simpler case (where the sequences are never offset by more than 7 bits, s1 is always offset as much or more than s2, and the length of the sequence is non-zero). The compare_bit_sequence() function then takes care of converting the harder case to the easier case, and calls do_compare() to do the work.
This just does a single-pass through the bit sequences, so hopefully that's an improvement on your copy-and-memcmp implementation.
#define NOT_EQUAL 0
#define EQUAL 1
/* do_compare()
*
* Does the actual comparison, but has some preconditions on parameters to
* simplify things:
*
* length > 0
* 8 > s1_off >= s2_off
*/
int do_compare(const uint8_t s1[], const unsigned s1_off, const uint8_t s2[],
const unsigned s2_off, const unsigned length)
{
const uint8_t mask_lo_bits[] =
{ 0xff, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff };
const uint8_t mask_hi_bits[] =
{ 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff };
const unsigned msb = (length + s1_off - 1) / 8;
const unsigned s2_shl = s1_off - s2_off;
const unsigned s2_shr = 8 - s2_shl;
unsigned n;
uint8_t s1s2_diff, lo_bits = 0;
for (n = 0; n <= msb; n++)
{
/* Shift s2 so it is aligned with s1, pulling in low bits from
* the high bits of the previous byte, and store in s1s2_diff */
s1s2_diff = lo_bits | (s2[n] << s2_shl);
/* Save the bits needed to fill in the low-order bits of the next
* byte. HERE BE DRAGONS - since s2_shr can be 8, this below line
* only works because uint8_t is promoted to int, and we know that
* the width of int is guaranteed to be >= 16. If you change this
* routine to work with a wider type than uint8_t, you will need
* to special-case this line so that if s2_shr is the width of the
* type, you get lo_bits = 0. Don't say you weren't warned. */
lo_bits = s2[n] >> s2_shr;
/* XOR with s1[n] to determine bits that differ between s1 and s2 */
s1s2_diff ^= s1[n];
/* Look only at differences in the high bits in the first byte */
if (n == 0)
s1s2_diff &= mask_hi_bits[8 - s1_off];
/* Look only at differences in the low bits of the last byte */
if (n == msb)
s1s2_diff &= mask_lo_bits[(length + s1_off) % 8];
if (s1s2_diff)
return NOT_EQUAL;
}
return EQUAL;
}
/* compare_bit_sequence()
*
* Adjusts the parameters to match the preconditions for do_compare(), then
* calls it to do the work.
*/
int compare_bit_sequence(const uint8_t s1[], unsigned s1_off,
const uint8_t s2[], unsigned s2_off, unsigned length)
{
/* Handle length zero */
if (length == 0)
return EQUAL;
/* Makes sure the offsets are less than 8 bits */
s1 += s1_off / 8;
s1_off %= 8;
s2 += s2_off / 8;
s2_off %= 8;
/* Make sure s2 is the sequence with the shorter offset */
if (s1_off >= s2_off)
return do_compare(s1, s1_off, s2, s2_off, length);
else
return do_compare(s2, s2_off, s1, s1_off, length);
}
To do the comparison in your example, you'd call:
compare_bit_sequence(bitarray_1, 13, bitarray_2, 5, 35)
(Note that I am numbering the bits from zero, and assuming that the bitarrays are laid out little-endian, so this will start the comparison from the sixth-least-significant bit in bitarray2[0], and the sixth-least-signifcant bit in bitarray1[1]).

What about writing the function that will calculate the offsets from both arrays, apply the mask, shift the bits and store the result to the int so you may compare them. If the bits count (34 in your example) exceeds the length of the int - recurse or loop.
Sorry, the example will be pain in the ass.

Here is my unoptimized bit sequence comparison function:
#include <stdio.h>
#include <stdint.h>
// 01234567 01234567
uint8_t bitsA[] = { 0b01000000, 0b00010000 };
uint8_t bitsB[] = { 0b10000000, 0b00100000 };
int bit( uint8_t *bits, size_t bitpoz, size_t len ){
return (bitpoz<len)? !!(bits[bitpoz/8]&(1<<(7-bitpoz%8))): 0;
}
int bitcmp( uint8_t *bitsA, size_t firstA, size_t lenA,
uint8_t *bitsB, size_t firstB, size_t lenB ){
int cmp;
for( size_t i=0; i<lenA || i<lenB; i++ ){
if( (cmp = bit(bitsA,firstA+i,firstA+lenA) -
bit(bitsB,firstB+i,firstB+lenB)) ) return cmp;
}
return 0;
}
int main(){
printf( "cmp: %i\n", bitcmp( bitsA,1,11, bitsB,0,11 ) );
}
EDIT: Here is my (untested) bitstring equality test function:
#include <stdlib.h>
#include <stdint.h>
#define load_64bit(bits,first) (*(uint64_t*)bits<<first | *(bits+8)>>(8-first))
#define load_32bit(bits,first) (*(uint32_t*)bits<<first | *(bits+4)>>(8-first))
#define load_16bit(bits,first) (*(uint16_t*)bits<<first | *(bits+2)>>(8-first))
#define load_8bit( bits,first) ( *bits<<first | *(bits+1)>>(8-first))
static inline uint8_t last_bits( uint8_t *bits, size_t first, size_t size ){
return (first+size>8?load_8bit(bits,first):*bits<<first)>>(8-size);
}
int biteq( uint8_t *bitsA, size_t firstA,
uint8_t *bitsB, size_t firstB, size_t size ){
if( !size ) return 1;
bitsA+=firstA/8; firstA%=8;
bitsB+=firstB/8; firstB%=8;
for(; size>64;size-=64,bitsA+=8,bitsB+=8)
if(load_64bit(bitsA,firstA)!=load_64bit(bitsB,firstB)) return 0;
for(; size>32;size-=32,bitsA+=4,bitsB+=4)
if(load_32bit(bitsA,firstA)!=load_32bit(bitsB,firstB)) return 0;
for(; size>16;size-=16,bitsA+=2,bitsB+=2)
if(load_16bit(bitsA,firstA)!=load_16bit(bitsB,firstB)) return 0;
for(; size> 8;size-= 8,bitsA++, bitsB++ )
if(load_8bit( bitsA,firstA)!=load_8bit( bitsB,firstB)) return 0;
return !size ||
last_bits(bitsA,firstA,size)==last_bits(bitsB,firstB,size);
}
I made a simple measurement tool to see how fast is it:
#include <unistd.h>
#include <stdio.h>
#include <signal.h>
#define SIZE 1000000
uint8_t bitsC[SIZE];
volatile int end_loop;
void sigalrm_hnd( int sig ){ (void)sig; end_loop=1; }
int main(){
uint64_t loop_count; int cmp;
signal(SIGALRM,sigalrm_hnd);
loop_count=0; end_loop=0; alarm(10);
while( !end_loop ){
for( int i=1; i<7; i++ ){
loop_count++;
cmp = biteq( bitsC,i, bitsC,7-i,(SIZE-1)*8 );
if( !cmp ){ printf( "cmp: %i (==0)\n", cmp ); return -1; }
}
}
printf( "biteq: %.2f round/sec\n", loop_count/10.0 );
}
Result:
bitcmp: 8.40 round/sec
biteq: 363.60 round/sec
EDIT2: last_bits() changed.