optimize unaligned SSE2/AVX2 XOR - c

In my code I have to handle "unmasking" of websocket packets, which essentially means XOR'ing unaligned data of arbitrary length. Thanks to SO (Websocket data unmasking / multi byte xor) I already have found out how to (hopefully) speed this up using SSE2/AVX2 extensions, but looking at it now, it seems to me that my handling of unaligned data is totally sub-optimal. Is there any way to optimize my code or at least make it simpler with same performance, or is my code already the best performing?
Here's the important part of the code (for the question I'm assuming that data will always be at least enough to run the AVX2 cycle once, but at the same time it will mostly run only a few times at most):
// circular shift left for uint32
int cshiftl_u32(uint32_t num, uint8_t shift) {
return (num << shift) | (num >> (32 - shift));
}
// circular shift right for uint32
int cshiftr_u32(uint32_t num, uint8_t shift) {
return (num >> shift) | (num << (32 - shift));
}
void optimized_xor_32( uint32_t mask, uint8_t *ds, uint8_t *de ) {
if (ds == de) return; // zero data len -> nothing to do
uint8_t maskOffset = 0;
// process single bytes till 4 byte alignment ( <= 3 )
for (; ds < de && ( (uint64_t)ds & (uint64_t)3 ); ds++) {
*ds ^= *((uint8_t *)(&mask) + maskOffset);
maskOffset = (maskOffset + 1) & (uint8_t)3;
}
if (ds == de) return; // done, return
if (maskOffset != 0) { // circular left-shift mask around so it works for other instructions
mask = cshiftl_u32(mask, maskOffset);
maskOffset = 0;
}
// process 4 byte block till 8 byte alignment ( <= 1 )
uint8_t *de32 = (uint8_t *)((uint64_t)de & ~((uint64_t)31));
if ( ds < de32 && ( (uint64_t)de & (uint64_t)7 ) ) {
*(uint32_t *)ds ^= mask; // mask is uint32_t
if (++ds == de) return;
}
// process 8 byte block till 16 byte alignment ( <= 1 )
uint64_t mask64 = mask | (mask << 4);
uint8_t *de64 = (uint8_t *)((uint64_t)de & ~((uint64_t)63));
if ( ds < de64 && ( (uint64_t)ds & (uint64_t)15 ) ) {
*(uint64_t *)ds ^= mask64;
if (++ds == de) return; // done, return
}
// process 16 byte block till 32 byte alignment ( <= 1) (if supported)
#ifdef CPU_SSE2
__m128i v128, v128_mask;
v128_mask = _mm_set1_epi32(mask);
uint8_t *de128 = (uint8_t *)((uint64_t)de & ~((uint64_t)127));
if ( ds < de128 && ( (uint64_t)ds & (uint64_t)31 ) ) {
v128 = _mm_load_si128((__m128i *)ds);
v128 = _mm_xor_si128(v128, v128_mask);
_mm_store_si128((__m128i *)ds, v128);
if (++ds == de) return; // done, return
}
#endif
#ifdef CPU_AVX2 // process 32 byte blocks (if supported -> haswell upwards)
__m256i v256, v256_mask;
v256_mask = _mm256_set1_epi32(mask);
uint8_t *de256 = (uint8_t *)((uint64_t)de & ~((uint64_t)255));
for (; ds < de256; ds+=32) {
v256 = _mm256_load_si256((__m256i *)ds);
v256 = _mm256_xor_si256(v256, v256_mask);
_mm256_store_si256((__m256i *)ds, v256);
}
if (ds == de) return; // done, return
#endif
#ifdef CPU_SSE2 // process remaining 16 byte blocks (if supported)
for (; ds < de128; ds+=16) {
v128 = _mm_load_si128((__m128i *)ds);
v128 = _mm_xor_si128(v128, v128_mask);
_mm_store_si128((__m128i *)ds, v128);
}
if (ds == de) return; // done, return
#endif
// process remaining 8 byte blocks
// this should always be supported, so remaining can be assumed to be executed <= 1 times
for (; ds < de64; ds += 8) {
*(uint64_t *)ds ^= mask64;
}
if (ds == de) return; // done, return
// process remaining 4 byte blocks ( <= 1)
if (ds < de32) {
*(uint32_t *)ds ^= mask;
if (++ds == de) return; // done, return
}
// process remaining bytes ( <= 3)
for (; ds < de; ds ++) {
*ds ^= *((uint8_t *)(&mask) + maskOffset);
maskOffset = (maskOffset + 1) & (uint8_t)3;
}
}
P.S.: Please ignore the use of #ifdef instead of cpuid or the like for cpu flag detection.

Unlike what it says in the manual most Intel processors are actually quite good at handling unaligned data. Since you are using Intel's compiler builtins for vector handling I assume you have access to a reasonably recent version of icc.
If you can not naturally align your data then I am afraid that what you are doing is as close as you can get to maximum performance. In terms of making the code more readable and deployable on Xeon Phi(64 byte vector registers)/Future longer vector processors I would suggest you start using Intel Cilk Plus.
Example:
void intel_cilk_xor(uint32_t mask, uint8_t *d, size_t length) {
while (length & 0x3) {
*(d++) ^= mask;
asm ("rold $8, %0" : "+g" (mask) :: "cc"); // rotate dword one byte left
length--;
}
// switch to 4 bytes per block
uint32_t _d = d;
length >>= 2;
// Intel Cilk Plus Array Notation
// Should expand automatically to the best possible SIMD instructions
// you are compiling for
_d[0:length] ^= mask;
}
Please note that I did not test this code as I do not have access to an Intel compiler right now. If you will encounter problems then I can go over it when I am back in my office next week.
If you rather prefer intrinsics then proper use of preprocessor macros can significantly ease up your life:
#if defined(__MIC__)
// intel Xeon Phi
#define VECTOR_BLOCKSIZE 64
// I do not remember the correct types/instructions right now
#error "TODO: MIC handling"
#elif defined(CPU_AVX2)
#define VECTOR_BLOCKSIZE 32
typedef __m256i my_vector_t;
#define VECTOR_LOAD_MASK _mm256_set1_epi32
#define VECTOR_XOR(d, mask) _mm_store_si256(d, _mm256_set1_epi32(_mm256_load_si256(d), mask))
#elif defined(CPU_SSE2)
#define VECTOR_BLOCKSIZE 16
typedef __m128i my_vector_t;
#define VECTOR_LOAD_MASK _mm128_set1_epi32
#define VECTOR_XOR(d, mask) _mm_store_si128(d, _mm128_set1_epi32(_mm128_load_si128(d), mask))
#else
#define VECTOR_BLOCKSIZE 8
#define VECTOR_LOAD_MASK(mask) ((mask) << 32 | (mask))
#define VECTOR_XOR(d, mask) (*(d)) ^= (mask)
typedef uint64_t my_vector_t;
#fi
void optimized_xor_32( uint32_t mask, uint8_t *d, size_t length ) {
size_t i;
// there really is no point in having extra
// branches for different vector lengths if they are
// executed at most once
// branch prediction is your friend here
// so we do one byte at a time until the block size
// is reached
while (length && (d & (VECTOR_BLOCKSIZE - 1))) {
*(d++) ^= mask;
asm ("rold $8, %0" : "+g" (mask) :: "cc"); // rotate dword one byte left
length--;
}
my_vector_t * d_vector = (my_vector_t *)d;
my_vector_t vector_mask = VECTOR_LOAD_MASK(mask);
size_t vector_legth = length / VECTOR_BLOCKSIZE; // compiler will optimise this to a bitshift
length &= VECTOR_BLOCKSIZE -1; // remaining length
for (i = 0; i < vector_legth; i++) {
VECTOR_XOR(d_vector + i, vector_mask);
}
// process the tail
d = (uint8_t*)(d_vector + i);
for (i = 0; i < length; i++) {
d[i] ^= mask;
asm ("rold $8, %0" : "+g" (mask) :: "cc");
}
}
On another note: You may want to use the x86 rotate instruction instead of bit shifts to rotate mask:
#define asm_rol(var, bits) asm ("rol %1, %0" : "+r" (var) : "c" ((uint8_t)bits) : "cc")

Related

C Zephyr SDK CRC16 Implementation

I was looking around in the zephyr implementations and found this method for computing a crc16 checksum:
u16_t crc16(const u8_t *src, size_t len, u16_t polynomial,
u16_t initial_value, bool pad)
{
u16_t crc = initial_value;
size_t padding = pad ? sizeof(crc) : 0;
size_t i, b;
/* src length + padding (if required) */
for (i = 0; i < len + padding; i++) {
for (b = 0; b < 8; b++) {
u16_t divide = crc & 0x8000UL;
crc = (crc << 1U);
/* choose input bytes or implicit trailing zeros */
if (i < len) {
crc |= !!(src[i] & (0x80U >> b));
}
if (divide != 0U) {
crc = crc ^ polynomial;
}
}
}
return crc;
}
And I tripped over this line here:
crc |= !!(src[i] & (0x80U >> b));
I do not understand why they are using a boolean operator (!!) in this line. From my understanding this is what it does:
It basically does an implicit "casting" where it considers its operand on the right to be a boolean and negates it twice, which does not do anything besides making the output a 0 or a 1 depending on if the expression (src[i] & (0x80U >> b)) was bigger then 0 to start with.
Is this correct? Why are they using the operator in this way?
It is inserting bit 7-b from src[i] into the low bit of crc. If that bit is a 1, which will be somewhere in the result of the &, the !! turns it into a 1 in the low bit, which is then or'ed into crc.
This is truly painful to look at. A better and cleaner way to do it is crc |= (src[i] >> b) & 1;, where b counts down instead of up. E.g. int b = 8; do { b--; ... } while (b);. Better still would be to just exclusive-or the byte after the loop, which does the same thing:
/* src length + padding (if required) */
for (i = 0; i < len + padding; i++) {
for (b = 0; b < 8; b++)
crc = crc & 0x8000 ? (crc << 1) ^ polynomial : crc << 1;
if (i < len)
crc ^= src[i];
}
An optimizing compiler will unroll the b loop.

How to set the values of an array to a single variable

I'm reading the values from a SD card in an ARM micro:
Res = f_read(&fil, (void*)buf, 6, &NumBytesRead);
where fil is a pointer, buf is a buffer where the data is stored.
And that's the problem: it's an array but I'd like to have the contents of that array in a single variable.
To give an actual example: the 6 bytes read from the file are:
buf[0] = 0x1B
buf[1] = 0x26
buf[2] = 0xB3
buf[3] = 0x54
buf[4] = 0xA1
buf[5] = 0xCF
And I'd like to have: uint64_t data be equal to 0x1B26B354A1CF. That is, all the elements of the array "concatenated" in one single 64 bit integer.
Without type punning you can do as below.
uint64_t data = 0;
for (int i=0; i<6; i++)
{
data <<= 8;
data |= (uint64_t) buf[i];
}
Use union but remember about the endianes.
union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef enum
{
LITTLE_E,
BIG_E,
}ENDIANESS;
ENDIANESS checkEndianess(void)
{
ENDIANESS result = BIG_E;
u64 d64 = {.u64 = 0xff};
if(d64.u8[0]) result = LITTLE_E;
return result;
}
uint64_t arrayToU64(uint8_t *array, ENDIANESS e) // for the array BE
{
u64 d64;
if(e == LITTLE_E)
{
memmove(&d64, array, sizeof(d64.u64));
}
else
{
for(int index = sizeof(d64.u64) - 1; index >= 0; index--)
{
d64.u8[sizeof(d64.u64) - index - 1] = array[index];
}
}
return d64.u64;
}
int main()
{
uint8_t BIG_E_Array[] = {0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x80};
ENDIANESS e;
printf("This system endianess: %s\n", (e = checkEndianess()) == BIG_E ? "BIG":"LITTLE");
printf("Punned uint64_t for our system 0x%lx\n", arrayToU64(BIG_E_Array, e));
printf("Punned uint64_t for the opposite endianess system 0x%lx\n", arrayToU64(BIG_E_Array, e == BIG_E ? LITTLE_E : BIG_E));
return 0;
}
To things to take care of here:
have the bytes be ordered correctly
read the six bytes into one 64bit integer
Issue 1 can be taken care of by storing the byte coming in in network byte order (Big Endian) into the 64 bit integer in host byte order by for example using the two marcos below:
/* below defines of htonll() and ntohll() are taken from this answer:
https://stackoverflow.com/a/28592202/694576
*/
#if __BIG_ENDIAN__
# define htonll(x) (x)
# define ntohll(x) (x)
#else
# define htonll(x) ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32))
# define ntohll(x) ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32))
#endif
Issue 2 can be solved in multiple ways:
Extending your approach
#define BUFFER_SIZE (6)
...
assert(BUFFER_SIZE <= sizeof (uint64_t));
uint8_t buffer[BUFFER_SIZE];
FILE * pf = ...; /* open file here */
/* test if file has been opened successfully here */
... result = f_read(pf, buffer, BUFFER_SIZE, ...);
/* test result for success */
uint64_t number = 0;
memset(&number, buffer, BUFFER_SIZE)
number = ntohll(number);
Use "Type Punning" by using a union
union buffer_wrapper
{
uint8_t u8[sizeof (uint64_t)];
uint64_t u64;
}
Instead of
uint8_t buffer[BUFFER_SIZE];
use
union buffer_wrapper buffer;
and instead of
memcpy(&number, buffer, BUFFER_SIZE)
number = ntohll(number)
use
number = ntohll(buffer.u64)

How to implement CRC32 taking advantage of Intel specific instructions? [duplicate]

So I have a design which incorporates CRC32C checksums to ensure data hasn't been damaged. I decided to use CRC32C because I can have both a software version and a hardware-accelerated version if the computer the software runs on supports SSE 4.2
I'm going by Intel's developer manual (vol 2A), which seems to provide the algorithm behind the crc32 instruction. However, I'm having little luck. Intel's developer guide says the following:
BIT_REFLECT32: DEST[31-0] = SRC[0-31]
MOD2: Remainder from Polynomial division modulus 2
TEMP1[31-0] <- BIT_REFLECT(SRC[31-0])
TEMP2[31-0] <- BIT_REFLECT(DEST[31-0])
TEMP3[63-0] <- TEMP1[31-0] << 32
TEMP4[63-0] <- TEMP2[31-0] << 32
TEMP5[63-0] <- TEMP3[63-0] XOR TEMP4[63-0]
TEMP6[31-0] <- TEMP5[63-0] MOD2 0x11EDC6F41
DEST[31-0] <- BIT_REFLECT(TEMP6[31-0])
Now, as far as I can tell, I've done everything up to the line starting TEMP6 correctly, but I think I may be either misunderstanding the polynomial division, or implementing it incorrectly. If my understanding is correct, 1 / 1 mod 2 = 1, 0 / 1 mod 2 = 0, and both divides-by-zero are undefined.
What I don't understand is how binary division with 64-bit and 33-bit operands will work. If SRC is 0x00000000, and DEST is 0xFFFFFFFF, TEMP5[63-32] will be all set bits, while TEMP5[31-0] will be all unset bits.
If I was to use the bits from TEMP5 as the numerator, there would be 30 divisions by zero as the polynomial 11EDC6F41 is only 33 bits long (and so converting it to a 64-bit unsigned integer leaves the top 30 bits unset), and so the denominator is unset for 30 bits.
However, if I was to use the polynomial as the numerator, the bottom 32 bits of TEMP5 are unset, resulting in divides by zero there, and the top 30 bits of the result would be zero, as the top 30 bits of the numerator would be zero, as 0 / 1 mod 2 = 0.
Am I misunderstanding how this works? Just plain missing something? Or has Intel left out some crucial step in their documentation?
The reason I went to Intel's developer guide for what appeared to be the algorithm they used is because they used a 33-bit polynomial, and I wanted to make outputs identical, which didn't happen when I used the 32-bit polynomial 1EDC6F41 (show below).
uint32_t poly = 0x1EDC6F41, sres, crcTable[256], data = 0x00000000;
for (n = 0; n < 256; n++) {
sres = n;
for (k = 0; k < 8; k++)
sres = (sres & 1) == 1 ? poly ^ (sres >> 1) : (sres >> 1);
crcTable[n] = sres;
}
sres = 0xFFFFFFFF;
for (n = 0; n < 4; n++) {
sres = crcTable[(sres ^ data) & 0xFF] ^ (sres >> 8);
}
The above code produces 4138093821 as an output, and the crc32 opcode produces 2346497208 using the input 0x00000000.
Sorry if this is badly written or incomprehensible in places, it is rather late for me.
Here are both software and hardware versions of CRC-32C. The software version is optimized to process eight bytes at a time. The hardware version is optimized to run three crc32q instructions effectively in parallel on a single core, since the throughput of that instruction is one cycle, but the latency is three cycles.
crc32c.c:
/* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
* Copyright (C) 2013, 2021 Mark Adler
* Version 1.2 5 Jun 2021 Mark Adler
*/
/*
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
Mark Adler
madler#alumni.caltech.edu
*/
/* Version History:
1.0 10 Feb 2013 First version
1.1 31 May 2021 Correct register constraints on assembly instructions
Include pre-computed tables to avoid use of pthreads
Return zero for the CRC when buf is NULL, as initial value
1.2 5 Jun 2021 Make tables constant
*/
// Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
// CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
// version is provided as a fall-back, as well as for speed comparisons.
#include <stddef.h>
#include <stdint.h>
// Tables for CRC word-wise calculation, definitions of LONG and SHORT, and CRC
// shifts by LONG and SHORT bytes.
#include "crc32c.h"
// Table-driven software version as a fall-back. This is about 15 times slower
// than using the hardware instructions. This assumes little-endian integers,
// as is the case on Intel processors that the assembler code here is for.
static uint32_t crc32c_sw(uint32_t crc, void const *buf, size_t len) {
if (buf == NULL)
return 0;
unsigned char const *data = buf;
while (len && ((uintptr_t)data & 7) != 0) {
crc = (crc >> 8) ^ crc32c_table[0][(crc ^ *data++) & 0xff];
len--;
}
size_t n = len >> 3;
for (size_t i = 0; i < n; i++) {
uint64_t word = crc ^ ((uint64_t const *)data)[i];
crc = crc32c_table[7][word & 0xff] ^
crc32c_table[6][(word >> 8) & 0xff] ^
crc32c_table[5][(word >> 16) & 0xff] ^
crc32c_table[4][(word >> 24) & 0xff] ^
crc32c_table[3][(word >> 32) & 0xff] ^
crc32c_table[2][(word >> 40) & 0xff] ^
crc32c_table[1][(word >> 48) & 0xff] ^
crc32c_table[0][word >> 56];
}
data += n << 3;
len &= 7;
while (len) {
len--;
crc = (crc >> 8) ^ crc32c_table[0][(crc ^ *data++) & 0xff];
}
return crc;
}
// Apply the zeros operator table to crc.
static uint32_t crc32c_shift(uint32_t const zeros[][256], uint32_t crc) {
return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
}
// Compute CRC-32C using the Intel hardware instruction. Three crc32q
// instructions are run in parallel on a single core. This gives a
// factor-of-three speedup over a single crc32q instruction, since the
// throughput of that instruction is one cycle, but the latency is three
// cycles.
static uint32_t crc32c_hw(uint32_t crc, void const *buf, size_t len) {
if (buf == NULL)
return 0;
// Pre-process the crc.
uint64_t crc0 = crc ^ 0xffffffff;
// Compute the crc for up to seven leading bytes, bringing the data pointer
// to an eight-byte boundary.
unsigned char const *next = buf;
while (len && ((uintptr_t)next & 7) != 0) {
__asm__("crc32b\t" "(%1), %0"
: "+r"(crc0)
: "r"(next), "m"(*next));
next++;
len--;
}
// Compute the crc on sets of LONG*3 bytes, making use of three ALUs in
// parallel on a single core.
while (len >= LONG*3) {
uint64_t crc1 = 0;
uint64_t crc2 = 0;
unsigned char const *end = next + LONG;
do {
__asm__("crc32q\t" "(%3), %0\n\t"
"crc32q\t" LONGx1 "(%3), %1\n\t"
"crc32q\t" LONGx2 "(%3), %2"
: "+r"(crc0), "+r"(crc1), "+r"(crc2)
: "r"(next), "m"(*next));
next += 8;
} while (next < end);
crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
next += LONG*2;
len -= LONG*3;
}
// Do the same thing, but now on SHORT*3 blocks for the remaining data less
// than a LONG*3 block.
while (len >= SHORT*3) {
uint64_t crc1 = 0;
uint64_t crc2 = 0;
unsigned char const *end = next + SHORT;
do {
__asm__("crc32q\t" "(%3), %0\n\t"
"crc32q\t" SHORTx1 "(%3), %1\n\t"
"crc32q\t" SHORTx2 "(%3), %2"
: "+r"(crc0), "+r"(crc1), "+r"(crc2)
: "r"(next), "m"(*next));
next += 8;
} while (next < end);
crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
next += SHORT*2;
len -= SHORT*3;
}
// Compute the crc on the remaining eight-byte units less than a SHORT*3
// block.
unsigned char const *end = next + (len - (len & 7));
while (next < end) {
__asm__("crc32q\t" "(%1), %0"
: "+r"(crc0)
: "r"(next), "m"(*next));
next += 8;
}
len &= 7;
// Compute the crc for up to seven trailing bytes.
while (len) {
__asm__("crc32b\t" "(%1), %0"
: "+r"(crc0)
: "r"(next), "m"(*next));
next++;
len--;
}
// Return the crc, post-processed.
return ~(uint32_t)crc0;
}
// Check for SSE 4.2. SSE 4.2 was first supported in Nehalem processors
// introduced in November, 2008. This does not check for the existence of the
// cpuid instruction itself, which was introduced on the 486SL in 1992, so this
// will fail on earlier x86 processors. cpuid works on all Pentium and later
// processors.
#define SSE42(have) \
do { \
uint32_t eax, ecx; \
eax = 1; \
__asm__("cpuid" \
: "=c"(ecx) \
: "a"(eax) \
: "%ebx", "%edx"); \
(have) = (ecx >> 20) & 1; \
} while (0)
// Compute a CRC-32C. If the crc32 instruction is available, use the hardware
// version. Otherwise, use the software version.
uint32_t crc32c(uint32_t crc, void const *buf, size_t len) {
int sse42;
SSE42(sse42);
return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
}
Code to generate crc32c.h (stackoverflow won't let me post the tables themselves, due to a 30,000 character limit in an answer):
// Generate crc32c.h for crc32c.c.
#include <stdio.h>
#include <stdint.h>
#define LONG 8192
#define SHORT 256
// Print a 2-D table of four-byte constants in hex.
static void print_table(uint32_t *tab, size_t rows, size_t cols, char *name) {
printf("static uint32_t const %s[][%zu] = {\n", name, cols);
size_t end = rows * cols;
size_t k = 0;
for (;;) {
fputs(" {", stdout);
size_t n = 0, j = 0;
for (;;) {
printf("0x%08x", tab[k + n]);
if (++n == cols)
break;
putchar(',');
if (++j == 6) {
fputs("\n ", stdout);
j = 0;
}
putchar(' ');
}
k += cols;
if (k == end)
break;
puts("},");
}
puts("}\n};");
}
/* CRC-32C (iSCSI) polynomial in reversed bit order. */
#define POLY 0x82f63b78
static void crc32c_word_table(void) {
uint32_t table[8][256];
// Generate byte-wise table.
for (unsigned n = 0; n < 256; n++) {
uint32_t crc = ~n;
for (unsigned k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
table[0][n] = ~crc;
}
// Use byte-wise table to generate word-wise table.
for (unsigned n = 0; n < 256; n++) {
uint32_t crc = ~table[0][n];
for (unsigned k = 1; k < 8; k++) {
crc = table[0][crc & 0xff] ^ (crc >> 8);
table[k][n] = ~crc;
}
}
// Print table.
print_table(table[0], 8, 256, "crc32c_table");
}
// Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC
// polynomial. For speed, this requires that a not be zero.
static uint32_t multmodp(uint32_t a, uint32_t b) {
uint32_t prod = 0;
for (;;) {
if (a & 0x80000000) {
prod ^= b;
if ((a & 0x7fffffff) == 0)
break;
}
a <<= 1;
b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
}
return prod;
}
/* Take a length and build four lookup tables for applying the zeros operator
for that length, byte-by-byte, on the operand. */
static void crc32c_zero_table(size_t len, char *name) {
// Generate operator for len zeros.
uint32_t op = 0x80000000; // 1 (x^0)
uint32_t sq = op >> 4; // x^4
while (len) {
sq = multmodp(sq, sq); // x^2^(k+3), k == len bit position
if (len & 1)
op = multmodp(sq, op);
len >>= 1;
}
// Generate table to update each byte of a CRC using op.
uint32_t table[4][256];
for (unsigned n = 0; n < 256; n++) {
table[0][n] = multmodp(op, n);
table[1][n] = multmodp(op, n << 8);
table[2][n] = multmodp(op, n << 16);
table[3][n] = multmodp(op, n << 24);
}
// Print the table to stdout.
print_table(table[0], 4, 256, name);
}
int main(void) {
puts(
"// crc32c.h\n"
"// Tables and constants for crc32c.c software and hardware calculations.\n"
"\n"
"// Table for a 64-bits-at-a-time software CRC-32C calculation. This table\n"
"// has built into it the pre and post bit inversion of the CRC."
);
crc32c_word_table();
puts(
"\n// Block sizes for three-way parallel crc computation. LONG and SHORT\n"
"// must both be powers of two. The associated string constants must be set\n"
"// accordingly, for use in constructing the assembler instructions."
);
printf("#define LONG %d\n", LONG);
printf("#define LONGx1 \"%d\"\n", LONG);
printf("#define LONGx2 \"%d\"\n", 2 * LONG);
printf("#define SHORT %d\n", SHORT);
printf("#define SHORTx1 \"%d\"\n", SHORT);
printf("#define SHORTx2 \"%d\"\n", 2 * SHORT);
puts(
"\n// Table to shift a CRC-32C by LONG bytes."
);
crc32c_zero_table(8192, "crc32c_long");
puts(
"\n// Table to shift a CRC-32C by SHORT bytes."
);
crc32c_zero_table(256, "crc32c_short");
return 0;
}
Mark Adler's answer is correct and complete, but those seeking quick and easy way to integrate CRC-32C in their application might find it a little difficult to adapt the code, especially if they are using Windows and .NET.
I've created a library that implements CRC-32C using either hardware or software method depending on available hardware. It's available as a NuGet package for C++ and .NET. It's opensource of course.
Besides packaging Mark Adler's code above, I've found a simple way to improve throughput of the software fallback by 50%. On my computer, the library now achieves 2 GB/s in software and over 20 GB/s in hardware. For those curious, here's the optimized software implementation:
static uint32_t append_table(uint32_t crci, buffer input, size_t length)
{
buffer next = input;
#ifdef _M_X64
uint64_t crc;
#else
uint32_t crc;
#endif
crc = crci ^ 0xffffffff;
#ifdef _M_X64
while (length && ((uintptr_t)next & 7) != 0)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
while (length >= 16)
{
crc ^= *(uint64_t *)next;
uint64_t high = *(uint64_t *)(next + 8);
crc = table[15][crc & 0xff]
^ table[14][(crc >> 8) & 0xff]
^ table[13][(crc >> 16) & 0xff]
^ table[12][(crc >> 24) & 0xff]
^ table[11][(crc >> 32) & 0xff]
^ table[10][(crc >> 40) & 0xff]
^ table[9][(crc >> 48) & 0xff]
^ table[8][crc >> 56]
^ table[7][high & 0xff]
^ table[6][(high >> 8) & 0xff]
^ table[5][(high >> 16) & 0xff]
^ table[4][(high >> 24) & 0xff]
^ table[3][(high >> 32) & 0xff]
^ table[2][(high >> 40) & 0xff]
^ table[1][(high >> 48) & 0xff]
^ table[0][high >> 56];
next += 16;
length -= 16;
}
#else
while (length && ((uintptr_t)next & 3) != 0)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
while (length >= 12)
{
crc ^= *(uint32_t *)next;
uint32_t high = *(uint32_t *)(next + 4);
uint32_t high2 = *(uint32_t *)(next + 8);
crc = table[11][crc & 0xff]
^ table[10][(crc >> 8) & 0xff]
^ table[9][(crc >> 16) & 0xff]
^ table[8][crc >> 24]
^ table[7][high & 0xff]
^ table[6][(high >> 8) & 0xff]
^ table[5][(high >> 16) & 0xff]
^ table[4][high >> 24]
^ table[3][high2 & 0xff]
^ table[2][(high2 >> 8) & 0xff]
^ table[1][(high2 >> 16) & 0xff]
^ table[0][high2 >> 24];
next += 12;
length -= 12;
}
#endif
while (length)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
return (uint32_t)crc ^ 0xffffffff;
}
As you can see, it merely crunches larger block at a time. It needs larger lookup table, but it's still cache-friendly. The table is generated the same way, only with more rows.
One extra thing I explored is the use of PCLMULQDQ instruction to get hardware acceleration on AMD processors. I've managed to port Intel's CRC patch for zlib (also available on GitHub) to CRC-32C polynomial except the magic constant 0x9db42487. If anyone is able to decipher that one, please let me know. After supersaw7's excellent explanation on reddit, I have ported also the elusive 0x9db42487 constant and I just need to find some time to polish and test it.
First of all the Intel's CRC32 instruction serves to calculate CRC-32C (that is uses a different polynomial that regular CRC32. Look at the Wikipedia CRC32 entry)
To use Intel's hardware acceleration for CRC32C using gcc you can:
Inline assembly language in C code via the asm statement
Use intrinsics _mm_crc32_u8, _mm_crc32_u16, _mm_crc32_u32 or _mm_crc32_u64. See Intel Intrinsics Guide for a description of those for the Intel's compiler icc but gcc also implements them.
This is how you would do it with __mm_crc32_u8 that takes one byte at a time, using __mm_crc32_u64 would give further performance improvement since it takes 8 bytes at a time.
uint32_t sse42_crc32(const uint8_t *bytes, size_t len)
{
uint32_t hash = 0;
size_t i = 0;
for (i=0;i<len;i++) {
hash = _mm_crc32_u8(hash, bytes[i]);
}
return hash;
}
To compile this you need to pass -msse4.2 in CFLAGS. Like gcc -g -msse4.2 test.c otherwise it will complain about undefined reference to _mm_crc32_u8.
If you want to revert to a plain C implementation if the instruction is not available in the platform where the executable is running you can use GCC's ifunc attribute. Like
uint32_t sse42_crc32(const uint8_t *bytes, size_t len)
{
/* use _mm_crc32_u* here */
}
uint32_t default_crc32(const uint8_t *bytes, size_t len)
{
/* pure C implementation */
}
/* this will be called at load time to decide which function really use */
/* sse42_crc32 if SSE 4.2 is supported */
/* default_crc32 if not */
static void * resolve_crc32(void) {
__builtin_cpu_init();
if (__builtin_cpu_supports("sse4.2")) return sse42_crc32;
return default_crc32;
}
/* crc32() implementation will be resolved at load time to either */
/* sse42_crc32() or default_crc32() */
uint32_t crc32(const uint8_t *bytes, size_t len) __attribute__ ((ifunc ("resolve_crc32")));
I compare various algorithms here: https://github.com/htot/crc32c
The fastest algorithm has been taken from Intels crc_iscsi_v_pcl.asm assembly code (which is available in a modified form in the linux kernel) and using a C wrapper (crcintelasm.cc) included into this project.
To be able to run this code on 32 bit platforms first it has been ported to C (crc32intelc) where possible, a small amount of inline assembly is required. Certain parts of the code depend on the bitness, crc32q is not available on 32 bits and neither is movq, these are put in macro's (crc32intel.h) with alternative code for 32 bit platforms.

How can I create a 48-bit uint for bit mask

I am trying to create a 48-bit integer value. I understand it may be possible to use a char array or struct, but I want to be able to do bit masking/manipulation and I'm not sure how that can be done.
Currently the program uses a 16-bit uint and I need to change it to 48. It is a bytecode interpreter and I want to expand the memory addressing to 4GB. I could just use 64-bit, but that would waste a lot of space.
Here is a sample of the code:
unsigned int program[] = { 0x1064, 0x11C8, 0x2201, 0x0000 };
void decode( )
{
instrNum = (program[i] & 0xF000) >> 12; //the instruction
reg1 = (program[i] & 0xF00 ) >> 8; //registers
reg2 = (program[i] & 0xF0 ) >> 4;
reg3 = (program[i] & 0xF );
imm = (program[i] & 0xFF ); //pointer to data
}
full program: http://en.wikibooks.org/wiki/Creating_a_Virtual_Machine/Register_VM_in_C
You can use the bit fields which are often used to represent integral types of known, fixed bit-width. A well-known usage of bit-fields is to represent a set of bits, and/or series of bits, known as flags. You can apply bit operations on them.
#include <stdio.h>
#include <stdint.h>
struct uint48 {
uint64_t x:48;
} __attribute__((packed));
Use a structure or uint16_t array with special functions for an array of uint48.
For individual instances, use uint64_t or unsigned long long. uint64_t will work fine for individually int48, but may want to mask off the results operations like * or << to keep upper bits cleared. Just some space saving routines are needed for arrays.
typedef uint64_t uint48;
const uint48 uint48mask = 0xFFFFFFFFFFFFFFFFull;
uint48 uint48_get(const uint48 *a48, size_t index) {
const uint16_t *a16 = (const uint16_t *) a48;
index *= 3;
return a16[index] | (uint32_t) a16[index + 1] << 16
| (uint64_t) a16[index + 2] << 32;
}
void uint48_set(uint48 *a48, size_t index, uint48 value) {
uint16_t *a16 = (uint16_t *) a48;
index *= 3;
a16[index] = (uint16_t) value;
a16[++index] = (uint16_t) (value >> 16);
a16[++index] = (uint16_t) (value >> 32);
}
uint48 *uint48_new(size_t n) {
size_t size = n * 3 * sizeof(uint16_t);
// Insure size allocated is a multiple of `sizeof(uint64_t)`
// Not fully certain this is needed - but doesn't hurt.
if (size % sizeof(uint64_t)) {
size += sizeof(uint64_t) - size % sizeof(uint64_t);
}
return malloc(size);
}

How to code a Modular operation using C that is using 128 bits Binary and mod with a 4 bit polynomial?

Lets say I have: (data) mod (polynomial)
1110 0101 mod 1001
I understand that I will need to shift the polynomial to the left most bit of the data and execute a XOR operation.
1110 0101
1001
and i will get a result of
0111 0101
Then I will need the set the polynomial to find the next '1' on the result and move the polynomial to the position and perform the next XOR operation, and repeat the steps until I get the remainder.
So, I understand that I will need to copy my data to an array and using the array I can do a shifting and use a AND operator and compare the first bit of the data with the first bit of the polynomial, if I get a result of '1' and I will then know that I can shift the polynomial to that position.
Here's a snippet of my code:
uint8_t polyarray[4];
uint32_t dataarray[32];
uint64_t mod(int data, int poly, int i) {
memcpy(polyarray, (int[]) {1}, sizeof polyarray);
memcpy(dataarray, (int[]) {1,2,3,4}, sizeof dataarray);
for (i=127; i>=0; i--){
poly << i;
dataarray[4]>>31;
polyarray[1]>>3;
if(dataarray[4] & polyarray[1]=1){
data = data ^ poly;
}
}
I am quite certain that my codes are incomplete but I am not sure where, anyone can help me?
i redo my codes again, will this be better?
void mod(uint8_t i, uint64_t *pPoly, uint64_t *pData)
{
uint64_t Data[128];
uint64_t Poly[4];
for(i=127; i>=0; i--)
{
Poly << i;
pData = &Data[i];
pPoly = &Poly[3];
if (pData = 1)
{
Data = Data^Poly;
}
else
{
Poly>>1;
i--;
}
}
}
You said 128 bit, but you have 128*64 bit.
Poly << i; shifts poly without effect; You have to store the result.
Data = Data^Poly; XORs two pointers to arrays instead of bits in the bit array
pData = &Data[i]; and
pPoly = &Poly[3]; are unused
if( pData = 1 ) have missing "="
Here is my code
int main()
{
uint64_t hi = 0;
uint64_t lo = 0b11100101;
uint64_t div = 0b1001;
uint64_t mask = 1L << 63;
uint8_t offset = 0;
/* search te MSB */
for (int i = 0; i < 64; i++)
{
if ( (mask >> i) & div )
{
offset = i;
break;
}
/*
before:
div = 00001001
mask= 10000000
after:
|---|
mask= 00001000
off = 4
*/
}
/* 00001001 */
div <<= offset;
/* 10010000 */
for(int i = 0; i < 64; i++)
{
/* XOR if MSB of HI is 1 */
if ( hi & (mask >> i) )
{
hi ^= (div >> i);
/* if we have to xor in HI and LO together, cause by overlap */
/* HI LO */
/* 000000000000 000000001001 */
/* DIV 000100 00000 */
if ( i > 0 )
{
lo ^= (div << (64 - i));
}
}
}
for(int i = 0; i <= offset; i++)
{
if ( lo & (mask >> i) )
{
lo ^= (div >> i);
}
}
printf("mod = %" PRIX64 "%" PRIX64 "\n", hi, lo);
}
The point is that, you have to properly move mask and divisor and xor it with dividend. The modulo is stored in last offset bytes in lo.
Probably exists some 128 bit instruction which can do all work for you in single step.

Resources