Is there a simple CRC algorithm based on a lookup table, but with words entering the algorithm instead of bytes.
For example, this algorithm works with bytes:
#include <stdint.h>
const uint16_t wTable_CRC16_Modbus[256] = {
0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
};
uint16_t Modbus_Calculate_CRC16(uint8_t *frame, uint8_t size) {
// Initialize CRC16 word
uint16_t crc16 = 0xFFFF;
// Calculate CRC16 word
uint16_t ind;
while(size--) {
ind = ( crc16 ^ *frame++ ) & 0x00FF;
crc16 >>= 8;
crc16 ^= wTable_CRC16_Modbus[ind];
}
// Swap low and high bytes
crc16 = (crc16<<8) | (crc16>>8);
// Return the CRC16 word with
// swapped low and high bytes
return crc16;
}
Since I'm always going to send words, using the above algorithm I would need to break each WORD in LSB and MSB and repeat the code in the loop body twice, first for LSB and then for MSB. Instead of doing that, I would like to update CRC in one step, with WORD as the algorithm (loop body) input.
If and only if you read words from memory in little-endian order (least significant byte first), then you can read a 16-bit word at a time. You would still need to use the table twice to process the word. The loop would be:
size >>= 1; // better have been even!
uint16_t const *words = frame; // better be little-endian!
while (size--) {
crc16 ^= *words++;
crc16 = (crc16 >> 8) ^ wTable_CRC16_Modbus[crc16 & 0xff];
crc16 = (crc16 >> 8) ^ wTable_CRC16_Modbus[crc16 & 0xff];
}
Depending on your processor, you may also need to make sure that frame points to an even address. Note that if size is odd, this won't process the last byte. If size can be and is odd, you can just add the processing of the last byte after the loop.
If you would like a single table lookup per word, then you'll need a much bigger table. The existing table is simply the CRC of the single-byte values 0..255 (where the initial CRC is zero, not 0xffff). The bigger table would be the same thing, but for the two-byte values 0..65536, with the least significant byte processed first. You can make that table using the existing table:
void modbus_bigtable(uint16_t *table) {
for (uint16_t lo = 0; lo < 256; lo++) {
uint16_t crclo = wTable_CRC16_Modbus[lo];
uint16_t crchi = crclo >> 8;
crclo &= 0xff;
uint16_t *nxt = table;
for (uint16_t hi = 0; hi < 256; hi++) {
*nxt = crchi ^ wTable_CRC16_Modbus[hi ^ crclo];
nxt += 256;
}
table++;
}
}
Then the loop becomes:
size >>= 1; // better have been even!
uint16_t const *words = frame; // better be little-endian!
while (size--)
crc16 = table[crc16 ^ *words++];
I am trying to select bits [0:2] and bits [6:8] of the bit-string 1010000000001. Bits [0:2] are 001 and bits [6:8] are 000. I tried to select these bits with:
int instr = 0x1401;
int src2 = (instr & 0x0006); //get bits [2:0]
int src1 = (instr & 0x01C0) >> 6; //get bits [6:8]
printf("%04x, %04x",src2, src1);
However I am getting that src1 and src2 are both 0000. Can someone please help me understand what I am doing incorrectly so I can select bits [0:2] and [6:8]?
Look at this code:
#include <stdio.h>
int main (void) {
unsigned instr = 0x1401;
unsigned src2 = instr & 0x0007; // 7 in hex == 0000 0000 0111 in binary
unsigned src1 = (instr & 0x01C) >> 6; // 1C in hex == 0001 1100 0000 in binary
printf("%04x, %04x", src2, src1);
}
It masks out the desired bits in instr and shifts them by the correct offset. Also, when doing bit manipulation, unsigned types are preferred.
It's easier to just write a function to calculate any arbitrary bit slice (here using 1 rather than 0 as the least significant bit):
#include <stdio.h>
#include <assert.h>
int bit_select(int num, size_t start, size_t end)
{
assert(end >= start);
const int mask = (1 << (end-start+1)) - 1;
const int shift = start - 1;
return (num & (mask << shift)) >> shift;
}
int main(void)
{
printf("Bits 1...3 of 01100101: %d\n", bit_select(0x65, 1, 3));
printf("Bits 3...3 of 01100101: %d\n", bit_select(0x65, 3, 3));
printf("Bits 4...4 of 01100101: %d\n", bit_select(0x65, 4, 4));
printf("Bits 3...7 of 01100101: %d\n", bit_select(0x65, 3, 7));
return 0;
}
with output:
paul#horus:~/src/sandbox$ ./bitselect
Bits 1...3 of 01100101: 5
Bits 3...3 of 01100101: 1
Bits 4...4 of 01100101: 0
Bits 3...7 of 01100101: 25
paul#horus:~/src/sandbox$
From what I can see if you get the result from 0x1401 & 0x0006 you get 0 and you get the same from 0x1401 & 0x01c0. The bit shift you do on src1 is just 0 shift right 6 bits which is still 0.
Because you provided a wrong mask.
To make life easier if you are using gcc, just provide binary literal rather than hex version so that you can see what you are masking off without pain:
unsigned src2 = instr & 0b111;
int instr = 0x1401;
//results in instr containing (ignoring endian)
//0x00001401
//or in binary
//0b0000 0000 0000 0000 0001 0100 0000 0001
//extracting bits 2:0 is normally done by:
int src2 = instr & 0x00000007;
//extracting bits 8:6 is normally done by:
int src1 = (instr & 0x000001C0) >> 6;
//note that if bit 31 is to be extracted,
//the bit shifting will not work
//due to sign propagation of a negative number
I've been struggling with the intrinsics. In particular I don't get the same results using the standard CRC calculation and the supposedly equivalent intel intrinsics. I'd like to move to using _mm_crc32_u16, and _mm_crc32_u32 but if I can't get the 8 bit operation to work there's no point.
static UINT32 g_ui32CRC32Table[256] =
{
0x00000000L, 0x77073096L, 0xEE0E612CL, 0x990951BAL,
0x076DC419L, 0x706AF48FL, 0xE963A535L, 0x9E6495A3L,
0x0EDB8832L, 0x79DCB8A4L, 0xE0D5E91EL, 0x97D2D988L,
....
// Your basic 32-bit CRC calculator
// NOTE: this code cannot be changed
UINT32 CalcCRC32(unsigned char *pucBuff, int iLen)
{
UINT32 crc = 0xFFFFFFFF;
for (int x = 0; x < iLen; x++)
{
crc = g_ui32CRC32Table[(crc ^ *pucBuff++) & 0xFFL] ^ (crc >> 8);
}
return crc ^ 0xFFFFFFFF;
}
UINT32 CalcCRC32_Intrinsic(unsigned char *pucBuff, int iLen)
{
UINT32 crc = 0xFFFFFFFF;
for (int x = 0; x < iLen; x++)
{
crc = _mm_crc32_u8(crc, *pucBuff++);
}
return crc ^ 0xFFFFFFFF;
}
That table is for a different CRC polynomial than the one used by the Intel instruction. The table is for the Ethernet/ZIP/etc. CRC, often referred to as CRC-32. The Intel instruction uses the iSCSI (Castagnoli) polynomial, for the CRC often referred to as CRC-32C.
This short example code can calculate either, by uncommenting the desired polynomial:
#include <stddef.h>
#include <stdint.h>
/* CRC-32 (Ethernet, ZIP, etc.) polynomial in reversed bit order. */
#define POLY 0xedb88320
/* CRC-32C (iSCSI) polynomial in reversed bit order. */
/* #define POLY 0x82f63b78 */
/* Compute CRC of buf[0..len-1] with initial CRC crc. This permits the
computation of a CRC by feeding this routine a chunk of the input data at a
time. The value of crc for the first chunk should be zero. */
uint32_t crc32c(uint32_t crc, const unsigned char *buf, size_t len)
{
int k;
crc = ~crc;
while (len--) {
crc ^= *buf++;
for (k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
}
return ~crc;
}
You can use this code to generate a replacement table for your code by simply computing the CRC-32C of each of the one-byte messages 0, 1, 2, ..., 255.
FWIW, I've obtained SW code that demonstrably matches the Intel crc32c instruction, but it uses a different polynomial: 0x82f63b78 The function definitely doesn't match any of the iSCSI test examples here: https://www.rfc-editor.org/rfc/rfc3720#appendix-B.4
What's frustrating in all this is every implementation I've tried for CRC-32C comes out with different hashes from all the others. Is there a true piece of reference code out there?
I made a function to set or clear a specific number of bits in a DWORD. My function works. I don't need help making it work. However, I am wondering if the method I've chosen to do it is the fastest possible way.
It's rather hard for me to explain how this works. There are two arrays containing DWORDs that are filled with bits on the left and right side of the DWORD (with all binary 1's). It makes a mask with all the bits filled except for the ones I want to set or clear, and then sets them with bitwise operators based on that mask. It seems rather complicated for such a simple task, but it seems like the fastest way I could come up with. It's much faster than setting them bit by bit.
static DWORD __dwFilledBitsRight[] = {
0x0, 0x1, 0x3, 0x7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF, 0x3FFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF
};
static DWORD __dwFilledBitsLeft[] = {
0x0, 0x80000000, 0xC0000000, 0xE0000000, 0xF0000000, 0xF8000000, 0xFC000000, 0xFE000000, 0xFF000000, 0xFF800000, 0xFFC00000, 0xFFE00000, 0xFFF00000, 0xFFF80000, 0xFFFC0000, 0xFFFE0000, 0xFFFF0000, 0xFFFF8000, 0xFFFFC000, 0xFFFFE000, 0xFFFFF000, 0xFFFFF800, 0xFFFFFC00, 0xFFFFFE00, 0xFFFFFF00, 0xFFFFFF80, 0xFFFFFFC0, 0xFFFFFFE0,
0xFFFFFFF0, 0xFFFFFFF8, 0xFFFFFFFC, 0xFFFFFFFE, 0xFFFFFFFF
};
// nStartBitFromLeft must be between 1 and 32...
// 1 is the bit farthest to the left (actual bit 31)
// 32 is the bit farthest to the right (actual bit 0)
inline void __FillDWORDBits(DWORD *p, int nStartBitFromLeft, int nBits, BOOL bSet)
{
DWORD dwLeftMask = __dwFilledBitsLeft[nStartBitFromLeft - 1]; // Mask for data on the left of the bits we want
DWORD dwRightMask = __dwFilledBitsRight[33 - (nStartBitFromLeft + nBits)]; // Mask for data on the right of the bits we want
DWORD dwBitMask = ~(dwLeftMask | dwRightMask); // Mask for the bits we want
DWORD dwOriginal = *p;
if(bSet) *p = (dwOriginal & dwLeftMask) | (dwOriginal & dwRightMask) | (0xFFFFFFFF & dwBitMask);
else *p = (dwOriginal & dwLeftMask) | (dwOriginal & dwRightMask) | 0;
}
How about:
// Create mask of correct length, and shift to the correct position
DWORD mask = ((1ULL << nBits) - 1) << pos;
// Apply mask (or its inverse)
if (bSet)
{
*p |= mask;
}
else
{
*p &= ~mask;
}
It's pretty likely that simple bitwise operations will be faster than table lookup on any modern processor.
Note: Depending on the relationship between DWORD and long long on this platform, you may need special handling for the case where nBits == sizeof(DWORD)*8. Or if nBits==0 is not a possibility, you could just do DWORD mask = ((2ULL << (nBits - 1)) - 1) << pos;.
Update: It's been mentioned that the if could potentially be slow, which is true. Here's a replacement for it, but you'd need to measure to see if it's actually any faster in practice.
// A bit hacky, but the aim is to get 0x00000000 or 0xFFFFFFFF
// (relies on two's-complement representation)
DWORD blanket = bSet - 1;
// Use the blanket to override one or other masking operation
*p |= (blanket | mask);
*p &= ~(blanket & mask);
This is the way I'd do it. I'd break it into two functions, setbits() and clearbits(). Steps broken out for clarity, and I'm sure it can be far more optimized.
This version is dependent on 32-bit code as it stands. Also, in my world, bit 0 is the rightmost bit. Your mileage may vary.
setbits( DWORD *p , int offset , int len )
{
// offset must be 0-31, len must be 0-31, len+offset must be 0-32
int right_shift = ( !len ? 0 : 32 - (len+offset) ) ;
int left_shift = offset ;
DWORD right_mask = 0xFFFFFFFF >> right_shift ;
DWORD left_mask = 0xFFFFFFFF << left_shift ;
DWORD mask = left_mask & right_mask ;
*p |= mask ;
return ;
}
clearbits( DWORD *p , int offset , int len )
{
// offset must be 0-31, len must be 0-31, len+offset must be 0-32
int right_shift = ( !len ? 0 : 32 - (len+offset) ) ;
int left_shift = offset ;
DWORD right_mask = 0xFFFFFFFF >> right_shift ;
DWORD left_mask = 0xFFFFFFFF << left_shift ;
DWORD mask = ~( left_mask & right_mask ) ;
*p &= mask ;
return ;
}
I stumbled across this improved version whilst looking for something else today. Courtesy of Sean Anderson's Bit Twiddling Hacks at Stanford University:
// uncomment #define to get the super scalar CPU version.
// #define SUPER_SCALAR_CPU
void setbits( unsigned int *p , int offset , int len , int flag )
{
unsigned int mask = ( ( 1 << len ) - 1 ) << offset ;
#if !defined( SUPER_SCALAR_CPU )
*p ^= ( - flag ^ *p ) & mask ;
#else
// supposed to be some 16% faster on a Intel Core 2 Duo than the non-super-scalar version above
*p = (*p & ~ mask ) | ( - flag & mask ) ;
#endif
return ;
}
Much depends on your compiler, though.