C Tokenizer - How does it work? - c

How does this work?
I know to use it you pass in:
start: string (e.g. "Item 1, Item 2, Item 3")
delim: delimiter string (e.g. ",")
tok: reference to a string which will hold the token
nextpos (optional): reference to a the position in the original string where the next token starts
sdelim (optional): pointer to a character which will hold the starting delimeter of the token
edelim (optional): pointer to a character which will hold the ending delimeter of the token
Code:
#include <stdlib.h>
#include <string.h>
int token(char* start, char* delim, char** tok, char** nextpos, char* sdelim, char* edelim) {
// Find beginning:
int len = 0;
char *scanner;
int dictionary[8];
int ptr;
for(ptr = 0; ptr < 8; ptr++) {
dictionary[ptr] = 0;
}
for(; *delim; delim++) {
dictionary[*delim / 32] |= 1 << *delim % 32;
}
if(sdelim) {
*sdelim = 0;
}
for(; *start; start++) {
if(!(dictionary[*start / 32] & 1 << *start % 32)) {
break;
}
if(sdelim) {
*sdelim = *start;
}
}
if(*start == 0) {
if(nextpos != NULL) {
*nextpos = start;
}
*tok = NULL;
return 0;
}
for(scanner = start; *scanner; scanner++) {
if(dictionary[*scanner / 32] & 1 << *scanner % 32) {
break;
}
len++;
}
if(edelim) {
*edelim = *scanner;
}
if(nextpos != NULL) {
*nextpos = scanner;
}
*tok = (char*)malloc(sizeof(char) * (len + 1));
if(*tok == NULL) {
return 0;
}
memcpy(*tok, start, len);
*(*tok + len) = 0;
return len + 1;
}
I get most of it except for:
dictionary[*delim / 32] |= 1 << *delim % 32;
and
dictionary[*start / 32] & 1 << *start % 32
Is it magic?

Since each character of the delimiter is 8 bits (sizeof(char) == 1 byte), it is limited to 256 possible values.
The dictionary is broken into 8 pieces (int dictionary[8]), 32 possibilities per piece (sizeof(int) is >= 4 bytes) and 32 * 8 = 256.
This forms a 256 bit matrix of values. It then turns on the flag for each character in the delimiter (dictionary[*delim / 32] |= 1 << *delim % 32;). The index of the array is *delim / 32, or the ASCII value of the character divided by 32. Since the ASCII value ranges from 0 to 255, this divide yields a value of 0 to 7 with a remainder. The remainder is which bit to turn on, decided by the modulus operation.
All this does is flag certain bits of the 256 bit matrix as true, if the corresponding ASCII character exists in the delimiter.
Then determining if a character is in the delimiter is simply a lookup in the 256 bit matrix (dictionary[*start / 32] & 1 << *start % 32)

They store which characters have occurred by making an 8 x 32 = 256 table of bits stored in dictionary.
dictionary[*delim / 32] |= 1 << *delim % 32;
sets the bit corresponding to *delim
dictionary[*start / 32] & 1 << *start % 32
checks the bit

OK, so if we send in the string "," for the delimiter then dictionary[*delim / 32] |= 1 << *delim % 32 will be dictionary[1] = 4096. The expression dictionary[*start / 32] & 1 << *start % 32 simply checks for a matching character.
What puzzles me is why they are not using direct char comparison.

Related

Word every 2 bits to symbol

I have a function that read a word, bit by bit and change to symbol:
I need help to change it to read every 2 bits and change to symbol.
I don't have an idea for it and I need your help guys
void PrintWeirdBits(word w , char* buf){
word mask = 1<<(BITS_IN_WORD-1);
int i;
for(i=0;i<BITS_IN_WORD;i++){
if(mask & w)
buf[i]='/';
else
buf[i]='.';
mask>>=1;
}
buf[i] = '\0';
}
Needed symbols:
00 - *
01 - #
10 - %
11 - !
Here is my proposal for your issue.
Using a lookup table for the symbol decoding will eliminate the need in if statements.
(I assumed word is an unsigned 16 bits data type)
#define BITS_PER_SIGN 2
#define BITS_PER_SIGN_MSK 3 // decimal 3 is 0b11 in binary --> two bits set
// General define could be:
// ((1u << BITS_PER_SIGN) - 1)
#define INIT_MASK (BITS_PER_SIGN_MSK << (BITS_IN_WORD - BITS_PER_SIGN))
void PrintWeirdBits(word w , char* buf)
{
static const char signs[] = {'*', '#', '%', '!'};
unsigned mask = INIT_MASK;
int i;
int sign_idx;
for(i=0; i < BITS_IN_WORD / BITS_PER_SIGN; i++)
{
// the bits of the sign represent the index in the signs array
// just need to align these bits to start from bit 0
sign_idx = (w & mask) >> (BITS_IN_WORD - (i + 1)*BITS_PER_SIGN);
// store the decoded sign in the buffer
buf[i] = signs[sign_idx];
// update the mask for the next symbol
mask >>= BITS_PER_SIGN;
}
buf[i] = '\0';
}
Here it seems to be working.
With small effort it can be updated to a generic code for any bit width of the symbol as long as it is power of two (1, 2, 4, 8) and smaller that BITS_IN_WORD.
Assuming word is unsigned int or an unsigned integer type.
void PrintWeirdBits(word w , char* buf){
word mask = 3 << (BITS_IN_WORD -2);
int i;
word cmp;
for(i=0;i<BITS_IN_WORD/2;i++){
cmp = (mask & w) >> (BITS_IN_WORD -2 -2i);
if(cmp == 0x00)
{
buf[i]='*';
}
else if (cmp == 0x01)
{
buf[i]='#';
}
else if (cmp == 0x02)
{
buf[i]='%';
}
else
{
buf[i]='!';
}
mask>>=2;
}
buf[i] = '\0';
}
The important part is
cmp = (mask & w) >> (BITS_IN_WORD -2 -2i);
Here mask and the input w is bitwise ANDed and the result is right shifted to get the value in the first two bits. These bits are compared to get the result.

How to write only 12 bits to a char array in C?

I'm trying to implement a FAT12 file system in which there's a FAT table data structure which is an unsigned char array. I need to write a function which given an array index would write a value to the next 12 bits (because it's FAT12) which is quite tricky because part of the value needs to go to one byte and the other part needs to go the half of the second byte.
This is the get value function I came up with:
//FAT is the unsigned char array
int GetFatEntry(int FATindex, unsigned char * FAT) {
unsigned int FATEntryCode; // The return value
// Calculate the offset of the WORD to get
int FatOffset = ((FATindex * 3) / 2);
if (FATindex % 2 == 1){ // If the index is odd
FATEntryCode = ((unsigned char)(&FAT[FatOffset])[0] + (((unsigned char)(&FAT[FatOffset])[1]) << 8));
FATEntryCode >>= 4; // Extract the high-order 12 bits
}
else{ // If the index is even
FATEntryCode = ((unsigned char)(&FAT[FatOffset])[0] + (((unsigned char)(&FAT[FatOffset])[1]) << 8));
FATEntryCode &= 0x0fff; // Extract the low-order 12 bits
}
return FATEntryCode;
}
I'm struggling to come up with the function which would set a value given a FATindex. I would appreciate any suggestions.
This seems to work. The data that should be written should be in the first 12 bits of data
void WriteFatEntry(int FATindex, unsigned char * FAT, unsigned char data[2]) {
// Calculate the offset of the WORD to get
int FatOffset = ((FATindex * 3) / 2);
unsigned char d;
if (FATindex % 2 != 0){ // If the index is odd
// Copy from data to d and e, and shift everything so that second half of
// e contains first half of data[1], and first half of e contains second
// half of data[0], while second half of d contains first half of data[0].
// First half of d contains a copy of first four bits in FAT[FatOffset]
// so that nothing changes when it gets written
unsigned char e=data[1];
e>>=4;
d=data[0];
e|=(d<<4) & 0b11110000;
d>>=4;
d |= FAT[FatOffset] & 0b11110000;
FAT[FatOffset]=d;
FAT[FatOffset+1] = e;
}
else{ // If the index is even
d = data[1] & 0b11110000;
d |= FAT[FatOffset+1] & 0b00001111;
FAT[FatOffset] = data[0];
FAT[FatOffset+1] = d;
}
}
#include <stdio.h>
#if 1 /* assuming MSB first */
#define MSB (idx)
#define LSB (idx+1)
#else /* assuming LSB first */
#define MSB (idx+1)
#define LSB (idx)
#endif
unsigned fat_getval(unsigned char * tab, unsigned num)
{
unsigned idx;
unsigned val;
idx = num + num/2;
val = (tab[MSB] <<8 ) + (tab[idx+1] ) ;
if (num %2 ==0) val >>= 4;
return val & 0xfff;
}
void fat_putval(unsigned char * tab, unsigned slot, unsigned val)
{
unsigned idx;
idx = slot + slot/2;
if (slot %2 ==0) { /* xyz_ */
val <<= 4;
val |= tab[LSB] & 0xf;
}
else { /* _xyz */
val |= (tab[MSB] & 0xf0) << 8;
}
tab[MSB] = val >>8;
tab[LSB] = val &0xff;
}
#undef MSB
#undef LSB
unsigned char fattable[] = "\x01\x23\x45\x67\x89\xab"; // 12 nibbles
int main(void)
{
unsigned idx, ret;
for (idx = 0; idx < 6; idx++) { // 6 bytes -> 12 nibbles */
printf(" %02x", fattable[idx] );
}
printf("\n");
printf("Put(0,0xabc):\n");
fat_putval(fattable, 0, 0xabc);
for (idx = 0; idx < 6; idx++) {
printf(" %02x", fattable[idx] );
}
printf("\n");
printf("Put(3,0xdef):\n");
fat_putval(fattable, 3, 0xdef);
for (idx = 0; idx < 6; idx++) {
printf(" %02x", fattable[idx] );
}
printf("\n");
printf("Get(0 to 4):\n");
for (idx = 0; idx < 4; idx++) { // 12 / 3 ~> 4 * 12bit entries
ret = fat_getval( fattable, idx);
printf("%u := %x\n", idx, ret );
}
printf("\n");
return 0;
}

Confused on example code from my c programming book

im confused about the purpose of adding 48 to the output of ((num >> 15 - i) & 0x0001)
/* Convert fixed 16-bit integer to binary digit string.
Pre num contains integral value to be converted
bitStr is a pointer to variable for bit string
Post bit string stored in str
*/
void bin16 (uint16_t num, char* bitStr) {
for (int i = 0; i < 16; i++)
bitStr[i] = (char) ((num >> 15 - i) & 0x0001) + 48;
return;
}// end of bin16
The 48 is the ASCII value for the character '0'. In this situation it is being used to convert between the numbers 0 & 1 and the characters '0' & '1'
bin16() could have also been written:
void bin16 (uint16_t num, char *bitStr) {
for (uint16_t i = 0; i < 16; i++)
bitStr[i] = (char) ((num >> 15 - i) & 0x0001) + '0';
}
bitStr is a string. You need to convert integer numbers to char to put correct value in the string. That's why 48 is added. It's equivalent to adding '0'. 48 is the ASCII value of '0'.
It's similar as writing this -
void bin16 (uint16_t num, char* bitStr)
{
for (int i = 0; i < 16; i++)
bitStr[i] = (char) ((num >> 15 - i) & 0x0001) + '0';
return;
}

Hamming code check parity

I am not sure if I am calculating the parity bit correctly for the the check Parity bit function I wrote. The codeWord is 11 chars long with 4 parity bits and 7 data bits. Does the implementation look good?
void parityCheck(char* codeWord) {
int parity[4] = {0}, i = 0, diffParity[4] = {0}, twoPower = 0, bitSum = 0;
// Stores # of 1's for each parity bit in array.
parity[0] = (codeWord[2] - 48) + (codeWord[4] - 48) + (codeWord[6] - 48) + (codeWord[8] - 48) + (codeWord[10] - 48);
parity[1] = (codeWord[2] - 48) + (codeWord[5] - 48) + (codeWord[6] - 48) + (codeWord[9] - 48) + (codeWord[10] - 48);
parity[2] = (codeWord[4] - 48) + (codeWord[5] - 48) + (codeWord[6] - 48);
parity[3] = (codeWord[8] - 48) + (codeWord[9] - 48) + (codeWord[10] - 48);
// Determines if sum of bits is even or odd, then tests for difference from actual parity bit.
for (i = 0; i < 4; i++) {
twoPower = (int)pow((double)2, i);
if (parity[i] % 2 == 0)
parity[i] = 0;
else
parity[i] = 1;
if ((codeWord[twoPower-1] - 48) != parity[i])
diffParity[i] = 1;
}
// Calculates the location of the error bit.
for (i = 0; i < 4; i++) {
twoPower = (int)pow((double)2, i);
bitSum += diffParity[i]*twoPower;
}
// Inverts bit at location of error.
if (bitSum <= 11 && bitSum > 0) {
if ((codeWord[bitSum-1] - 48))
codeWord[bitSum-1] = '0';
else
codeWord[bitSum-1] = '1';
}
Does the implementation look good?
This very much depends on your measure for “good”. I can confirm that it does get the job done, so at least it is correct. Your code is very verbose, and thus hard to check for correctness. I'd do the following:
int parity_check(int codeWord) {
int parity = 0, codeWordBit, bitPos;
for (bitPos = 1; bitPos <= 11; ++bitPos) {
codeWordBit = ((codeWord >> (bitPos - 1)) & 1);
parity ^= bitPos*codeWordBit;
}
if (parity != 0) {
if (parity > 11)
return -1; // multi-bit error!
codeWord ^= 1 << (parity - 1);
}
return codeWord;
}
Instead of a sequence of digit characters, I treat your whole code word as a single integer, which is a lot more efficient.
Looking at the table at Wikipedia, I see that the columns of that table form binary representations of the sequence 1 … 11. Each code word bit affects exactly those parity bits mentioned in that column, so I take the code word bit (which is zero or one), multiply it by the bit pattern of that column to obtain either that pattern or zero, then XOR this with the current parity bit pattern. The effect of this is that a zero code word bit won't change anything, whereas a non-zero code word bit flips all associated parity bits.
Some care has to be taken because the bit pattern is one-based, whereas the bit position using the right shift trick is zero-based. So I have to subtract one, then shift right by that amount, and then extract the least significant digit in order to obtain the codeWordBit.
Using my implementation for reference, I was able to verify (by complete enumeration) that your code works the same.
Your code works fine AFAIK as it passed test cases I conjured up. Some simplifications were employed, but the OP functionality not changed. Some classic simplifications were made for easier viewing.
void parityCheck(char* cW) {
int parity[4] = { 0 }, i = 0, diffParity[4] = { 0 }, twoPower = 0, bitSum = 0;
// Stores # of 1's for each parity bit in array.
parity[0] = (cW[2] - '0') + (cW[4] - '0') + (cW[6] - '0') + (cW[8] - '0') + (cW[10] - '0');
parity[1] = (cW[2] - '0') + (cW[5] - '0') + (cW[6] - '0') + (cW[9] - '0') + (cW[10] - '0');
parity[2] = (cW[4] - '0') + (cW[5] - '0') + (cW[6] - '0');
parity[3] = (cW[8] - '0') + (cW[9] - '0') + (cW[10] - '0');
// Determines if sum of bits is even or odd, then tests for difference from actual parity bit.
for (i = 0; i < 4; i++) {
//twoPower = (int) pow((double) 2, i);
twoPower = 1 << i;
//if (parity[i] % 2 == 0) parity[i] = 0; else parity[i] = 1;
parity[i] &= 1; // Make 0 even, 1 odd.
if ((cW[twoPower - 1]-'0') != parity[i])
diffParity[i] = 1;
}
// Calculates the location of the error bit.
for (i = 0; i < 4; i++) {
// twoPower = (int) pow((double) 2, i);
twoPower = 1 << i;
bitSum += diffParity[i] * twoPower;
}
// Inverts bit at location of error.
if (bitSum <= 11 && bitSum > 0) {
if ((cW[bitSum - 1]-'0'))
cW[bitSum - 1] = '0';
else
cW[bitSum - 1] = '1';
}
}
void TestP(const char * Test) {
char buf[100];
strcpy(buf, Test);
parityCheck(buf);
printf("'%s' '%s'\n", Test, buf);
}
int main(void) {
TestP("00000000000");
TestP("10011100101");
TestP("10100111001");
}
It would have been useful had the OP posted test patterns.
Here's my implementation. It works. The public is free to use it at no charge.
I used the acronym "secded" as in, "single-error-correcting, double-error-detecting." You can re-wire this as a "triple error detector" if you want that instead. Really, some small part of this is secded and the rest is Hamming 7,4 -- but I named these methods what I did, when I did.
The "strings" here are not NUL-terminated, but counted. This code is excerpted from a Python module written in C. That is the provenance of the string type you see.
A key point here was realizing that there are only 16 Hamming 7,4 codes. I calculated secded_of_nibble() with some Python code, which unfortunately I no longer have.
static const unsigned char secded_of_nibble[] =
{ 0x0, 0xd2, 0x55, 0x87, 0x99, 0x4b, 0xcc, 0x1e, 0xe1, 0x33, 0xb4, 0x66, 0x78, 0
xaa, 0x2d, 0xff };
int fec_secded_encode_cch_bits(const char * strIn, const int cchIn, char * strOu
t, const int cchOut)
{
assert( cchIn * 2 == cchOut);
if( cchIn * 2 != cchOut)
return 0;
if (!strIn || !strOut)
return 0;
int i;
for (i = 0; i < cchIn; i ++)
{
char in_byte = strIn[i];
char hi_byte = secded_of_nibble[(in_byte >> 4) & 0xf];
char lo_byte = secded_of_nibble[in_byte & 0xf];
strOut[i * 2] = hi_byte;
strOut[i * 2 + 1] = lo_byte;
}
return 1;
}
char bv_H[] = {0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x8};
char val_nibble(char ch)
{
return ((ch & 0x20) >> 2) | ((ch & 0xE) >> 1);
}
char correct_nibble(char ch)
{
char nibble = 0;
int i = 0;
for (i = 0; i < 8; i++)
if (ch & (1 << (7-i)))
nibble ^= bv_H[i];
return nibble;
}
void apply_correct(char nib_correct, char * pbyte, int * pcSec, int *pcDed)
{
if (0 == nib_correct)
return;
if (nib_correct & 0x8)
{
(*pcSec) ++;
int bit = (8 - (nib_correct & 0x7)) & 0x7;
/* fprintf(stderr, "bit %d, %02X\n", bit, 1 << bit);*/
(*pbyte) ^= (1 << bit);
}
else
{
(*pcDed) ++;
}
}
int fec_secded_decode_cch_bits
(
const char * strIn,
const int cchIn,
char * strOut,
const int cchOut,
int * pcSec,
int * pcDed
)
{
assert( cchIn == cchOut *2);
if( cchIn != cchOut * 2)
return 0;
if (!strIn || !strOut)
return 0;
int i;
for (i = 0; i < cchOut; i ++)
{
char hi_byte = strIn[i * 2];
char lo_byte = strIn[i * 2 + 1];
char hi_correct = correct_nibble(hi_byte);
char lo_correct = correct_nibble(lo_byte);
if (hi_correct || lo_correct)
{
apply_correct(hi_correct, &hi_byte, pcSec, pcDed);
apply_correct(lo_correct, &lo_byte, pcSec, pcDed);
/* fprintf(stderr, "Corrections %x %x.\n", hi_correct, lo_correct);*/
}
char hi_nibble = val_nibble(hi_byte);
char lo_nibble = val_nibble(lo_byte);
strOut[i] = (hi_nibble << 4) | lo_nibble;
}
return 1;
}

What is a better method for packing 4 bytes into 3 than this?

I have an array of values all well within the range 0 - 63, and decided I could pack every 4 bytes into 3 because the values only require 6 bits and I could use the extra 2bits to store the first 2 bits of the next value and so on.
Having never done this before I used the switch statement and a nextbit variable (a state machine like device) to do the packing and keep track of the starting bit. I'm convinced however, there must be a better way.
Suggestions/clues please, but don't ruin my fun ;-)
Any portability problems regarding big/little endian?
btw: I have verified this code is working, by unpacking it again and comparing with the input. And no it ain't homework, just an exercise I've set myself.
/* build with gcc -std=c99 -Wconversion */
#define ASZ 400
typedef unsigned char uc_;
uc_ data[ASZ];
int i;
for (i = 0; i < ASZ; ++i) {
data[i] = (uc_)(i % 0x40);
}
size_t dl = sizeof(data);
printf("sizeof(data):%z\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%z\n",pl);
for (i = 0; i < dl; ++i)
printf("%02d ", data[i]);
printf("\n");
uc_ * packeddata = calloc(pl, sizeof(uc_));
uc_ * byte = packeddata;
uc_ nextbit = 1;
for (int i = 0; i < dl; ++i) {
uc_ m = (uc_)(data[i] & 0x3f);
switch(nextbit) {
case 1:
/* all 6 bits of m into first 6 bits of byte: */
*byte = m;
nextbit = 7;
break;
case 3:
/* all 6 bits of m into last 6 bits of byte: */
*byte++ = (uc_)(*byte | (m << 2));
nextbit = 1;
break;
case 5:
/* 1st 4 bits of m into last 4 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x0f) << 4));
/* 5th and 6th bits of m into 1st and 2nd bits of byte: */
*byte = (uc_)(*byte | ((m & 0x30) >> 4));
nextbit = 3;
break;
case 7:
/* 1st 2 bits of m into last 2 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x03) << 6));
/* next (last) 4 bits of m into 1st 4 bits of byte: */
*byte = (uc_)((m & 0x3c) >> 2);
nextbit = 5;
break;
}
}
So, this is kinda like code-golf, right?
#include <stdlib.h>
#include <string.h>
static void pack2(unsigned char *r, unsigned char *n) {
unsigned v = n[0] + (n[1] << 6) + (n[2] << 12) + (n[3] << 18);
*r++ = v;
*r++ = v >> 8;
*r++ = v >> 16;
}
unsigned char *apack(const unsigned char *s, int len) {
unsigned char *s_end = s + len,
*r, *result = malloc(len/4*3+3),
lastones[4] = { 0 };
if (result == NULL)
return NULL;
for(r = result; s + 4 <= s_end; s += 4, r += 3)
pack2(r, s);
memcpy(lastones, s, s_end - s);
pack2(r, lastones);
return result;
}
Check out the IETF RFC 4648 for 'The Base16, Base32 and Base64 Data Encodings'.
Partial code critique:
size_t dl = sizeof(data);
printf("sizeof(data):%d\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%d\n",pl);
Don't use the floating point stuff - just use integers. And use '%z' to print 'size_t' values - assuming you've got a C99 library.
size_t pl = ((dl + 3) / 4) * 3;
I think your loop could be simplified by dealing with 3-byte input units until you've got a partial unit left over, and then dealing with a remainder of 1 or 2 bytes as special cases. I note that the standard referenced says that you use one or two '=' signs to pad at the end.
I have a Base64 encoder and decode which does some of that. You are describing the 'decode' part of Base64 -- where the Base64 code has 4 bytes of data that should be stored in just 3 - as your packing code. The Base64 encoder corresponds to the unpacker you will need.
Base-64 Decoder
Note: base_64_inv is an array of 256 values, one for each possible input byte value; it defines the correct decoded value for each encoded byte. In the Base64 encoding, this is a sparse array - 3/4 zeroes. Similarly, base_64_map is the mapping between a value 0..63 and the corresponding storage value.
enum { DC_PAD = -1, DC_ERR = -2 };
static int decode_b64(int c)
{
int b64 = base_64_inv[c];
if (c == base64_pad)
b64 = DC_PAD;
else if (b64 == 0 && c != base_64_map[0])
b64 = DC_ERR;
return(b64);
}
/* Decode 4 bytes into 3 */
static int decode_quad(const char *b64_data, char *bin_data)
{
int b0 = decode_b64(b64_data[0]);
int b1 = decode_b64(b64_data[1]);
int b2 = decode_b64(b64_data[2]);
int b3 = decode_b64(b64_data[3]);
int bytes;
if (b0 < 0 || b1 < 0 || b2 == DC_ERR || b3 == DC_ERR || (b2 == DC_PAD && b3 != DC_PAD))
return(B64_ERR_INVALID_ENCODED_DATA);
if (b2 == DC_PAD && (b1 & 0x0F) != 0)
/* 3rd byte is '='; 2nd byte must end with 4 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
if (b2 >= 0 && b3 == DC_PAD && (b2 & 0x03) != 0)
/* 4th byte is '='; 3rd byte is not '=' and must end with 2 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
bin_data[0] = (b0 << 2) | (b1 >> 4);
bytes = 1;
if (b2 >= 0)
{
bin_data[1] = ((b1 & 0x0F) << 4) | (b2 >> 2);
bytes = 2;
}
if (b3 >= 0)
{
bin_data[2] = ((b2 & 0x03) << 6) | (b3);
bytes = 3;
}
return(bytes);
}
/* Decode input Base-64 string to original data. Output length returned, or negative error */
int base64_decode(const char *data, size_t datalen, char *buffer, size_t buflen)
{
size_t outlen = 0;
if (datalen % 4 != 0)
return(B64_ERR_INVALID_ENCODED_LENGTH);
if (BASE64_DECLENGTH(datalen) > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 4)
{
int nbytes = decode_quad(data, buffer + outlen);
if (nbytes < 0)
return(nbytes);
outlen += nbytes;
data += 4;
datalen -= 4;
}
assert(datalen == 0); /* By virtue of the %4 check earlier */
return(outlen);
}
Base-64 Encoder
/* Encode 3 bytes of data into 4 */
static void encode_triplet(const char *triplet, char *quad)
{
quad[0] = base_64_map[(triplet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((triplet[0] & 0x03) << 4) | ((triplet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((triplet[1] & 0x0F) << 2) | ((triplet[2] >> 6) & 0x03)];
quad[3] = base_64_map[triplet[2] & 0x3F];
}
/* Encode 2 bytes of data into 4 */
static void encode_doublet(const char *doublet, char *quad, char pad)
{
quad[0] = base_64_map[(doublet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((doublet[0] & 0x03) << 4) | ((doublet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((doublet[1] & 0x0F) << 2)];
quad[3] = pad;
}
/* Encode 1 byte of data into 4 */
static void encode_singlet(const char *singlet, char *quad, char pad)
{
quad[0] = base_64_map[(singlet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((singlet[0] & 0x03) << 4)];
quad[2] = pad;
quad[3] = pad;
}
/* Encode input data as Base-64 string. Output length returned, or negative error */
static int base64_encode_internal(const char *data, size_t datalen, char *buffer, size_t buflen, char pad)
{
size_t outlen = BASE64_ENCLENGTH(datalen);
const char *bin_data = (const void *)data;
char *b64_data = (void *)buffer;
if (outlen > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 3)
{
encode_triplet(bin_data, b64_data);
bin_data += 3;
b64_data += 4;
datalen -= 3;
}
b64_data[0] = '\0';
if (datalen == 2)
encode_doublet(bin_data, b64_data, pad);
else if (datalen == 1)
encode_singlet(bin_data, b64_data, pad);
b64_data[4] = '\0';
return((b64_data - buffer) + strlen(b64_data));
}
I complicate life by having to deal with a product that uses a variant alphabet for the Base64 encoding, and also manages not to pad data - hence the 'pad' argument (which can be zero for 'null padding' or '=' for standard padding. The 'base_64_map' array contains the alphabet to use for 6-bit values in the range 0..63.
Another simpler way to do it would be to use bit fields. One of the lesser known corners of C struct syntax is the big field. Let's say you have the following structure:
struct packed_bytes {
byte chunk1 : 6;
byte chunk2 : 6;
byte chunk3 : 6;
byte chunk4 : 6;
};
This declares chunk1, chunk2, chunk3, and chunk4 to have the type byte but to only take up 6 bits in the structure. The result is that sizeof(struct packed_bytes) == 3. Now all you need is a little function to take your array and dump it into the structure like so:
void
dump_to_struct(byte *in, struct packed_bytes *out, int count)
{
int i, j;
for (i = 0; i < (count / 4); ++i) {
out[i].chunk1 = in[i * 4];
out[i].chunk2 = in[i * 4 + 1];
out[i].chunk3 = in[i * 4 + 2];
out[i].chunk4 = in[i * 4 + 3];
}
// Finish up
switch(struct % 4) {
case 3:
out[count / 4].chunk3 = in[(count / 4) * 4 + 2];
case 2:
out[count / 4].chunk2 = in[(count / 4) * 4 + 1];
case 1:
out[count / 4].chunk1 = in[(count / 4) * 4];
}
}
There you go, you now have an array of struct packed_bytes that you can easily read by using the above struct.
Instead of using a statemachine you can simply use a counter for how many bits are already used in the current byte, from which you can directly derive the shift-offsets and whether or not you overflow into the next byte.
Regarding the endianess: As long as you use only a single datatype (that is you don't reinterpret pointer to types of different size (e.g. int* a =...;short* b=(short*) a;) you shouldn't get problems with endianess in most cases
Taking elements of DigitalRoss's compact code, Grizzly's suggestion, and my own code, I have written my own answer at last. Although DigitalRoss provides a usable working answer, my usage of it without understanding, would not have provided the same satisfaction as to learning something. For this reason I have chosen to base my answer on my original code.
I have also chosen to ignore the advice Jonathon Leffler gives to avoid using floating point arithmetic for the calculation of the packed data length. Both the recommended method given - the same DigitalRoss also uses, increases the length of the packed data by as much as three bytes. Granted this is not much, but is also avoidable by the use of floating point math.
Here is the code, criticisms welcome:
/* built with gcc -std=c99 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned char *
pack(const unsigned char * data, size_t len, size_t * packedlen)
{
float fpl = ((float)len / 4.0f) * 3.0f;
*packedlen = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
unsigned char * packed = malloc(*packedlen);
if (!packed)
return 0;
const unsigned char * in = data;
const unsigned char * in_end = in + len;
unsigned char * out;
for (out = packed; in + 4 <= in_end; in += 4) {
*out++ = in[0] | ((in[1] & 0x03) << 6);
*out++ = ((in[1] & 0x3c) >> 2) | ((in[2] & 0x0f) << 4);
*out++ = ((in[2] & 0x30) >> 4) | (in[3] << 2);
}
size_t lastlen = in_end - in;
if (lastlen > 0) {
*out = in[0];
if (lastlen > 1) {
*out++ |= ((in[1] & 0x03) << 6);
*out = ((in[1] & 0x3c) >> 2);
if (lastlen > 2) {
*out++ |= ((in[2] & 0x0f) << 4);
*out = ((in[2] & 0x30) >> 4);
if (lastlen > 3)
*out |= (in[3] << 2);
}
}
}
return packed;
}
int main()
{
size_t i;
unsigned char data[] = {
12, 15, 40, 18,
26, 32, 50, 3,
7, 19, 46, 10,
25, 37, 2, 39,
60, 59, 0, 17,
9, 29, 13, 54,
5, 6, 47, 32
};
size_t datalen = sizeof(data);
printf("unpacked datalen: %td\nunpacked data\n", datalen);
for (i = 0; i < datalen; ++i)
printf("%02d ", data[i]);
printf("\n");
size_t packedlen;
unsigned char * packed = pack(data, sizeof(data), &packedlen);
if (!packed) {
fprintf(stderr, "Packing failed!\n");
return EXIT_FAILURE;
}
printf("packedlen: %td\npacked data\n", packedlen);
for (i = 0; i < packedlen; ++i)
printf("0x%02x ", packed[i]);
printf("\n");
free(packed);
return EXIT_SUCCESS;
}

Resources