I am not sure if I am calculating the parity bit correctly for the the check Parity bit function I wrote. The codeWord is 11 chars long with 4 parity bits and 7 data bits. Does the implementation look good?
void parityCheck(char* codeWord) {
int parity[4] = {0}, i = 0, diffParity[4] = {0}, twoPower = 0, bitSum = 0;
// Stores # of 1's for each parity bit in array.
parity[0] = (codeWord[2] - 48) + (codeWord[4] - 48) + (codeWord[6] - 48) + (codeWord[8] - 48) + (codeWord[10] - 48);
parity[1] = (codeWord[2] - 48) + (codeWord[5] - 48) + (codeWord[6] - 48) + (codeWord[9] - 48) + (codeWord[10] - 48);
parity[2] = (codeWord[4] - 48) + (codeWord[5] - 48) + (codeWord[6] - 48);
parity[3] = (codeWord[8] - 48) + (codeWord[9] - 48) + (codeWord[10] - 48);
// Determines if sum of bits is even or odd, then tests for difference from actual parity bit.
for (i = 0; i < 4; i++) {
twoPower = (int)pow((double)2, i);
if (parity[i] % 2 == 0)
parity[i] = 0;
else
parity[i] = 1;
if ((codeWord[twoPower-1] - 48) != parity[i])
diffParity[i] = 1;
}
// Calculates the location of the error bit.
for (i = 0; i < 4; i++) {
twoPower = (int)pow((double)2, i);
bitSum += diffParity[i]*twoPower;
}
// Inverts bit at location of error.
if (bitSum <= 11 && bitSum > 0) {
if ((codeWord[bitSum-1] - 48))
codeWord[bitSum-1] = '0';
else
codeWord[bitSum-1] = '1';
}
Does the implementation look good?
This very much depends on your measure for “good”. I can confirm that it does get the job done, so at least it is correct. Your code is very verbose, and thus hard to check for correctness. I'd do the following:
int parity_check(int codeWord) {
int parity = 0, codeWordBit, bitPos;
for (bitPos = 1; bitPos <= 11; ++bitPos) {
codeWordBit = ((codeWord >> (bitPos - 1)) & 1);
parity ^= bitPos*codeWordBit;
}
if (parity != 0) {
if (parity > 11)
return -1; // multi-bit error!
codeWord ^= 1 << (parity - 1);
}
return codeWord;
}
Instead of a sequence of digit characters, I treat your whole code word as a single integer, which is a lot more efficient.
Looking at the table at Wikipedia, I see that the columns of that table form binary representations of the sequence 1 … 11. Each code word bit affects exactly those parity bits mentioned in that column, so I take the code word bit (which is zero or one), multiply it by the bit pattern of that column to obtain either that pattern or zero, then XOR this with the current parity bit pattern. The effect of this is that a zero code word bit won't change anything, whereas a non-zero code word bit flips all associated parity bits.
Some care has to be taken because the bit pattern is one-based, whereas the bit position using the right shift trick is zero-based. So I have to subtract one, then shift right by that amount, and then extract the least significant digit in order to obtain the codeWordBit.
Using my implementation for reference, I was able to verify (by complete enumeration) that your code works the same.
Your code works fine AFAIK as it passed test cases I conjured up. Some simplifications were employed, but the OP functionality not changed. Some classic simplifications were made for easier viewing.
void parityCheck(char* cW) {
int parity[4] = { 0 }, i = 0, diffParity[4] = { 0 }, twoPower = 0, bitSum = 0;
// Stores # of 1's for each parity bit in array.
parity[0] = (cW[2] - '0') + (cW[4] - '0') + (cW[6] - '0') + (cW[8] - '0') + (cW[10] - '0');
parity[1] = (cW[2] - '0') + (cW[5] - '0') + (cW[6] - '0') + (cW[9] - '0') + (cW[10] - '0');
parity[2] = (cW[4] - '0') + (cW[5] - '0') + (cW[6] - '0');
parity[3] = (cW[8] - '0') + (cW[9] - '0') + (cW[10] - '0');
// Determines if sum of bits is even or odd, then tests for difference from actual parity bit.
for (i = 0; i < 4; i++) {
//twoPower = (int) pow((double) 2, i);
twoPower = 1 << i;
//if (parity[i] % 2 == 0) parity[i] = 0; else parity[i] = 1;
parity[i] &= 1; // Make 0 even, 1 odd.
if ((cW[twoPower - 1]-'0') != parity[i])
diffParity[i] = 1;
}
// Calculates the location of the error bit.
for (i = 0; i < 4; i++) {
// twoPower = (int) pow((double) 2, i);
twoPower = 1 << i;
bitSum += diffParity[i] * twoPower;
}
// Inverts bit at location of error.
if (bitSum <= 11 && bitSum > 0) {
if ((cW[bitSum - 1]-'0'))
cW[bitSum - 1] = '0';
else
cW[bitSum - 1] = '1';
}
}
void TestP(const char * Test) {
char buf[100];
strcpy(buf, Test);
parityCheck(buf);
printf("'%s' '%s'\n", Test, buf);
}
int main(void) {
TestP("00000000000");
TestP("10011100101");
TestP("10100111001");
}
It would have been useful had the OP posted test patterns.
Here's my implementation. It works. The public is free to use it at no charge.
I used the acronym "secded" as in, "single-error-correcting, double-error-detecting." You can re-wire this as a "triple error detector" if you want that instead. Really, some small part of this is secded and the rest is Hamming 7,4 -- but I named these methods what I did, when I did.
The "strings" here are not NUL-terminated, but counted. This code is excerpted from a Python module written in C. That is the provenance of the string type you see.
A key point here was realizing that there are only 16 Hamming 7,4 codes. I calculated secded_of_nibble() with some Python code, which unfortunately I no longer have.
static const unsigned char secded_of_nibble[] =
{ 0x0, 0xd2, 0x55, 0x87, 0x99, 0x4b, 0xcc, 0x1e, 0xe1, 0x33, 0xb4, 0x66, 0x78, 0
xaa, 0x2d, 0xff };
int fec_secded_encode_cch_bits(const char * strIn, const int cchIn, char * strOu
t, const int cchOut)
{
assert( cchIn * 2 == cchOut);
if( cchIn * 2 != cchOut)
return 0;
if (!strIn || !strOut)
return 0;
int i;
for (i = 0; i < cchIn; i ++)
{
char in_byte = strIn[i];
char hi_byte = secded_of_nibble[(in_byte >> 4) & 0xf];
char lo_byte = secded_of_nibble[in_byte & 0xf];
strOut[i * 2] = hi_byte;
strOut[i * 2 + 1] = lo_byte;
}
return 1;
}
char bv_H[] = {0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x8};
char val_nibble(char ch)
{
return ((ch & 0x20) >> 2) | ((ch & 0xE) >> 1);
}
char correct_nibble(char ch)
{
char nibble = 0;
int i = 0;
for (i = 0; i < 8; i++)
if (ch & (1 << (7-i)))
nibble ^= bv_H[i];
return nibble;
}
void apply_correct(char nib_correct, char * pbyte, int * pcSec, int *pcDed)
{
if (0 == nib_correct)
return;
if (nib_correct & 0x8)
{
(*pcSec) ++;
int bit = (8 - (nib_correct & 0x7)) & 0x7;
/* fprintf(stderr, "bit %d, %02X\n", bit, 1 << bit);*/
(*pbyte) ^= (1 << bit);
}
else
{
(*pcDed) ++;
}
}
int fec_secded_decode_cch_bits
(
const char * strIn,
const int cchIn,
char * strOut,
const int cchOut,
int * pcSec,
int * pcDed
)
{
assert( cchIn == cchOut *2);
if( cchIn != cchOut * 2)
return 0;
if (!strIn || !strOut)
return 0;
int i;
for (i = 0; i < cchOut; i ++)
{
char hi_byte = strIn[i * 2];
char lo_byte = strIn[i * 2 + 1];
char hi_correct = correct_nibble(hi_byte);
char lo_correct = correct_nibble(lo_byte);
if (hi_correct || lo_correct)
{
apply_correct(hi_correct, &hi_byte, pcSec, pcDed);
apply_correct(lo_correct, &lo_byte, pcSec, pcDed);
/* fprintf(stderr, "Corrections %x %x.\n", hi_correct, lo_correct);*/
}
char hi_nibble = val_nibble(hi_byte);
char lo_nibble = val_nibble(lo_byte);
strOut[i] = (hi_nibble << 4) | lo_nibble;
}
return 1;
}
Related
I need help converting a number string to a SQL_NUMERIC_STRUCT value to use decimal and numeric database data types. The SQL_NUMERIC_STRUCT value is a 16-byte hexadecimal unsigned integer. For example, I have a string "12312312899012312890522341231231232198", that contains 38 digits (maximum for SQL SERVER decimal or numeric data types). In other languages such a c# there is a built-in conversion function, but my Visual studio 2019 does not allow me to directly use 128-bit integers in the C++ environment. The Microsoft help page offers example with a small,2-byte integer, unfortunately.
I have found a solution.
bool ConvertToNumericStruct (char* s, SQL_NUMERIC_STRUCT* v){
int sc = (int)strlen(s), scale = 0, i,y, z;
char c, p = 0, d; bool minus = false;
int _tmp, x, carryover;
memset(v->val, 0, 16);
for (i = 0; i < sc; i++) {
c = s[i];
if (i == 0 && c == '-')minus = true;
else if (c == '.') { if (scale == 0)scale = sc - i - 1; else return false; }
else if (c < '0' || c>'9') return false;
else
{
if (p > 38) return false;
d = c - 48;
_tmp = 0;
carryover = d;
y = 0; z = 0;
for (x = sc - 1; x > -1; x--)
{
if (y % 2 == 1)
{
_tmp = (v->val[z] >> 4) * 10 + carryover;
v->val[z] &= 0x0F;
v->val[z] |= ((_tmp % 16) << 4 & 0xF0);
z++;
if (z > 15) break;
}
else {
_tmp = (v->val[z] & 0x0F) * 10 + carryover;
v->val[z] &= 0Xf0;
v->val[z] |= ((_tmp % 16) & 0x0F);
}
y++;
carryover = _tmp / 16;
}
p++;
}
}
v->precision = p;
v->scale = scale;
if (minus) v->sign = 0; else v->sign = 1;
return true;}
If you want to insert data defined by decimal or numeric into database such as MySql via UnixODBC with the function SQLBindParameter,you can just use SQL_C_CHAR for fCtype and SQL_CHAR for fSqltype with a char-string buffer.No need to convert.That would be done implicitly.
I am currently working on a program where I need to have this kind of output:
I have to output the binary in IEEE 754 of 64 and 32-bit numbers in C.
I already have the double and single floating point approximation, but I'm having trouble finding out how to output the binary of these in IEEE 754 notation, and color code them as well. Any thoughts/solutions on how to do this would be much appreciated.
This does not guarantee the correct answer if the underlying machine is something esoteric, however:
float f = 3.14;
uint32_t u;
memcpy(&u, &f, sizeof u);
for (int i = 31; i >= 0; i--)
putchar('0' + ((u >> i) & 1));
I decided to take the opportunity to refresh my memory of the IEE-754 floating-point standard. Below is a mashup I made for displaying a string in its single-precision floating point number representation, though it is easily modified for the double-precision format.
The code won't work with +Inf, -Inf, NaN, trailing-zero, fractionless and leftout-zero (.fraction instead of 0.fraction or integer. instead of integer.0) numbers, it's just supposed to give the general idea of how to do what you want to do in a portable and well-defined (and highly entertaining) way.
#define EXPLEN 8 /* Fraction length for single-precision */
#define SIGNIFLEN 23 /* Significand length for single-precision */
#define EXPBIAS 0x7F /* Exponent bias for single-precision */
#define BITLEN (1 + EXPLEN + SIGNIFLEN)
BOOL strToFloat(char *floatStr, char *outBits, size_t outBitsLen){
unsigned long int floatStrLength = strlen(floatStr), intPart, fracPart, intPartHighestBit = 1, fracPartLength,
fracPartPowTen = 1, temp;
char roundBit, stickyBit, expPart = 0;
int i;
/* Get sign */
if (floatStr[0] == '-'){
floatStr++;
outBits[0] = '1';
} else {
if (floatStr[0] == '+')
floatStr++;
outBits[0] = '0';
}
if (sscanf(floatStr, "%lu.%lu", &intPart, &fracPart) == EOF ||
outBitsLen < BITLEN + 1)
return FALSE; /* Failure */
/* Get integer part */
temp = intPart;
while (temp >>= 1)
intPartHighestBit <<= 1;
for (i = EXPLEN + 1; i < BITLEN && (intPartHighestBit >>= 1); i++, expPart++)
outBits[i] = !!(intPart & intPartHighestBit) + '0';
/* Get fraction part */
fracPartLength = strlen(strchr(floatStr, '.'));
while (--fracPartLength)
fracPartPowTen *= 10;
if (!intPart && i == EXPLEN + 1)
if (fracPart > 0){
i--;
expPart--;
} else
expPart = -EXPBIAS;
for (; i < BITLEN; fracPart = (fracPart << 1) % fracPartPowTen){
outBits[i] = !!((fracPart << 1) - (fracPart << 1) % fracPartPowTen) + '0';
if (outBits[i] == '0' && i == EXPLEN) /* Start writing only after first set bit is reached if number <1 */
expPart--;
else
i++;
}
/* Get exponent part */
for (i = EXPLEN, expPart += EXPBIAS; i > 0; i--, expPart >>= 1)
outBits[i] = (unsigned char)expPart % 2 + '0';
/* Round fraction part (to-nearest mode) */
if ((fracPart << 1) - (fracPart << 1) % fracPartPowTen){ /* Guard bit set, rounding needed */
fracPart = (fracPart << 1) % fracPartPowTen;
roundBit = !!((fracPart << 1) - (fracPart << 1) % fracPartPowTen);
fracPart = (fracPart << 1) % fracPartPowTen;
stickyBit = !!((fracPart << 1) - (fracPart << 1) % fracPartPowTen);
if (roundBit || stickyBit || outBits[BITLEN - 1] == '0'){ /* Round up, add 1 to mantissa (and to exponent
if mantissa overflows)*/
for (i = BITLEN - 1; outBits[i] == '1' && i > 0; i--)
outBits[i] = '0';
outBits[i] = '1';
}
}
outBits[BITLEN] = '\0';
return TRUE; /* Success */
}
Example usage:
char *str = "-3.14",
*outFloat = malloc(BITLEN + 1);
if (outFloat && strToFloat(str, outFloat, BITLEN + 1))
printf("%s", outFloat);
outputs
11000000010010001111010111000011
UPDATE: did my best to
remove magic numbers so it's easier to change this to use the double-precision format;
fix (I think) the rounding overflows;
fix zeroes issues;
refactor the code for setting the sign bit; and I also fiddled with some types, both per #Segmented's request in the comments.
Well, that was lots of fun! If you see any errors or space for improvement in this (rather hasty) code, please post it!
So I want to toggle the most significant bit of my number. Here is an example:
x = 100101 then answer should be 00101
I have a 64 bit machine and hence I am not expecting the answer to be 100000..<51 0's>..100101
One way I thought of was to count the number of bits in my number and then toggle the MSB, but not sure on how to count.
The cheat is to pawn it off to the compiler: There are instructions in most CPUs for doing work like this.
The following should do what you want.
i ^ (1 << (sizeof i * CHAR_BIT - clz(i) - 1))
This will translate into the CLZ instruction, which counts the leading zeros.
For GCC, see: http://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Other-Builtins.html
One thing to be careful of is that this results in undefined behavior if i == 0.
You should replace clz() with the correct intrinsic for your compiler, In GCC this is __builtin_clz; in Visual Studio C++ this is _BitScanForward.
#jleahy has already posted a good option in case of using GCC, I would only leave here a generic implementation of clz which does not use any compiler intrinsics. However, it is not the optimal choice for CPUs which already have native instructions for counting bits (such as x86).
#define __bit_msb_mask(n) (~(~0x0ul >> (n))) /* n leftmost bits. */
/* Count leading zeroes. */
int clz(unsigned long x) {
int nr = 0;
int sh;
assert(x);
/* Hope that compiler optimizes out the sizeof check. */
if (sizeof(x) == 8) {
/* Suppress "shift count >= width of type" error in case
* when sizeof(x) is NOT 8, i.e. when it is a dead code anyway. */
sh = !(x & __bit_msb_mask(sizeof(x)*8/2)) << 5;
nr += sh; x <<= sh;
}
sh = !(x & __bit_msb_mask(1 << 4)) << 4; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 3)) << 3; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 2)) << 2; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 1)) << 1; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 0)) << 0; nr += sh;
return nr;
}
Using this function one can toggle the most significant set bit (assuming there is such one) as follows:
x ^= 1ul << (sizeof(x)*8 - clz(x))
Here's an approach using a lookup table, assuming CHAR_BIT == 8:
uint32_t toggle_msb(uint32_t n)
{
static unsigned char const lookup[] =
{ 1, 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
for (unsigned int i = 0; i != sizeof n; ++i)
{
// omit the last bit for big-endian machines: ---VVVVVVVVVVVVVVVVVV
unsigned char * p
= reinterpret_cast<unsigned char *>(&n) + sizeof n - i - 1;
if (*p / 16 != 0) { *p = *p % 16 + (lookup[*p / 16] * 16); return n; }
if (*p % 16 != 0) { *p = 16 * (*p / 16) + lookup[*p % 16]; return n; }
}
return 1;
}
And to just put it all together in some sample code for GCC:
#include <stdio.h>
#define clz(x) __builtin_clz(x)
int main()
{
int i = 411; /* 110011011 */
if( i != 0 )
i ^= (1 << (sizeof(i)*8 - clz(i)-1));
/* i is now 10011011 */
printf("i = %d\n", i);
return(0);
}
I have a program that outputs a textual table using UTF-8 strings, and I need to measure the number of monospaced character cells used by a string so I can align it properly. If possible, I'd like to do this with standard functions.
From UTF-8 and Unicode FAQ for Unix/Linux:
The number of characters can be counted in C in a portable way using mbstowcs(NULL,s,0). This works for UTF-8 like for any other supported encoding, as long as the appropriate locale has been selected. A hard-wired technique to count the number of characters in a UTF-8 string is to count all bytes except those in the range 0x80 – 0xBF, because these are just continuation bytes and not characters of their own. However, the need to count characters arises surprisingly rarely in applications.
You may or may not have a UTF-8 compatible strlen(3) function available. However, there are some simple C functions readily available that do the job quickly.
The efficient C solutions examine the start of the character to skip continuation bytes. The simple code (referenced from the link above) is
int my_strlen_utf8_c(char *s) {
int i = 0, j = 0;
while (s[i]) {
if ((s[i] & 0xc0) != 0x80) j++;
i++;
}
return j;
}
The faster version uses the same technique, but prefetches data and does multi-byte compares, resulting is a substantial speedup. The code is longer and more complex, however.
I'm shocked that no one mentioned this, so here it goes for the record:
If you want to align text in a terminal, you need to use the POSIX functions wcwidth and wcswidth. Here's correct program to find the on-screen length of a string.
#define _XOPEN_SOURCE
#include <wchar.h>
#include <stdio.h>
#include <locale.h>
#include <stdlib.h>
int measure(char *string) {
// allocate enough memory to hold the wide string
size_t needed = mbstowcs(NULL, string, 0) + 1;
wchar_t *wcstring = malloc(needed * sizeof *wcstring);
if (!wcstring) return -1;
// change encodings
if (mbstowcs(wcstring, string, needed) == (size_t)-1) return -2;
// measure width
int width = wcswidth(wcstring, needed);
free(wcstring);
return width;
}
int main(int argc, char **argv) {
setlocale(LC_ALL, "");
for (int i = 1; i < argc; i++) {
printf("%s: %d\n", argv[i], measure(argv[i]));
}
}
Here's an example of it running:
$ ./measure hello 莊子 cAb
hello: 5
莊子: 4
cAb: 4
Note how the two characters "莊子" and the three characters "cAb" (note the double-width A) are both 4 columns wide.
As utf8everywhere.org puts it,
The size of the string as it appears on the screen is unrelated to the
number of code points in the string. One has to communicate with the
rendering engine for this. Code points do not occupy one column even
in monospace fonts and terminals. POSIX takes this into account.
Windows does not have any built-in wcwidth function for console output; if you want to support multi-column characters in the Windows console you need to find a portable implementation of wcwidth give up because the Windows console doesn’t support Unicode without crazy hacks.
If you are able to use 3rd party libraries, have a look at the ICU library from IBM:
http://site.icu-project.org/
The following code takes ill-formed byte sequences into consideration. the example of string data comes from ""Table 3-8. Use of U+FFFD in UTF-8 Conversion"" in the Unicode Standard 6.3.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#define is_trail(c) (c > 0x7F && c < 0xC0)
#define SUCCESS 1
#define FAILURE -1
int utf8_get_next_char(const unsigned char*, size_t, size_t*, int*, unsigned int*);
int utf8_length(unsigned char*, size_t);
void utf8_print_each_char(unsigned char*, size_t);
int main(void)
{
unsigned char *str;
str = (unsigned char *) "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64";
size_t str_size = strlen((const char*) str);
puts(10 == utf8_length(str, str_size) ? "true" : "false");
utf8_print_each_char(str, str_size);
return EXIT_SUCCESS;
}
int utf8_length(unsigned char *str, size_t str_size)
{
int length = 0;
size_t pos = 0;
size_t next_pos = 0;
int is_valid = 0;
unsigned int code_point = 0;
while (
utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
) {
++length;
}
return length;
}
void utf8_print_each_char(unsigned char *str, size_t str_size)
{
int length = 0;
size_t pos = 0;
size_t next_pos = 0;
int is_valid = 0;
unsigned int code_point = 0;
while (
utf8_get_next_char(str, str_size, &next_pos, &is_valid, &code_point) == SUCCESS
) {
if (is_valid == true) {
printf("%.*s\n", (int) next_pos - (int) pos, str + pos);
} else {
puts("\xEF\xBF\xBD");
}
pos = next_pos;
}
}
int utf8_get_next_char(const unsigned char *str, size_t str_size, size_t *cursor, int *is_valid, unsigned int *code_point)
{
size_t pos = *cursor;
size_t rest_size = str_size - pos;
unsigned char c;
unsigned char min;
unsigned char max;
*code_point = 0;
*is_valid = SUCCESS;
if (*cursor >= str_size) {
return FAILURE;
}
c = str[pos];
if (rest_size < 1) {
*is_valid = false;
pos += 1;
} else if (c < 0x80) {
*code_point = str[pos];
*is_valid = true;
pos += 1;
} else if (c < 0xC2) {
*is_valid = false;
pos += 1;
} else if (c < 0xE0) {
if (rest_size < 2 || !is_trail(str[pos + 1])) {
*is_valid = false;
pos += 1;
} else {
*code_point = ((str[pos] & 0x1F) << 6) | (str[pos + 1] & 0x3F);
*is_valid = true;
pos += 2;
}
} else if (c < 0xF0) {
min = (c == 0xE0) ? 0xA0 : 0x80;
max = (c == 0xED) ? 0x9F : 0xBF;
if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
*is_valid = false;
pos += 1;
} else if (rest_size < 3 || !is_trail(str[pos + 2])) {
*is_valid = false;
pos += 2;
} else {
*code_point = ((str[pos] & 0x1F) << 12)
| ((str[pos + 1] & 0x3F) << 6)
| (str[pos + 2] & 0x3F);
*is_valid = true;
pos += 3;
}
} else if (c < 0xF5) {
min = (c == 0xF0) ? 0x90 : 0x80;
max = (c == 0xF4) ? 0x8F : 0xBF;
if (rest_size < 2 || str[pos + 1] < min || max < str[pos + 1]) {
*is_valid = false;
pos += 1;
} else if (rest_size < 3 || !is_trail(str[pos + 2])) {
*is_valid = false;
pos += 2;
} else if (rest_size < 4 || !is_trail(str[pos + 3])) {
*is_valid = false;
pos += 3;
} else {
*code_point = ((str[pos] & 0x7) << 18)
| ((str[pos + 1] & 0x3F) << 12)
| ((str[pos + 2] & 0x3F) << 6)
| (str[pos + 3] & 0x3F);
*is_valid = true;
pos += 4;
}
} else {
*is_valid = false;
pos += 1;
}
*cursor = pos;
return SUCCESS;
}
When I write code for UTF-8, I see "Table 3-7. Well-Formed UTF-8 Byte Sequences" in the Unicode Standard 6.3.
Code Points First Byte Second Byte Third Byte Fourth Byte
U+0000 - U+007F 00 - 7F
U+0080 - U+07FF C2 - DF 80 - BF
U+0800 - U+0FFF E0 A0 - BF 80 - BF
U+1000 - U+CFFF E1 - EC 80 - BF 80 - BF
U+D000 - U+D7FF ED 80 - 9F 80 - BF
U+E000 - U+FFFF EE - EF 80 - BF 80 - BF
U+10000 - U+3FFFF F0 90 - BF 80 - BF 80 - BF
U+40000 - U+FFFFF F1 - F3 80 - BF 80 - BF 80 - BF
U+100000 - U+10FFFF F4 80 - 8F 80 - BF 80 - BF
You can also use glib which makes your live much easier when dealing with UTF-8. glib reference docs
I have an array of values all well within the range 0 - 63, and decided I could pack every 4 bytes into 3 because the values only require 6 bits and I could use the extra 2bits to store the first 2 bits of the next value and so on.
Having never done this before I used the switch statement and a nextbit variable (a state machine like device) to do the packing and keep track of the starting bit. I'm convinced however, there must be a better way.
Suggestions/clues please, but don't ruin my fun ;-)
Any portability problems regarding big/little endian?
btw: I have verified this code is working, by unpacking it again and comparing with the input. And no it ain't homework, just an exercise I've set myself.
/* build with gcc -std=c99 -Wconversion */
#define ASZ 400
typedef unsigned char uc_;
uc_ data[ASZ];
int i;
for (i = 0; i < ASZ; ++i) {
data[i] = (uc_)(i % 0x40);
}
size_t dl = sizeof(data);
printf("sizeof(data):%z\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%z\n",pl);
for (i = 0; i < dl; ++i)
printf("%02d ", data[i]);
printf("\n");
uc_ * packeddata = calloc(pl, sizeof(uc_));
uc_ * byte = packeddata;
uc_ nextbit = 1;
for (int i = 0; i < dl; ++i) {
uc_ m = (uc_)(data[i] & 0x3f);
switch(nextbit) {
case 1:
/* all 6 bits of m into first 6 bits of byte: */
*byte = m;
nextbit = 7;
break;
case 3:
/* all 6 bits of m into last 6 bits of byte: */
*byte++ = (uc_)(*byte | (m << 2));
nextbit = 1;
break;
case 5:
/* 1st 4 bits of m into last 4 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x0f) << 4));
/* 5th and 6th bits of m into 1st and 2nd bits of byte: */
*byte = (uc_)(*byte | ((m & 0x30) >> 4));
nextbit = 3;
break;
case 7:
/* 1st 2 bits of m into last 2 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x03) << 6));
/* next (last) 4 bits of m into 1st 4 bits of byte: */
*byte = (uc_)((m & 0x3c) >> 2);
nextbit = 5;
break;
}
}
So, this is kinda like code-golf, right?
#include <stdlib.h>
#include <string.h>
static void pack2(unsigned char *r, unsigned char *n) {
unsigned v = n[0] + (n[1] << 6) + (n[2] << 12) + (n[3] << 18);
*r++ = v;
*r++ = v >> 8;
*r++ = v >> 16;
}
unsigned char *apack(const unsigned char *s, int len) {
unsigned char *s_end = s + len,
*r, *result = malloc(len/4*3+3),
lastones[4] = { 0 };
if (result == NULL)
return NULL;
for(r = result; s + 4 <= s_end; s += 4, r += 3)
pack2(r, s);
memcpy(lastones, s, s_end - s);
pack2(r, lastones);
return result;
}
Check out the IETF RFC 4648 for 'The Base16, Base32 and Base64 Data Encodings'.
Partial code critique:
size_t dl = sizeof(data);
printf("sizeof(data):%d\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%d\n",pl);
Don't use the floating point stuff - just use integers. And use '%z' to print 'size_t' values - assuming you've got a C99 library.
size_t pl = ((dl + 3) / 4) * 3;
I think your loop could be simplified by dealing with 3-byte input units until you've got a partial unit left over, and then dealing with a remainder of 1 or 2 bytes as special cases. I note that the standard referenced says that you use one or two '=' signs to pad at the end.
I have a Base64 encoder and decode which does some of that. You are describing the 'decode' part of Base64 -- where the Base64 code has 4 bytes of data that should be stored in just 3 - as your packing code. The Base64 encoder corresponds to the unpacker you will need.
Base-64 Decoder
Note: base_64_inv is an array of 256 values, one for each possible input byte value; it defines the correct decoded value for each encoded byte. In the Base64 encoding, this is a sparse array - 3/4 zeroes. Similarly, base_64_map is the mapping between a value 0..63 and the corresponding storage value.
enum { DC_PAD = -1, DC_ERR = -2 };
static int decode_b64(int c)
{
int b64 = base_64_inv[c];
if (c == base64_pad)
b64 = DC_PAD;
else if (b64 == 0 && c != base_64_map[0])
b64 = DC_ERR;
return(b64);
}
/* Decode 4 bytes into 3 */
static int decode_quad(const char *b64_data, char *bin_data)
{
int b0 = decode_b64(b64_data[0]);
int b1 = decode_b64(b64_data[1]);
int b2 = decode_b64(b64_data[2]);
int b3 = decode_b64(b64_data[3]);
int bytes;
if (b0 < 0 || b1 < 0 || b2 == DC_ERR || b3 == DC_ERR || (b2 == DC_PAD && b3 != DC_PAD))
return(B64_ERR_INVALID_ENCODED_DATA);
if (b2 == DC_PAD && (b1 & 0x0F) != 0)
/* 3rd byte is '='; 2nd byte must end with 4 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
if (b2 >= 0 && b3 == DC_PAD && (b2 & 0x03) != 0)
/* 4th byte is '='; 3rd byte is not '=' and must end with 2 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
bin_data[0] = (b0 << 2) | (b1 >> 4);
bytes = 1;
if (b2 >= 0)
{
bin_data[1] = ((b1 & 0x0F) << 4) | (b2 >> 2);
bytes = 2;
}
if (b3 >= 0)
{
bin_data[2] = ((b2 & 0x03) << 6) | (b3);
bytes = 3;
}
return(bytes);
}
/* Decode input Base-64 string to original data. Output length returned, or negative error */
int base64_decode(const char *data, size_t datalen, char *buffer, size_t buflen)
{
size_t outlen = 0;
if (datalen % 4 != 0)
return(B64_ERR_INVALID_ENCODED_LENGTH);
if (BASE64_DECLENGTH(datalen) > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 4)
{
int nbytes = decode_quad(data, buffer + outlen);
if (nbytes < 0)
return(nbytes);
outlen += nbytes;
data += 4;
datalen -= 4;
}
assert(datalen == 0); /* By virtue of the %4 check earlier */
return(outlen);
}
Base-64 Encoder
/* Encode 3 bytes of data into 4 */
static void encode_triplet(const char *triplet, char *quad)
{
quad[0] = base_64_map[(triplet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((triplet[0] & 0x03) << 4) | ((triplet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((triplet[1] & 0x0F) << 2) | ((triplet[2] >> 6) & 0x03)];
quad[3] = base_64_map[triplet[2] & 0x3F];
}
/* Encode 2 bytes of data into 4 */
static void encode_doublet(const char *doublet, char *quad, char pad)
{
quad[0] = base_64_map[(doublet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((doublet[0] & 0x03) << 4) | ((doublet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((doublet[1] & 0x0F) << 2)];
quad[3] = pad;
}
/* Encode 1 byte of data into 4 */
static void encode_singlet(const char *singlet, char *quad, char pad)
{
quad[0] = base_64_map[(singlet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((singlet[0] & 0x03) << 4)];
quad[2] = pad;
quad[3] = pad;
}
/* Encode input data as Base-64 string. Output length returned, or negative error */
static int base64_encode_internal(const char *data, size_t datalen, char *buffer, size_t buflen, char pad)
{
size_t outlen = BASE64_ENCLENGTH(datalen);
const char *bin_data = (const void *)data;
char *b64_data = (void *)buffer;
if (outlen > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 3)
{
encode_triplet(bin_data, b64_data);
bin_data += 3;
b64_data += 4;
datalen -= 3;
}
b64_data[0] = '\0';
if (datalen == 2)
encode_doublet(bin_data, b64_data, pad);
else if (datalen == 1)
encode_singlet(bin_data, b64_data, pad);
b64_data[4] = '\0';
return((b64_data - buffer) + strlen(b64_data));
}
I complicate life by having to deal with a product that uses a variant alphabet for the Base64 encoding, and also manages not to pad data - hence the 'pad' argument (which can be zero for 'null padding' or '=' for standard padding. The 'base_64_map' array contains the alphabet to use for 6-bit values in the range 0..63.
Another simpler way to do it would be to use bit fields. One of the lesser known corners of C struct syntax is the big field. Let's say you have the following structure:
struct packed_bytes {
byte chunk1 : 6;
byte chunk2 : 6;
byte chunk3 : 6;
byte chunk4 : 6;
};
This declares chunk1, chunk2, chunk3, and chunk4 to have the type byte but to only take up 6 bits in the structure. The result is that sizeof(struct packed_bytes) == 3. Now all you need is a little function to take your array and dump it into the structure like so:
void
dump_to_struct(byte *in, struct packed_bytes *out, int count)
{
int i, j;
for (i = 0; i < (count / 4); ++i) {
out[i].chunk1 = in[i * 4];
out[i].chunk2 = in[i * 4 + 1];
out[i].chunk3 = in[i * 4 + 2];
out[i].chunk4 = in[i * 4 + 3];
}
// Finish up
switch(struct % 4) {
case 3:
out[count / 4].chunk3 = in[(count / 4) * 4 + 2];
case 2:
out[count / 4].chunk2 = in[(count / 4) * 4 + 1];
case 1:
out[count / 4].chunk1 = in[(count / 4) * 4];
}
}
There you go, you now have an array of struct packed_bytes that you can easily read by using the above struct.
Instead of using a statemachine you can simply use a counter for how many bits are already used in the current byte, from which you can directly derive the shift-offsets and whether or not you overflow into the next byte.
Regarding the endianess: As long as you use only a single datatype (that is you don't reinterpret pointer to types of different size (e.g. int* a =...;short* b=(short*) a;) you shouldn't get problems with endianess in most cases
Taking elements of DigitalRoss's compact code, Grizzly's suggestion, and my own code, I have written my own answer at last. Although DigitalRoss provides a usable working answer, my usage of it without understanding, would not have provided the same satisfaction as to learning something. For this reason I have chosen to base my answer on my original code.
I have also chosen to ignore the advice Jonathon Leffler gives to avoid using floating point arithmetic for the calculation of the packed data length. Both the recommended method given - the same DigitalRoss also uses, increases the length of the packed data by as much as three bytes. Granted this is not much, but is also avoidable by the use of floating point math.
Here is the code, criticisms welcome:
/* built with gcc -std=c99 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned char *
pack(const unsigned char * data, size_t len, size_t * packedlen)
{
float fpl = ((float)len / 4.0f) * 3.0f;
*packedlen = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
unsigned char * packed = malloc(*packedlen);
if (!packed)
return 0;
const unsigned char * in = data;
const unsigned char * in_end = in + len;
unsigned char * out;
for (out = packed; in + 4 <= in_end; in += 4) {
*out++ = in[0] | ((in[1] & 0x03) << 6);
*out++ = ((in[1] & 0x3c) >> 2) | ((in[2] & 0x0f) << 4);
*out++ = ((in[2] & 0x30) >> 4) | (in[3] << 2);
}
size_t lastlen = in_end - in;
if (lastlen > 0) {
*out = in[0];
if (lastlen > 1) {
*out++ |= ((in[1] & 0x03) << 6);
*out = ((in[1] & 0x3c) >> 2);
if (lastlen > 2) {
*out++ |= ((in[2] & 0x0f) << 4);
*out = ((in[2] & 0x30) >> 4);
if (lastlen > 3)
*out |= (in[3] << 2);
}
}
}
return packed;
}
int main()
{
size_t i;
unsigned char data[] = {
12, 15, 40, 18,
26, 32, 50, 3,
7, 19, 46, 10,
25, 37, 2, 39,
60, 59, 0, 17,
9, 29, 13, 54,
5, 6, 47, 32
};
size_t datalen = sizeof(data);
printf("unpacked datalen: %td\nunpacked data\n", datalen);
for (i = 0; i < datalen; ++i)
printf("%02d ", data[i]);
printf("\n");
size_t packedlen;
unsigned char * packed = pack(data, sizeof(data), &packedlen);
if (!packed) {
fprintf(stderr, "Packing failed!\n");
return EXIT_FAILURE;
}
printf("packedlen: %td\npacked data\n", packedlen);
for (i = 0; i < packedlen; ++i)
printf("0x%02x ", packed[i]);
printf("\n");
free(packed);
return EXIT_SUCCESS;
}