CUDA doesn't work as expected? - c

I have programmed CUDA code.
unsigned long mask_buffer;
int s;
off_t p,
for(p=0;p!=5000;p++)
{
for(s=start;s!=end;s++)
{
ref_off = *(((unsigned int*)(idx_base)) + p);
if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) < 0)
{
int shamt2 = (ref_off % 8 - first_indexes[s-start_sequence] % 8);
mask_buffer = *((unsigned long *)(msk_base + (ref_off - first_indexes[s-start_sequence])/8)) >> shamt2;
if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2)
continue;
}
else if((int)(first_indexes[s-start_sequence] % 8 - ref_off % 8) == 0)
{
mask_buffer = *((unsigned long *)(msk_base + (ref_off)/8));
if( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence)) ^ mask_buffer))
continue;
}
else
{
int shamt2 = 8 - (first_indexes[s-start_sequence] % 8 - ref_off % 8);
mask_buffer = *((unsigned long *)(msk_base + (ref_off/8- first_indexes[s-start_sequence]/8) - 1)) >> shamt2;
if( ( (*(unsigned long *)(seqmaskc + 16 * (s-start_sequence))) ^ mask_buffer ) << shamt2)
continue;
}
int shamt = (ref_off % 4 - first_indexes[s-start_sequence] % 4) * 2;
memcpy(reference_blk, ref_base + ref_off / 4 - first_indexes[s-start_sequence] / 4, sequence_bytes);
for (rp = last_rp ; rp != (unsigned long *) reference_blk ; rp--)
{
unsigned long tmp = ((*rp) & ((1 << shamt) - 1)) << (8 * sizeof(unsigned long) - shamt);
*rp = (*rp >> shamt) | shifted_in;
shifted_in = tmp;
}
*rp = (*rp >> shamt) | shifted_in;
if (sequence_length & 0x3)
reference_blk[sequence_length >> 2] &= (1 << ((sequence_length & 0x3) << 1)) - 1;
for ( i = sequence_length >> 2 ; i & (SEQUENCE_ALIGN - 1) ; i++ )
reference_blk[i] = 0;
//-- instead of memcmp --//
int v = 0;
char *p1 = (char *)sequence;
char *p2 = (char *)reference_blk;
int tmp_asd = sequence_bytes;
while(tmp_asd!=0)
{
v = *(p1++) - *(p2++);
if(v!=0)
break;
tmp_asd--;
}
if(v == 0)
{
mat_count[s - (int)start_sequence]++; /* Maintain count */
mat_position[s - (int)start_sequence] = ref_off-first_indexes[s-start_sequence]; /* Record latest position */
}
}
}
This for loop is main function of my code.
But the problem is that variable "p" is never increased over 5 or 6.
I have GT530 in my computer and my CUDA Driver Version and Runtime Version are also 4.0.
What is problem in this code???

Assuming this is indeed a kernel, which your original post before your new code edit claimed it was, you can't call standard C-library functions like memcpy inside of a CUDA kernel (you do that about half-way through the code). You can only call __device__ functions in a kernel. So unless that is a re-implementation of memcpy in CUDA, calling that function inside your kernel is not going to work ...
Also, if you are going to attempt to write CUDA kernels that need to check the results of what other threads in the group have written, such as you're doing in the memory comparison section of your kernel, you will need to place some synchronization points in your code in order to ensure that all the threads have reached a certain point before you start checking the results from those threads. If you are attempting to check what another block may have written to main memory, there are also synchronization primitives for that as well that ensure that the results of any previous thread block have been written out to main memory.

Related

Round Constants in Keccak

Recently, just for the heck of it, I've been playing around with an attempt at implementing Keccak, the cryptographic primitive behind SHA-3. I've run into some issues however, specifically with calculating the round constants used in the "Iota" step of the permutation.
Just to get it out of the way: Yes. I know they are round constants. I know I could hard code them as constants. But where's the fun in that?
I've specifically been referencing the FIPS 202 specification document on SHA-3 as well as the Keccak team's own Keccak reference. However, despite my efforts, I can't seem to end up with the correct constants. I've never dealt with bit manipulation before, so if I'm doing something the complete wrong way, feel free to let me know.
rc is a function defined in the FIPS 202 standard of Keccak that is a linear feedback shift register with a feedback polynomial of x^8 + x^6 + x^5 + x^4 + 1.
The values of t (specific to SHA-3) are defined as the set of integers that includes j + 7 * i_r, where i_r = {0, 1, ..., 22, 23} and j = {0, 1, ..., 4, 5}.
The expected outputs (the round constants) are defined as follows: 0x0000000000000001, 0x0000000000008082, 0x800000000000808a,
0x8000000080008000, 0x000000000000808b, 0x0000000080000001,
0x8000000080008081, 0x8000000000008009, 0x000000000000008a,
0x0000000000000088, 0x0000000080008009, 0x000000008000000a,
0x000000008000808b, 0x800000000000008b, 0x8000000000008089,
0x8000000000008003, 0x8000000000008002, 0x8000000000000080,
0x000000000000800a, 0x800000008000000a, 0x8000000080008081,
0x8000000000008080, 0x0000000080000001, and 0x8000000080008008.
rc Function Implementation
uint64_t rc(int t)
{
if(t % 255 == 0)
{
return 0x1;
}
uint64_t R = 0x1;
for(int i = 1; i <= t % 255; i++)
{
R = R << 0x1;
R |= (((R >> 0x0) & 0x1) ^ ((R >> 0x8) & 0x1)) << 0x0;
R |= (((R >> 0x4) & 0x1) ^ ((R >> 0x8) & 0x1)) << 0x4;
R |= (((R >> 0x5) & 0x1) ^ ((R >> 0x8) & 0x1)) << 0x5;
R |= (((R >> 0x6) & 0x1) ^ ((R >> 0x8) & 0x1)) << 0x6;
R &= 0xFF;
}
return R & 0x1;
}
rc Function Call
for(int i_r = 0; i_r < 24; i_r++)
{
uint64_t RC = 0x0;
// TODO: Fix so the limit is not constant
for(int j = 0; j < 6; j++)
{
RC ^= (rc(j + 7 * i_r) << ((int) pow(2, j) - 1));
}
printf("%llu\n", RC);
}
Any help on this matter is much appreciated.
I made some random changes to the code and now it works. Here are the highlights:
The j loop needs to count from 0 to 6. That's because 2^6-1 = 63. So if j is never 6, then the output can never have the MSB set, i.e. an output of 0x8... is not possible.
Using the pow function is generally a bad idea for this type of application. double values have a nasty habit of being slightly lower than desired, e.g. 4 is actually 3.99999999999, which gets truncated to 3 when you convert it to an int. Doubtful that was happening in this case, but why risk it, since it's easy to just multiply variable shift by 2 on each pass through the loop.
The maximum value for t is 7*23+6 = 167, so the % 255 does nothing (at least with the value of i and t in this code). Also, there's no need to treat t == 0 as a special case. The loop won't run when t is 0, so the result is 0x1 by default.
Implementing a linear feedback shift register is quite simple in C. Each term in the polynomial corresponds to a single bit. x^8 is just 2^8 which is 0x100 and x^6 + x^5 + x^4 + 1 is 0x71. So whenever bit 0x100 is set, you XOR the result by 0x71.
Here's the updated code:
#include <stdio.h>
#include <stdint.h>
#include <inttypes.h>
uint64_t rc(int t)
{
uint64_t result = 0x1;
for (int i = 1; i <= t; i++)
{
result <<= 1;
if (result & 0x100)
result ^= 0x71;
}
return result & 0x1;
}
int main(void)
{
for (int i = 0; i < 24; i++)
{
uint64_t result = 0x0;
uint64_t shift = 1;
for (int j = 0; j < 7; j++)
{
uint64_t value = rc(7*i + j);
result |= value << (shift - 1);
shift *= 2;
}
printf("0x%016" PRIx64 "\n", result);
}
}

Extract 14-bit values from an array of bytes in C

In an arbitrary-sized array of bytes in C, I want to store 14-bit numbers (0-16,383) tightly packed. In other words, in the sequence:
0000000000000100000000000001
there are two numbers that I wish to be able to arbitrarily store and retrieve into a 16-bit integer. (in this case, both of them are 1, but could be anything in the given range) If I were to have the functions uint16_t 14bitarr_get(unsigned char* arr, unsigned int index) and void 14bitarr_set(unsigned char* arr, unsigned int index, uint16_t value), how would I implement those functions?
This is not for a homework project, merely my own curiosity. I have a specific project that this would be used for, and it is the key/center of the entire project.
I do not want an array of structs that have 14-bit values in them, as that generates waste bits for every struct that is stored. I want to be able to tightly pack as many 14-bit values as I possibly can into an array of bytes. (e.g.: in a comment I made, putting as many 14-bit values into a chunk of 64 bytes is desirable, with no waste bits. the way those 64 bytes work is completely tightly packed for a specific use case, such that even a single bit of waste would take away the ability to store another 14 bit value)
Well, this is bit fiddling at its best. Doing it with an array of bytes makes it more complicated than it would be with larger elements because a single 14 bit quantity can span 3 bytes, where uint16_t or anything bigger would require no more than two. But I'll take you at your word that this is what you want (no pun intended). This code will actually work with the constant set to anything 8 or larger (but not over the size of an int; for that, additional type casts are needed). Of course the value type must be adjusted if larger than 16.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#define W 14
uint16_t arr_get(unsigned char* arr, size_t index) {
size_t bit_index = W * index;
size_t byte_index = bit_index / 8;
unsigned bit_in_byte_index = bit_index % 8;
uint16_t result = arr[byte_index] >> bit_in_byte_index;
for (unsigned n_bits = 8 - bit_in_byte_index; n_bits < W; n_bits += 8)
result |= arr[++byte_index] << n_bits;
return result & ~(~0u << W);
}
void arr_set(unsigned char* arr, size_t index, uint16_t value) {
size_t bit_index = W * index;
size_t byte_index = bit_index / 8;
unsigned bit_in_byte_index = bit_index % 8;
arr[byte_index] &= ~(0xff << bit_in_byte_index);
arr[byte_index++] |= value << bit_in_byte_index;
unsigned n_bits = 8 - bit_in_byte_index;
value >>= n_bits;
while (n_bits < W - 8) {
arr[byte_index++] = value;
value >>= 8;
n_bits += 8;
}
arr[byte_index] &= 0xff << (W - n_bits);
arr[byte_index] |= value;
}
int main(void) {
int mod = 1 << W;
int n = 50000;
unsigned x[n];
unsigned char b[2 * n];
for (int tries = 0; tries < 10000; tries++) {
for (int i = 0; i < n; i++) {
x[i] = rand() % mod;
arr_set(b, i, x[i]);
}
for (int i = 0; i < n; i++)
if (arr_get(b, i) != x[i])
printf("Err #%d: %d should be %d\n", i, arr_get(b, i), x[i]);
}
return 0;
}
Faster versions Since you said in comments that performance is an issue: open coding the loops gives a roughly 10% speed improvement on my machine on the little test driver included in the original. This includes random number generation and testing, so perhaps the primitives are 20% faster. I'm confident that 16- or 32-bit array elements would give further improvements because byte access is expensive:
uint16_t arr_get(unsigned char* a, size_t i) {
size_t ib = 14 * i;
size_t iy = ib / 8;
switch (ib % 8) {
case 0:
return (a[iy] | (a[iy+1] << 8)) & 0x3fff;
case 2:
return ((a[iy] >> 2) | (a[iy+1] << 6)) & 0x3fff;
case 4:
return ((a[iy] >> 4) | (a[iy+1] << 4) | (a[iy+2] << 12)) & 0x3fff;
}
return ((a[iy] >> 6) | (a[iy+1] << 2) | (a[iy+2] << 10)) & 0x3fff;
}
#define M(IB) (~0u << (IB))
#define SETLO(IY, IB, V) a[IY] = (a[IY] & M(IB)) | ((V) >> (14 - (IB)))
#define SETHI(IY, IB, V) a[IY] = (a[IY] & ~M(IB)) | ((V) << (IB))
void arr_set(unsigned char* a, size_t i, uint16_t val) {
size_t ib = 14 * i;
size_t iy = ib / 8;
switch (ib % 8) {
case 0:
a[iy] = val;
SETLO(iy+1, 6, val);
return;
case 2:
SETHI(iy, 2, val);
a[iy+1] = val >> 6;
return;
case 4:
SETHI(iy, 4, val);
a[iy+1] = val >> 4;
SETLO(iy+2, 2, val);
return;
}
SETHI(iy, 6, val);
a[iy+1] = val >> 2;
SETLO(iy+2, 4, val);
}
Another variation
This is quite a bit faster yet on my machine, about 20% better than above:
uint16_t arr_get2(unsigned char* a, size_t i) {
size_t ib = i * 14;
size_t iy = ib / 8;
unsigned buf = a[iy] | (a[iy+1] << 8) | (a[iy+2] << 16);
return (buf >> (ib % 8)) & 0x3fff;
}
void arr_set2(unsigned char* a, size_t i, unsigned val) {
size_t ib = i * 14;
size_t iy = ib / 8;
unsigned buf = a[iy] | (a[iy+1] << 8) | (a[iy+2] << 16);
unsigned io = ib % 8;
buf = (buf & ~(0x3fff << io)) | (val << io);
a[iy] = buf;
a[iy+1] = buf >> 8;
a[iy+2] = buf >> 16;
}
Note that for this code to be safe you should allocate one extra byte at the end of the packed array. It always reads and writes 3 bytes even when the desired 14 bits are in the first 2.
One more variation Finally, this runs just a bit slower than the one above (again on my machine; YMMV), but you don't need the extra byte. It uses one comparison per operation:
uint16_t arr_get2(unsigned char* a, size_t i) {
size_t ib = i * 14;
size_t iy = ib / 8;
unsigned io = ib % 8;
unsigned buf = ib % 8 <= 2
? a[iy] | (a[iy+1] << 8)
: a[iy] | (a[iy+1] << 8) | (a[iy+2] << 16);
return (buf >> io) & 0x3fff;
}
void arr_set2(unsigned char* a, size_t i, unsigned val) {
size_t ib = i * 14;
size_t iy = ib / 8;
unsigned io = ib % 8;
if (io <= 2) {
unsigned buf = a[iy] | (a[iy+1] << 8);
buf = (buf & ~(0x3fff << io)) | (val << io);
a[iy] = buf;
a[iy+1] = buf >> 8;
} else {
unsigned buf = a[iy] | (a[iy+1] << 8) | (a[iy+2] << 16);
buf = (buf & ~(0x3fff << io)) | (val << io);
a[iy] = buf;
a[iy+1] = buf >> 8;
a[iy+2] = buf >> 16;
}
}
The easiest solution is to use a struct of eight bitfields:
typedef struct __attribute__((__packed__)) EightValues {
uint16_t v0 : 14,
v1 : 14,
v2 : 14,
v3 : 14,
v4 : 14,
v5 : 14,
v6 : 14,
v7 : 14;
} EightValues;
This struct has a size of 14*8 = 112 bits, which is 14 bytes (seven uint16_t). Now, all you need is to use the last three bits of the array index to select the right bitfield:
uint16_t 14bitarr_get(unsigned char* arr, unsigned int index) {
EightValues* accessPointer = (EightValues*)arr;
accessPointer += index >> 3; //select the right structure in the array
switch(index & 7) { //use the last three bits of the index to access the right bitfield
case 0: return accessPointer->v0;
case 1: return accessPointer->v1;
case 2: return accessPointer->v2;
case 3: return accessPointer->v3;
case 4: return accessPointer->v4;
case 5: return accessPointer->v5;
case 6: return accessPointer->v6;
case 7: return accessPointer->v7;
}
}
Your compiler will do the bit-fiddling for you.
The Basis for Storage Issue
The biggest issue you are facing is the fundamental question of "What is my basis for storage going to be?" You know the basics, what you have available to you is char, short, int, etc... The smallest being 8-bits. No matter how you slice your storage scheme, it will ultimately have to rest in memory in a unit of memory based on this 8 bit per byte layout.
The only optimal, no bits wasted, memory allocation would be to declare an array of char in the least common multiple of 14-bits. It is the full 112-bits in this case (7-shorts or 14-chars). This may be the best option. Here, declaring an array of 7-shorts or 14-chars, would allow the exact storage of 8 14-bit values. Of course if you have no need for 8 of them, then it wouldn't be of much use anyway as it would waste more than the 4-bits lost on a single unsigned value.
Let me know if this is something you would like to further explore. If it is, I'm happy to help with the implementation.
Bitfield Struct
The comments regarding bitfield packing or bit packing are exactly what you need to do. This can involve a structure alone or in combination with a union, or by manually right/left shifting values directly as needed.
A short example applicable to your situation (if I understood correctly you want 2 14-bit areas in memory) would be:
#include <stdio.h>
typedef struct bitarr14 {
unsigned n1 : 14,
n2 : 14;
} bitarr14;
char *binstr (unsigned long n, size_t sz);
int main (void) {
bitarr14 mybitfield;
mybitfield.n1 = 1;
mybitfield.n2 = 1;
printf ("\n mybitfield in memory : %s\n\n",
binstr (*(unsigned *)&mybitfield, 28));
return 0;
}
char *binstr (unsigned long n, size_t sz)
{
static char s[64 + 1] = {0};
char *p = s + 64;
register size_t i = 0;
for (i = 0; i < sz; i++) {
p--;
*p = (n >> i & 1) ? '1' : '0';
}
return p;
}
Output
$ ./bin/bitfield14
mybitfield in memory : 0000000000000100000000000001
Note: the dereference of mybitfield for purposes of printing the value in memory breaks strict aliasing and it is intentional just for the purpose of the output example.
The beauty, and purpose for using a struct in the manner provided is it will allow direct access to each 14-bit part of the struct directly, without having to manually shift, etc.
Update - assuming you want big endian bit packing. This is code meant for a fixed size code word. It's based on code I've used for data compression algorithms. The switch case and fixed logic helps with performance.
typedef unsigned short uint16_t;
void bit14arr_set(unsigned char* arr, unsigned int index, uint16_t value)
{
unsigned int bitofs = (index*14)%8;
arr += (index*14)/8;
switch(bitofs){
case 0: /* bit offset == 0 */
*arr++ = (unsigned char)(value >> 6);
*arr &= 0x03;
*arr |= (unsigned char)(value << 2);
break;
case 2: /* bit offset == 2 */
*arr &= 0xc0;
*arr++ |= (unsigned char)(value >> 8);
*arr = (unsigned char)(value << 0);
break;
case 4: /* bit offset == 4 */
*arr &= 0xf0;
*arr++ |= (unsigned char)(value >> 10);
*arr++ = (unsigned char)(value >> 2);
*arr &= 0x3f;
*arr |= (unsigned char)(value << 6);
break;
case 6: /* bit offset == 6 */
*arr &= 0xfc;
*arr++ |= (unsigned char)(value >> 12);
*arr++ = (unsigned char)(value >> 4);
*arr &= 0x0f;
*arr |= (unsigned char)(value << 4);
break;
}
}
uint16_t bit14arr_get(unsigned char* arr, unsigned int index)
{
unsigned int bitofs = (index*14)%8;
unsigned short value;
arr += (index*14)/8;
switch(bitofs){
case 0: /* bit offset == 0 */
value = ((unsigned int)(*arr++) ) << 6;
value |= ((unsigned int)(*arr ) ) >> 2;
break;
case 2: /* bit offset == 2 */
value = ((unsigned int)(*arr++)&0x3f) << 8;
value |= ((unsigned int)(*arr ) ) >> 0;
break;
case 4: /* bit offset == 4 */
value = ((unsigned int)(*arr++)&0x0f) << 10;
value |= ((unsigned int)(*arr++) ) << 2;
value |= ((unsigned int)(*arr ) ) >> 6;
break;
case 6: /* bit offset == 6 */
value = ((unsigned int)(*arr++)&0x03) << 12;
value |= ((unsigned int)(*arr++) ) << 4;
value |= ((unsigned int)(*arr ) ) >> 4;
break;
}
return value;
}
Here's my version (updated to fix bugs):
#define PACKWID 14 // number of bits in packed number
#define PACKMSK ((1 << PACKWID) - 1)
#ifndef ARCHBYTEALIGN
#define ARCHBYTEALIGN 1 // align to 1=bytes, 2=words
#endif
#define ARCHBITALIGN (ARCHBYTEALIGN * 8)
typedef unsigned char byte;
typedef unsigned short u16;
typedef unsigned int u32;
typedef long long s64;
typedef u16 pcknum_t; // container for packed number
typedef u32 acc_t; // working accumulator
#ifndef ARYOFF
#define ARYOFF long
#endif
#define PRT(_val) ((unsigned long) _val)
typedef unsigned ARYOFF aryoff_t; // bit offset
// packary -- access array of packed numbers
// RETURNS: old value
extern inline pcknum_t
packary(byte *ary,aryoff_t idx,int setflg,pcknum_t newval)
// ary -- byte array pointer
// idx -- index into array (packed number relative)
// setflg -- 1=set new value, 0=just get old value
// newval -- new value to set (if setflg set)
{
aryoff_t absbitoff;
aryoff_t bytoff;
aryoff_t absbitlhs;
acc_t acc;
acc_t nval;
int shf;
acc_t curmsk;
pcknum_t oldval;
// get the absolute bit number for the given array index
absbitoff = idx * PACKWID;
// get the byte offset of the lowest byte containing the number
bytoff = absbitoff / ARCHBITALIGN;
// get absolute bit offset of first containing byte
absbitlhs = bytoff * ARCHBITALIGN;
// get amount we need to shift things by:
// (1) our accumulator
// (2) values to set/get
shf = absbitoff - absbitlhs;
#ifdef MODSHOW
do {
static int modshow;
if (modshow > 50)
break;
++modshow;
printf("packary: MODSHOW idx=%ld shf=%d bytoff=%ld absbitlhs=%ld absbitoff=%ld\n",
PRT(idx),shf,PRT(bytoff),PRT(absbitlhs),PRT(absbitoff));
} while (0);
#endif
// adjust array pointer to the portion we want (guaranteed to span)
ary += bytoff * ARCHBYTEALIGN;
// fetch the number + some other bits
acc = *(acc_t *) ary;
// get the old value
oldval = (acc >> shf) & PACKMSK;
// set the new value
if (setflg) {
// get shifted mask for packed number
curmsk = PACKMSK << shf;
// remove the old value
acc &= ~curmsk;
// ensure caller doesn't pass us a bad value
nval = newval;
#if 0
nval &= PACKMSK;
#endif
nval <<= shf;
// add in the value
acc |= nval;
*(acc_t *) ary = acc;
}
return oldval;
}
pcknum_t
int_get(byte *ary,aryoff_t idx)
{
return packary(ary,idx,0,0);
}
void
int_set(byte *ary,aryoff_t idx,pcknum_t newval)
{
packary(ary,idx,1,newval);
}
Here are benchmarks:
set: 354740751 7.095 -- gene
set: 203407176 4.068 -- rcgldr
set: 298946533 5.979 -- craig
get: 268574627 5.371 -- gene
get: 166839767 3.337 -- rcgldr
get: 207764612 4.155 -- craig

What's the best way to toggle the MSB?

So I want to toggle the most significant bit of my number. Here is an example:
x = 100101 then answer should be 00101
I have a 64 bit machine and hence I am not expecting the answer to be 100000..<51 0's>..100101
One way I thought of was to count the number of bits in my number and then toggle the MSB, but not sure on how to count.
The cheat is to pawn it off to the compiler: There are instructions in most CPUs for doing work like this.
The following should do what you want.
i ^ (1 << (sizeof i * CHAR_BIT - clz(i) - 1))
This will translate into the CLZ instruction, which counts the leading zeros.
For GCC, see: http://gcc.gnu.org/onlinedocs/gcc-4.1.2/gcc/Other-Builtins.html
One thing to be careful of is that this results in undefined behavior if i == 0.
You should replace clz() with the correct intrinsic for your compiler, In GCC this is __builtin_clz; in Visual Studio C++ this is _BitScanForward.
#jleahy has already posted a good option in case of using GCC, I would only leave here a generic implementation of clz which does not use any compiler intrinsics. However, it is not the optimal choice for CPUs which already have native instructions for counting bits (such as x86).
#define __bit_msb_mask(n) (~(~0x0ul >> (n))) /* n leftmost bits. */
/* Count leading zeroes. */
int clz(unsigned long x) {
int nr = 0;
int sh;
assert(x);
/* Hope that compiler optimizes out the sizeof check. */
if (sizeof(x) == 8) {
/* Suppress "shift count >= width of type" error in case
* when sizeof(x) is NOT 8, i.e. when it is a dead code anyway. */
sh = !(x & __bit_msb_mask(sizeof(x)*8/2)) << 5;
nr += sh; x <<= sh;
}
sh = !(x & __bit_msb_mask(1 << 4)) << 4; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 3)) << 3; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 2)) << 2; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 1)) << 1; nr += sh; x <<= sh;
sh = !(x & __bit_msb_mask(1 << 0)) << 0; nr += sh;
return nr;
}
Using this function one can toggle the most significant set bit (assuming there is such one) as follows:
x ^= 1ul << (sizeof(x)*8 - clz(x))
Here's an approach using a lookup table, assuming CHAR_BIT == 8:
uint32_t toggle_msb(uint32_t n)
{
static unsigned char const lookup[] =
{ 1, 0, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 };
for (unsigned int i = 0; i != sizeof n; ++i)
{
// omit the last bit for big-endian machines: ---VVVVVVVVVVVVVVVVVV
unsigned char * p
= reinterpret_cast<unsigned char *>(&n) + sizeof n - i - 1;
if (*p / 16 != 0) { *p = *p % 16 + (lookup[*p / 16] * 16); return n; }
if (*p % 16 != 0) { *p = 16 * (*p / 16) + lookup[*p % 16]; return n; }
}
return 1;
}
And to just put it all together in some sample code for GCC:
#include <stdio.h>
#define clz(x) __builtin_clz(x)
int main()
{
int i = 411; /* 110011011 */
if( i != 0 )
i ^= (1 << (sizeof(i)*8 - clz(i)-1));
/* i is now 10011011 */
printf("i = %d\n", i);
return(0);
}

how to make a bit-set/byte-array conversion in c

Given an array,
unsigned char q[32]="1100111...",
how can I generate a 4-bytes bit-set, unsigned char p[4], such that, the bit of this bit-set, equals to value inside the array, e.g., the first byte p[0]= "q[0] ... q[7]"; 2nd byte p[1]="q[8] ... q[15]", etc.
and also how to do it in opposite, i.e., given bit-set, generate the array?
my own trial out for the first part.
unsigned char p[4]={0};
for (int j=0; j<N; j++)
{
if (q[j] == '1')
{
p [j / 8] |= 1 << (7-(j % 8));
}
}
Is the above right? any conditions to check? Is there any better way?
EDIT - 1
I wonder if above is efficient way? As the array size could be upto 4096 or even more.
First, Use strtoul to get a 32-bit value. Then convert the byte order to big-endian with htonl. Finally, store the result in your array:
#include <arpa/inet.h>
#include <stdlib.h>
/* ... */
unsigned char q[32] = "1100111...";
unsigned char result[4] = {0};
*(unsigned long*)result = htonl(strtoul(q, NULL, 2));
There are other ways as well.
But I lack <arpa/inet.h>!
Then you need to know what byte order your platform is. If it's big endian, then htonl does nothing and can be omitted. If it's little-endian, then htonl is just:
unsigned long htonl(unsigned long x)
{
x = (x & 0xFF00FF00) >> 8) | (x & 0x00FF00FF) << 8);
x = (x & 0xFFFF0000) >> 16) | (x & 0x0000FFFF) << 16);
return x;
}
If you're lucky, your optimizer might see what you're doing and make it into efficient code. If not, well, at least it's all implementable in registers and O(log N).
If you don't know what byte order your platform is, then you need to detect it:
typedef union {
char c[sizeof(int) / sizeof(char)];
int i;
} OrderTest;
unsigned long htonl(unsigned long x)
{
OrderTest test;
test.i = 1;
if(!test.c[0])
return x;
x = (x & 0xFF00FF00) >> 8) | (x & 0x00FF00FF) << 8);
x = (x & 0xFFFF0000) >> 16) | (x & 0x0000FFFF) << 16);
return x;
}
Maybe long is 8 bytes!
Well, the OP implied 4-byte inputs with their array size, but 8-byte long is doable:
#define kCharsPerLong (sizeof(long) / sizeof(char))
unsigned char q[8 * kCharsPerLong] = "1100111...";
unsigned char result[kCharsPerLong] = {0};
*(unsigned long*)result = htonl(strtoul(q, NULL, 2));
unsigned long htonl(unsigned long x)
{
#if kCharsPerLong == 4
x = (x & 0xFF00FF00UL) >> 8) | (x & 0x00FF00FFUL) << 8);
x = (x & 0xFFFF0000UL) >> 16) | (x & 0x0000FFFFUL) << 16);
#elif kCharsPerLong == 8
x = (x & 0xFF00FF00FF00FF00UL) >> 8) | (x & 0x00FF00FF00FF00FFUL) << 8);
x = (x & 0xFFFF0000FFFF0000UL) >> 16) | (x & 0x0000FFFF0000FFFFUL) << 16);
x = (x & 0xFFFFFFFF00000000UL) >> 32) | (x & 0x00000000FFFFFFFFUL) << 32);
#else
#error Unsupported word size.
#endif
return x;
}
For char that isn't 8 bits (DSPs like to do this), you're on your own. (This is why it was a Big Deal when the SHARC series of DSPs had 8-bit bytes; it made it a LOT easier to port existing code because, face it, C does a horrible job of portability support.)
What about arbitrary length buffers? No funny pointer typecasts, please.
The main thing that can be improved with the OP's version is to rethink the loop's internals. Instead of thinking of the output bytes as a fixed data register, think of it as a shift register, where each successive bit is shifted into the right (LSB) end. This will save you from all those divisions and mods (which, hopefully, are optimized away to bit shifts).
For sanity, I'm ditching unsigned char for uint8_t.
#include <stdint.h>
unsigned StringToBits(const char* inChars, uint8_t* outBytes, size_t numBytes,
size_t* bytesRead)
/* Converts the string of '1' and '0' characters in `inChars` to a buffer of
* bytes in `outBytes`. `numBytes` is the number of available bytes in the
* `outBytes` buffer. On exit, if `bytesRead` is not NULL, the value it points
* to is set to the number of bytes read (rounding up to the nearest full
* byte). If a multiple of 8 bits is not read, the last byte written will be
* padded with 0 bits to reach a multiple of 8 bits. This function returns the
* number of padding bits that were added. For example, an input of 11 bits
* will result `bytesRead` being set to 2 and the function will return 5. This
* means that if a nonzero value is returned, then a partial byte was read,
* which may be an error.
*/
{ size_t bytes = 0;
unsigned bits = 0;
uint8_t x = 0;
while(bytes < numBytes)
{ /* Parse a character. */
switch(*inChars++)
{ '0': x <<= 1; ++bits; break;
'1': x = (x << 1) | 1; ++bits; break;
default: numBytes = 0;
}
/* See if we filled a byte. */
if(bits == 8)
{ outBytes[bytes++] = x;
x = 0;
bits = 0;
}
}
/* Padding, if needed. */
if(bits)
{ bits = 8 - bits;
outBytes[bytes++] = x << bits;
}
/* Finish up. */
if(bytesRead)
*bytesRead = bytes;
return bits;
}
It's your responsibility to make sure inChars is null-terminated. The function will return on the first non-'0' or '1' character it sees or if it runs out of output buffer. Some example usage:
unsigned char q[32] = "1100111...";
uint8_t buf[4];
size_t bytesRead = 5;
if(StringToBits(q, buf, 4, &bytesRead) || bytesRead != 4)
{
/* Partial read; handle error here. */
}
This just reads 4 bytes, and traps the error if it can't.
unsigned char q[4096] = "1100111...";
uint8_t buf[512];
StringToBits(q, buf, 512, NULL);
This just converts what it can and sets the rest to 0 bits.
This function could be done better if C had the ability to break out of more than one level of loop or switch; as it stands, I'd have to add a flag value to get the same effect, which is clutter, or I'd have to add a goto, which I simply refuse.
I don't think that will quite work. You are comparing each "bit" to 1 when it should really be '1'. You can also make it a bit more efficient by getting rid of the if:
unsigned char p[4]={0};
for (int j=0; j<32; j++)
{
p [j / 8] |= (q[j] == `1`) << (7-(j % 8));
}
Going in reverse is pretty simple too. Just mask for each "bit" that you set earlier.
unsigned char q[32]={0};
for (int j=0; j<32; j++) {
q[j] = p[j / 8] & ( 1 << (7-(j % 8)) ) + '0';
}
You'll notice the creative use of (boolean) + '0' to convert between 1/0 and '1'/'0'.
According to your example it does not look like you are going for readability, and after a (late) refresh my solution looks very similar to Chriszuma except for the lack of parenthesis due to order of operations and the addition of the !! to enforce a 0 or 1.
const size_t N = 32; //N must be a multiple of 8
unsigned char q[N+1] = "11011101001001101001111110000111";
unsigned char p[N/8] = {0};
unsigned char r[N+1] = {0}; //reversed
for(size_t i = 0; i < N; ++i)
p[i / 8] |= (q[i] == '1') << 7 - i % 8;
for(size_t i = 0; i < N; ++i)
r[i] = '0' + !!(p[i / 8] & 1 << 7 - i % 8);
printf("%x %x %x %x\n", p[0], p[1], p[2], p[3]);
printf("%s\n%s\n", q,r);
If you are looking for extreme efficiency, try to use the following techniques:
Replace if by subtraction of '0' (seems like you can assume your input symbols can be only 0 or 1).
Also process the input from lower indices to higher ones.
for (int c = 0; c < N; c += 8)
{
int y = 0;
for (int b = 0; b < 8; ++b)
y = y * 2 + q[c + b] - '0';
p[c / 8] = y;
}
Replace array indices by auto-incrementing pointers:
const char* qptr = q;
unsigned char* pptr = p;
for (int c = 0; c < N; c += 8)
{
int y = 0;
for (int b = 0; b < 8; ++b)
y = y * 2 + *qptr++ - '0';
*pptr++ = y;
}
Unroll the inner loop:
const char* qptr = q;
unsigned char* pptr = p;
for (int c = 0; c < N; c += 8)
{
*pptr++ =
qptr[0] - '0' << 7 |
qptr[1] - '0' << 6 |
qptr[2] - '0' << 5 |
qptr[3] - '0' << 4 |
qptr[4] - '0' << 3 |
qptr[5] - '0' << 2 |
qptr[6] - '0' << 1 |
qptr[7] - '0' << 0;
qptr += 8;
}
Process several input characters simultaneously (using bit twiddling hacks or MMX instructions) - this has great speedup potential!

What is a better method for packing 4 bytes into 3 than this?

I have an array of values all well within the range 0 - 63, and decided I could pack every 4 bytes into 3 because the values only require 6 bits and I could use the extra 2bits to store the first 2 bits of the next value and so on.
Having never done this before I used the switch statement and a nextbit variable (a state machine like device) to do the packing and keep track of the starting bit. I'm convinced however, there must be a better way.
Suggestions/clues please, but don't ruin my fun ;-)
Any portability problems regarding big/little endian?
btw: I have verified this code is working, by unpacking it again and comparing with the input. And no it ain't homework, just an exercise I've set myself.
/* build with gcc -std=c99 -Wconversion */
#define ASZ 400
typedef unsigned char uc_;
uc_ data[ASZ];
int i;
for (i = 0; i < ASZ; ++i) {
data[i] = (uc_)(i % 0x40);
}
size_t dl = sizeof(data);
printf("sizeof(data):%z\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%z\n",pl);
for (i = 0; i < dl; ++i)
printf("%02d ", data[i]);
printf("\n");
uc_ * packeddata = calloc(pl, sizeof(uc_));
uc_ * byte = packeddata;
uc_ nextbit = 1;
for (int i = 0; i < dl; ++i) {
uc_ m = (uc_)(data[i] & 0x3f);
switch(nextbit) {
case 1:
/* all 6 bits of m into first 6 bits of byte: */
*byte = m;
nextbit = 7;
break;
case 3:
/* all 6 bits of m into last 6 bits of byte: */
*byte++ = (uc_)(*byte | (m << 2));
nextbit = 1;
break;
case 5:
/* 1st 4 bits of m into last 4 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x0f) << 4));
/* 5th and 6th bits of m into 1st and 2nd bits of byte: */
*byte = (uc_)(*byte | ((m & 0x30) >> 4));
nextbit = 3;
break;
case 7:
/* 1st 2 bits of m into last 2 bits of byte: */
*byte++ = (uc_)(*byte | ((m & 0x03) << 6));
/* next (last) 4 bits of m into 1st 4 bits of byte: */
*byte = (uc_)((m & 0x3c) >> 2);
nextbit = 5;
break;
}
}
So, this is kinda like code-golf, right?
#include <stdlib.h>
#include <string.h>
static void pack2(unsigned char *r, unsigned char *n) {
unsigned v = n[0] + (n[1] << 6) + (n[2] << 12) + (n[3] << 18);
*r++ = v;
*r++ = v >> 8;
*r++ = v >> 16;
}
unsigned char *apack(const unsigned char *s, int len) {
unsigned char *s_end = s + len,
*r, *result = malloc(len/4*3+3),
lastones[4] = { 0 };
if (result == NULL)
return NULL;
for(r = result; s + 4 <= s_end; s += 4, r += 3)
pack2(r, s);
memcpy(lastones, s, s_end - s);
pack2(r, lastones);
return result;
}
Check out the IETF RFC 4648 for 'The Base16, Base32 and Base64 Data Encodings'.
Partial code critique:
size_t dl = sizeof(data);
printf("sizeof(data):%d\n",dl);
float fpl = ((float)dl / 4.0f) * 3.0f;
size_t pl = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
printf("length of packed data:%d\n",pl);
Don't use the floating point stuff - just use integers. And use '%z' to print 'size_t' values - assuming you've got a C99 library.
size_t pl = ((dl + 3) / 4) * 3;
I think your loop could be simplified by dealing with 3-byte input units until you've got a partial unit left over, and then dealing with a remainder of 1 or 2 bytes as special cases. I note that the standard referenced says that you use one or two '=' signs to pad at the end.
I have a Base64 encoder and decode which does some of that. You are describing the 'decode' part of Base64 -- where the Base64 code has 4 bytes of data that should be stored in just 3 - as your packing code. The Base64 encoder corresponds to the unpacker you will need.
Base-64 Decoder
Note: base_64_inv is an array of 256 values, one for each possible input byte value; it defines the correct decoded value for each encoded byte. In the Base64 encoding, this is a sparse array - 3/4 zeroes. Similarly, base_64_map is the mapping between a value 0..63 and the corresponding storage value.
enum { DC_PAD = -1, DC_ERR = -2 };
static int decode_b64(int c)
{
int b64 = base_64_inv[c];
if (c == base64_pad)
b64 = DC_PAD;
else if (b64 == 0 && c != base_64_map[0])
b64 = DC_ERR;
return(b64);
}
/* Decode 4 bytes into 3 */
static int decode_quad(const char *b64_data, char *bin_data)
{
int b0 = decode_b64(b64_data[0]);
int b1 = decode_b64(b64_data[1]);
int b2 = decode_b64(b64_data[2]);
int b3 = decode_b64(b64_data[3]);
int bytes;
if (b0 < 0 || b1 < 0 || b2 == DC_ERR || b3 == DC_ERR || (b2 == DC_PAD && b3 != DC_PAD))
return(B64_ERR_INVALID_ENCODED_DATA);
if (b2 == DC_PAD && (b1 & 0x0F) != 0)
/* 3rd byte is '='; 2nd byte must end with 4 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
if (b2 >= 0 && b3 == DC_PAD && (b2 & 0x03) != 0)
/* 4th byte is '='; 3rd byte is not '=' and must end with 2 zero bits */
return(B64_ERR_INVALID_TRAILING_BYTE);
bin_data[0] = (b0 << 2) | (b1 >> 4);
bytes = 1;
if (b2 >= 0)
{
bin_data[1] = ((b1 & 0x0F) << 4) | (b2 >> 2);
bytes = 2;
}
if (b3 >= 0)
{
bin_data[2] = ((b2 & 0x03) << 6) | (b3);
bytes = 3;
}
return(bytes);
}
/* Decode input Base-64 string to original data. Output length returned, or negative error */
int base64_decode(const char *data, size_t datalen, char *buffer, size_t buflen)
{
size_t outlen = 0;
if (datalen % 4 != 0)
return(B64_ERR_INVALID_ENCODED_LENGTH);
if (BASE64_DECLENGTH(datalen) > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 4)
{
int nbytes = decode_quad(data, buffer + outlen);
if (nbytes < 0)
return(nbytes);
outlen += nbytes;
data += 4;
datalen -= 4;
}
assert(datalen == 0); /* By virtue of the %4 check earlier */
return(outlen);
}
Base-64 Encoder
/* Encode 3 bytes of data into 4 */
static void encode_triplet(const char *triplet, char *quad)
{
quad[0] = base_64_map[(triplet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((triplet[0] & 0x03) << 4) | ((triplet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((triplet[1] & 0x0F) << 2) | ((triplet[2] >> 6) & 0x03)];
quad[3] = base_64_map[triplet[2] & 0x3F];
}
/* Encode 2 bytes of data into 4 */
static void encode_doublet(const char *doublet, char *quad, char pad)
{
quad[0] = base_64_map[(doublet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((doublet[0] & 0x03) << 4) | ((doublet[1] >> 4) & 0x0F)];
quad[2] = base_64_map[((doublet[1] & 0x0F) << 2)];
quad[3] = pad;
}
/* Encode 1 byte of data into 4 */
static void encode_singlet(const char *singlet, char *quad, char pad)
{
quad[0] = base_64_map[(singlet[0] >> 2) & 0x3F];
quad[1] = base_64_map[((singlet[0] & 0x03) << 4)];
quad[2] = pad;
quad[3] = pad;
}
/* Encode input data as Base-64 string. Output length returned, or negative error */
static int base64_encode_internal(const char *data, size_t datalen, char *buffer, size_t buflen, char pad)
{
size_t outlen = BASE64_ENCLENGTH(datalen);
const char *bin_data = (const void *)data;
char *b64_data = (void *)buffer;
if (outlen > buflen)
return(B64_ERR_OUTPUT_BUFFER_TOO_SMALL);
while (datalen >= 3)
{
encode_triplet(bin_data, b64_data);
bin_data += 3;
b64_data += 4;
datalen -= 3;
}
b64_data[0] = '\0';
if (datalen == 2)
encode_doublet(bin_data, b64_data, pad);
else if (datalen == 1)
encode_singlet(bin_data, b64_data, pad);
b64_data[4] = '\0';
return((b64_data - buffer) + strlen(b64_data));
}
I complicate life by having to deal with a product that uses a variant alphabet for the Base64 encoding, and also manages not to pad data - hence the 'pad' argument (which can be zero for 'null padding' or '=' for standard padding. The 'base_64_map' array contains the alphabet to use for 6-bit values in the range 0..63.
Another simpler way to do it would be to use bit fields. One of the lesser known corners of C struct syntax is the big field. Let's say you have the following structure:
struct packed_bytes {
byte chunk1 : 6;
byte chunk2 : 6;
byte chunk3 : 6;
byte chunk4 : 6;
};
This declares chunk1, chunk2, chunk3, and chunk4 to have the type byte but to only take up 6 bits in the structure. The result is that sizeof(struct packed_bytes) == 3. Now all you need is a little function to take your array and dump it into the structure like so:
void
dump_to_struct(byte *in, struct packed_bytes *out, int count)
{
int i, j;
for (i = 0; i < (count / 4); ++i) {
out[i].chunk1 = in[i * 4];
out[i].chunk2 = in[i * 4 + 1];
out[i].chunk3 = in[i * 4 + 2];
out[i].chunk4 = in[i * 4 + 3];
}
// Finish up
switch(struct % 4) {
case 3:
out[count / 4].chunk3 = in[(count / 4) * 4 + 2];
case 2:
out[count / 4].chunk2 = in[(count / 4) * 4 + 1];
case 1:
out[count / 4].chunk1 = in[(count / 4) * 4];
}
}
There you go, you now have an array of struct packed_bytes that you can easily read by using the above struct.
Instead of using a statemachine you can simply use a counter for how many bits are already used in the current byte, from which you can directly derive the shift-offsets and whether or not you overflow into the next byte.
Regarding the endianess: As long as you use only a single datatype (that is you don't reinterpret pointer to types of different size (e.g. int* a =...;short* b=(short*) a;) you shouldn't get problems with endianess in most cases
Taking elements of DigitalRoss's compact code, Grizzly's suggestion, and my own code, I have written my own answer at last. Although DigitalRoss provides a usable working answer, my usage of it without understanding, would not have provided the same satisfaction as to learning something. For this reason I have chosen to base my answer on my original code.
I have also chosen to ignore the advice Jonathon Leffler gives to avoid using floating point arithmetic for the calculation of the packed data length. Both the recommended method given - the same DigitalRoss also uses, increases the length of the packed data by as much as three bytes. Granted this is not much, but is also avoidable by the use of floating point math.
Here is the code, criticisms welcome:
/* built with gcc -std=c99 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
unsigned char *
pack(const unsigned char * data, size_t len, size_t * packedlen)
{
float fpl = ((float)len / 4.0f) * 3.0f;
*packedlen = (size_t)(fpl > (float)((int)fpl) ? fpl + 1 : fpl);
unsigned char * packed = malloc(*packedlen);
if (!packed)
return 0;
const unsigned char * in = data;
const unsigned char * in_end = in + len;
unsigned char * out;
for (out = packed; in + 4 <= in_end; in += 4) {
*out++ = in[0] | ((in[1] & 0x03) << 6);
*out++ = ((in[1] & 0x3c) >> 2) | ((in[2] & 0x0f) << 4);
*out++ = ((in[2] & 0x30) >> 4) | (in[3] << 2);
}
size_t lastlen = in_end - in;
if (lastlen > 0) {
*out = in[0];
if (lastlen > 1) {
*out++ |= ((in[1] & 0x03) << 6);
*out = ((in[1] & 0x3c) >> 2);
if (lastlen > 2) {
*out++ |= ((in[2] & 0x0f) << 4);
*out = ((in[2] & 0x30) >> 4);
if (lastlen > 3)
*out |= (in[3] << 2);
}
}
}
return packed;
}
int main()
{
size_t i;
unsigned char data[] = {
12, 15, 40, 18,
26, 32, 50, 3,
7, 19, 46, 10,
25, 37, 2, 39,
60, 59, 0, 17,
9, 29, 13, 54,
5, 6, 47, 32
};
size_t datalen = sizeof(data);
printf("unpacked datalen: %td\nunpacked data\n", datalen);
for (i = 0; i < datalen; ++i)
printf("%02d ", data[i]);
printf("\n");
size_t packedlen;
unsigned char * packed = pack(data, sizeof(data), &packedlen);
if (!packed) {
fprintf(stderr, "Packing failed!\n");
return EXIT_FAILURE;
}
printf("packedlen: %td\npacked data\n", packedlen);
for (i = 0; i < packedlen; ++i)
printf("0x%02x ", packed[i]);
printf("\n");
free(packed);
return EXIT_SUCCESS;
}

Resources