Related
I'm reading the values from a SD card in an ARM micro:
Res = f_read(&fil, (void*)buf, 6, &NumBytesRead);
where fil is a pointer, buf is a buffer where the data is stored.
And that's the problem: it's an array but I'd like to have the contents of that array in a single variable.
To give an actual example: the 6 bytes read from the file are:
buf[0] = 0x1B
buf[1] = 0x26
buf[2] = 0xB3
buf[3] = 0x54
buf[4] = 0xA1
buf[5] = 0xCF
And I'd like to have: uint64_t data be equal to 0x1B26B354A1CF. That is, all the elements of the array "concatenated" in one single 64 bit integer.
Without type punning you can do as below.
uint64_t data = 0;
for (int i=0; i<6; i++)
{
data <<= 8;
data |= (uint64_t) buf[i];
}
Use union but remember about the endianes.
union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef union
{
uint8_t u8[8];
uint64_t u64;
}u64;
typedef enum
{
LITTLE_E,
BIG_E,
}ENDIANESS;
ENDIANESS checkEndianess(void)
{
ENDIANESS result = BIG_E;
u64 d64 = {.u64 = 0xff};
if(d64.u8[0]) result = LITTLE_E;
return result;
}
uint64_t arrayToU64(uint8_t *array, ENDIANESS e) // for the array BE
{
u64 d64;
if(e == LITTLE_E)
{
memmove(&d64, array, sizeof(d64.u64));
}
else
{
for(int index = sizeof(d64.u64) - 1; index >= 0; index--)
{
d64.u8[sizeof(d64.u64) - index - 1] = array[index];
}
}
return d64.u64;
}
int main()
{
uint8_t BIG_E_Array[] = {0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x80};
ENDIANESS e;
printf("This system endianess: %s\n", (e = checkEndianess()) == BIG_E ? "BIG":"LITTLE");
printf("Punned uint64_t for our system 0x%lx\n", arrayToU64(BIG_E_Array, e));
printf("Punned uint64_t for the opposite endianess system 0x%lx\n", arrayToU64(BIG_E_Array, e == BIG_E ? LITTLE_E : BIG_E));
return 0;
}
To things to take care of here:
have the bytes be ordered correctly
read the six bytes into one 64bit integer
Issue 1 can be taken care of by storing the byte coming in in network byte order (Big Endian) into the 64 bit integer in host byte order by for example using the two marcos below:
/* below defines of htonll() and ntohll() are taken from this answer:
https://stackoverflow.com/a/28592202/694576
*/
#if __BIG_ENDIAN__
# define htonll(x) (x)
# define ntohll(x) (x)
#else
# define htonll(x) ((uint64_t)htonl((x) & 0xFFFFFFFF) << 32) | htonl((x) >> 32))
# define ntohll(x) ((uint64_t)ntohl((x) & 0xFFFFFFFF) << 32) | ntohl((x) >> 32))
#endif
Issue 2 can be solved in multiple ways:
Extending your approach
#define BUFFER_SIZE (6)
...
assert(BUFFER_SIZE <= sizeof (uint64_t));
uint8_t buffer[BUFFER_SIZE];
FILE * pf = ...; /* open file here */
/* test if file has been opened successfully here */
... result = f_read(pf, buffer, BUFFER_SIZE, ...);
/* test result for success */
uint64_t number = 0;
memset(&number, buffer, BUFFER_SIZE)
number = ntohll(number);
Use "Type Punning" by using a union
union buffer_wrapper
{
uint8_t u8[sizeof (uint64_t)];
uint64_t u64;
}
Instead of
uint8_t buffer[BUFFER_SIZE];
use
union buffer_wrapper buffer;
and instead of
memcpy(&number, buffer, BUFFER_SIZE)
number = ntohll(number)
use
number = ntohll(buffer.u64)
I consider how to make efficient XORing of 2 bytes arrays.
I have this bytes arrays defined as unsigned char *
I think that XORing them as uint64_t will be much faster. Is it true?
How efficiently convert unsigned char * to this uint64_t * preferably inside the XORing loop? How to make padding of last bytes if length of the bytes array % 8 isn't 0?
Here is my current code that XORs bytes array, but each byte (unsigned char) separately:
unsigned char *bitwise_xor(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
unsigned char *XOR_Bytes_Array;
// allocate XORed bytes array
XOR_Bytes_Array = malloc(sizeof(unsigned char) * length);
// perform bitwise XOR operation on bytes arrays A and B
for(int i=0; i < length; i++)
XOR_Bytes_Array[i] = (unsigned char)(A_Bytes_Array[i] ^ B_Bytes_Array[i]);
return XOR_Bytes_Array;
}
Ok, in the meantime I have tried to do it this way. My bytes_array are rather large (rgba bitmaps 4*1440*900?).
static uint64_t next64bitsFromBytesArray(const unsigned char *bytesArray, const int i) {
uint64_t next64bits = (uint64_t) bytesArray[i+7] | ((uint64_t) bytesArray[i+6] << 8) | ((uint64_t) bytesArray[i+5] << 16) | ((uint64_t) bytesArray[i+4] << 24) | ((uint64_t) bytesArray[i+3] << 32) | ((uint64_t) bytesArray[i+2] << 40) | ((uint64_t) bytesArray[i+1] << 48) | ((uint64_t)bytesArray[i] << 56);
return next64bits;
}
unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
unsigned char *XOR_Bytes_Array;
// allocate XORed bytes array
XOR_Bytes_Array = malloc(sizeof(unsigned char) * length);
// perform bitwise XOR operation on bytes arrays A and B using uint64_t
for(int i=0; i<length; i+=8) {
uint64_t A_Bytes = next64bitsFromBytesArray(A_Bytes_Array, i);
uint64_t B_Bytes = next64bitsFromBytesArray(B_Bytes_Array, i);
uint64_t XOR_Bytes = A_Bytes ^ B_Bytes;
memcpy(XOR_Bytes_Array + i, &XOR_Bytes, 8);
}
return XOR_Bytes_Array;
}
UPDATE: (2nd approach to this problem)
unsigned char *bitwise_xor64(const unsigned char *A_Bytes_Array, const unsigned char *B_Bytes_Array, const size_t length) {
const uint64_t *aBytes = (const uint64_t *) A_Bytes_Array;
const uint64_t *bBytes = (const uint64_t *) B_Bytes_Array;
unsigned char *xorBytes = malloc(sizeof(unsigned char)*length);
for(int i = 0, j=0; i < length; i +=8) {
uint64_t aXORbBytes = aBytes[j] ^ bBytes[j];
//printf("a XOR b = 0x%" PRIx64 "\n", aXORbBytes);
memcpy(xorBytes + i, &aXORbBytes, 8);
j++;
}
return xorBytes;
}
So I did an experiment:
#include <stdlib.h>
#include <stdint.h>
#ifndef TYPE
#define TYPE uint64_t
#endif
TYPE *
xor(const void *va, const void *vb, size_t l)
{
const TYPE *a = va;
const TYPE *b = vb;
TYPE *r = malloc(l);
size_t i;
for (i = 0; i < l / sizeof(TYPE); i++) {
*r++ = *a++ ^ *b++;
}
return r;
}
Compiled both for uint64_t and uint8_t with clang with basic optimizations. In both cases the compiler vectorized the hell out of this. The difference was that the uint8_t version had code to handle when l wasn't a multiple of 8. So if we add code to handle the size not being a multiple of 8, you'll probably end up with equivalent generated code. Also, the 64 bit version unrolled the loop a few times and had code to handle that, so for big enough arrays you might gain a few percent here. On the other hand, on big enough arrays you'll be memory-bound and the xor operation won't matter a bit.
Are you sure your compiler won't deal with this? This is a kind of micro-optimization that makes sense only when you're measuring things and then you wouldn't need to ask which one is faster, you'd know.
I've been developing a cryptographic algorithm on the GPU and currently stuck with an algorithm to perform large integer addition. Large integers are represented in a usual way as a bunch of 32-bit words.
For example, we can use one thread to add two 32-bit words. For simplicity, let assume
that the numbers to be added are of the same length and number of threads per block == number of words. Then:
__global__ void add_kernel(int *C, const int *A, const int *B) {
int x = A[threadIdx.x];
int y = B[threadIdx.x];
int z = x + y;
int carry = (z < x);
/** do carry propagation in parallel somehow ? */
............
z = z + newcarry; // update the resulting words after carry propagation
C[threadIdx.x] = z;
}
I am pretty sure that there is a way to do carry propagation via some tricky reduction procedure but could not figure it out..
I had a look at CUDA thrust extensions but big integer package seems not to be implemented yet.
Perhaps someone can give me a hint how to do that on CUDA ?
You are right, carry propagation can be done via prefix sum computation but it's a bit tricky to define the binary function for this operation and prove that it is associative (needed for parallel prefix sum). As a matter of fact, this algorithm is used (theoretically) in Carry-lookahead adder.
Suppose we have two large integers a[0..n-1] and b[0..n-1].
Then we compute (i = 0..n-1):
s[i] = a[i] + b[i]l;
carryin[i] = (s[i] < a[i]);
We define two functions:
generate[i] = carryin[i];
propagate[i] = (s[i] == 0xffffffff);
with quite intuitive meaning: generate[i] == 1 means that the carry is generated at
position i while propagate[i] == 1 means that the carry will be propagated from position
(i - 1) to (i + 1). Our goal is to compute the function carryout[0..n-1] used to update the resulting sum s[0..n-1]. carryout can be computed recursively as follows:
carryout[i] = generate[i] OR (propagate[i] AND carryout[i-1])
carryout[0] = 0
Here carryout[i] == 1 if carry is generated at position i OR it is generated sometimes earlier AND propagated to position i. Finally, we update the resulting sum:
s[i] = s[i] + carryout[i-1]; for i = 1..n-1
carry = carryout[n-1];
Now it is quite straightforward to prove that carryout function is indeed binary associative and hence parallel prefix sum computation applies. To implement this on CUDA, we can merge both flags 'generate' and 'propagate' in a single variable since they are mutually exclusive, i.e.:
cy[i] = (s[i] == -1u ? -1u : 0) | carryin[i];
In other words,
cy[i] = 0xffffffff if propagate[i]
cy[i] = 1 if generate[i]
cy[u] = 0 otherwise
Then, one can verify that the following formula computes prefix sum for carryout function:
cy[i] = max((int)cy[i], (int)cy[k]) & cy[i];
for all k < i. The example code below shows large addition for 2048-word integers. Here I used CUDA blocks with 512 threads:
// add & output carry flag
#define UADDO(c, a, b) \
asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
// add with carry & output carry flag
#define UADDC(c, a, b) \
asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
#define WS 32
__global__ void bignum_add(unsigned *g_R, const unsigned *g_A,const unsigned *g_B) {
extern __shared__ unsigned shared[];
unsigned *r = shared;
const unsigned N_THIDS = 512;
unsigned thid = threadIdx.x, thid_in_warp = thid & WS-1;
unsigned ofs, cf;
uint4 a = ((const uint4 *)g_A)[thid],
b = ((const uint4 *)g_B)[thid];
UADDO(a.x, a.x, b.x) // adding 128-bit chunks with carry flag
UADDC(a.y, a.y, b.y)
UADDC(a.z, a.z, b.z)
UADDC(a.w, a.w, b.w)
UADDC(cf, 0, 0) // save carry-out
// memory consumption: 49 * N_THIDS / 64
// use "alternating" data layout for each pair of warps
volatile short *scan = (volatile short *)(r + 16 + thid_in_warp +
49 * (thid / 64)) + ((thid / 32) & 1);
scan[-32] = -1; // put identity element
if(a.x == -1u && a.x == a.y && a.x == a.z && a.x == a.w)
// this indicates that carry will propagate through the number
cf = -1u;
// "Hillis-and-Steele-style" reduction
scan[0] = cf;
cf = max((int)cf, (int)scan[-2]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-4]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-8]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-16]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-32]) & cf;
scan[0] = cf;
int *postscan = (int *)r + 16 + 49 * (N_THIDS / 64);
if(thid_in_warp == WS - 1) // scan leading carry-outs once again
postscan[thid >> 5] = cf;
__syncthreads();
if(thid < N_THIDS / 32) {
volatile int *t = (volatile int *)postscan + thid;
t[-8] = -1; // load identity symbol
cf = t[0];
cf = max((int)cf, (int)t[-1]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-2]) & cf;
t[0] = cf;
cf = max((int)cf, (int)t[-4]) & cf;
t[0] = cf;
}
__syncthreads();
cf = scan[0];
int ps = postscan[(int)((thid >> 5) - 1)]; // postscan[-1] equals to -1
scan[0] = max((int)cf, ps) & cf; // update carry flags within warps
cf = scan[-2];
if(thid_in_warp == 0)
cf = ps;
if((int)cf < 0)
cf = 0;
UADDO(a.x, a.x, cf) // propagate carry flag if needed
UADDC(a.y, a.y, 0)
UADDC(a.z, a.z, 0)
UADDC(a.w, a.w, 0)
((uint4 *)g_R)[thid] = a;
}
Note that macros UADDO / UADDC might not be necessary anymore since CUDA 4.0 has corresponding intrinsics (however I am not entirely sure).
Also remark that, though parallel reduction is quite fast, if you need to add several large integers in a row, it might be better to use some redundant representation (which was suggested in comments above), i.e., first accumulate the results of additions in 64-bit words, and then perform one carry propagation at the very end in "one sweep".
I thought I would post my answer also, in addition to #asm, so this SO question can be a sort of repository of ideas. Similar to #asm, I detect and store the carry condition as well as the "carry-through" condition, ie. when the intermediate word result is all 1's (0xF...FFF) so that if a carry were to propagate into this word, it would "carry-through" to the next word.
I didn't use any PTX or asm in my code, so I chose to use 64-bit unsigned ints instead of 32-bit, to achieve the 2048x32bit capability, using 1024 threads.
A larger difference from #asm's code is in my parallel carry propagation scheme. I construct a bit-packed array ("carry") where each bit represents the carry condition generated from the independent intermediate 64-bit adds from each of the 1024 threads. I also construct a bit-packed array ("carry_through") where each bit represents the carry_through condition of the individual 64-bit intermediate results. For 1024 threads, this amounts to 1024/64 = 16x64 bit words of shared memory for each bit-packed array, so total shared mem usage is 64+3 32bit quantites. With these bit packed arrays, I perform the following to generate a combined propagated carry indicator:
carry = carry | (carry_through ^ ((carry & carry_through) + carry_through);
(note that carry is shifted left by one: carry[i] indicates that the result of a[i-1] + b[i-1] generated a carry)
The explanation is as follows:
the bitwise and of carry and carry_through generates the candidates where a carry will
interact with a sequence of one or more carry though conditions
adding the result of step one to carry_through generates a result which
has changed bits which represent all words that will be affected by
the propagation of the carry into the carry_through sequence
taking the exclusive-or of carry_through plus the result from step 2
shows the affected results indicated with a 1 bit
taking the bitwise or of the result from step 3 and the ordinary
carry indicators gives a combined carry condition, which is then
used to update all the intermediate results.
Note that the addition in step 2 requires another multi-word add (for big ints composed of more than 64 words). I believe this algorithm works, and it has passed the test cases I have thrown at it.
Here is my example code which implements this:
// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>
#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64 // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE
// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY
#define LOOPCNT 1000
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.
__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){
__shared__ unsigned long long carry_through[BSIZE];
__shared__ unsigned long long carry[BSIZE+1];
__shared__ volatile unsigned mcarry;
__shared__ volatile unsigned mcarry_through;
unsigned idx = threadIdx.x + (psize * blockIdx.x);
if ((threadIdx.x < psize) && (idx < size)){
// handle 64 bit unsigned add first
unsigned long long cr1 = a[idx];
unsigned long long lc = cr1 + b[idx];
// handle carry
if (threadIdx.x < BSIZE){
carry[threadIdx.x] = 0;
carry_through[threadIdx.x] = 0;
}
if (threadIdx.x == 0){
mcarry = 0;
mcarry_through = 0;
}
__syncthreads();
if (lc < cr1){
if ((threadIdx.x%LLBITS) != (LLBITS-1))
atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));
else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);
}
// handle carry-through
if (lc == 0xFFFFFFFFFFFFFFFFull)
atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS)));
__syncthreads();
if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){
// only 1 warp executing within this if statement
unsigned long long cr3 = carry_through[threadIdx.x];
cr1 = carry[threadIdx.x] & cr3;
// start of sub-add
unsigned long long cr2 = cr3 + cr1;
if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));
if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));
if (threadIdx.x == 0) {
unsigned cr4 = mcarry & mcarry_through;
cr4 += mcarry_through;
mcarry |= (mcarry_through ^ cr4);
}
if (mcarry & (1u<<threadIdx.x)) cr2++;
// end of sub-add
carry[threadIdx.x] |= (cr2 ^ cr3);
}
__syncthreads();
if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;
c[idx] = lc;
}
}
int main() {
unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;
unsigned at_once = 256; // valid range = 1 .. 65535
unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZE
unsigned dsize = at_once * prob_size;
cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;
float et_gpu, et_cpu, tot_gpu, tot_cpu;
tot_gpu = 0;
tot_cpu = 0;
if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}
if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long))) == 0) {printf("Malloc Fail\n"); return 1;}
cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc1 fail");
cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc2 fail");
cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
cudaCheckErrors("cudaHostAlloc3 fail");
cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc1 fail");
cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc2 fail");
cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));
cudaCheckErrors("cudaMalloc3 fail");
cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));
cudaEventCreate(&t_start_gpu);
cudaEventCreate(&t_end_gpu);
cudaEventCreate(&t_start_cpu);
cudaEventCreate(&t_end_cpu);
for (unsigned loops = 0; loops <LOOPCNT; loops++){
//create some test cases
if (loops == 0){
for (int j=0; j<at_once; j++)
for (int k=0; k<prob_size; k++){
int i= (j*prob_size) + k;
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_a[prob_size-1] = 0;
h_b[prob_size-1] = 1;
h_b[0] = 1;
}
else if (loops == 1){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFFull;
h_b[i] = 0;
}
h_b[0] = 1;
}
else if (loops == 2){
for (int i=0; i<dsize; i++){
h_a[i] = 0xFFFFFFFFFFFFFFFEull;
h_b[i] = 2;
}
h_b[0] = 1;
}
else {
for (int i = 0; i<dsize; i++){
h_a[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
h_b[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
}
}
#ifdef GPUCOPY
cudaEventRecord(t_start_gpu, 0);
#endif
cudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPU
cudaEventRecord(t_start_gpu, 0);
#endif
paradd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);
cudaCheckErrors("Kernel Fail");
#ifdef GPU
cudaEventRecord(t_end_gpu, 0);
#endif
cudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPY
cudaEventRecord(t_end_gpu, 0);
#endif
cudaEventSynchronize(t_end_gpu);
cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);
tot_gpu += et_gpu;
cudaEventRecord(t_start_cpu, 0);
//also compute result on CPU for comparison
for (int j=0; j<at_once; j++) {
unsigned rc=0;
for (int n=0; n<prob_size; n++){
unsigned i = (j*prob_size) + n;
c[i] = h_a[i] + h_b[i];
if (c[i] < h_a[i]) {
c[i] += rc;
rc=1;}
else {
if ((c[i] += rc) != 0) rc=0;
}
if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}
}
}
cudaEventRecord(t_end_cpu, 0);
cudaEventSynchronize(t_end_cpu);
cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);
tot_cpu += et_cpu;
if ((loops%(LOOPCNT/10)) == 0) printf("*\n");
}
printf("\nResults Match!\n");
printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));
printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));
return 0;
}
Say I have a two dimensional array where each entry contains a length and a value:
int array[4][2] = { /* {length, value}, */
{5, 3},
{6, 7},
{1, 0},
{8, 15},
};
I want to store them sequentially into memory with leading zeros to make each field the appropriate length. The example above would be:
00011 000111 0 00001111
The first block is five bits long and stores decimal 3. The second block is six bits long and stores decimal seven. The third block is one bit long and stores decimal 0, and the last block is eight bits long and stores decimal 15.
I can do it with some bitwise manipulation but I thought I would ask to see if there is an easier way.
I am coding in C for a Tensilica 32-bit RISC processor.
The purpose is to write a sequence of Exponential-Golomb codes.
EDIT: SOLUTION:
int main(int argc, char *argv[])
{
unsigned int i = 0, j = 0;
unsigned char bit = 0;
unsigned int bit_num = 0;
unsigned int field_length_bits = 0;
unsigned int field_length_bytes = 0;
unsigned int field_array_length = 0;
unsigned int field_list[NUM_FIELDS][2] = {
/*{Length, Value},*/
{4, 3},
{5, 5},
{6, 9},
{7, 11},
{8, 13},
{9, 15},
{10, 17},
};
unsigned char *seq_array;
// Find total length of field list in bits
for (i = 0; i < NUM_FIELDS; i++)
field_length_bits += field_list[i][LENGTH];
// Number of bytes needed to store FIELD parameters
for (i = 0; i < (field_length_bits + i) % 8 != 0; i++) ;
field_length_bytes = (field_length_bits + i) / 8;
// Size of array we need to allocate (multiple of 4 bytes)
for (i = 0; (field_length_bytes + i) % 4 != 0; i++) ;
field_array_length = (field_length_bytes + i);
// Allocate memory
seq_array = (unsigned char *) calloc(field_array_length, sizeof(unsigned char));
// Traverse source and set destination
for(i = 0; i < NUM_FIELDS; i++)
{
for(j = 0; j < field_list[i][LENGTH]; j++)
{
bit = 0x01 & (field_list[i][VALUE] >> (field_list[i][LENGTH] - j - 1));
if (bit)
setBit(seq_array, field_array_length, bit_num, 1);
else
setBit(seq_array, field_array_length, bit_num, 0);
bit_num++;
}
}
return 0;
}
void setBit(unsigned char *array, unsigned int array_len, unsigned int bit_num, unsigned int bit_value)
{
unsigned int byte_location = 0;
unsigned int bit_location = 0;
byte_location = bit_num / 8;
if(byte_location > array_len - 1)
{
printf("setBit(): Unauthorized memory access");
return;
}
bit_location = bit_num % 8;
if(bit_value)
array[byte_location] |= (1 << (7-bit_location));
else
array[byte_location] &= ~(1 << (7-bit_location));
return;
}
You can use a bitstream library:
Highly recommended bitstream library:
http://cpansearch.perl.org/src/KURIHARA/Imager-QRCode-0.033/src/bitstream.c
http://cpansearch.perl.org/src/KURIHARA/Imager-QRCode-0.033/src/bitstream.h
Because this bitstream library seems to be very self-contained, and doesn't seem to require external includes.
http://www.codeproject.com/Articles/32783/CBitStream-A-simple-C-class-for-reading-and-writin - C library, but using windows WORD, DWORD types (you can still typedef to use this library)
http://code.google.com/p/youtube-mobile-ffmpeg/source/browse/trunk/libavcodec/bitstream.c?r=8 - includes quite a few other include files to use the bitstream library
If you just want exponential golomb codes, there are open-source C implementations:
http://www.koders.com/c/fid8A317DF502A7D61CC96EC4DA07021850B6AD97ED.aspx?s=gcd
Or you can use bit manipulation techniques.
For example:
unsigned int array[4][2] = ???
unsigned int mem[100] = {};
int index=0,bit=0;
for (int i=0;i<4;i++) {
int shift = (32 - array[i][0] - bit);
if (shift>0) mem[index] &= array[i][1] << shift;
else {
mem[index] &= array[i][1] >> -shift;
mem[index+1] &= array[i][1] << (32+shift);
}
bit += array[i][1];
if (bit>=32) {
bit-=32;
index++;
}
}
Disclaimer:
The code only works if your computer byte-order is little endian, and the result will actually be little-endian within each 4-byte boundary, and big-endian across 4-byte boundaries. If you convert mem from int type to char, and replace the constants 32 to 8, you will get a big-endian representation of your bit-array.
It also assumes that the length is less than 32. Obviously, the code you actually want will depend on the bounds of valid input, and what you want in terms of byte-ordering.
Do you mean something like a bit field?
struct myBF
{
unsigned int v1 : 5;
unsigned int v2 : 5;
unsigned int v3 : 1;
unsigned int v4 : 8;
};
struct myBF b = { 3, 7, 0, 15 };
I may be misunderstanding your requirements entirely. Please comment if that's the case.
Update: Suppose you want to do this dynamically. Let's make a function that accepts an array of pairs, like in your example, and an output buffer:
/* Fill dst with bits.
* Returns one plus the number of bytes used or 0 on error.
*/
size_t bitstream(int (*arr)[2], size_t arrlen,
unsigned char * dst, size_t dstlen)
{
size_t total_bits = 0, bits_so_far = 0;
/* Check if there's enough space */
for (size_t i = 0; i != arrlen; ++i) { total_bits += arr[i][0]; }
if (dst == NULL || total_bits > CHAR_BIT * dstlen) { return 0; }
/* Set the output range to all zero */
memset(dst, 0, dstlen);
/* Populate the output range */
for (size_t i = 0; i != arrlen; ++i)
{
for (size_t bits_to_spend = arr[i][0], value = arr[i][1];
bits_to_spend != 0; /* no increment */ )
{
size_t const bit_offset = bits_so_far % CHAR_BIT;
size_t const byte_index = bits_so_far / CHAR_BIT;
size_t const cur_byte_capacity = CHAR_BIT - bit_offset;
/* Debug: Watch it work! */
printf("Need to store %zu, %zu bits to spend, capacity %zu.\n",
value, bits_to_spend, cur_byte_capacity);
dst[byte_index] |= (value << bit_offset);
if (cur_byte_capacity < bits_to_spend)
{
value >>= cur_byte_capacity;
bits_so_far += cur_byte_capacity;
bits_to_spend -= cur_byte_capacity;
}
else
{
bits_so_far += bits_to_spend;
bits_to_spend = 0;
}
}
}
return (bits_so_far + CHAR_BIT - 1) / CHAR_BIT;
}
Notes:
If the number arr[i][1] does not fit into arr[i][0] bits, only the residue modulo 2arr[i][0] is stored.
To be perfectly correct, the array type should be unsigned as well, otherwise the initialization size_t value = arr[i][1] may be undefined behaviour.
You can modify the error handling behaviour. For example, you could forgo transactionality and move the length check into the main loop. Also, instead of returning 0, you could return the number of required bytes, so that the user can figure out how big the destination array needs to be (like snptrintf does).
Usage:
unsigned char dst[N];
size_t n = bitstream(array, sizeof array / sizeof *array, dst, sizeof dst);
for (size_t i = 0; i != n; ++i) { printf("0x%02X ", dst[n - i - 1]); }
For your example, this will produce 0x00 0xF0 0xE3, which is:
0x00 0xF0 0xE3
00000000 11110000 11100011
0000 00001111 0 000111 00011
padd 15 0 7 3
In standard C there's no way to access anything smaller than a char by any way other than the 'bitwise manipulation` you mention. I'm afraid you're out of luck, unless you come across a library somewhere out there that can help you.
I can print with printf as a hex or octal number. Is there a format tag to print as binary, or arbitrary base?
I am running gcc.
printf("%d %x %o\n", 10, 10, 10); //prints "10 A 12\n"
printf("%b\n", 10); // prints "%b\n"
Hacky but works for me:
#define BYTE_TO_BINARY_PATTERN "%c%c%c%c%c%c%c%c"
#define BYTE_TO_BINARY(byte) \
(byte & 0x80 ? '1' : '0'), \
(byte & 0x40 ? '1' : '0'), \
(byte & 0x20 ? '1' : '0'), \
(byte & 0x10 ? '1' : '0'), \
(byte & 0x08 ? '1' : '0'), \
(byte & 0x04 ? '1' : '0'), \
(byte & 0x02 ? '1' : '0'), \
(byte & 0x01 ? '1' : '0')
printf("Leading text "BYTE_TO_BINARY_PATTERN, BYTE_TO_BINARY(byte));
For multi-byte types
printf("m: "BYTE_TO_BINARY_PATTERN" "BYTE_TO_BINARY_PATTERN"\n",
BYTE_TO_BINARY(m>>8), BYTE_TO_BINARY(m));
You need all the extra quotes unfortunately. This approach has the efficiency risks of macros (don't pass a function as the argument to BYTE_TO_BINARY) but avoids the memory issues and multiple invocations of strcat in some of the other proposals here.
Print Binary for Any Datatype
// Assumes little endian
void printBits(size_t const size, void const * const ptr)
{
unsigned char *b = (unsigned char*) ptr;
unsigned char byte;
int i, j;
for (i = size-1; i >= 0; i--) {
for (j = 7; j >= 0; j--) {
byte = (b[i] >> j) & 1;
printf("%u", byte);
}
}
puts("");
}
Test:
int main(int argv, char* argc[])
{
int i = 23;
uint ui = UINT_MAX;
float f = 23.45f;
printBits(sizeof(i), &i);
printBits(sizeof(ui), &ui);
printBits(sizeof(f), &f);
return 0;
}
Here is a quick hack to demonstrate techniques to do what you want.
#include <stdio.h> /* printf */
#include <string.h> /* strcat */
#include <stdlib.h> /* strtol */
const char *byte_to_binary
(
int x
)
{
static char b[9];
b[0] = '\0';
int z;
for (z = 128; z > 0; z >>= 1)
{
strcat(b, ((x & z) == z) ? "1" : "0");
}
return b;
}
int main
(
void
)
{
{
/* binary string to int */
char *tmp;
char *b = "0101";
printf("%d\n", strtol(b, &tmp, 2));
}
{
/* byte to binary string */
printf("%s\n", byte_to_binary(5));
}
return 0;
}
There isn't a binary conversion specifier in glibc normally.
It is possible to add custom conversion types to the printf() family of functions in glibc. See register_printf_function for details. You could add a custom %b conversion for your own use, if it simplifies the application code to have it available.
Here is an example of how to implement a custom printf formats in glibc.
You could use a small table to improve speed1. Similar techniques are useful in the embedded world, for example, to invert a byte:
const char *bit_rep[16] = {
[ 0] = "0000", [ 1] = "0001", [ 2] = "0010", [ 3] = "0011",
[ 4] = "0100", [ 5] = "0101", [ 6] = "0110", [ 7] = "0111",
[ 8] = "1000", [ 9] = "1001", [10] = "1010", [11] = "1011",
[12] = "1100", [13] = "1101", [14] = "1110", [15] = "1111",
};
void print_byte(uint8_t byte)
{
printf("%s%s", bit_rep[byte >> 4], bit_rep[byte & 0x0F]);
}
1 I'm mostly referring to embedded applications where optimizers are not so aggressive and the speed difference is visible.
Print the least significant bit and shift it out on the right. Doing this until the integer becomes zero prints the binary representation without leading zeros but in reversed order. Using recursion, the order can be corrected quite easily.
#include <stdio.h>
void print_binary(unsigned int number)
{
if (number >> 1) {
print_binary(number >> 1);
}
putc((number & 1) ? '1' : '0', stdout);
}
To me, this is one of the cleanest solutions to the problem. If you like 0b prefix and a trailing new line character, I suggest wrapping the function.
Online demo
Based on #William Whyte's answer, this is a macro that provides int8,16,32 & 64 versions, reusing the INT8 macro to avoid repetition.
/* --- PRINTF_BYTE_TO_BINARY macro's --- */
#define PRINTF_BINARY_PATTERN_INT8 "%c%c%c%c%c%c%c%c"
#define PRINTF_BYTE_TO_BINARY_INT8(i) \
(((i) & 0x80ll) ? '1' : '0'), \
(((i) & 0x40ll) ? '1' : '0'), \
(((i) & 0x20ll) ? '1' : '0'), \
(((i) & 0x10ll) ? '1' : '0'), \
(((i) & 0x08ll) ? '1' : '0'), \
(((i) & 0x04ll) ? '1' : '0'), \
(((i) & 0x02ll) ? '1' : '0'), \
(((i) & 0x01ll) ? '1' : '0')
#define PRINTF_BINARY_PATTERN_INT16 \
PRINTF_BINARY_PATTERN_INT8 PRINTF_BINARY_PATTERN_INT8
#define PRINTF_BYTE_TO_BINARY_INT16(i) \
PRINTF_BYTE_TO_BINARY_INT8((i) >> 8), PRINTF_BYTE_TO_BINARY_INT8(i)
#define PRINTF_BINARY_PATTERN_INT32 \
PRINTF_BINARY_PATTERN_INT16 PRINTF_BINARY_PATTERN_INT16
#define PRINTF_BYTE_TO_BINARY_INT32(i) \
PRINTF_BYTE_TO_BINARY_INT16((i) >> 16), PRINTF_BYTE_TO_BINARY_INT16(i)
#define PRINTF_BINARY_PATTERN_INT64 \
PRINTF_BINARY_PATTERN_INT32 PRINTF_BINARY_PATTERN_INT32
#define PRINTF_BYTE_TO_BINARY_INT64(i) \
PRINTF_BYTE_TO_BINARY_INT32((i) >> 32), PRINTF_BYTE_TO_BINARY_INT32(i)
/* --- end macros --- */
#include <stdio.h>
int main() {
long long int flag = 1648646756487983144ll;
printf("My Flag "
PRINTF_BINARY_PATTERN_INT64 "\n",
PRINTF_BYTE_TO_BINARY_INT64(flag));
return 0;
}
This outputs:
My Flag 0001011011100001001010110111110101111000100100001111000000101000
For readability you may want to add a separator for eg:
My Flag 00010110,11100001,00101011,01111101,01111000,10010000,11110000,00101000
As of February 3rd, 2022, the GNU C Library been updated to version 2.35. As a result, %b is now supported to output in binary format.
printf-family functions now support the %b format for output of
integers in binary, as specified in draft ISO C2X, and the %B variant
of that format recommended by draft ISO C2X.
Here's a version of the function that does not suffer from reentrancy issues or limits on the size/type of the argument:
#define FMT_BUF_SIZE (CHAR_BIT*sizeof(uintmax_t)+1)
char *binary_fmt(uintmax_t x, char buf[static FMT_BUF_SIZE])
{
char *s = buf + FMT_BUF_SIZE;
*--s = 0;
if (!x) *--s = '0';
for (; x; x /= 2) *--s = '0' + x%2;
return s;
}
Note that this code would work just as well for any base between 2 and 10 if you just replace the 2's by the desired base. Usage is:
char tmp[FMT_BUF_SIZE];
printf("%s\n", binary_fmt(x, tmp));
Where x is any integral expression.
Quick and easy solution:
void printbits(my_integer_type x)
{
for(int i=sizeof(x)<<3; i; i--)
putchar('0'+((x>>(i-1))&1));
}
Works for any size type and for signed and unsigned ints. The '&1' is needed to handle signed ints as the shift may do sign extension.
There are so many ways of doing this. Here's a super simple one for printing 32 bits or n bits from a signed or unsigned 32 bit type (not putting a negative if signed, just printing the actual bits) and no carriage return. Note that i is decremented before the bit shift:
#define printbits_n(x,n) for (int i=n;i;i--,putchar('0'|(x>>i)&1))
#define printbits_32(x) printbits_n(x,32)
What about returning a string with the bits to store or print later? You either can allocate the memory and return it and the user has to free it, or else you return a static string but it will get clobbered if it's called again, or by another thread. Both methods shown:
char *int_to_bitstring_alloc(int x, int count)
{
count = count<1 ? sizeof(x)*8 : count;
char *pstr = malloc(count+1);
for(int i = 0; i<count; i++)
pstr[i] = '0' | ((x>>(count-1-i))&1);
pstr[count]=0;
return pstr;
}
#define BITSIZEOF(x) (sizeof(x)*8)
char *int_to_bitstring_static(int x, int count)
{
static char bitbuf[BITSIZEOF(x)+1];
count = (count<1 || count>BITSIZEOF(x)) ? BITSIZEOF(x) : count;
for(int i = 0; i<count; i++)
bitbuf[i] = '0' | ((x>>(count-1-i))&1);
bitbuf[count]=0;
return bitbuf;
}
Call with:
// memory allocated string returned which needs to be freed
char *pstr = int_to_bitstring_alloc(0x97e50ae6, 17);
printf("bits = 0b%s\n", pstr);
free(pstr);
// no free needed but you need to copy the string to save it somewhere else
char *pstr2 = int_to_bitstring_static(0x97e50ae6, 17);
printf("bits = 0b%s\n", pstr2);
Is there a printf converter to print in binary format?
The printf() family is only able to print integers in base 8, 10, and 16 using the standard specifiers directly. I suggest creating a function that converts the number to a string per code's particular needs.
[Edit 2022] This is expected to change with the next version of C which implements "%b".
Binary constants such as 0b10101010, and %b conversion specifier for printf() function family C2x
To print in any base [2-36]
All other answers so far have at least one of these limitations.
Use static memory for the return buffer. This limits the number of times the function may be used as an argument to printf().
Allocate memory requiring the calling code to free pointers.
Require the calling code to explicitly provide a suitable buffer.
Call printf() directly. This obliges a new function for to fprintf(), sprintf(), vsprintf(), etc.
Use a reduced integer range.
The following has none of the above limitation. It does require C99 or later and use of "%s". It uses a compound literal to provide the buffer space. It has no trouble with multiple calls in a printf().
#include <assert.h>
#include <limits.h>
#define TO_BASE_N (sizeof(unsigned)*CHAR_BIT + 1)
// v--compound literal--v
#define TO_BASE(x, b) my_to_base((char [TO_BASE_N]){""}, (x), (b))
// Tailor the details of the conversion function as needed
// This one does not display unneeded leading zeros
// Use return value, not `buf`
char *my_to_base(char buf[TO_BASE_N], unsigned i, int base) {
assert(base >= 2 && base <= 36);
char *s = &buf[TO_BASE_N - 1];
*s = '\0';
do {
s--;
*s = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % base];
i /= base;
} while (i);
// Could employ memmove here to move the used buffer to the beginning
// size_t len = &buf[TO_BASE_N] - s;
// memmove(buf, s, len);
return s;
}
#include <stdio.h>
int main(void) {
int ip1 = 0x01020304;
int ip2 = 0x05060708;
printf("%s %s\n", TO_BASE(ip1, 16), TO_BASE(ip2, 16));
printf("%s %s\n", TO_BASE(ip1, 2), TO_BASE(ip2, 2));
puts(TO_BASE(ip1, 8));
puts(TO_BASE(ip1, 36));
return 0;
}
Output
1020304 5060708
1000000100000001100000100 101000001100000011100001000
100401404
A2F44
const char* byte_to_binary(int x)
{
static char b[sizeof(int)*8+1] = {0};
int y;
long long z;
for (z = 1LL<<sizeof(int)*8-1, y = 0; z > 0; z >>= 1, y++) {
b[y] = (((x & z) == z) ? '1' : '0');
}
b[y] = 0;
return b;
}
None of the previously posted answers are exactly what I was looking for, so I wrote one. It is super simple to use %B with the printf!
/*
* File: main.c
* Author: Techplex.Engineer
*
* Created on February 14, 2012, 9:16 PM
*/
#include <stdio.h>
#include <stdlib.h>
#include <printf.h>
#include <math.h>
#include <string.h>
static int printf_arginfo_M(const struct printf_info *info, size_t n, int *argtypes)
{
/* "%M" always takes one argument, a pointer to uint8_t[6]. */
if (n > 0) {
argtypes[0] = PA_POINTER;
}
return 1;
}
static int printf_output_M(FILE *stream, const struct printf_info *info, const void *const *args)
{
int value = 0;
int len;
value = *(int **) (args[0]);
// Beginning of my code ------------------------------------------------------------
char buffer [50] = ""; // Is this bad?
char buffer2 [50] = ""; // Is this bad?
int bits = info->width;
if (bits <= 0)
bits = 8; // Default to 8 bits
int mask = pow(2, bits - 1);
while (mask > 0) {
sprintf(buffer, "%s", ((value & mask) > 0 ? "1" : "0"));
strcat(buffer2, buffer);
mask >>= 1;
}
strcat(buffer2, "\n");
// End of my code --------------------------------------------------------------
len = fprintf(stream, "%s", buffer2);
return len;
}
int main(int argc, char** argv)
{
register_printf_specifier('B', printf_output_M, printf_arginfo_M);
printf("%4B\n", 65);
return EXIT_SUCCESS;
}
This code should handle your needs up to 64 bits.
I created two functions: pBin and pBinFill. Both do the same thing, but pBinFill fills in the leading spaces with the fill character provided by its last argument.
The test function generates some test data, then prints it out using the pBinFill function.
#define kDisplayWidth 64
char* pBin(long int x,char *so)
{
char s[kDisplayWidth+1];
int i = kDisplayWidth;
s[i--] = 0x00; // terminate string
do { // fill in array from right to left
s[i--] = (x & 1) ? '1' : '0'; // determine bit
x >>= 1; // shift right 1 bit
} while (x > 0);
i++; // point to last valid character
sprintf(so, "%s", s+i); // stick it in the temp string string
return so;
}
char* pBinFill(long int x, char *so, char fillChar)
{
// fill in array from right to left
char s[kDisplayWidth+1];
int i = kDisplayWidth;
s[i--] = 0x00; // terminate string
do { // fill in array from right to left
s[i--] = (x & 1) ? '1' : '0';
x >>= 1; // shift right 1 bit
} while (x > 0);
while (i >= 0) s[i--] = fillChar; // fill with fillChar
sprintf(so, "%s", s);
return so;
}
void test()
{
char so[kDisplayWidth+1]; // working buffer for pBin
long int val = 1;
do {
printf("%ld =\t\t%#lx =\t\t0b%s\n", val, val, pBinFill(val, so, '0'));
val *= 11; // generate test data
} while (val < 100000000);
}
Output:
00000001 = 0x000001 = 0b00000000000000000000000000000001
00000011 = 0x00000b = 0b00000000000000000000000000001011
00000121 = 0x000079 = 0b00000000000000000000000001111001
00001331 = 0x000533 = 0b00000000000000000000010100110011
00014641 = 0x003931 = 0b00000000000000000011100100110001
00161051 = 0x02751b = 0b00000000000000100111010100011011
01771561 = 0x1b0829 = 0b00000000000110110000100000101001
19487171 = 0x12959c3 = 0b00000001001010010101100111000011
Some runtimes support "%b" although that is not a standard.
Also see here for an interesting discussion:
http://bytes.com/forum/thread591027.html
HTH
Maybe a bit OT, but if you need this only for debuging to understand or retrace some binary operations you are doing, you might take a look on wcalc (a simple console calculator). With the -b options you get binary output.
e.g.
$ wcalc -b "(256 | 3) & 0xff"
= 0b11
There is no formatting function in the C standard library to output binary like that. All the format operations the printf family supports are towards human readable text.
The following recursive function might be useful:
void bin(int n)
{
/* Step 1 */
if (n > 1)
bin(n/2);
/* Step 2 */
printf("%d", n % 2);
}
I optimized the top solution for size and C++-ness, and got to this solution:
inline std::string format_binary(unsigned int x)
{
static char b[33];
b[32] = '\0';
for (int z = 0; z < 32; z++) {
b[31-z] = ((x>>z) & 0x1) ? '1' : '0';
}
return b;
}
Use:
char buffer [33];
itoa(value, buffer, 2);
printf("\nbinary: %s\n", buffer);
For more ref., see How to print binary number via printf.
void
print_binary(unsigned int n)
{
unsigned int mask = 0;
/* this grotesque hack creates a bit pattern 1000... */
/* regardless of the size of an unsigned int */
mask = ~mask ^ (~mask >> 1);
for(; mask != 0; mask >>= 1) {
putchar((n & mask) ? '1' : '0');
}
}
Print bits from any type using less code and resources
This approach has as attributes:
Works with variables and literals.
Doesn't iterate all bits when not necessary.
Call printf only when complete a byte (not unnecessarily for all bits).
Works for any type.
Works with little and big endianness (uses GCC #defines for checking).
May work with hardware that char isn't a byte (eight bits). (Tks #supercat)
Uses typeof() that isn't C standard but is largely defined.
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define for_endian(size) for (int i = 0; i < size; ++i)
#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define for_endian(size) for (int i = size - 1; i >= 0; --i)
#else
#error "Endianness not detected"
#endif
#define printb(value) \
({ \
typeof(value) _v = value; \
__printb((typeof(_v) *) &_v, sizeof(_v)); \
})
#define MSB_MASK 1 << (CHAR_BIT - 1)
void __printb(void *value, size_t size)
{
unsigned char uc;
unsigned char bits[CHAR_BIT + 1];
bits[CHAR_BIT] = '\0';
for_endian(size) {
uc = ((unsigned char *) value)[i];
memset(bits, '0', CHAR_BIT);
for (int j = 0; uc && j < CHAR_BIT; ++j) {
if (uc & MSB_MASK)
bits[j] = '1';
uc <<= 1;
}
printf("%s ", bits);
}
printf("\n");
}
int main(void)
{
uint8_t c1 = 0xff, c2 = 0x44;
uint8_t c3 = c1 + c2;
printb(c1);
printb((char) 0xff);
printb((short) 0xff);
printb(0xff);
printb(c2);
printb(0x44);
printb(0x4411ff01);
printb((uint16_t) c3);
printb('A');
printf("\n");
return 0;
}
Output
$ ./printb
11111111
11111111
00000000 11111111
00000000 00000000 00000000 11111111
01000100
00000000 00000000 00000000 01000100
01000100 00010001 11111111 00000001
00000000 01000011
00000000 00000000 00000000 01000001
I have used another approach (bitprint.h) to fill a table with all bytes (as bit strings) and print them based on the input/index byte. It's worth taking a look.
Maybe someone will find this solution useful:
void print_binary(int number, int num_digits) {
int digit;
for(digit = num_digits - 1; digit >= 0; digit--) {
printf("%c", number & (1 << digit) ? '1' : '0');
}
}
void print_ulong_bin(const unsigned long * const var, int bits) {
int i;
#if defined(__LP64__) || defined(_LP64)
if( (bits > 64) || (bits <= 0) )
#else
if( (bits > 32) || (bits <= 0) )
#endif
return;
for(i = 0; i < bits; i++) {
printf("%lu", (*var >> (bits - 1 - i)) & 0x01);
}
}
should work - untested.
I liked the code by paniq, the static buffer is a good idea. However it fails if you want multiple binary formats in a single printf() because it always returns the same pointer and overwrites the array.
Here's a C style drop-in that rotates pointer on a split buffer.
char *
format_binary(unsigned int x)
{
#define MAXLEN 8 // width of output format
#define MAXCNT 4 // count per printf statement
static char fmtbuf[(MAXLEN+1)*MAXCNT];
static int count = 0;
char *b;
count = count % MAXCNT + 1;
b = &fmtbuf[(MAXLEN+1)*count];
b[MAXLEN] = '\0';
for (int z = 0; z < MAXLEN; z++) { b[MAXLEN-1-z] = ((x>>z) & 0x1) ? '1' : '0'; }
return b;
}
Here is a small variation of paniq's solution that uses templates to allow printing of 32 and 64 bit integers:
template<class T>
inline std::string format_binary(T x)
{
char b[sizeof(T)*8+1] = {0};
for (size_t z = 0; z < sizeof(T)*8; z++)
b[sizeof(T)*8-1-z] = ((x>>z) & 0x1) ? '1' : '0';
return std::string(b);
}
And can be used like:
unsigned int value32 = 0x1e127ad;
printf( " 0x%x: %s\n", value32, format_binary(value32).c_str() );
unsigned long long value64 = 0x2e0b04ce0;
printf( "0x%llx: %s\n", value64, format_binary(value64).c_str() );
Here is the result:
0x1e127ad: 00000001111000010010011110101101
0x2e0b04ce0: 0000000000000000000000000000001011100000101100000100110011100000
No standard and portable way.
Some implementations provide itoa(), but it's not going to be in most, and it has a somewhat crummy interface. But the code is behind the link and should let you implement your own formatter pretty easily.
I just want to post my solution. It's used to get zeroes and ones of one byte, but calling this function few times can be used for larger data blocks. I use it for 128 bit or larger structs. You can also modify it to use size_t as input parameter and pointer to data you want to print, so it can be size independent. But it works for me quit well as it is.
void print_binary(unsigned char c)
{
unsigned char i1 = (1 << (sizeof(c)*8-1));
for(; i1; i1 >>= 1)
printf("%d",(c&i1)!=0);
}
void get_binary(unsigned char c, unsigned char bin[])
{
unsigned char i1 = (1 << (sizeof(c)*8-1)), i2=0;
for(; i1; i1>>=1, i2++)
bin[i2] = ((c&i1)!=0);
}
Here's how I did it for an unsigned int
void printb(unsigned int v) {
unsigned int i, s = 1<<((sizeof(v)<<3)-1); // s = only most significant bit at 1
for (i = s; i; i>>=1) printf("%d", v & i || 0 );
}
One statement generic conversion of any integral type into the binary string representation using standard library:
#include <bitset>
MyIntegralType num = 10;
print("%s\n",
std::bitset<sizeof(num) * 8>(num).to_string().insert(0, "0b").c_str()
); // prints "0b1010\n"
Or just: std::cout << std::bitset<sizeof(num) * 8>(num);