Compress string using bits Operation in C - c

I want to reduce memory storage of chars using Bitwise Operations,
for Example
input: {"ACGT"}
output: {"0xE4"}
where we represtnt the number in binary then into Hexadecimal
if A=00, C=01, G=10, T=11
so ACGT = 0xE4 = 11100100b
I cant Figure the whole way so here is what I did So Far
enum NucleicAc {
A = 0,
C = 1,
G = 2,
T = 3,
} ;
struct _DNA {
char * seq ;
};
DNAString DSCreate(char * mseq) {
DNAString dna = malloc(sizeof(struct _DNA));
if (!dna){
exit(-1);
}
const int length = strlen(mseq);
// left shift will create the base , in the case of ACGT --> 0000,0000
int base = 0 << length * 2;
//loop and set bits
int i = 0 ;
int k = 0 ; // the counter where i want to modify the current bit
//simple looping gor the string
for ( ; i < length ; i++ ) {
switch (*(mseq+i)) {
case 'A': // 00
k++;
break;
case 'C': // 0 1
modifyBit(&base, k, 1);
k++;
break;
case 'G': //10
k++;
modifyBit(&base,k , 1);
break;
case 'T': // 11
modifyBit(&base, k, 1);
k++;
modifyBit(&base, k,1);
break;
default:
break;
} //end of switch
k++;
}//end of for
char * generatedSeq ;
//convert the base to hex ??
return dna;
}
void bin(unsigned n){
unsigned i;
for (i = 1 << 7; i > 0; i = i / 2){
(n & i) ? printf("1") : printf("0");
}
}
and if we print the base, the value is 11100100b as expected,
how to store the hexadecimal representation as String to the char *mseq in the struct ?
any direction or exact solutions or is there any better approach?
and also later i want to get the letter using only the index for Example
DSGet(dsStruct , '0')--> will return 'A' hence dsStruct contains the "ACGT" before Encode?

There are several ways you can approach encoding the sequence. Your enum is fine, but for your encoded sequence, a struct that captures the bytes as unsigned char, the original sequence length and the encoded size in bytes will allow an easy decoding. You will get 4-to-1 compression (plus 1 byte if you sequence isn't evenly divisible by 4). The enum and struct could be:
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
To map characters in your string to the encoded values a simple function that returns the enum value matching the character is all you need (don't forget to handle any errors)
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
To encode the original sequence, you will populate the encoded struct with the original length (len) and number of bytes (size) needed to hold the encoded string. Don't forget that any 1-character of the next 4-characters will require another byte of storage. You can use a simple add-and-divide to account for any partial 4-character ending portion of the sequence, e.g.
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 4; seq[i] && seq[i] != '\n'; i++) {
if (!shift--) /* decrement shift, reset if 0 */
shift = 3;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 6, 4, 2, 0) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
To get the original sequence back from your encoded struct, the use of a lookup table (shown below with the full code) makes it a breeze. You simply loop over all stored byte values appending the corresponding strings from the lookup table until the final byte. For the final byte, you need to determine if it is a partial string and, if so, how many characters remain to copy. (that's why you store the original sequence length in your struct). Then simply use strncat to append that many characters from the final byte, e.g.
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
Adding the lookup table and putting all the pieces together, one way to approach this problem is:
(edit: lookup table reordered to match your byte-value encoding, optimized decode_seq() to not scan for end-of-string on copy)
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
enum { A, C, G, T };
typedef struct {
unsigned char *seq;
size_t len, size;
} encoded;
const char lookup[][5] = {
"AAAA","CAAA","GAAA","TAAA","ACAA","CCAA","GCAA","TCAA",
"AGAA","CGAA","GGAA","TGAA","ATAA","CTAA","GTAA","TTAA",
"AACA","CACA","GACA","TACA","ACCA","CCCA","GCCA","TCCA",
"AGCA","CGCA","GGCA","TGCA","ATCA","CTCA","GTCA","TTCA",
"AAGA","CAGA","GAGA","TAGA","ACGA","CCGA","GCGA","TCGA",
"AGGA","CGGA","GGGA","TGGA","ATGA","CTGA","GTGA","TTGA",
"AATA","CATA","GATA","TATA","ACTA","CCTA","GCTA","TCTA",
"AGTA","CGTA","GGTA","TGTA","ATTA","CTTA","GTTA","TTTA",
"AAAC","CAAC","GAAC","TAAC","ACAC","CCAC","GCAC","TCAC",
"AGAC","CGAC","GGAC","TGAC","ATAC","CTAC","GTAC","TTAC",
"AACC","CACC","GACC","TACC","ACCC","CCCC","GCCC","TCCC",
"AGCC","CGCC","GGCC","TGCC","ATCC","CTCC","GTCC","TTCC",
"AAGC","CAGC","GAGC","TAGC","ACGC","CCGC","GCGC","TCGC",
"AGGC","CGGC","GGGC","TGGC","ATGC","CTGC","GTGC","TTGC",
"AATC","CATC","GATC","TATC","ACTC","CCTC","GCTC","TCTC",
"AGTC","CGTC","GGTC","TGTC","ATTC","CTTC","GTTC","TTTC",
"AAAG","CAAG","GAAG","TAAG","ACAG","CCAG","GCAG","TCAG",
"AGAG","CGAG","GGAG","TGAG","ATAG","CTAG","GTAG","TTAG",
"AACG","CACG","GACG","TACG","ACCG","CCCG","GCCG","TCCG",
"AGCG","CGCG","GGCG","TGCG","ATCG","CTCG","GTCG","TTCG",
"AAGG","CAGG","GAGG","TAGG","ACGG","CCGG","GCGG","TCGG",
"AGGG","CGGG","GGGG","TGGG","ATGG","CTGG","GTGG","TTGG",
"AATG","CATG","GATG","TATG","ACTG","CCTG","GCTG","TCTG",
"AGTG","CGTG","GGTG","TGTG","ATTG","CTTG","GTTG","TTTG",
"AAAT","CAAT","GAAT","TAAT","ACAT","CCAT","GCAT","TCAT",
"AGAT","CGAT","GGAT","TGAT","ATAT","CTAT","GTAT","TTAT",
"AACT","CACT","GACT","TACT","ACCT","CCCT","GCCT","TCCT",
"AGCT","CGCT","GGCT","TGCT","ATCT","CTCT","GTCT","TTCT",
"AAGT","CAGT","GAGT","TAGT","ACGT","CCGT","GCGT","TCGT",
"AGGT","CGGT","GGGT","TGGT","ATGT","CTGT","GTGT","TTGT",
"AATT","CATT","GATT","TATT","ACTT","CCTT","GCTT","TCTT",
"AGTT","CGTT","GGTT","TGTT","ATTT","CTTT","GTTT","TTTT"};
/* convert character to encoded value */
unsigned char getencval (const char c)
{
if (c == 'A')
return A;
else if (c == 'C')
return C;
else if (c == 'G')
return G;
else if (c == 'T')
return T;
/* exit on anything other than A, C, G, T */
fprintf (stderr, "error: invalid sequence character '%c'\n", c);
exit (EXIT_FAILURE);
}
/* encode sequence of characters as 2-bit pairs (4-characters per-byte)
* returns encoded struct with allocated .seq member, on failure the .seq
* member is NULL. User is resposible for freeing .seq member when done.
*/
encoded encode_seq (const char *seq)
{
size_t len = strlen(seq),
size = (len + 3) / 4; /* integer division intentional */
encoded enc = { .seq = calloc (1, size), /* encoded sequence struct */
.len = len,
.size = size };
if (!enc.seq) { /* validate allication */
perror ("calloc-enc.seq");
return enc;
}
/* loop over each char (i) with byte index (ndx)
* shifting each 2-bit pair by (shift * 2) amount.
*/
for (int i = 0, ndx = 0, shift = 0; seq[i] && seq[i] != '\n'; i++, shift++) {
if (shift == 4) /* reset to 0 */
shift = 0;
if (i && i % 4 == 0) /* after each 4th char, increment ndx */
ndx += 1;
/* shift each encoded value (multiply by 2 for shift of 0, 2, 4, 6) */
enc.seq[ndx] |= getencval (seq[i]) << shift * 2;
}
return enc; /* return encoded struct with allocated .seq member */
}
/* decodes encoded sequence. Allocates storage for decoded sequence
* and loops over each encoded byte using lookup-table to obtain
* original 4-character string from byte value. User is responsible
* for freeing returned string when done. Returns NULL on allocation
* failure.
*/
char *decode_seq (encoded *eseq)
{
char *seq = malloc (eseq->len + 1); /* allocate storage for sequence */
size_t i = 0, offset = 0, remain;
if (!seq) { /* validate allocation */
perror ("malloc-seq");
return NULL;
}
/* loop appending strings from lookup table for all but last byte */
for (; i < eseq->size - 1; i++) {
memcpy (seq + offset, lookup[eseq->seq[i]], 4);
offset += 4; /* increment offset by 4 */
}
/* determine the number of characters in last byte */
remain = eseq->len - (eseq->size - 1) * 4;
memcpy (seq + offset, lookup[eseq->seq[i]], remain);
seq[offset + remain] = 0; /* nul-terminate seq */
return seq; /* return allocated sequence */
}
/* short example program that takes string to encode as 1st argument
* using "ACGT" if no argument is provided by default
*/
int main (int argc, char **argv) {
char *seq = NULL;
encoded enc = encode_seq(argc > 1 ? argv[1] : "ACGT");
if (!enc.seq) /* validate encoded allocation */
return 1;
/* output original string, length and encoded size */
printf ("encoded str : %s\nencoded len : %zu\nencoded size : %zu\n",
argc > 1 ? argv[1] : "ACGT", enc.len, enc.size);
/* loop outputting byte-values of encoded string */
fputs ("encoded seq :", stdout);
for (size_t i = 0; i < enc.size; i++)
printf (" 0x%02x", enc.seq[i]);
putchar ('\n');
seq = decode_seq (&enc); /* decode seq from byte values */
printf ("decoded seq : %s\n", seq); /* output decoded string */
free (seq); /* don't forget to free what you allocated */
free (enc.seq);
}
In most cases a lookup-table provides a great deal of efficiency advantage compared to computing and building each 4-character string during decoding. This is enhanced by the lookup table staying resident in cache for most cases.
The length of the DNA sequence you can encode and decode is limited only by the amount of virtual memory you have available.
Example Use/Output
The program takes the sequence to encode and decode as the first argument (default "ACGT"). So the default output is:
$ ./bin/dnaencodedecode
encoded str : ACGT
encoded len : 4
encoded size : 1
encoded seq : 0xe4
decoded seq : ACGT
4-byte encoded in 1-byte. Note the byte value of 0x1b and not 0xe4 due to the table ordering.
A longer example:
./bin/dnaencodedecode ACGTGGGTCAGACTTA
encoded str : ACGTGGGTCAGACTTA
encoded len : 16
encoded size : 4
encoded seq : 0xe4 0xea 0x21 0x3d
decoded seq : ACGTGGGTCAGACTTA
16-character encoded in 4-bytes.
Finally, what of a sequence that isn't divisible by 4 so you have a partial number of characters in the last encoded byte? That is handled as well, e.g.
$ ./bin/dnaencodedecode ACGTGGGTCAGACTTAG
encoded str : ACGTGGGTCAGACTTAG
encoded len : 17
encoded size : 5
encoded seq : 0xe4 0xea 0x21 0x3d 0x02
decoded seq : ACGTGGGTCAGACTTAG
17 characters encoded in 5-bytes. (not the pure 4-to-1 compression, but as the sequence size increases, the significance of any partial group of characters in the last byte becomes negligible)
As far a perfomance, for a sequence of 100,000 characters and output of the the byte values and strings replaced with a simple loop that compares the decoded seq to the original argv[1] it only takes a few thousandth of a second (on an old i7 Gen2 laptop with SSD) to encode and decode and validate, e.g.
$
time ./bin/dnaencodedecodet2big $(< dat/dnaseq100k.txt)
encoded len : 100000
encoded size : 25000
all tests passed
real 0m0.014s
user 0m0.012s
sys 0m0.003s
There are a lot of ways to do this, but given your description, this was what came to my mind that you were trying to accomplish. There is a lot here, so take your time going through it.
Look things over (the code is commented), and let me know if you have further questions. Just drop a comment below.

This should do what you want. I thought the compression was a pretty cool idea so I wrote this real quick. As mentioned by #kaylum, hex encoding is just a way to read the underlying data in memory, which is always just bits. So, you only need to worry about that on print statements.
Let me know if this works or you have any questions about what I did.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
typedef struct {
unsigned char *bits;
unsigned long length; // use this to store the number of letters encoded, for safety
} DNA;
typedef enum {
A = 0,
C = 1,
G = 2,
T = 3
} NucleicAc;
This returns the base at a given index with some bounds checking
char base_at_index(DNA *dna, unsigned long index) {
if (index >= dna->length) {
fputs("error: base_at_index: index out of range", stderr);
exit(EXIT_FAILURE);
}
// offset is index / 4, this gives us the correct byte
// shift amount is index % 4 to give us the correct 2 bits within the byte.
// This must then be multiplied by 2 because
// each base takes 2 bits to encode
// then we have to bitwise-and this value with
// 3 (0000 0011 in binary) to retrieve the bits we want.
// so, the formula we need is
// (dna->bits[index / 4] >> (2 * (index % 4))) & 3
switch((dna->bits[index / 4] >> 2 * (index % 4)) & 3) {
case A: return 'A';
case C: return 'C';
case G: return 'G';
case T: return 'T';
default:
fputs("error: base_at_index: invalid encoding", stderr);
exit(EXIT_FAILURE);
}
}
This encodes a string of bases to bytes
/* you can fit four 2-bit DNA codes in each byte (unsigned char).
len is the maximum number of characters to read. result must be at least len bytes long
*/
void encode_dna(unsigned char *result, char *sequence, unsigned long len) {
// keep track of what byte we are on in the result
unsigned result_index = 0;
// our shift for writing to the correct position in the byte
unsigned shift = 0;
// first clear result or else bitwise operations will produce errors
// this could be removed if you were certain result parameter was zero-filled
memset(result, 0, len);
// iterate through characters of the sequence
while(*sequence) {
switch (*sequence) {
// do nothing for 'A' since it is just zero
case 'A': break;
case 'C':
// we are doing a bitwise or with the current byte
// and C (1) shifted to the appropriate position within
// the byte, and then assigning the byte with the result
result[result_index] |= C << shift;
break;
case 'G':
result[result_index] |= G << shift;
break;
case 'T':
result[result_index] |= T << shift;
break;
default:
fputs("error: encode_dna: invalid base pair", stderr);
exit(EXIT_FAILURE);
}
// increase shift amount by 2 to the next 2-bit slot in the byte
shift += 2;
// on every 4th iteration, reset our shift to zero since the byte is now full
// and move to the next byte in our result buffer
if (shift == 8) {
shift = 0;
result_index++;
}
// advance sequence to next nucleotide character
sequence++;
}
}
And here's a test
int main(int argc, char **argv) {
// allocate some storage for encoded DNA
unsigned char encoded_dna[32];
const unsigned long sample_length = 15;
// encode the given sample sequence
encode_dna(encoded_dna, "ACGTAGTCGTCATAG", sample_length);
// hh here means half of half word, which is a byte
// capital X for capitalized hex output
// here we print some bytes
printf("0x%hhX\n", encoded_dna[0]); // should output 0xE4
printf("0x%hhX\n", encoded_dna[1]); // should be 0x78
printf("0x%hhX\n", encoded_dna[2]); // should be 0x1E
printf("0x%hhX\n", encoded_dna[3]); // should be 0x23
DNA test_dna; // allocate a sample DNA structure
test_dna.bits = encoded_dna;
test_dna.length = sample_length; // length of the sample sequence above
// test some indices and see if the results are correct
printf("test_dna index 4: %c\n", base_at_index(&test_dna, 4));
printf("test_dna index 7: %c\n", base_at_index(&test_dna, 7));
printf("test_dna index 12: %c\n", base_at_index(&test_dna, 12));
return 0;
}
Output:
0xE4
0x78
0x1E
0x23
test_dna index 4: A
test_dna index 7: C
test_dna index 12: T

Assuming you really do want to encode your dna string into a hex string and that you want to read the input string from left to right, but output hex chars right to left, here's a simple, but slightly slow implementation.
First, your DNAString needs to keep track whether there are really an even or odd number of acid sequences in the list. This will make additional appendages easier.
struct DNAString
{
char* seq;
bool odd; // if odd bit is set, then the front char is already allocated and holds one acid
};
And now let's introduce a little helper function to convert ACGT into 0,1,2,3.
char acid_to_value(char c)
{
switch (c)
{
case 'A': return 0;
case 'C': return 1;
case 'G': return 2;
case 'T': return 3;
}
// ASSERT(false)
return 0;
}
Then the core implementation is to keep "prepending" new hexchars onto the string your are building. If the string is already of an odd length, the code will just "fixup" the front character by converting it from hex to integer, then shifting the new acid value into it, then converting it back to a hex char
extern char fixup(char previous, char acid);
{
char hexchars[16] = { '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' };
char tmp[2] = { previous, '\0' };
unsigned long asnumber = strtol(tmp, nullptr, 16);
asnumber = asnumber & 0x3; // last two bits
asnumber = asnumber | (acid_to_value(acid) << 2);
return hexchars[asnumber];
}
void prepend_nucleic_acid_to_hexstring(struct DNAString* dnastring, char acid)
{
if (dnastring->odd)
{
// find the first char in the string and fix it up hexwise
dnastring->seq[0] = fixup(dnastring->seq[0], acid);
dnastring->odd = false;
}
else
{
size_t currentlength = dnastring->seq ? strlen(dnastring->seq) : 0;
const char* currentstring = dnastring->seq ? dnastring->seq : "";
char* newseq = (char*)calloc(currentlength + 2, sizeof(char)); // +1 for new char and +1 for null char
newseq[0] = acid_to_value(acid) + '0'; // prepend the next hex char
strcpy(newseq + 1, currentstring); // copy the old string into the new string space
free(dnastring->seq);
dnastring->seq = newseq;
dnastring->odd = true;
}
}
Then your DNACreate function is real simple:
struct DNAString DSCreate(const char* mseq)
{
DNAString dnastring = { 0 };
while (*mseq)
{
prepend_nucleic_acid_to_hexstring(&dnastring, *mseq);
mseq++;
}
return dnastring;
}
I don't claim this approach to be efficient since he literally keeps reallocating memory for each char. But it does enable you to have flexability to invoke the prepend function later for additional sequencing.
And then to test it:
int main()
{
struct DNAString dnastring = DSCreate("ACGT");
printf("0x%s\n", dnastring.seq);
return 0;
}

Related

Converting a Numerical String to an Array of Bytes Represented in Binary

I’d like to convert a string containing only integers to an array of bytes, but to be stored efficiently (so no “digits[digitIndex] = string[digitIndex - ‘0’;”). I would like them to be stored like any type is stored: having 256 different possibilities per byte, not only 10 as in the previous, faulty example. It also needs to hold a lot of digits (I’m using an 8-bit parameter as the size, so at least 100 digits I believe). Edit: I also do not want to use any libraries whatsoever for personal reasons.
Here’s an example of what it would look like in a function:
int8_t *stringToBigInt(char *input) {
uint8_t digitsBase10 = strlen(input);
uint8_t bytes = ???; //However many bytes to store the result (max 255 bytes in this case)
int8_t *result = malloc(sizeof(void *) + bytes);
... //Code for setting result to input
return result;
}
And here’s an example of a possible input and output:
Edit: This is a short example that fits into 32-bits only for simplicity; an input could be much more than a 32-bit (and possibly 64-bit) integer
Input: “1234567890”
Output: {01001001, 10010110, 00000010, 11010010}
This is a base conversion from base-10 to base-256, so that’s what you should look for as far as algorithms go. For a simplistic implementation, first implement long division by powers of 2 working on strings. Then convert each of the remainders to a byte: these bytes form your output. You’ll want to repeatedly divide the input, and each string of 8 remainder bit remainders forms the base-256 bytes, starting at the least significant digit (one byte is one base-256 digit). Repeated division means that you feed the quotient of the preceding division to the succeeding one, as the dividend.
There are some cool algorithms that can divide base-10 numbers by powers of two, that operate much faster and are simpler than generalized long division. As a hint, let’s take an example: 510. We divide each digit by two, and feed the remainder*5 to the next digit. Let’s drop the fractional part smaller than 0.5: 510 becomes 2*100 + 5*10 + 5. Then 1*100 + 2*10 + 2 dot 5. Then 6*10 + 1. Then 3*10 dot 5, 2*10 + 5, then 1*10 + 2 dot 5, then 6, then 3, then 2 dot 5, then 1, then 0 dot 5.
For 255 we’d get 127.5, 63.5, 15.5, 7.5, 3.5, 1.5, 0.5.
Division by higher factors of two is possible, but requires repeated long additions. E.g. 33 div 4 = 0*10 + 7rem1 + 0 rem 0.75 (ha!). Divisions by two work better since we use the fact that 10=2*5, and base-n notation can be divided by factors of the base easily, without performing long additions: all operations are limited to two adjacent digits, so it’s a linear time process with cost N in number of digits. But for base conversion to base-256 you do repeated division, so the cost is ~0.5N^2. Easy to implement but costly in computations.
There are better algorithms than that, of course. But the above can be implemented concisely - even in the form of reasonably good quality library functions:
First, let's define an array-of-bytes type, and a way to dump it to human-readable hexadecimal output. For convenience, the object is referred to via the pointer to its data, and the implementation detail doesn't figure anywhere in the interface at all. The constructor new_Bytes zero-initializes the array. There is also a method that treats the array as if it was an array of bits, ordered lest-endian (LSB first), and sets (turns on) a given bit.
// https://github.com/KubaO/stackoverflown/tree/master/questions/decimal-to-binary-54422895
#include <assert.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Bytes Class Interface
typedef uint8_t *Bytes;
typedef const uint8_t *cBytes;
Bytes new_Bytes(size_t size);
size_t Bytes_size(cBytes bytes);
void Bytes_truncate(Bytes bytes, size_t new_size);
void free_Bytes(cBytes bytes);
char *Bytes_to_hex(cBytes bytes);
static inline void Bytes_set_bit(Bytes const bytes, size_t const bit_num) {
bytes[bit_num / 8] |= 1 << (bit_num % 8);
}
Then, the division-by-2 is performed in-place, and the flags provide additional information needed for base conversion - especially the remainder. The conversion from base 10 to base 256 uses the division and returns a new Bytes array.
// Division and Base Conversion Interface
typedef enum {
REMAINDER = 1, /* there is a non-zero remainder */
ZERO = 2, /* the quotient is zero or null */
NULL_DECIMAL = 4, /* the dividend is null or empty */
NON_DECIMALS = 8, /* division was terminated on non-decimal characters */
LEADING_ZERO_COUNT = 16, /* count of leading zeroes in the quotient */
LEADING_ZERO_COUNT_MASK = ~(LEADING_ZERO_COUNT - 1),
CLR_CARRY_MASK = ~REMAINDER,
CLR_ZERO_MASK = ~ZERO,
} DivFlags;
DivFlags divide_by_2(char *decimal);
Bytes base_10_to_256(const char *decimal);
The division operates on the decimal representation, in order from most-significant to least-significant digit. Each digit is merged with the remainder from the prior digit's division, and then is divided by 2. The remainder is carried between digit divisions. After division of the least significant digit, the remainder is output in the flags.
The flags are mostly self-explanatory, but LEADING_ZERO_COUNT isn't quite - and thus the access to it is implemented via accessor functions. LEADING_ZERO_COUNT is the unit of the count of leading zeroes. As the division steps though the decimal representation, it will count the leading zeroes, multiply them by this unit, and merge it with the flags. To extract the count, the flags are divided by the unit.
// Division and Base Conversion Implementation
static inline int leading_zero_count(DivFlags const flags) {
return (flags & LEADING_ZERO_COUNT_MASK) / LEADING_ZERO_COUNT;
}
static inline void saturated_inc_leading_zero_count(DivFlags *flags) {
if ((*flags & LEADING_ZERO_COUNT_MASK) != LEADING_ZERO_COUNT_MASK)
*flags += LEADING_ZERO_COUNT;
}
DivFlags divide_by_2(char *decimal) {
DivFlags flags = ZERO;
if (!decimal) return flags | NULL_DECIMAL;
char c;
while ((c = *decimal)) {
if (c < '0' || c > '9') return flags | NON_DECIMALS;
c = c - '0' + ((flags & REMAINDER) ? 10 : 0);
if (c & 1)
flags |= REMAINDER;
else
flags &= CLR_CARRY_MASK;
c >>= 1;
assert(c >= 0 && c <= 9);
if (c)
flags &= CLR_ZERO_MASK;
else if (flags & ZERO)
saturated_inc_leading_zero_count(&flags);
*decimal++ = c + '0';
}
return flags;
}
Then, the base conversion performs repeated division by 2, and shifts the remainder bits into the byte array, as follows:
First, the base conversion takes a copy of the decimal representation, and allocates the output byte array of the appropriate size.
static void base_10_to_256_impl(Bytes const bytes, char *decimal);
Bytes base_10_to_256(const char *const decimal) {
assert(decimal);
size_t const dec_len = strlen(decimal);
char *const dec_buf = malloc(dec_len + 1);
if (!dec_buf) return NULL;
memcpy(dec_buf, decimal, dec_len + 1);
size_t const BASE_RATIO_NUM = 416, /* ceil(log(10)/log(256)*1000) */
BASE_RATIO_DENOM = 1000;
assert(dec_len <= (SIZE_MAX / BASE_RATIO_NUM));
size_t const len = (size_t)(dec_len * BASE_RATIO_NUM / BASE_RATIO_DENOM) + 1;
Bytes const bytes = new_Bytes(len); // little-endian
if (bytes) base_10_to_256_impl(bytes, dec_buf);
free(dec_buf);
return bytes;
}
Then, in the "meat" of the implementation, the function iterates the output bits, repeatedly dividing the decimal representation by 2, and sets each bit with the value of the remainder bit.
static void base_10_to_256_impl(Bytes const bytes, char *decimal) {
size_t const len = Bytes_size(bytes);
for (size_t bit_num = 0;; bit_num++) {
DivFlags const flags = divide_by_2(decimal);
assert(!(flags & NULL_DECIMAL));
decimal += leading_zero_count(flags);
if (flags & ZERO && !(flags & REMAINDER)) {
size_t const new_len = ((bit_num + 7) / 8);
Bytes_truncate(bytes, new_len);
break;
}
// here, there are still non-zero bits - in the dec[imal] and/or in the carry
assert((bit_num / 8) < len);
if (flags & REMAINDER) Bytes_set_bit(bytes, bit_num);
}
}
We can now add some tests:
// Tests
void check_bytes(const char *const decimal, const char *const bytes_expected,
size_t const bytes_len, const char *const hex_expected) {
cBytes const bytes = base_10_to_256(decimal);
assert(bytes && Bytes_size(bytes) == bytes_len);
assert(memcmp(bytes, bytes_expected, bytes_len) == 0);
char *const hex = Bytes_to_hex(bytes);
assert(hex && strcmp(hex, hex_expected) == 0);
printf("%s\n", hex);
free(hex);
free_Bytes(bytes);
}
int main() {
check_bytes("4294967297" /* 2^32+1 */, "\1\0\0\0\1", 5, "01 00000001");
check_bytes("4294967296" /* 2^32 */, "\0\0\0\0\1", 5, "01 00000000");
check_bytes("4294967295" /* 2^32-1 */, "\xFF\xFF\xFF\xFF", 4, "FFFFFFFF");
check_bytes("16777217" /* 2^24+1 */, "\1\0\0\1", 4, "01000001");
check_bytes("16777216" /* 2^24 */, "\0\0\0\1", 4, "01000000");
check_bytes("16777215" /* 2^24-1 */, "\xFF\xFF\xFF", 3, "FFFFFF");
check_bytes("256", "\0\1", 2, "0100");
check_bytes("255", "\xFF", 1, "FF");
check_bytes("254", "\xFE", 1, "FE");
check_bytes("253", "\xFD", 1, "FD");
check_bytes("3", "\3", 1, "03");
check_bytes("2", "\2", 1, "02");
check_bytes("1", "\1", 1, "01");
check_bytes("0", "\0", 1, "00");
}
The implementation of the Bytes class concludes the example:
// Bytes Implementation
struct BytesImpl {
size_t size;
uint8_t data[1];
};
static const size_t Bytes_header_size = offsetof(struct BytesImpl, data);
_Static_assert(offsetof(struct BytesImpl, data) == sizeof(size_t),
"unexpected layout of struct BytesImpl");
Bytes new_Bytes(size_t size) {
assert(size <= SIZE_MAX - Bytes_header_size);
if (!size) size++;
struct BytesImpl *const impl = calloc(Bytes_header_size + size, 1);
if (!impl) return NULL;
impl->size = size;
return &impl->data[0];
}
static const struct BytesImpl *Bytes_get_const_impl_(cBytes const bytes) {
return (const struct BytesImpl *)(const void *)((const char *)bytes -
Bytes_header_size);
}
static struct BytesImpl *Bytes_get_impl_(Bytes const bytes) {
return (struct BytesImpl *)(void *)((char *)bytes - Bytes_header_size);
}
size_t Bytes_size(cBytes const bytes) { return Bytes_get_const_impl_(bytes)->size; }
void Bytes_truncate(Bytes const bytes, size_t new_size) {
size_t *const size = &Bytes_get_impl_(bytes)->size;
if (!new_size) {
new_size++; // we always leave one byte in the array
bytes[0] = 0;
}
assert(*size);
if (*size <= new_size) return;
*size = new_size;
}
void free_Bytes(cBytes const bytes) {
if (bytes) free((void *)(intptr_t)(const void *)Bytes_get_const_impl_(bytes));
}
char *Bytes_to_hex(cBytes const bytes) {
size_t n = Bytes_size(bytes);
size_t spaces = (n - 1) / 4;
char *const out = malloc(n * 2 + spaces + 1);
if (out)
for (char *o = out; n;) {
uint8_t const c = bytes[n - 1];
snprintf(o, 3, "%02" PRIX8, c);
o += 2;
n--;
if (n && n % 4 == 0) {
assert(spaces);
*o++ = ' ';
spaces--;
}
}
return out;
}

How to do binary addition of real numbers?

Short story. I made a program that does addition for binary integers. I need to make it work for binary real numbers (e.g. 1010.1010(binary)=10.625(decimal)
The input is given as a binary string.
I made a lot of attempts and I couldn't find a simple way to do it. Please help create such a program.
Example: {input: 1010.1010(10.625 in decimal) 0.1(0.5 in decimal)
output: 1011.001 (11.125 in decimal)}
Code:
#include <stdio.h>
#include <string.h>
void bin_add(int c[400], int d[400])
{
int car[400]; //carry
int i = 199;
car[i] = 0;
while (i >= 0)
{
//find carry and shift it left
//find the sum
car[i - 1] = (c[i] & d[i]) | (c[i] & car[i]) | (d[i] & car[i]);
c[i] = (c[i] ^ d[i]) ^ car[i];
printf("car[i-1]=%d c[i]=%d\n", car[i - 1], c[i]);
i--;
}
// printf("\n");
}
int main()
{
int l, l1, i;//l and l1 are lengths
char a[200], b[200]; //a and b are the inputs
int c[200], d[200]; //c and d are used for processing
for (i = 0; i < 200; i++)
{
c[i] = 0;
d[i] = 0;
}
gets(a);
gets(b);
l = strlen(a);
l1 = strlen(b);
for (int i = 0; i < l; i++)
{
c[200 - l + i] = a[i] - 48;
}
////////////////////////////////////////////
for (int i = 0; i < l1; i++)
{
d[200 - l1 + i] = b[i] - 48;
}
////////////////////////////////
bin_add(c, d);
for (i = 0; i < 200; i++)
printf("%d", c[i]);
return 0;
}
What you really want to do, is handle each digit in order of increasing importance. To make that easier, you should implement the following functions:
/* Return the number of fractional bits in bs */
int bs_fractbits(const char *bs);
/* Return the number of integer bits in bs */
int bs_intbits(const char *bs);
/* Return the bit in bs corresponding to value 2**i,
0 if outside the bit string */
int bs_bit(const char *bs, int i);
/* Return -1 if bs is negative,
0 if bs is zero or NULL,
+1 if bs is positive */
int bs_sign(const char *bs);
/* Return -1 if bs1 < bs2,
0 if bs1 == bs2,
+1 if bs1 > bs2. */
int bs_cmp(const char *bs1, const char *bs2);
To support negative values, you'll need to implement both addition and subtraction (of "unsigned" bit strings):
Addition: The result has as many fractional bits as the term that has most fractional bits, and possibly one more integer bit than the term that has most integer bits. Start at the least significant bit in either term, and work your way up to the most significant bit in either term, summing each bit, and keeping the "carry bit" along, just like you'd do by hand. If the carry is nonzero at end, you'll get that one additional bit.
Subtraction: Always subtract smaller from larger. If that changes the order of the terms, negate the result. The result has at most as many fractional bits as the term that has most fractional bits, and at most as many integer bits as the term that has most integer bits. This is just like addition, except you subtract the bits, and instead of "carry bit", you use a "borrow bit". Because you subtract smaller unsigned value from larger unsigned value, the "borrow bit" will be zero at end.
Multiplication: The integer part has the number of integer bits, and the number of fractional bits, as the terms have in total (summed). You can implement the operation as if multiplying two unsigned integer values, and just insert the bit at end. (So that the result has as many fractional bits as the input terms have in total.) This usually involves a double loop, just like in long multiplication by hand.
Note that the same logic also works if you use larger radix instead of 2. The "carry"/"borrow" is a digit, between zero and one less than the radix.
Personally, I'd be very tempted to use a structure to describe each digit string:
typedef struct {
int idigits; /* Number of integral digits before point */
int fdigits; /* Number of fractional digits after point */
int size; /* Number of chars dynamically allocated at data */
char *point; /* Location of decimal point */
char *data; /* Dynamically allocated buffer */
} digitstring;
#define DIGITSTRING_INIT { 0, 0, 0, NULL, NULL }
with an additional flag if negative digit strings are to be supported.
Digit D with numerical value D×Bi, where B is the radix (number of unique digits used) and i being the position of said digit, is located at point[-i] if i < 0 (and -i <= fdigits), or at point[-i-1] if i >= 0 (and i < idigits). point[0] itself is where the decimal point is, if there is one.
For example, if we have string 0100.00, then idigits = 4, fdigits = 2, and the only nonzero digit is at position 2. (Position 0 is on the left side of the decimal point, and -1 on the right side.)
size and data fields allow reuse of the dynamically allocated buffer. Each declaration of a digitstring must be initialized, digitstring value = DIGITSTRING_INIT;, because there is no initialization function; this way you are less likely to leak memory (unless you forget to free a digitstring when no longer needed):
/* Free the specified digit string. */
static inline void digitstring_free(digitstring *ds)
{
if (ds) {
if (ds->data)
free(ds->data);
ds->idigits = 0;
ds->fdigits = 0;
ds->size = 0;
ds->point = NULL;
ds->data = NULL;
}
}
To use the digit string as a C string, you use a helper function to obtain the pointer to the most significant digit in the digit string:
/* Return a pointer to a printable version of the digit string. */
static const char *digitstring_str(const digitstring *ds, const char *none)
{
if (ds && ds->point)
return (const char *)(ds->point - ds->idigits);
else
return none;
}
I've found that rather than crash, it is often useful to pass an extra parameter that is only used for the return value when the return value is otherwise undefined. For example, if you have an initialized digit string foo without any contents, then digitstring_str(&foo, "0") returns the string literal "0".
The main point of the digit string structure is to have accessor functions that get and set each individual digit:
/* Get the value of a specific digit. */
static inline unsigned int digitstring_get(const digitstring *ds, const int position)
{
if (ds) {
if (position < 0) {
if (-position <= ds->fdigits)
return digit_to_value(ds->point[-position]);
else
return 0;
} else {
if (position < ds->idigits)
return digit_to_value(ds->point[-position-1]);
else
return 0;
}
} else
return 0;
}
/* Set the value of a specific digit. */
static inline void digitstring_set(digitstring *ds, const int position, const unsigned int value)
{
if (!ds) {
fprintf(stderr, "digitstring_set(): NULL digitstring specified.\n");
exit(EXIT_FAILURE);
} else
if (position < 0) {
if (-position > ds->fdigits) {
fprintf(stderr, "digitstring_set(): Digit position underflow (in fractional part).\n");
exit(EXIT_FAILURE);
}
ds->point[-position] = value_to_digit(value);
} else {
if (position >= ds->idigits) {
fprintf(stderr, "digitstring_set(): Digit position overflow (in integer part).\n");
exit(EXIT_FAILURE);
}
ds->point[-position-1] = value_to_digit(value);
}
}
Above, value_to_digit() is a helper function that converts a numerical value to the corresponding character, and digit_to_value() converts a character to the corresponding numerical value.
All operations (from parsing to arithmetic operators) really need a "constructor", that creates a new digit string with sufficient number of digits. (The number of digits is known beforehand for each operation, and depends only on the number of significant digits in the terms.) For this, I created a function that constructs a zero of desired size:
/* Clear the specified digit string to zero. */
static inline void digitstring_zero(digitstring *ds, int idigits, int fdigits)
{
int size;
char *data;
if (!ds) {
fprintf(stderr, "digitstring_zero(): No digitstring specified.\n");
exit(EXIT_FAILURE);
}
/* Require at least one integral digit. */
if (idigits < 1)
idigits = 1;
if (fdigits < 0)
fdigits = 0;
/* Total number of chars needed, including decimal point
and string-terminating nul char. */
size = idigits + 1 + fdigits + 1;
/* Check if dynamically allocated buffer needs resizing. */
if (ds->size < size) {
if (ds->data)
data = realloc(ds->data, size);
else
data = malloc(size);
if (!data) {
fprintf(stderr, "digitstring_zero(): Out of memory.\n");
exit(EXIT_FAILURE);
}
ds->data = data;
ds->size = size;
} else {
data = ds->data;
size = ds->size;
}
/* Fill it with zeroes. */
memset(data, value_to_digit(0), idigits + 1 + fdigits);
/* Pad the unused space with nul chars, terminating the string. */
memset(data + idigits + 1 + fdigits, '\0', size - idigits - 1 - fdigits);
/* Assign the decimal point. */
ds->point = data + idigits;
/* If there are no decimals, no need for a decimal point either. */
if (fdigits > 0)
ds->point[0] = decimal_point;
else
ds->point[0] = '\0';
/* After setting the desired digits, use digitstring_trim(). */
ds->idigits = idigits;
ds->fdigits = fdigits;
}
It will ensure the digit string has enough room for the specified number of digits, reallocating its dynamically allocated buffer if necessary, reusing it if already large enough.
The idea is that to implement an operation, you first find out the maximum number of integral and fractional digits the result can have. You use the above to create the result digit string, then digitstring_set() to set each digit to their respective values. You will typically operate in increasing digit significance, which means increasing digit "positions".
If we have additional helper functions int digits(const char *src), which returns the number of consecutive valid digit characters starting at src, and int decimal_points(const char *src), which returns 1 if src points to a decimal point, and 0 otherwise, we can parse input strings into digit strings using
/* Parse a string into a digit string, returning the pointer
to the first unparsed character, or NULL if an error occurs. */
static const char *digitstring_parse(digitstring *ds, const char *src)
{
const int zero = value_to_digit(0);
const char *idigit, *fdigit;
int idigits, fdigits, fextra, n;
/* Fail if nothing to parse. */
if (!src)
return NULL;
/* Skip leading whitespace. */
while (isspace((unsigned char)(*src)))
src++;
/* Fail if nothing to parse. */
if (*src == '\0')
return NULL;
/* Scan integer digits. */
idigit = src;
src += digits(src);
idigits = (int)(src - idigit);
/* Decimal point? */
fextra = 0;
n = decimal_points(src);
if (n > 0) {
src += n;
/* Scan fractional digits. */
fdigit = src;
src += digits(src);
fdigits = (int)(src - fdigit);
if (fdigits < 1)
fextra = 1;
} else {
fdigit = src;
fdigits = 0;
}
/* No digits? */
if (idigit == 0 && fdigit == 0)
return NULL;
/* Trim leading zeroes. */
while (idigits > 1 && *idigit == zero) {
idigits--;
idigit++;
}
/* Trim trailing zeroes. */
while (fdigits > 1 && fdigit[fdigits - 1] == zero)
fdigits--;
/* Create the necessary digit string, */
digitstring_zero(ds, idigits, fdigits + fextra);
/* copy the integer digits, if any, */
if (idigits > 0)
memcpy(ds->point - idigits, idigit, idigits);
/* and the fractional digits, if any. */
if (fdigits > 0)
memcpy(ds->point + 1, fdigit, fdigits);
/* Return a pointer to the first unparsed character. */
return src;
}
After updating its digits, one can call a helper function to remove any extra leading zeroes:
static inline void digitstring_ltrim(digitstring *ds)
{
if (ds && ds->point) {
const int zero = value_to_digit(0);
while (ds->idigits > 1 && ds->point[-ds->idigits] == zero)
ds->idigits--;
}
}
Adding two (unsigned) digit strings, possibly reusing one of the terms, is now quite simple to implement:
static void digitstring_add(digitstring *to, const digitstring *src1, const digitstring *src2)
{
digitstring result = DIGITSTRING_INIT;
unsigned int carry = 0;
int i, idigits, fdigits;
if (!to || !src1 || !src2) {
fprintf(stderr, "digitstring_add(): NULL digitstring specified.\n");
exit(EXIT_FAILURE);
}
/* For addition, the result has as many digits
as the longer source term. */
idigits = (src1->idigits >= src2->idigits) ? src1->idigits : src2->idigits;
fdigits = (src1->fdigits >= src2->fdigits) ? src1->fdigits : src2->fdigits;
/* Result needs possibly one more integer digit,
in case carry overflows. */
digitstring_zero(&result, idigits + 1, fdigits);
/* Addition loop, in order of increasing digit significance. */
for (i = -fdigits; i < idigits; i++) {
const unsigned int sum = digitstring_get(src1, i)
+ digitstring_get(src2, i)
+ carry;
digitstring_set(&result, i, sum % RADIX);
carry = sum / RADIX;
}
digitstring_set(&result, idigits, carry);
/* Trim leading zeroes. */
digitstring_ltrim(&result);
/* At this point, we can discard the target, even if it is actually
one of the sources, and copy the result to it. */
digitstring_free(to);
*to = result;
}
where RADIX is the radix used (the number of unique digits, 2 for binary). Pay extra attention to the digit loop. -fdigits is the least significant position in the result, and idigits-1 the most significant position. We need the accessor functions, because the source terms might not contain those digits at all (they are logically zero then).
These functions have been tested to work on both binary and octal number strings. I like this implementation, because it omits the decimal point if all terms are integers (so you get 12 + 33 = 45), but (due to fextra in digitstring_parse()) if any of the terms have a decimal point, then the result will have at least one fractional digit (so 12. + 33 = 45.0).
After all the beautiful ideas in Animals' answer I felt the strange urge, to present my own solution:
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX(x, y) ((x) > (y) ? (x) : (y))
size_t gpp(char const *s)
{
char *n = strchr(s, '.');
return n ? n - s + 1 : 0;
}
char* bin_add(char const *a, char const *b)
{
char const *inp[] = { a, b };
size_t ll[] = { strlen(a), strlen(b) };
size_t pp[] = { gpp(a), gpp(b) };
size_t OO[2], off[2];
for (size_t i = 0; i < 2; ++i) {
OO[i] = pp[i] ? pp[i] - 1 : ll[i];
pp[i] = pp[i] ? ll[i] - pp[i] : 0;}
for (size_t i = 0; i < 2; ++i)
off[i] = OO[i] < OO[!i] ? OO[!i] - OO[i] : 0;
size_t ML = MAX(OO[0], OO[1]) + MAX(pp[0], pp[1]) + (!!pp[0] || !!pp[1]);
char *Ol = calloc(ML + 2, 1);
if(!Ol) return Ol;
char ops[2];
int xc = 0;
size_t lO = ML;
unsigned cc[2] = { 0 };
for (size_t i = ML; i; --i) {
bool pt = false;
for (size_t l = 0; l < 2; ++l) {
ops[l] = i <= ll[l] + off[l] && i - off[l] - 1
< ll[l] ? inp[l][i - off[l] - 1] : '0';
if (ops[l] == '.') {
if (cc[l]) {
free(Ol);
return NULL;
}
pt = true;
++cc[l];
}
ops[l] -= '0';
}
if (pt) {
Ol[i] = '.';
continue;
}
if ((Ol[i] = ops[0] + ops[1] + xc) > 1) {
Ol[i] = 0;
xc = 1;
}
else xc = 0;
lO = (Ol[i] += '0') == '1' ? i : lO;
}
if((Ol[0] = '0' + xc) == '1') return Ol;
for (size_t i = 0; i <= ML - lO + 1; ++i)
Ol[i] = Ol[lO + i];
return Ol;
}
int main(void)
{
char a[81], b[81];
while (scanf(" %80[0.1] %80[0.1]", a, b) & 1 << 1) {
char *c = bin_add(a, b);
if (!c && errno == ENOMEM) {
fputs("Not enough memory :(\n\n", stderr);
return EXIT_FAILURE;
}
else if (!c) {
fputs("Input error :(\n\n", stderr);
goto clear;
}
char* O[] = { a, b, c };
size_t lO[3], Ol = 0;
for (size_t i = 0; i < 3; ++i) {
lO[i] = gpp(O[i]);
lO[i] = lO[i] ? lO[i] : strlen(i[O]) + 1;
Ol = lO[i] > Ol ? lO[i] : Ol;
}
putchar('\n');
for (size_t i = 0; i < 3; ++i) {
for (size_t l = 0; l < Ol - lO[i]; ++l, putchar(' '));
puts(O[i]);
}
putchar('\n');
free(c);
clear :{ int c; while ((c = getchar()) != '\n' && c != EOF); }
}
}
Sample Output:
11001001 .11001001
11001001
.11001001
11001001.11001001
11001001 1010
11001001
1010
11010011
111111 1
111111
1
1000000
1010101 010111001.0101110101010
1010101
010111001.0101110101010
100001110.0101110101010
1001001.010111010101 10100101100.10010111101
1001001.010111010101
10100101100.10010111101
10101110101.111000001111
. .
.
.
0
.. .
Input error :(
A
Press any key to continue . . .
I contemplated wheter I should ask for a review at codereview. But I think I schould rather not.
There are two answers, depending upon whether you desire fixed- or floating- point arithmetic.
The first issue is reading the number. strtol() is your friend here:
char input[BUFFER_SIZE];
char * tmp;
long integral, fractional;
fgets(input, BUFFER_SIZE-1, stdin);
integral = strtol(input, &tmp, 2); /* Read BINARY integral part */
tmp++; /* Pass over the binary point. You may want to check that it is actually a dot */
fractional = strtol(tmp, null, 2); /* Read BINARY fractional part */
The next issue is figuring out how you will do the arithmetic. fractional must be bit-shifted an amount depending on how many digits past the point were provided and your desired precision. Fixed point arithmetic is simple: fractional <<= FRAC_BITS - strlen(tmp) then add the fractional parts together. Mask by ((1<<FRAC_BITS)-1) for the fractional part of the sum, shift the remaining bits and add them to the integral parts for the integral part of the sum. Floating-point is a little more finicky, but not too much harder.
For real numbers, convert non-fraction and fraction part to decimal, do the addition and print it as binary. This will require function to convert a number to binary string. Just a note that real numbers are float numbers in C and they are represented in binary with mantessa form like 2e^3 which is 2 multiplied by exponent to the power of 3.

String to space separated integer

I have a string with integers in it (in descending order) then output should be space separated integers. Integers are not negative.
INPUT: 9876543 OUTPUT: 9 8 7 6 5 4 3
INPUT: 109876543 OUTPUT: 10 9 8 7 6 5 4 3
INPUT: 400399398397 OUTPUT: 400 399 398 397
So I tried using sscanf() but was not able to get the desired result, this is the code I tried:
fgets(s1,100,stdin); // Get string
while(sscanf(data1,"%d%n",&m1,&len)==1){
b[i] = m1; // Store the integers in the array
data1 += len;
i += 1;
}
How can I achieve the desired result?
Continuing from the comments and the additional answer, to parse and separate the string into a space separated series of integers decreasing by one, there are probably a number of differing approaches you can take. The biggest design question is whether you start with the length of the input string, cut it in half and then work backwards decreasing the number of digits you check for adjacent values by one -- or whether you start at the beginning and work toward the end incrementing the number of digits being considered along the way.
Regardless of the direction you choose, the twist is handling/checking adjacent values with a different number of digits. Your second example, 109876543, hits at the heart of this twist, where you must code a way to check the 2-digit value 10 against the next single-digit value in the series 9. There is just no pretty way to do this. One reasonable way is to simply compute the smallest number that can be represented by n-digits (e.g. 10, 100, 1000, ...). Essentially 10^(n-1) (where we let int expn = n - 1;). If your first value v1 is equal to 10^(n-1), then reduce the number of characters you consider for the next smallest values. Something like the following:
while (expn--) /* loop to build 10 ^ (n-1) */
x10 *= 10; /* compute 10 ^ (n-1), 10, 100 */
if (v1 == x10) /* compare against v1 */
n--; /* reduce t2 by 1-char/digit */
The remainder of the task is just basically a brute force check with a minimum number of validations necessary to protect array bounds, while handling adding values to your integer array (or however you want to store values until you validate or invalidate the remaining characters in the string) while you work your way through the remaining characters.
Putting all the pieces together, and noting there are many, many ways to code this, this example being only one, you could do something similar to the following. Note, the code simply handles the conversion from ASCII to int in the single-digit series case by subtracting '0' from the character value, for multi-digit conversions, strtol is used with a validation check of errno. The code works from beginning to end of the string incrementing the number of digits checked until the end of the string is reached. If a solution is found, a space-separated list of integers is output, otherwise, "no solution found." is output. The code is commented to help you work though it.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#define MAXI 256
int main (int argc, char **argv) {
int a[MAXI] = {0}, i = 1, idx = 0, n = 1, len;
char *p = argc > 1 ? argv[1] : "9876543";
printf ("string : %s\n", p); /* original input string */
len = (int)strlen (p); /* get length */
while (i + n <= len && n < len) { /* loop until conditions met */
if (n >= MAXI) { /* protect int array bounds */
fprintf (stderr, "warning: array full, %d elements filled.\n", n);
break;
}
if (n == 1) { /* handle single digits series */
if (p[i - 1] == p[i] + 1) { /* previous equal current + 1? */
if (!idx) /* if array index == 0 */
a[idx++] = p[i - 1] - '0'; /* store first integer */
a[idx++] = p[i] - '0'; /* store current integer */
i++; /* increment string index */
}
else
n++, i = n, idx = 0; /* increment n-digits to check */
} /* set i = n, zero array index */
else { /* handle multi-digit values */
char t1[MAXI] = "", t2[MAXI] = ""; /* tmp strings for values */
int v1 = 0, v2 = 0, /* tmp for coverted values */
expn = n - 1, x10 = 1, /* 10 ^ expn for n-- test */
norig = n; /* n to restore on no match */
strncpy (t1, p + i - n, n); /* copy n-digits for 1st value */
errno = 0;
v1 = (int) strtol (t1, NULL, 10); /* convert to int/validate */
if (errno) {
fprintf (stderr, "error: failed conversion, i: %d, n: %d\n",
i, n);
return 1;
}
while (expn--) /* loop to build 10 ^ (n-1) */
x10 *= 10; /* compute 10 ^ (n-1), 10, 100 */
if (v1 == x10) /* compare against v1 */
n--; /* reduce t2 by 1-char/digit */
strncpy (t2, p + i, n); /* copy n-digits for 2nd value */
errno = 0;
v2 = (int) strtol (t2, NULL, 10); /* convert to int/validate */
if (errno) {
fprintf (stderr, "error: failed conversion, i: %d, n: %d\n",
i, n);
return 1;
}
if (v1 == v2 + 1) { /* check decreasing values */
if (!idx) /* if array index == 0 */
a[idx++] = v1; /* store first integer */
a[idx++] = v2; /* store current integer */
i += n; /* increment string index */
}
else {
n += n < norig ? 2 : 1; /* reset n if no match */
i = n; /* set string index to n */
idx = 0; /* reset array index to 0 */
}
}
}
if (idx && n < len) { /* if array has values, output */
printf ("integers :");
for (int j = 0; j < idx; j++)
printf (" %*d", n, a[j]);
putchar ('\n');
}
else
printf ("no solution found.\n");
return 0;
}
note: not all corner-cases have been evaluated and the input is presumed to contain only digits. (you are free to add the check for isdigit if you expect otherwise), further testing on your part should be done to satisfy yourself any odd-ball cases are sufficiently covered.
Example Use/Output
$ ./bin/intsepdecr
string : 9876543
integers : 9 8 7 6 5 4 3
$ ./bin/intsepdecr 109876543
string : 109876543
integers : 10 9 8 7 6 5 4 3
$ ./bin/intsepdecr 400399398397
string : 400399398397
integers : 400 399 398 397
$ ./bin/intsepdecr 400399398396
string : 400399398396
no solution found.
$ ./bin/intsepdecr 101176543
string : 101176543
no solution found.
Look things over and let me know if you have any further questions.
I can give a basic algorithm of how to go about this problem.
Convert the first x digits of the string into an integer (Initially x=1). You can even use a simple function like strtoi for this. (Let us say this number is N)
Find no of digits in N-1. Convert that many digits next into an integer.
Is the Converted value equal to N-1. If So, Continue and convert the rest of the string by repeating steps 2 & 3. (Need to Set N = N-1)
If not equal to N-1, Repeat from step 1, Only this time increment the number of digits converted.
Exit the program and declare a malformed string if you are not able to convert the entire string and x is greater than half the length of string.
Here is something i just whipped up i tested it against your cases and it works
Mind it is in c++ but it would be simple to convert to c.
string spaceNum(string in)
{
string numConvBuff;
size_t matchSize = 0;
if (in.size() == 1)
{
return to_string(in[0] - '0');
}
for (int i = 0; i < in.size() / 2; i++)
{
numConvBuff = in.substr(0, i + 1);
unsigned int numRes = stoul(numConvBuff) - 1;
string numResStr = to_string(numRes);
string n = in.substr(i + 1, numResStr.length());
if(numRes == stoul(n))
matchSize = i+1;
}
if (matchSize)
{
string out = in.substr(0, matchSize);
unsigned int numRes = stoul(out);
for (size_t i = matchSize; i < in.length();)
{
numRes--;
string numResStr = to_string(numRes);
string n = in.substr(i, numResStr.length());
out += " " + n;
i += numResStr.length();
}
return out;
}
return "";
}

Converting ascii hex string to byte array

I have a char array say char value []={'0','2','0','c','0','3'};
I want to convert this into a byte array like unsigned char val[]={'02','0c','03'}
This is in an embedded application so i can't use string.h functions. How can i do this?
Sicne you talk about an embedded application I assume that you want to save the numbers as values and not as strings/characters. So if you just want to store your character data as numbers (for example in an integer), you can use sscanf.
This means you could do something like this:
char source_val[] = {'0','A','0','3','B','7'} // Represents the numbers 0x0A, 0x03 and 0xB7
uint8 dest_val[3]; // We want to save 3 numbers
for(int i = 0; i<3; i++)
{
sscanf(&source_val[i*2],"%x%x",&dest_val[i]); // Everytime we read two chars --> %x%x
}
// Now dest_val contains 0x0A, 0x03 and 0xB7
However if you want to store it as a string (like in your example), you can't use unsigned char
since this type is also just 8-Bit long, which means it can only store one character. Displaying 'B3' in a single (unsigned) char does not work.
edit: Ok according to comments, the goal is to save the passed data as a numerical value. Unfortunately the compiler from the opener does not support sscanf which would be the easiest way to do so. Anyhow, since this is (in my opinion) the simplest approach, I will leave this part of the answer at it is and try to add a more custom approach in this edit.
Regarding the data type, it actually doesn't matter if you have uint8. Even though I would advise to use some kind of integer data type, you can also store your data into an unsigned char. The problem here is, that the data you get passed, is a character/letter, that you want to interpret as a numerical value. However, the internal storage of your character differs. You can check the ASCII Table, where you can check the internal values for every character.
For example:
char letter = 'A'; // Internally 0x41
char number = 0x61; // Internally 0x64 - represents the letter 'a'
As you can see there is also a differnce between upper an lower case.
If you do something like this:
int myVal = letter; //
myVal won't represent the value 0xA (decimal 10), it will have the value 0x41.
The fact you can't use sscanf means you need a custom function. So first of all we need a way to conver one letter into an integer:
int charToInt(char letter)
{
int myNumerical;
// First we want to check if its 0-9, A-F, or a-f) --> See ASCII Table
if(letter > 47 && letter < 58)
{
// 0-9
myNumerical = letter-48;
// The Letter "0" is in the ASCII table at position 48 -> meaning if we subtract 48 we get 0 and so on...
}
else if(letter > 64 && letter < 71)
{
// A-F
myNumerical = letter-55
// The Letter "A" (dec 10) is at Pos 65 --> 65-55 = 10 and so on..
}
else if(letter > 96 && letter < 103)
{
// a-f
myNumerical = letter-87
// The Letter "a" (dec 10) is at Pos 97--> 97-87 = 10 and so on...
}
else
{
// Not supported letter...
myNumerical = -1;
}
return myNumerical;
}
Now we have a way to convert every single character into a number. The other problem, is to always append two characters together, but this is rather easy:
int appendNumbers(int higherNibble, int lowerNibble)
{
int myNumber = higherNibble << 4;
myNumber |= lowerNibbler;
return myNumber;
// Example: higherNibble = 0x0A, lowerNibble = 0x03; -> myNumber 0 0xA3
// Of course you have to ensure that the parameters are not bigger than 0x0F
}
Now everything together would be something like this:
char source_val[] = {'0','A','0','3','B','7'} // Represents the numbers 0x0A, 0x03 and 0xB7
int dest_val[3]; // We want to save 3 numbers
int temp_low, temp_high;
for(int i = 0; i<3; i++)
{
temp_high = charToInt(source_val[i*2]);
temp_low = charToInt(source_val[i*2+1]);
dest_val[i] = appendNumbers(temp_high , temp_low);
}
I hope that I understood your problem right, and this helps..
If you have a "proper" array, like value as declared in the question, then you loop over the size of it to get each character. If you're on a system which uses the ASCII alphabet (which is most likely) then you can convert a hexadecimal digit in character form to a decimal value by subtracting '0' for digits (see the linked ASCII table to understand why), and subtracting 'A' or 'a' for letters (make sure no letters are higher than 'F' of course) and add ten.
When you have the value from the first hexadeximal digit, then convert the second hexadecimal digit the same way. Multiply the first value by 16 and add the second value. You now have single byte value corresponding to two hexadecimal digits in character form.
Time for some code examples:
/* Function which converts a hexadecimal digit character to its integer value */
int hex_to_val(const char ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0'; /* Simple ASCII arithmetic */
else if (ch >= 'a' && ch <= 'f')
return 10 + ch - 'a'; /* Because hex-digit a is ten */
else if (ch >= 'A' && ch <= 'F')
return 10 + ch - 'A'; /* Because hex-digit A is ten */
else
return -1; /* Not a valid hexadecimal digit */
}
...
/* Source character array */
char value []={'0','2','0','c','0','3'};
/* Destination "byte" array */
char val[3];
/* `i < sizeof(value)` works because `sizeof(char)` is always 1 */
/* `i += 2` because there is two digits per value */
/* NOTE: This loop can only handle an array of even number of entries */
for (size_t i = 0, j = 0; i < sizeof(value); i += 2, ++j)
{
int digit1 = hex_to_val(value[i]); /* Get value of first digit */
int digit2 = hex_to_val(value[i + 1]); /* Get value of second digit */
if (digit1 == -1 || digit2 == -1)
continue; /* Not a valid hexadecimal digit */
/* The first digit is multiplied with the base */
/* Cast to the destination type */
val[j] = (char) (digit1 * 16 + digit2);
}
for (size_t i = 0; i < 3; ++i)
printf("Hex value %lu = %02x\n", i + 1, val[i]);
The output from the code above is
Hex value 1 = 02
Hex value 2 = 0c
Hex value 3 = 03
A note about the ASCII arithmetic: The ASCII value for the character '0' is 48, and the ASCII value for the character '1' is 49. Therefore '1' - '0' will result in 1.
It's easy with strtol():
#include <stdlib.h>
#include <assert.h>
void parse_bytes(unsigned char *dest, const char *src, size_t n)
{
/** size 3 is important to make sure tmp is \0-terminated and
the initialization guarantees that the array is filled with zeros */
char tmp[3] = "";
while (n--) {
tmp[0] = *src++;
tmp[1] = *src++;
*dest++ = strtol(tmp, NULL, 16);
}
}
int main(void)
{
unsigned char d[3];
parse_bytes(d, "0a1bca", 3);
assert(d[0] == 0x0a);
assert(d[1] == 0x1b);
assert(d[2] == 0xca);
return EXIT_SUCCESS;
}
If that is not available (even though it is NOT from string.h), you could do something like:
int ctohex(char c)
{
if (c >= '0' && c <= '9') {
return c - '0';
}
switch (c) {
case 'a':
case 'A':
return 0xa;
case 'b':
case 'B':
return 0xb;
/**
* and so on
*/
}
return -1;
}
void parse_bytes(unsigned char *dest, const char *src, size_t n)
{
while (n--) {
*dest = ctohex(*src++) * 16;
*dest++ += ctohex(*src++);
}
}
Assuming 8-bit bytes (not actually guaranteed by the C standard, but ubiquitous), the range of `unsigned char` is 0..255, and the range of `signed char` is -128..127. ASCII was developed as a 7-bit code using values in the range 0-127, so the same value can be represented by both `char` types.
For the now discovered task of converting a counted hex-string from ascii to unsigned bytes, here's my take:
unsigned int atob(char a){
register int b;
b = a - '0'; // subtract '0' so '0' goes to 0 .. '9' goes to 9
if (b > 9) b = b - ('A' - '0') + 10; // too high! try 'A'..'F'
if (b > 15) b = b - ('a' - 'A); // too high! try 'a'..'f'
return b;
}
void myfunc(const char *in, int n){
int i;
unsigned char *ba;
ba=malloc(n/2);
for (i=0; i < n; i+=2){
ba[i/2] = (atob(in[i]) << 4) | atob(in[i+1]);
}
// ... do something with ba
}

Display the binary representation of a number in C? [duplicate]

This question already has answers here:
Closed 11 years ago.
Possible Duplicate:
Is there a printf converter to print in binary format?
Still learning C and I was wondering:
Given a number, is it possible to do something like the following?
char a = 5;
printf("binary representation of a = %b",a);
> 101
Or would i have to write my own method to do the transformation to binary?
There is no direct way (i.e. using printf or another standard library function) to print it. You will have to write your own function.
/* This code has an obvious bug and another non-obvious one :) */
void printbits(unsigned char v) {
for (; v; v >>= 1) putchar('0' + (v & 1));
}
If you're using terminal, you can use control codes to print out bytes in natural order:
void printbits(unsigned char v) {
printf("%*s", (int)ceil(log2(v)) + 1, "");
for (; v; v >>= 1) printf("\x1b[2D%c",'0' + (v & 1));
}
Based on dirkgently's answer, but fixing his two bugs, and always printing a fixed number of digits:
void printbits(unsigned char v) {
int i; // for C89 compatability
for(i = 7; i >= 0; i--) putchar('0' + ((v >> i) & 1));
}
Yes (write your own), something like the following complete function.
#include <stdio.h> /* only needed for the printf() in main(). */
#include <string.h>
/* Create a string of binary digits based on the input value.
Input:
val: value to convert.
buff: buffer to write to must be >= sz+1 chars.
sz: size of buffer.
Returns address of string or NULL if not enough space provided.
*/
static char *binrep (unsigned int val, char *buff, int sz) {
char *pbuff = buff;
/* Must be able to store one character at least. */
if (sz < 1) return NULL;
/* Special case for zero to ensure some output. */
if (val == 0) {
*pbuff++ = '0';
*pbuff = '\0';
return buff;
}
/* Work from the end of the buffer back. */
pbuff += sz;
*pbuff-- = '\0';
/* For each bit (going backwards) store character. */
while (val != 0) {
if (sz-- == 0) return NULL;
*pbuff-- = ((val & 1) == 1) ? '1' : '0';
/* Get next bit. */
val >>= 1;
}
return pbuff+1;
}
Add this main to the end of it to see it in operation:
#define SZ 32
int main(int argc, char *argv[]) {
int i;
int n;
char buff[SZ+1];
/* Process all arguments, outputting their binary. */
for (i = 1; i < argc; i++) {
n = atoi (argv[i]);
printf("[%3d] %9d -> %s (from '%s')\n", i, n,
binrep(n,buff,SZ), argv[i]);
}
return 0;
}
Run it with "progname 0 7 12 52 123" to get:
[ 1] 0 -> 0 (from '0')
[ 2] 7 -> 111 (from '7')
[ 3] 12 -> 1100 (from '12')
[ 4] 52 -> 110100 (from '52')
[ 5] 123 -> 1111011 (from '123')
#include<iostream>
#include<conio.h>
#include<stdlib.h>
using namespace std;
void displayBinary(int n)
{
char bistr[1000];
itoa(n,bistr,2); //2 means binary u can convert n upto base 36
printf("%s",bistr);
}
int main()
{
int n;
cin>>n;
displayBinary(n);
getch();
return 0;
}
Use a lookup table, like:
char *table[16] = {"0000", "0001", .... "1111"};
then print each nibble like this
printf("%s%s", table[a / 0x10], table[a % 0x10]);
Surely you can use just one table, but it will be marginally faster and too big.
There is no direct format specifier for this in the C language. Although I wrote this quick python snippet to help you understand the process step by step to roll your own.
#!/usr/bin/python
dec = input("Enter a decimal number to convert: ")
base = 2
solution = ""
while dec >= base:
solution = str(dec%base) + solution
dec = dec/base
if dec > 0:
solution = str(dec) + solution
print solution
Explained:
dec = input("Enter a decimal number to convert: ") - prompt the user for numerical input (there are multiple ways to do this in C via scanf for example)
base = 2 - specify our base is 2 (binary)
solution = "" - create an empty string in which we will concatenate our solution
while dec >= base: - while our number is bigger than the base entered
solution = str(dec%base) + solution - get the modulus of the number to the base, and add it to the beginning of our string (we must add numbers right to left using division and remainder method). the str() function converts the result of the operation to a string. You cannot concatenate integers with strings in python without a type conversion.
dec = dec/base - divide the decimal number by the base in preperation to take the next modulo
if dec > 0:
solution = str(dec) + solution - if anything is left over, add it to the beginning (this will be 1, if anything)
print solution - print the final number
This code should handle your needs up to 64 bits.
char* pBinFill(long int x,char *so, char fillChar); // version with fill
char* pBin(long int x, char *so); // version without fill
#define width 64
char* pBin(long int x,char *so)
{
char s[width+1];
int i=width;
s[i--]=0x00; // terminate string
do
{ // fill in array from right to left
s[i--]=(x & 1) ? '1':'0'; // determine bit
x>>=1; // shift right 1 bit
} while( x &gt 0);
i++; // point to last valid character
sprintf(so,"%s",s+i); // stick it in the temp string string
return so;
}
char* pBinFill(long int x,char *so, char fillChar)
{ // fill in array from right to left
char s[width+1];
int i=width;
s[i--]=0x00; // terminate string
do
{
s[i--]=(x & 1) ? '1':'0';
x>>=1; // shift right 1 bit
} while( x > 0);
while(i>=0) s[i--]=fillChar; // fill with fillChar
sprintf(so,"%s",s);
return so;
}
void test()
{
char so[width+1]; // working buffer for pBin
long int val=1;
do
{
printf("%ld =\t\t%#lx =\t\t0b%s\n",val,val,pBinFill(val,so,0));
val*=11; // generate test data
} while (val < 100000000);
}
Output:
00000001 = 0x000001 = 0b00000000000000000000000000000001
00000011 = 0x00000b = 0b00000000000000000000000000001011
00000121 = 0x000079 = 0b00000000000000000000000001111001
00001331 = 0x000533 = 0b00000000000000000000010100110011
00014641 = 0x003931 = 0b00000000000000000011100100110001
00161051 = 0x02751b = 0b00000000000000100111010100011011
01771561 = 0x1b0829 = 0b00000000000110110000100000101001
19487171 = 0x12959c3 = 0b00000001001010010101100111000011
You have to write your own transformation. Only decimal, hex and octal numbers are supported with format specifiers.

Resources