Related
I am given the mantissa, exponent, and sign and I have to convert it into the corresponding float. I am using 22 bits for mantissa, 9 bits for exponent, and 1 bit for the sign.
I conceptually know how to convert them into a float, first adjusting the exponent back to its place, then converting the resulting number back into a float, but I'm having trouble implementing this in C. I saw this thread, but I couldn't understand the code, and I'm not sure the answer is even right. Can anyone point me in the right direction? I need to code it in C
Edit: I've made some progress by first converting the mantissa into binary, then adjusting the decimal point of the binary, then converting the decimal-point binary back into the actual float. I based my conversion functions off these two GeekforGeek pages (one, two) But it seems like doing all these binary conversions is doing it the long and hard way. The link above apparently does it in very little steps by using the >> operators, but I don't understand exactly how that results in a float.
Here is a program with comments explaining the decoding:
#include <inttypes.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
// Define constants describing the floating-point encoding.
enum
{
SignificandBits = 22, // Number of bits in signficand field.
ExponentBits = 9, // Number of bits in exponent field.
ExponentMaximum = (1 << ExponentBits) - 1,
ExponentBias = (1 << ExponentBits-1) - 1,
};
/* Given the contents of the sign, exponent, and significand fields that
encode a floating-point number following IEEE-754 patterns for binary
floating-point, return the encoded number.
"double" is used for the return type as not all values represented by the
sample format (9 exponent bits, 22 significand bits) will fit in a "float"
when it is the commonly used IEEE-754 binary32 format.
*/
double DecodeCustomFloat(
unsigned SignField, uint32_t ExponentField, uint32_t SignificandField)
{
/* We are given a significand field as an integer, but it is used as the
value of a binary numeral consisting of “.” followed by the significand
bits. That value equals the integer divided by 2 to the power of the
number of significand bits. Define a constant with that value to be
used for converting the significand field to represented value.
*/
static const double SignificandRatio = (uint32_t) 1 << SignificandBits;
/* Decode the sign field:
If the sign bit is 0, the sign is +, for which we use +1.
If the sign bit is 1, the sign is -, for which we use -1.
*/
double Sign = SignField ? -1. : +1.;
// Dispatch to handle the different categories of exponent field.
switch (ExponentField)
{
/* When the exponent field is all ones, the value represented is a
NaN or infinity:
If the significand field is zero, it is an infinity.
Otherwise, it is a NaN. In either case, the sign should be
preserved.
Note this is a simple demonstration implementation that does not
preserve the bits in the significand field of a NaN -- we just
return the generic NAN without attempting to set its significand
bits.
*/
case ExponentMaximum:
{
return Sign * (SignificandField ? NAN : INFINITY);
}
/* When the exponent field is not all zeros or all ones, the value
represented is a normal number:
The exponent represented is ExponentField - ExponentBias, and
the significand represented is the value given by the binary
numeral “1.” followed by the significand bits.
*/
default:
{
int Exponent = ExponentField - ExponentBias;
double Significand = 1 + SignificandField / SignificandRatio;
return Sign * ldexp(Significand, Exponent);
}
/* When the exponent field is zero, the value represented is subnormal:
The exponent represented is 1 - ExponentBias, and the
significand represented is the value given by the binary
numeral “0.” followed by the significand bits.
*/
case 0:
{
int Exponent = 1 - ExponentBias;
double Significand = 0 + SignificandField / SignificandRatio;
return Sign * ldexp(Significand, Exponent);
}
}
}
/* Test that a given set of fields decodes to the expected value and
print the fields and the decoded value.
*/
static void Demonstrate(
unsigned SignField, uint32_t SignificandField, uint32_t ExponentField,
double Expected)
{
double Observed
= DecodeCustomFloat(SignField, SignificandField, ExponentField);
if (! (Observed == Expected) && ! (isnan(Observed) && isnan(Expected)))
{
fprintf(stderr,
"Error, expected (%u, %" PRIu32 ", %" PRIu32 ") to represent "
"%g (hexadecimal %a) but got %g (hexadecimal %a).\n",
SignField, SignificandField, ExponentField,
Expected, Expected,
Observed, Observed);
exit(EXIT_FAILURE);
}
printf(
"(%u, %" PRIu32 ", %" PRIu32 ") represents %g (hexadecimal %a).\n",
SignField, SignificandField, ExponentField, Observed, Observed);
}
int main(void)
{
Demonstrate(0, 0, 0, +0.);
Demonstrate(1, 0, 0, -0.);
Demonstrate(0, 255, 0, +1.);
Demonstrate(1, 255, 0, -1.);
Demonstrate(0, 511, 0, +INFINITY);
Demonstrate(1, 511, 0, -INFINITY);
Demonstrate(0, 511, 1, +NAN);
Demonstrate(1, 511, 1, -NAN);
Demonstrate(0, 0, 1, +0x1p-276);
Demonstrate(1, 0, 1, -0x1p-276);
Demonstrate(0, 255, 1, +1. + 0x1p-22);
Demonstrate(1, 255, 1, -1. - 0x1p-22);
Demonstrate(0, 1, 0, +0x1p-254);
Demonstrate(1, 1, 0, -0x1p-254);
Demonstrate(0, 510, 0x3fffff, +0x1p256 - 0x1p233);
Demonstrate(1, 510, 0x3fffff, -0x1p256 + 0x1p233);
}
Some notes:
ldexp is a standard C library function. ldexp(x, e) returns x multiplied by 2 to the power of e.
uint32_t is an unsigned 32-bit integer type. It is defined in stdint.h.
"%" PRIu32 provides a printf conversion specification for formatting a uint32_t.
Here is a simple program to illustrate how to break a float into its components and how to compose a float value from a (sign, exponent, mantissa) triplet:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
void dumpbits(uint32_t bits, int n) {
while (n--)
printf("%d%c", (bits >> n) & 1, ".|"[!n]);
}
int main(int argc, char *argv[]) {
unsigned sign = 0;
unsigned exponent = 127;
unsigned long mantissa = 0;
union {
float f32;
uint32_t u32;
} u;
if (argc == 2) {
u.f32 = strtof(argv[1], NULL);
sign = u.u32 >> 31;
exponent = (u.u32 >> 23) & 0xff;
mantissa = (u.u32) & 0x7fffff;
printf("%.8g -> sign:%u, exponent:%u, mantissa:0x%06lx\n",
(double)u.f32, sign, exponent, mantissa);
printf("+s+----exponent---+------------------mantissa-------------------+\n");
printf("|");
dumpbits(sign, 1);
dumpbits(exponent, 8);
dumpbits(mantissa, 23);
printf("\n");
printf("+-+---------------+---------------------------------------------+\n");
} else {
if (argc > 1) sign = strtol(argv[1], NULL, 0);
if (argc > 2) exponent = strtol(argv[2], NULL, 0);
if (argc > 3) mantissa = strtol(argv[3], NULL, 0);
u.u32 = (sign << 31) | (exponent << 23) | mantissa;
printf("sign:%u, exponent:%u, mantissa:0x%06lx -> %.8g\n",
sign, exponent, mantissa, (double)u.f32);
}
return 0;
}
Note that contrary to your assignment, the size of the mantissa is 23 bits and the exponent has 8 bits, which correspond to the IEEE 754 Standard for 32-bit aka single-precision float. See the Wikipedia article on Single-precision floating-point format.
Linked question is C++ not C. To convert between datatypes in C preserving bits, a tool to use is the union. Something like
union float_or_int {
uint32_t i;
float f;
}
float to_float(uint32_t mantissa, uint32_t exponent, uint32_t sign)
{
union float_or_int result;
result.i = (sign << 31) | (exponent << 22) | mantissa;
return result.f;
}
Sorry for typos, it's been a while since I've coded in C 😙
I was wondering if you could help explain the process on converting an integer to float, or a float to an integer. For my class, we are to do this using only bitwise operators, but I think a firm understanding on the casting from type to type will help me more in this stage.
From what I know so far, for int to float, you will have to convert the integer into binary, normalize the value of the integer by finding the significand, exponent, and fraction, and then output the value in float from there?
As for float to int, you will have to separate the value into the significand, exponent, and fraction, and then reverse the instructions above to get an int value?
I tried to follow the instructions from this question: Casting float to int (bitwise) in C.
But I was not really able to understand it.
Also, could someone explain why rounding will be necessary for values greater than 23 bits when converting int to float?
First, a paper you should consider reading, if you want to understand floating point foibles better: "What Every Computer Scientist Should Know About Floating Point Arithmetic," http://www.validlab.com/goldberg/paper.pdf
And now to some meat.
The following code is bare bones, and attempts to produce an IEEE-754 single precision float from an unsigned int in the range 0 < value < 224. That's the format you're most likely to encounter on modern hardware, and it's the format you seem to reference in your original question.
IEEE-754 single-precision floats are divided into three fields: A single sign bit, 8 bits of exponent, and 23 bits of significand (sometimes called a mantissa). IEEE-754 uses a hidden 1 significand, meaning that the significand is actually 24 bits total. The bits are packed left to right, with the sign bit in bit 31, exponent in bits 30 .. 23, and the significand in bits 22 .. 0. The following diagram from Wikipedia illustrates:
The exponent has a bias of 127, meaning that the actual exponent associated with the floating point number is 127 less than the value stored in the exponent field. An exponent of 0 therefore would be encoded as 127.
(Note: The full Wikipedia article may be interesting to you. Ref: http://en.wikipedia.org/wiki/Single_precision_floating-point_format )
Therefore, the IEEE-754 number 0x40000000 is interpreted as follows:
Bit 31 = 0: Positive value
Bits 30 .. 23 = 0x80: Exponent = 128 - 127 = 1 (aka. 21)
Bits 22 .. 0 are all 0: Significand = 1.00000000_00000000_0000000. (Note I restored the hidden 1).
So the value is 1.0 x 21 = 2.0.
To convert an unsigned int in the limited range given above, then, to something in IEEE-754 format, you might use a function like the one below. It takes the following steps:
Aligns the leading 1 of the integer to the position of the hidden 1 in the floating point representation.
While aligning the integer, records the total number of shifts made.
Masks away the hidden 1.
Using the number of shifts made, computes the exponent and appends it to the number.
Using reinterpret_cast, converts the resulting bit-pattern to a float. This part is an ugly hack, because it uses a type-punned pointer. You could also do this by abusing a union. Some platforms provide an intrinsic operation (such as _itof) to make this reinterpretation less ugly.
There are much faster ways to do this; this one is meant to be pedagogically useful, if not super efficient:
float uint_to_float(unsigned int significand)
{
// Only support 0 < significand < 1 << 24.
if (significand == 0 || significand >= 1 << 24)
return -1.0; // or abort(); or whatever you'd like here.
int shifts = 0;
// Align the leading 1 of the significand to the hidden-1
// position. Count the number of shifts required.
while ((significand & (1 << 23)) == 0)
{
significand <<= 1;
shifts++;
}
// The number 1.0 has an exponent of 0, and would need to be
// shifted left 23 times. The number 2.0, however, has an
// exponent of 1 and needs to be shifted left only 22 times.
// Therefore, the exponent should be (23 - shifts). IEEE-754
// format requires a bias of 127, though, so the exponent field
// is given by the following expression:
unsigned int exponent = 127 + 23 - shifts;
// Now merge significand and exponent. Be sure to strip away
// the hidden 1 in the significand.
unsigned int merged = (exponent << 23) | (significand & 0x7FFFFF);
// Reinterpret as a float and return. This is an evil hack.
return *reinterpret_cast< float* >( &merged );
}
You can make this process more efficient using functions that detect the leading 1 in a number. (These sometimes go by names like clz for "count leading zeros", or norm for "normalize".)
You can also extend this to signed numbers by recording the sign, taking the absolute value of the integer, performing the steps above, and then putting the sign into bit 31 of the number.
For integers >= 224, the entire integer does not fit into the significand field of the 32-bit float format. This is why you need to "round": You lose LSBs in order to make the value fit. Thus, multiple integers will end up mapping to the same floating point pattern. The exact mapping depends on the rounding mode (round toward -Inf, round toward +Inf, round toward zero, round toward nearest even). But the fact of the matter is you can't shove 24 bits into fewer than 24 bits without some loss.
You can see this in terms of the code above. It works by aligning the leading 1 to the hidden 1 position. If a value was >= 224, the code would need to shift right, not left, and that necessarily shifts LSBs away. Rounding modes just tell you how to handle the bits shifted away.
Have you checked the IEEE 754 floating-point representation?
In 32-bit normalized form, it has (mantissa's) sign bit, 8-bit exponent (excess-127, I think) and 23-bit mantissa in "decimal" except that the "0." is dropped (always in that form) and the radix is 2, not 10. That is: the MSB value is 1/2, the next bit 1/4 and so on.
Joe Z's answer is elegant but range of input values is highly limited. 32 bit float can store all integer values from the following range:
[-224...+224] = [-16777216...+16777216]
and some other values outside this range.
The whole range would be covered by this:
float int2float(int value)
{
// handles all values from [-2^24...2^24]
// outside this range only some integers may be represented exactly
// this method will use truncation 'rounding mode' during conversion
// we can safely reinterpret it as 0.0
if (value == 0) return 0.0;
if (value == (1U<<31)) // ie -2^31
{
// -(-2^31) = -2^31 so we'll not be able to handle it below - use const
// value = 0xCF000000;
return (float)INT_MIN; // *((float*)&value); is undefined behaviour
}
int sign = 0;
// handle negative values
if (value < 0)
{
sign = 1U << 31;
value = -value;
}
// although right shift of signed is undefined - all compilers (that I know) do
// arithmetic shift (copies sign into MSB) is what I prefer here
// hence using unsigned abs_value_copy for shift
unsigned int abs_value_copy = value;
// find leading one
int bit_num = 31;
int shift_count = 0;
for(; bit_num > 0; bit_num--)
{
if (abs_value_copy & (1U<<bit_num))
{
if (bit_num >= 23)
{
// need to shift right
shift_count = bit_num - 23;
abs_value_copy >>= shift_count;
}
else
{
// need to shift left
shift_count = 23 - bit_num;
abs_value_copy <<= shift_count;
}
break;
}
}
// exponent is biased by 127
int exp = bit_num + 127;
// clear leading 1 (bit #23) (it will implicitly be there but not stored)
int coeff = abs_value_copy & ~(1<<23);
// move exp to the right place
exp <<= 23;
union
{
int rint;
float rfloat;
}ret = { sign | exp | coeff };
return ret.rfloat;
}
Of course there are other means to find abs value of int (branchless). Similarly couting leading zeros can also be done without a branch so treat this example as example ;-).
I'm sorry if I can't explain correctly, but my english management is so bad.
Well, the question is: I have a double var, and I cast this var to float, because I need to send exclusively 4 bytes, not 8. This isn't work for me, so I decide to calculate the value directly from IEEE754 standard.
I have this code:
union DoubleNumberIEEE754{
struct{
uint64_t mantissa : 52;
uint64_t exponent : 11;
uint64_t sign : 1;
}raw;
double d;
char c[8];
}dnumber;
floatval = (pow((-1), dnumber.raw.sign) * (1 + dnumber.raw.mantissa) * pow(2, (dnumber.raw.exponent - 1023)));
With these code, I can't obtain the correct value.
I am watching the header from linux to see the correct order of components, but I don't know if this code is correct.
I am skeptical that the double-to-float conversion is broken, but, assuming it is:
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Create a mask of n low bits, for n from 0 to 63.
#define Mask(n) (((uint64_t) 1 << (n)) - 1)
/* This routine converts float values to double values:
float and double must be IEEE-754 binary32 and binary64, respectively.
The payloads of NaNs are not preserved, and only a quiet NaN is
returned.
The double is represented to the nearest value in float, with ties
rounded to the float with the even low bit in the significand.
We assume a standard C conversion from double to float is broken for
unknown reasons but that a converstion from a representable uint32_t to a
float works.
*/
static float ConvertDoubleToFloat(double x)
{
// Copy the double into a uint64_t so we can access its representation.
uint64_t u;
memcpy(&u, &x, sizeof u);
// Extract the fields from the representation of a double.
int SignCode = u >> 63;
int ExponentCode = u >> 52 & Mask(11);
uint64_t SignificandCode = u & Mask(52);
/* Convert the fields to their represented values.
The sign code merely encodes - or +.
The exponent code is biased by 1023 from the actual exponent.
The significand code represents the portion of the significand
after the radix point. However, since there is some problem
converting float to double, we will maintain it with an integer
type, scaled by 2**52 from its represented value.
The exponent code also represents the portion of the significand
before the radix point -- 1 if the exponent is non-zero, 0 if the
exponent is zero. We include that in the significand, scaled by
2**52.
*/
float Sign = SignCode ? -1 : +1;
int Exponent = ExponentCode - 1023;
uint64_t ScaledSignificand =
(ExponentCode ? ((uint64_t) 1 << 52) : 0) + SignificandCode;
// Handle NaNs and infinities.
if (ExponentCode == Mask(11))
return Sign * (SignificandCode == 0 ? INFINITY : NAN);
/* Round the significand:
If Exponent < -150, all bits of the significand are below 1/2 ULP
of the least positive float, so they round to zero.
If -150 <= Exponent < -126, only bits of the significand
corresponding to exponent -149 remain in the significand, so we
shift accordingly and round the residue.
Otherwise, the top 24 bits of the significand remain in the
significand (except when there is overflow to infinity), so we
shift accordingly and round the residue.
Note that the scaling in the new significand is 2**23 instead of 2**52,
since we are shifting it for the float format.
*/
uint32_t NewScaledSignificand;
if (Exponent < -150)
NewScaledSignificand = 0;
else
{
unsigned Shift = 53 - (Exponent < -126 ? Exponent - -150 : 24);
NewScaledSignificand = ScaledSignificand >> Shift;
// Clamp the exponent for subnormals.
if (Exponent < -126)
Exponent = -126;
// Examine the residue being lost and round accordingly.
uint64_t Residue = ScaledSignificand - ((uint64_t) NewScaledSignificand << Shift);
uint64_t Half = (uint64_t) 1 << Shift-1;
// If the residue is greater than 1/2 ULP, round up (in magnitude).
if (Half < Residue)
NewScaledSignificand += 1;
/* If the residue is 1/2 ULP, round 0.1 to 0 and 1.1 to 10.0 (these
numerals are binary with "." marking the ULP position).
*/
else if (Half == Residue)
NewScaledSignificand += NewScaledSignificand & 1;
/* Otherwise, the residue is less than 1/2, and we have already
rounded down, in the shift.
*/
}
// Combine the components, including removing the significand scaling.
return Sign * ldexpf(NewScaledSignificand, Exponent-23);
}
static void TestOneSign(double x)
{
float Expected = x;
float Observed = ConvertDoubleToFloat(x);
if (Observed != Expected && !(isnan(Observed) && isnan(Expected)))
{
printf("Error, %a -> %a, but expected %a.\n",
x, Observed, Expected);
exit(EXIT_FAILURE);
}
}
static void Test(double x)
{
TestOneSign(+x);
TestOneSign(-x);
}
int main(void)
{
for (int e = -1024; e < 1024; ++e)
{
Test(ldexp(0x1.0p0, e));
Test(ldexp(0x1.4p0, e));
Test(ldexp(0x1.8p0, e));
Test(ldexp(0x1.cp0, e));
Test(ldexp(0x1.5555540p0, e));
Test(ldexp(0x1.5555548p0, e));
Test(ldexp(0x1.5555550p0, e));
Test(ldexp(0x1.5555558p0, e));
Test(ldexp(0x1.5555560p0, e));
Test(ldexp(0x1.5555568p0, e));
Test(ldexp(0x1.5555570p0, e));
Test(ldexp(0x1.5555578p0, e));
}
Test(3.14);
Test(0);
Test(INFINITY);
Test(NAN);
Test(1/3.);
Test(0x1p128);
Test(0x1p128 - 0x1p104);
Test(0x1p128 - 0x.9p104);
Test(0x1p128 - 0x.8p104);
Test(0x1p128 - 0x.7p104);
}
Problem
I need to multiply a number without using * or + operator or other libs, only binary logic
To multiply a number by two using the IEEE norm, you add one to the exponent, for example:
12 = 1 10000010 100000(...)
So the exponent is: 10000010 (130)
If I want to multiply it by 2, I just add 1 to it and it becomes 10000011 (131).
Question
If I get a float, how do I turn it into, binary, then IEEE norm? Example:
8.0 = 1000.0 in IEEE I need it to have only one number on the left side, so 1.000 * 2^3. Then how do I add one so I multiply it by 2?
I need to get a float, ie. 6.5
Turn it to binary 110.1
Then to IEEE 754 0 10000001 101000(...)
Extract the exponent 10000001
Add one to it 10000010
Return it to IEEE 754 0 10000010 101000(...)
Then back to float 13
Given that the C implementation is known to use IEEE-754 basic 32-bit binary floating-point for its float type, the following code shows how to take apart the bits that represent a float, adjust the exponent, and reassemble the bits. Only simple multiplications involving normal numbers are handled.
#include <assert.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
int main(void)
{
float f = 6.125;
// Copy the bits that represent the float f into a 32-bit integer.
uint32_t u;
assert(sizeof f == sizeof u);
memcpy(&u, &f, sizeof u);
// Extract the sign, exponent, and significand fields.
uint32_t sign = u >> 31;
uint32_t exponent = (u >> 23) & 0xff;
uint32_t significand = u & 0x7fffff;
// Assert the exponent field is in the normal range and will remain so.
assert(0 < exponent && exponent < 254);
// Increment the exponent.
++exponent;
// Reassemble the bits and copy them back into f.
u = sign << 31 | exponent << 23 | significand;
memcpy(&f, &u, sizeof f);
// Display the result.
printf("%g\n", f);
}
Maybe not exactly what you are looking for, but C has a library function ldexp which does exactly what you need:
double x = 6.5;
x = ldexp(x, 1); // now x is 13
Maybe unions is the tool you need.
#include<iostream>
union fb {
float f;
struct b_s {
unsigned int sign :1;
unsigned int mant :22;
unsigned int exp :8;
} b;
};
fb num;
int main() {
num.f = 3.1415;
num.b.exp++;
std::cout << num.f << std::endl;
return 0;
}
Assuming a low end microprocessor with no floating point arithmetic, I need to generate an IEE754 single precision floating point format number to push out to a file.
I need to write a function that takes three integers being the sign, whole and the fraction and returns a byte array with 4 bytes being the IEEE 754 single precision representation.
Something like:
// Convert 75.65 to 4 byte IEEE 754 single precision representation
char* float = convert(0, 75, 65);
Does anybody have any pointers or example C code please? I'm particularly struggling to understand how to convert the mantissa.
You will need to generate the sign (1 bit), the exponent (8 bits, a biased power of 2), and the fraction/mantissa (23 bits).
Bear in mind that the fraction has an implicit leading '1' bit, which means that the most significant leading '1' bit (2^22) is not stored in the IEEE format. For example, given a fraction of 0x755555 (24 bits), the actual bits stored would be 0x355555 (23 bits).
Also bear in mind that the fraction is shifted so that the binary point is immediately to the right of the implicit leading '1' bit. So an IEEE 23-bit fraction of 11 0101 0101... represents the 24-bit binary fraction 1.11 0101 0101...
This means that the exponent has to be adjusted accordingly.
Does the value have to be written big endian or little endian? Reversed bit ordering?
If you are free, you should think about writing the value as string literal. That way you can easily convert the integer: just write the int part and write "e0" as exponent (or omit the exponent and write ".0").
For the binary representation, you should have a look at Wikipedia. Best is to first assemble the bitfields to an uint32_t - the structure is given in the linked article. Note that you might have to round if the integer has more than 23 bits value. Remember to normalize the generated value.
Second step will be to serialize the uint32_t to an uint8_t-array. Mind the endianess of the result!
Also note to use uint8_t for the result if you really want 8 bit values; you should use an unsigned type. For the intermediate representation, using uint32_t is recommended as that will guarantee you operate on 32 bit values.
You haven't had a go yet so no give aways.
Remember you can regard two 32-bit integers a & b to be interpreted as a decimal a.b as being a single 64-bit integer with an exponent of 2^-32 (where ^ is exponent).
So without doing anything you've got it in the form:
s * m * 2^e
The only problem is your mantissa is too long and your number isn't normalized.
A bit of shifting and adding/subtracting with a possible rounding step and you're done.
You can use a software floating point compiler/library.
See https://gcc.gnu.org/onlinedocs/gccint/Soft-float-library-routines.html
The basic premise is to:
Given binary32 float.
Form a binary fixed-point representation of the combined whole and factional parts hundredths. This code uses a structure encoding both whole and hundredths fields separately. Important that the whole field is at least 32 bits.
Shift left/right (*2 and /2) until MSbit is in the implied bit position whilst counting the shifts. A robust solution would also note non-zero bits shifted out.
Form a biased exponent.
Round mantissa and drop implied bit.
Form sign (not done here).
Combine the above 3 steps to form the answer.
As Sub-normals, infinites & Not-A-Number will not result with whole, hundredths input, generating those float special cases are not addressed here.
.
#include <assert.h>
#include <stdint.h>
#define IMPLIED_BIT 0x00800000L
typedef struct {
int_least32_t whole;
int hundreth;
} x_xx;
int_least32_t covert(int whole, int hundreth) {
assert(whole >= 0 && hundreth >= 0 && hundreth < 100);
if (whole == 0 && hundreth == 0) return 0;
x_xx x = { whole, hundreth };
int_least32_t expo = 0;
int sticky_bit = 0; // Note any 1 bits shifted out
while (x.whole >= IMPLIED_BIT * 2) {
expo++;
sticky_bit |= x.hundreth % 2;
x.hundreth /= 2;
x.hundreth += (x.whole % 2)*(100/2);
x.whole /= 2;
}
while (x.whole < IMPLIED_BIT) {
expo--;
x.hundreth *= 2;
x.whole *= 2;
x.whole += x.hundreth / 100;
x.hundreth %= 100;
}
int32_t mantissa = x.whole;
// Round to nearest - ties to even
if (x.hundreth >= 100/2 && (x.hundreth > 100/2 || x.whole%2 || sticky_bit)) {
mantissa++;
}
if (mantissa >= (IMPLIED_BIT * 2)) {
mantissa /= 2;
expo++;
}
mantissa &= ~IMPLIED_BIT; // Toss MSbit as it is implied in final
expo += 24 + 126; // Bias: 24 bits + binary32 bias
expo <<= 23; // Offset
return expo | mantissa;
}
void test_covert(int whole, int hundreths) {
union {
uint32_t u32;
float f;
} u;
u.u32 = covert(whole, hundreths);
volatile float best = whole + hundreths / 100.0;
printf("%10d.%02d --> %15.6e %15.6e Same:%d\n", whole, hundreths, u.f, best,
best == u.f);
}
#include <limits.h>
int main(void) {
test_covert(75, 65);
test_covert(0, 1);
test_covert(INT_MAX, 99);
return 0;
}
Output
75.65 --> 7.565000e+01 7.565000e+01 Same:1
0.01 --> 1.000000e-02 1.000000e-02 Same:1
2147483647.99 --> 2.147484e+09 2.147484e+09 Same:1
Known issues: sign not applied.