Multiply float by a number using bitwise operators - c

I have this function that takes in the bits of a float (f) as a uint32_t. It should use bit operations and + to calculate f * 2048 and should return the bits of this value as a uint32_t.
If the result is too large to be represented as a float, +inf or -inf should be returned returned; and if f is +0, -0, +inf or -inf, or Nan, it should be returned unchanged.
uint32_t float_2048(uint32_t f) {
uint32_t a = (f << 1) ;
int result = a << 10;
return result;
}
This is what I have so far but if I give it the value '1' it returns 0 instead of 2048. How do I fix this?
Some example inputs and outputs:
./float_2048 1
2048
./float_2048 3.14159265
6433.98193
./float_2048 -2.718281828e-20
-5.56704133e-17
./float_2048 1e38
inf

As mentioned in the comments, to multiply a floating-point number by a power of 2 (assuming, as is likely, that it is represented in IEEE-754 format), we can just add that power to the (binary) exponent part of the representation.
For a single-precision (32-bit) float value, that exponent is stored in bits 30-23 and the following code shows how to extract those, add the required value (11, because 2048 = 211), then replace the exponent bits with that modified value.
uint32_t fmul2048(uint32_t f)
{
#define EXPONENT 0x7F800000u
#define SIGN_BIT 0x80000000u
uint32_t expon = (f & EXPONENT) >> 23; // Get exponent value
f &= ~EXPONENT; // Remove old exponent
expon += 11; // Adding 11 to exponent multiplies by 2^11 (= 2048);
if (expon > 254) return EXPONENT | (f & SIGN_BIT); // Too big: return +/- Inf
f |= (expon << 23); // Insert modified exponent
return f;
}
There will, no-doubt, be some "bit trickery" that can be applied to make the code smaller and/or more efficient; but I have avoided doing so in order to keep the code clear. I have also included one error check (for a too large exponent) and the code returns the standard representation for +/- Infinity (all exponent bits set to 1, and keeping the original sign) if that test fails. (I leave other error-checking as an "exercise for the reader".)

To handle all float takes more code.
Do some tests so code can assume the expected float size, matching endian and (IEEE) encoding. C does not require float as 32-bit, matching endian to an integer, not binary32 encoding, even though that is common.
Extract the biased exponent and look for its min and max value.
Max values signify NAN or infinity.
Min values are sub-normals and zero and need special handling. The significand needs to be shifted. If that result is now a normal float, re-encode it.
Biased exponents in between simple need an increment and test for exceeding FLT_MAX's exponent.
Tested successfully for all float.
#include <assert.h>
#include <stdint.h>
static_assert(sizeof(uint32_t) == sizeof(float), "Unexpected float size");
#define IEEE_MASK_BIASED_EXPO 0x7F800000u
#define IEEE_MASK_BIASED_EXPO_LSB 0x00800000u
#define IEEE_MASK_SIGNIFICAND 0x007FFFFFu
#define IEEE_SIGNIFICAND_MAX 0x00FFFFFFu
#define IEEE_INFINITY 0x7F800000u
// Scale value by 2048
uint32_t float_2048(uint32_t f) {
uint32_t expo = f & IEEE_MASK_BIASED_EXPO;
// Test for infinity or NAN
if (expo == IEEE_MASK_BIASED_EXPO) {
return f;
}
// Sub-normal and zero test
if (expo == 0) {
uint64_t sig = f & IEEE_MASK_SIGNIFICAND;
sig <<= 11; // *= 2048;
// If value now a normal one
if (sig > IEEE_MASK_SIGNIFICAND) {
expo += IEEE_MASK_BIASED_EXPO_LSB;
while (sig > IEEE_SIGNIFICAND_MAX) {
sig >>= 1;
expo += IEEE_MASK_BIASED_EXPO_LSB;
}
f = (f & ~IEEE_MASK_BIASED_EXPO) | (expo & IEEE_MASK_BIASED_EXPO);
}
f = (f & ~IEEE_MASK_SIGNIFICAND) | (sig & IEEE_MASK_SIGNIFICAND);
} else {
expo += 11 * IEEE_MASK_BIASED_EXPO_LSB; // *= 2048;
if (expo >= IEEE_MASK_BIASED_EXPO) {
f &= ~(IEEE_MASK_BIASED_EXPO | IEEE_MASK_SIGNIFICAND);
f |= IEEE_INFINITY;
} else {
f = (f & ~IEEE_MASK_BIASED_EXPO) | (expo & IEEE_MASK_BIASED_EXPO);
}
}
return f;
}
Test code.
#include <stdio.h>
#include <stdlib.h>
typedef union {
uint32_t u32;
float f;
} fu32;
int main(void ) {
// Lightweight test to see if endian matches and IEEE encoding
assert((fu32) {.u32 = 0x87654321}.f == -1.72477726182e-34f);
float f[] = {0, FLT_TRUE_MIN, FLT_MIN, 1, FLT_MAX};
size_t n = sizeof f/sizeof f[0];
for (size_t i = 0; i<n; i++) {
fu32 x = { .f = f[i] };
float y0 = x.f * 2048.0f;
fu32 y1 = { .u32 = float_2048(x.u32) };
if (memcmp(&y0, &y1.f, sizeof y0)) {
printf("%.9g %.9g\n", y0, y1.f);
}
}
fu32 x = { .u32 = 0 };
do {
fu32 y0 = { .f = isnan(x.f) ? x.f : x.f * 2048.0f };
fu32 y1 = { .u32 = float_2048(x.u32) };
if (memcmp(&y0.f, &y1.f, sizeof y0)) {
printf("%.9g %.9g\n", y0.f, y1.f);
printf("%08lx %08lx %08lx\n", (unsigned long) x.u32,
(unsigned long) y0.u32, (unsigned long) y1.u32);
break;
}
x.u32++;
} while (x.u32 != 0);
puts("Done");
}

Related

How to correctly implement multiply for floating point numbers (software FP)

My program is about a method which is given floats and in this method I want to multiply or add those floats. But not multiply like a * b, I want to break those floats down to their structure like the bit for the sign, the 8 bit for the exponent and the rest of the bits as the mantissa.
I want to implement / emulate software floating-point add and multiply (to learn more about what FP hardware has to do).
In the head of the program there are the breakdowns:
#define SIGN(x) (x>>31);
#define MANT(x) (x&0x7FFFFF);
#define EXPO(x) ((x>>23)&0xFF);
#define SPLIT(x, s, m, e) do { \
s = SIGN(x); \
m = MANT(x); \
e = EXPO(x); \
if ( e != 0x00 && e != 0xFF ) { \
m |= 0x800000; \
} \
} while ( 0 )
#define BUILD(x, s, m, e) do { \
x = (s << 31) | (e<<23) | (m&0x7FFFFF); \
} while ( 0 )
The main looks as follows:
float f = 2.3;
float g = 1.8;
float h = foo(&f, &g);
And the method for the calculation looks like:
float foo(float *a, float *b) {
uint32_t ia = *(unsigned int *)a;
uint32_t ib = *(unsigned int *)b;
uint32_t result = 0;
uint32_t signa, signb, signr;
uint32_t manta, mantb, mantr;
uint32_t expoa, expob, expor;
SPLIT(ia, signa, manta, expoa);
SPLIT(ib, signb, mantb, expob);
I already tried the multiply by adding the exponents and multiply their mantissas as follow:
expor = (expoa -127) + (expob -127) + 127;
mantr = (manta) * (mantb);
signr = signa ^ signb;
The return and rebuild of the new float:
BUILD(result, signr, mantr, expor);
return *(float *)&result;
The problem is now, that the result is wrong. the mantr even takes a very low negative Number (in case if foo gets 1.5 and 2.4 mantr takes -838860800 and the result is 2.0000000).
You can't just take truncate the result of the mantissa multiply, you need to take the top 24 bits (after using the low half for rounding) and renormalize (adjust the exponent).
Floating point operations keep the top significand bits. The most significant part of the integer product is the high bits; the low bits are further places after the decimal. (Terminology: it's a "binary point", not "decimal point", because binary floats use radix 2 (binary), not 10 (decimal).)
For normalized inputs, the implicit leading 1 in the input significands means the 32x32 => 64-bit uint64_t product that you use to implement 24 x 24 => 48-bit mantissa multiplication will have its high bit in one of 2 possible locations, so you don't need a bit-scan to find it. A compare or single-bit-test will do.
For subnormal inputs, that's not guaranteed so you need to check where the MSB is, e.g. with GNU C __builtin_clzll. (There are many special cases to handle for one or both inputs being subnormal, and/or the output being subnormal.)
See https://en.wikipedia.org/wiki/Single-precision_floating-point_format for more about the IEEE-754 binary32 format, including the implied leading 1 of the significand.
And see #njuffa's answer for an actual tested + working implementation that does 64-bit operations as two 32-bit halves for some reason, instead of letting C do that efficiently.
Also, return *(float *)&result; violates strict aliasing. It's only safe on MSVC. Use a union or memcpy for type punning in C99 / C11.
Emulating the multiplication of two IEEE-754 (2008) binary32 operands is a bit more complex than the question suggests. In general, we have to distinguish the following operand classes: zeros, subnormals (0 < |x| < 2-126), normals (2126 ≤ |x| < 2128), infinities, NaNs. Normals use biased exponents in [1, 254], while any of the special operand classes use biased exponents in {0, 255}. The following assumes we want to implement floating-point multiply with all floating-point exceptions masked, and using the round-to-nearest-to-even rounding mode.
First, we check whether any of the arguments belongs to a special operand class. If so, we check the special cases in sequence. If one of the arguments is a NaN, we turn that NaN into a QNaN and return it. If one of the operands is zero, we return an appropriately signed zero, unless the other argument is an infinity, in which case we return a special QNaN INDEFINITE since this is an invalid operation. After that we check for any argument of infinity, returning an appropriately signed infinity. This leaves subnormals, which we normalize. In case there are two subnormal arguments, we only need to normalize one of them as the result will underflow to zero.
The multiplication of normals proceeds as the asker envisioned in the question. The sign of the result is the exclusive-OR of the signs of the arguments, the exponent of the result is the sum of the exponents of the arguments (adjusted for exponent bias), and the significand of the result is generated from the product of the significant of the arguments. We need the full product for rounding. We can either use a 64-bit type for that, or represent it with a pair of 32-bit numbers. In the code below I have chose the latter representation. Rounding to nearest-or-even is straightforward: if we have a tie-case (the result is exactly in the middle between the closest two binary32 number), we need to round up if the least significant bit of the mantissa is 1. Otherwise, we need to round up if the most significant discarded bit (the round bit) is 1.
Three cases need to be considered for the result, based on the result exponent prior to rounding: Exponent is in normal range, result overflows (too large in magnitude), or it underflows (too small in magnitude). In the first case, the result is a normal or infinity if overflow occurs during rounding. In the second case, the result is infinity. In the last case the result is either zero (severe underflow), a subnormal, or the smallest normal (if round-up occurs).
The following code, with a simple framework for light testing via gobs of random test cases and several thousand interesting patterns shows an exemplary ISO-C implementation written in a couple of hours for reasonable clarity and reasonable performance. I let the test framework run for an hour or so on an x64 platform and no errors were reported. If you plan to use the code in production, you would want to construct a more stringent test framework, and may need additional performance tuning.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
#define FLOAT_MANT_BITS (23)
#define FLOAT_EXPO_BITS (8)
#define FLOAT_EXPO_BIAS (127)
#define FLOAT_MANT_MASK (~((~0u) << (FLOAT_MANT_BITS+1))) /* incl. integer bit */
#define EXPO_ADJUST (1) /* adjustment for performance reasons */
#define MIN_NORM_EXPO (1) /* minimum biased exponent of normals */
#define MAX_NORM_EXPO (254) /* maximum biased exponent of normals */
#define INF_EXPO (255) /* biased exponent of infinities */
#define EXPO_MASK (~((~0u) << FLOAT_EXPO_BITS))
#define FLOAT_SIGN_MASK (0x80000000u)
#define FLOAT_IMPLICIT_BIT (1 << FLOAT_MANT_BITS)
#define RND_BIT_SHIFT (31)
#define RND_BIT_MASK (1u << RND_BIT_SHIFT)
#define FLOAT_INFINITY (0x7f800000)
#define FLOAT_INDEFINITE (0xffc00000u)
#define MANT_LSB (0x00000001)
#define FLOAT_QNAN_BIT (0x00400000)
#define MAX_SHIFT (FLOAT_MANT_BITS + 2)
uint32_t fp32_mul_core (uint32_t a, uint32_t b)
{
uint64_t prod;
uint32_t expoa, expob, manta, mantb, shift;
uint32_t r, signr, expor, mantr_hi, mantr_lo;
/* split arguments into sign, exponent, significand */
expoa = ((a >> FLOAT_MANT_BITS) & EXPO_MASK) - EXPO_ADJUST;
expob = ((b >> FLOAT_MANT_BITS) & EXPO_MASK) - EXPO_ADJUST;
manta = (a | FLOAT_IMPLICIT_BIT) & FLOAT_MANT_MASK;
mantb = (b | FLOAT_IMPLICIT_BIT) & FLOAT_MANT_MASK;
/* result sign bit: XOR sign argument signs */
signr = (a ^ b) & FLOAT_SIGN_MASK;
if ((expoa >= (MAX_NORM_EXPO - EXPO_ADJUST)) || /* at least one argument is special */
(expob >= (MAX_NORM_EXPO - EXPO_ADJUST))) {
if ((a & ~FLOAT_SIGN_MASK) > FLOAT_INFINITY) { /* a is NaN */
/* return quietened NaN */
return a | FLOAT_QNAN_BIT;
}
if ((b & ~FLOAT_SIGN_MASK) > FLOAT_INFINITY) { /* b is NaN */
/* return quietened NaN */
return b | FLOAT_QNAN_BIT;
}
if ((a & ~FLOAT_SIGN_MASK) == 0) { /* a is zero */
/* return NaN if b is infinity, else zero */
return (expob != (INF_EXPO - EXPO_ADJUST)) ? signr : FLOAT_INDEFINITE;
}
if ((b & ~FLOAT_SIGN_MASK) == 0) { /* b is zero */
/* return NaN if a is infinity, else zero */
return (expoa != (INF_EXPO - EXPO_ADJUST)) ? signr : FLOAT_INDEFINITE;
}
if (((a & ~FLOAT_SIGN_MASK) == FLOAT_INFINITY) || /* a or b infinity */
((b & ~FLOAT_SIGN_MASK) == FLOAT_INFINITY)) {
return signr | FLOAT_INFINITY;
}
if ((int32_t)expoa < (MIN_NORM_EXPO - EXPO_ADJUST)) { /* a is subnormal */
/* normalize significand of a */
manta = a & FLOAT_MANT_MASK;
expoa++;
do {
manta = 2 * manta;
expoa--;
} while (manta < FLOAT_IMPLICIT_BIT);
} else if ((int32_t)expob < (MIN_NORM_EXPO - EXPO_ADJUST)) { /* b is subnormal */
/* normalize significand of b */
mantb = b & FLOAT_MANT_MASK;
expob++;
do {
mantb = 2 * mantb;
expob--;
} while (mantb < FLOAT_IMPLICIT_BIT);
}
}
/* result exponent: add argument exponents and adjust for biasing */
expor = expoa + expob - FLOAT_EXPO_BIAS + 2 * EXPO_ADJUST;
mantb = mantb << FLOAT_EXPO_BITS; /* preshift to align result signficand */
/* result significand: multiply argument signficands */
prod = (uint64_t)manta * mantb;
mantr_hi = (uint32_t)(prod >> 32);
mantr_lo = (uint32_t)(prod >> 0);
/* normalize significand */
if (mantr_hi < FLOAT_IMPLICIT_BIT) {
mantr_hi = (mantr_hi << 1) | (mantr_lo >> (32 - 1));
mantr_lo = (mantr_lo << 1);
expor--;
}
if (expor <= (MAX_NORM_EXPO - EXPO_ADJUST)) { /* normal, may overflow to infinity during rounding */
/* combine biased exponent, sign and signficand */
r = (expor << FLOAT_MANT_BITS) + signr + mantr_hi;
/* round result to nearest or even; overflow to infinity possible */
r = r + ((mantr_lo == RND_BIT_MASK) ? (mantr_hi & MANT_LSB) : (mantr_lo >> RND_BIT_SHIFT));
} else if ((int32_t)expor > (MAX_NORM_EXPO - EXPO_ADJUST)) { /* overflow */
/* return infinity */
r = signr | FLOAT_INFINITY;
} else { /* underflow */
/* return zero, normal, or smallest subnormal */
shift = 0 - expor;
if (shift > MAX_SHIFT) shift = MAX_SHIFT;
/* denormalize significand */
mantr_lo = mantr_hi << (32 - shift) | (mantr_lo ? 1 : 0);
mantr_hi = mantr_hi >> shift;
/* combine sign and signficand; biased exponent known to be zero */
r = mantr_hi + signr;
/* round result to nearest or even */
r = r + ((mantr_lo == RND_BIT_MASK) ? (mantr_hi & MANT_LSB) : (mantr_lo >> RND_BIT_SHIFT));
}
return r;
}
uint32_t float_as_uint (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
float uint_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
float fp32_mul (float a, float b)
{
return uint_as_float (fp32_mul_core (float_as_uint (a), float_as_uint (b)));
}
/* Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007 */
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579) /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)
#define ISNAN(x) ((float_as_uint (x) << 1) > 0xff000000)
#define QNAN(x) (x | FLOAT_QNAN_BIT)
#define PURELY_RANDOM (0)
#define PATTERN_BASED (1)
#define TEST_MODE (PURELY_RANDOM)
uint32_t v[8192];
int main (void)
{
unsigned long long count = 0;
float a, b, res, ref;
uint32_t i, j, patterns, idx = 0, nbrBits = sizeof (uint32_t) * CHAR_BIT;
/* pattern class 1: 2**i */
for (i = 0; i < nbrBits; i++) {
v [idx] = ((uint32_t)1 << i);
idx++;
}
/* pattern class 2: 2**i-1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = (((uint32_t)1 << i) - 1);
idx++;
}
/* pattern class 3: 2**i+1 */
for (i = 0; i < nbrBits; i++) {
v [idx] = (((uint32_t)1 << i) + 1);
idx++;
}
/* pattern class 4: 2**i + 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = (((uint32_t)1 << i) + ((uint32_t)1 << j));
idx++;
}
}
/* pattern class 5: 2**i - 2**j */
for (i = 0; i < nbrBits; i++) {
for (j = 0; j < nbrBits; j++) {
v [idx] = (((uint32_t)1 << i) - ((uint32_t)1 << j));
idx++;
}
}
/* pattern class 6: MAX_UINT/(2**i+1) rep. blocks of i zeros an i ones */
for (i = 0; i < nbrBits; i++) {
v [idx] = ((~(uint32_t)0) / (((uint32_t)1 << i) + 1));
idx++;
}
patterns = idx;
/* pattern class 6: one's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = ~v [i];
idx++;
}
/* pattern class 7: two's complement of pattern classes 1 through 5 */
for (i = 0; i < patterns; i++) {
v [idx] = ~v [i] + 1;
idx++;
}
patterns = idx;
#if TEST_MODE == PURELY_RANDOM
printf ("using purely random test vectors\n");
#elif TEST_MODE == PATTERN_BASED
printf ("using pattern-based test vectors\n");
printf ("#patterns = %u\n", patterns);
#endif // TEST_MODE
do {
#if TEST_MODE == PURELY_RANDOM
a = uint_as_float (KISS);
b = uint_as_float (KISS);
#elif TEST_MODE == PATTERN_BASED
i = KISS % patterns;
j = KISS % patterns;
a = uint_as_float ((v[i] & 0x7fffff) | (KISS & ~0x7fffff));
b = uint_as_float ((v[j] & 0x7fffff) | (KISS & ~0x7fffff));
#endif // TEST_MODE
res = fp32_mul (a, b);
ref = a * b;
/* check for bit pattern mismatch between result and reference */
if (float_as_uint (res) != float_as_uint (ref)) {
/* if both a and b are NaNs, either could be returned quietened */
if (! (ISNAN (a) && ISNAN (b) &&
((QNAN (float_as_uint (a)) == float_as_uint (res)) ||
(QNAN (float_as_uint (b)) == float_as_uint (res))))) {
printf ("err: a=% 15.8e (%08x) b=% 15.8e (%08x) res=% 15.8e (%08x) ref=%15.8e (%08x)\n",
a, float_as_uint(a), b, float_as_uint (b), res, float_as_uint (res), ref, float_as_uint (ref));
return EXIT_FAILURE;
}
}
count++;
if (!(count & 0xffffff)) printf ("\r%llu", count);
} while (1);
return EXIT_SUCCESS;
}
It is much more complicated. Take a look on the source of the softmath library (for example https://github.com/riscv/riscv-pk/blob/master/softfloat/f64_mul.c). Clone it and analyze.

How can I extract the biased exponent from a floating point number in C?

I am having issues finding the biased exponent from a floating point number. I do not understand why I return a 0 from any number that I enter for the parameter. Here is my code:
iFloat_t floatGetExp(iFloat_t x)
{
return ((x >> 23) & 0x7f800000);
}
The exponent of a float is from bits to 23-30, so that is why I am shifting to the right by 23 and have a mask of 0x7f800000.
For example, if I passed it a float of 248.875, the exponent should be 22.
((x >> 23) & 0x7f800000); is shift and masking in the wrong order.
The math library, C has a function that retrieves the exponent. This is the portable way.
int floatGetExp(float x) {
int expo;
frexpf(x, &expo); // fetch exponent (0.5 <= significand < 1.0) * 2^expo
if (x) expo -= 1;
return expo;
}
Assuming binary32 format for iFloat_t x there are 4 cases to consider:
Infinity/NaN, normal numbers, sub-normals, zero.
iFloat_t floatGetExp(iFloat_t x) {
uint32_t u = (uint32_t) x; // move into known type
// mask off the biased exponent
int bias_expo = (u & 0x7F800000) >> 23;
if (bias_exp == 0xFF) return TBD; // inf or NaN
if (bias_exp > 0) return bias_exp - 127; // the usual
// handle sub-normal numbers
if (u & 0x7FFFFFFF == 0) return 0; // zero
bias_exp -= 126;
while (u & 0x00400000 == 0) {
u <<= 1;
bias_exp--;
}
return bias_exp;
}

Getting the exponent from a floating point in C

I'm writing a function that will get the exponent of a floating point number (IEEE 754 standard) but for some reason when I use the right shift bitwise operator on the number it returns 0
Here is the function
int get_exp (int x)
{
return ( ((x >> 21) & 255) -127 );
}
I'm passing it 7.23 so the output should be 2, for some reason the (x >> 21) part returns 0 when it should actually be returning 129. The 255 is the mask I'm using to and (&) with the exponent part of the floating point number.
I'm guessing you're doing some kind of casting hocus-pocus to pass floating point as ints? I would use float frexpf (float x, int* exp); as defined in <math.h>.
#include <math.h>
int get_exp(float x)
{
int exp;
frexpf(x, &exp);
return exp;
}
It's guaranteed to work regardless of the sizes of the floating point types.
If you want to roll it yourself, you can adapt this code.
#define EXPONENT_BIAS (-127)
int get_exp(float f)
{
int i;
union {
// Set here, then use s or c to extract
float f;
// This may or may not work for you
struct {
unsigned int sign: 1;
unsigned int exponent: 8;
unsigned int mantissa: 23;
} s;
// For debugging purposes
unsigned char c[sizeof(float)];
} u;
// Assign, you might need to reverse the bytes!
u.f = f;
// You'll probably need this to figure out the field widths
for (i = 0; i < sizeof(float); i++)
fprintf(stderr, "%02x%s", u.c[i], (i + 1 < sizeof(float))? " ": "\n");
// Just return the exponent
return (int)u.s.exponent + EXPONENT_BIAS;
}
This will bite you if sizeof(float) != 4, or if you switch endian-ness.
Main issue is the passing of int rather than float and using 21 vs 23. #dbush
IEEE 754 standard (binary32) has a number of corner cases: Inifinty, NaN, sub-normal including zero. So additional code is needed to cope with them.
Assuming proper endian:
int get_exp(float x) {
assert(sizeof x == sizeof(uint32_t));
union {
float x;
uint32_t u32;
} u = { x };
#define EXPOSHIFT 23
#define EXPOMASK 255
#define EXPOBIAS 127
if (x == 0.0) return 0;
int expo = (int) (u.u32 >> EXPOSHIFT) & EXPOMASK;
if (expo == EXPOMASK) return INT_MAX; // x is infinity or NaN
if (expo == 0) return get_exp(x * (1L << EXPOSHIFT)) - EXPOSHIFT;
return expo - EXPOBIAS;
}
Working under the assumption that a float is 32 bit and is laid out as specified here, you have three issues:
Your function needs to accept a float.
You need to point a uint32_t to the address of the float so it sees the same bytes, then perform actions against the dereferenced pointer.
The exponent starts at the 24th (23 if you start from 0) bit, not the 22nd (21 if you start with 0), so you have to shift by 23.
#include <stdio.h>
#include <stdint.h>
int get_exp (float x)
{
uint32_t *i = (uint32_t *)&x;
return ( ((*i >> 23) & 255) -127 );
}
int main()
{
printf("exp=%d\n",get_exp(7.23));
}
Result:
exp=2
Should performance not be an issue, simply iterate:
int expof(float f) {
int expo = 0;
if (f < 0.0) f = -f;
while (f < 0.5f) {
f *= 2.0f;
expo--;
}
while (f >= 1.0f) {
f *= 0.5f;
expo++;
}
return expo;
}
Does not depend on any particular float implementation other than the exponent fits in int. It use no external functions as commented here.
Same result as from int expo; frexpf(f, &expo); return expo
The parameter list show
int x
and you pass a floating point number. Try to substitute with
float x

cast 32bit-float to 64bit-double on system where sizeof double == sizeof float == 4

I am trying to serialize a float according to the BSON spec which only has support for 64bit double. so i need to cast my float to a double.
On a system where sizeof(double) == 8 i would just do
float f = 3.14;
serialize((double)f);
but since sizeof(double) == 4 on the my target system i have to do something like
float f = 3.14;
uint64_t d;
float32_to_float64(f, &d);
serialize(d);
i have written some test code (on a machine where sizeof(double) == 8) trying to correctly converting the float32 to float64 and storing the result as a uint64_t but i am not getting the expected result.
#include <stdio.h>
#include <stdint.h>
#define FLOAT_FRACTION_MSK 0xFFFFFF
#define DOUBLE_FRACTION_S 52 // Fraction is 52 bits
#define DOUBLE_EXPONENT_S 11 // Exponent is 11 bits
#define FLOAT_FRACTION_S 23 // Fraction is 23 bits
#define FLOAT_EXPONENT_S 8 // Exponent is 8 bits
int main(void) {
// float af = 3.14;
float af = 0.15625;
double bd = 0;
//uint8_t buff[sizeof(int64_t)] = {0};
*(uint64_t*)&bd |= (*(uint32_t*)&af & (1UL << 31)) << 32; // check sign bit
uint8_t exponent32 = (*(uint32_t*)&af & 0x7F800000) >> (FLOAT_FRACTION_S+1);
if (exponent32 == 0xFF) return 1; // Error (infiniti if fraction is zero,
// Nan ortherwise)
printf("exponent32=%.4x\n", exponent32);
int64_t temp = *(uint64_t*)&bd;
*(uint64_t*)&bd |= ((uint64_t)exponent32 << (DOUBLE_FRACTION_S+4)); //& 0x7FF0000000000000; // (33); // 28
printf("exponent64=%llx, %d\n", *(uint64_t*)&bd, (DOUBLE_FRACTION_S+4));
// Do the fraction
{
printf("fraction64=%#.8llx\n", (
(uint64_t)(
(*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S-4)//((52-22)-1) // 33
) );
*(uint64_t*)&bd |= (
(uint64_t)(
(*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S)
) ;
}
double expected = af;
printf("Original float=%#.4x, converted double=%#.8llx expected=%.8llx,\n", *(uint32_t*)&af, *(uint64_t*)&bd, *(uint64_t*)&expected);
printf("Original float=%f, converted double=%lf\n\n", af, bd);
*(uint64_t*)&bd = temp;
return 0;
}
The output of this gives Original float=0x3e200000, converted double=0x3e04000000000000 expected=3fc4000000000000,
So it seems i am missing something when converting the exponent but i am at a loss to what that is.
fixed denormals, infinites & nans
unsigned __int64 Float2Double(float v)
{
unsigned int f = *(unsigned int*)&v; // reinterpret
if ( !(f&0x7fffffff) )
return (unsigned __int64)f<<32; // return +/-0.0
unsigned int s = f>>31; // get sign
unsigned int e = ((f&0x7f800000)>>23) -128; // get exponent and unbias from 128
unsigned int m = f&0x007fffff; // get mantisa
if (e==-128)
{
// handle denormals
while ( !(m&0x00800000) )
{
m<<=1;
e--;
}
m&=0x007fffff; // remove implicit 1
e++; //
}
else
if (e==127)
{
// +/-infinity
e = 1023;
}
unsigned __int64 d = s; // store sign (in lowest bit)
d <<= 11; // make space for exponent
d |= e +1024; // store rebiased exponent
d <<= 23; // add space for 23 most significant bits of mantisa
d |= m; // store 23 bits of mantisa
d <<= 52-23; // trail zeros in place of lower significant bit of mantisa
return d;
}
After accept answer that works with all float.
Tested successfully with all float including typical normal finites, sub normals, +/- zero, +/- infinity and NaN.
#include <assert.h>
#include <math.h>
#include <stdint.h>
#define F_SIGN_SHIFT (31)
#define F_EXPO_MAX (0xFF)
#define F_EXPO_SHIFT (23)
#define F_EXPO_MASK ((uint32_t) F_EXPO_MAX << F_EXPO_SHIFT)
#define F_EXPO_BIAS (127)
#define F_SFCT_MASK (0x7FFFFF)
#define F_SFCT_IMPLIEDBIT (F_SFCT_MASK + 1)
#define D_SIGN_SHIFT (63)
#define D_EXPO_MAX (0x7FF)
#define D_EXPO_SHIFT (52)
#define D_EXPO_MASK ((uint64_t) D_EXPO_MAX << D_EXPO_SHIFT)
#define D_EXPO_BIAS (1023)
uint64_t IEEEbinary32float_to_IEEEbinary64int(float f) {
assert(sizeof f == sizeof(uint32_t));
union {
float f;
uint32_t u;
} x = { f };
uint64_t y;
y = (uint64_t) (x.u >> F_SIGN_SHIFT) << D_SIGN_SHIFT;
unsigned expo = (x.u & F_EXPO_MASK) >> F_EXPO_SHIFT;
uint32_t significant = x.u & F_SFCT_MASK;
if (expo > 0) {
if (expo == F_EXPO_MAX) { // Infinity NaN
expo = D_EXPO_MAX;
} else { // typical normal finite numbers
expo += D_EXPO_BIAS - F_EXPO_BIAS;
}
} else {
if (significant) { // Subnormal
expo += D_EXPO_BIAS - F_EXPO_BIAS + 1;
while ((significant & F_SFCT_IMPLIEDBIT) == 0) {
significant <<= 1;
expo--;
}
significant &= F_SFCT_MASK;
} else { // Zero
expo = 0;
}
}
y |= (uint64_t) expo << D_EXPO_SHIFT;
y |= (uint64_t) significant << (D_EXPO_SHIFT - F_EXPO_SHIFT);
return y;
}

Divide by power of 2 resulting in float

I find myself needing to compute 16-bit unsigned integer divided by power of 2, which should result in a 32-bit float (standard IEEE format). This is on embedded system and the routine is repeatedly used so I am looking for something better than (float)x/(float)(1<<n). In addition, C compiler is pretty limited (no math lib, bit field, reinterpret_cast, etc).
If you don't mind some bit twiddling then the obvious way to go is to convert the integer to float and then subtract n from the exponent bits to achieve the division by 2^n:
y = (float)x; // convert to float
uint32_t yi = *(uint32_t *)&y); // get float value as bits
uint32_t exponent = yi & 0x7f800000; // extract exponent bits 30..23
exponent -= (n << 23); // subtract n from exponent
yi = yi & ~0x7f800000 | exponent; // insert modified exponent back into bits 30..23
y = *(float *)&yi; // copy bits back to float
Note that this fails for x = 0, so you should check x > 0 before conversion.
Total cost is one int-float conversion plus a handful of integer bitwise/arithmetic operations. If you use a union you can avoid having separate int/float representations and just work directly on the float.
Use ldexpf(x, -n). This function is defined by the C standard to do exactly what you want, return x•2-n, so any decent compiler will provide good code for this. (This requires either part of a math library or a compiler that optimizes this to inline code.)
If n is known at compile time, you can also consider x * (1.f/(1<<n)). A good compiler will compute (1.f/(1<<n)) at compile time, so the executable code will be two operations: Convert x to float and multiply by a constant. That might be faster than the code generated for ldexpf(x, -n) if the compiler does not optimize ldexpf(x, -n) as well as it might.
A quick and easy solution is to precompute a table of float values of 2-n for n >= 0 (what's the upper limit for n, around 31?) and then multiply x by the nth element of the table.
This may not be the fastest if your code emulates floating point multiplication because the CPU doesn't support it directly.
You may, however, do it quicker using integer math only.
Example (assuming IEEE-754 32-bit floats):
#include <limits.h>
#include <string.h>
#include <stdio.h>
#define C_ASSERT(expr) extern char CAssertExtern[(expr)?1:-1]
C_ASSERT(CHAR_BIT == 8);
C_ASSERT(sizeof(float) == 4);
C_ASSERT(sizeof(int) == 4);
float div(int x, unsigned n)
{
float res;
unsigned e = 0;
unsigned sign = x < 0;
unsigned m = sign ? -x : x;
if (m)
{
while (m >= (1u << 24))
m >>= 1, e++;
while (m < (1u << 23))
m <<= 1, e--;
e += 0x7F + 23;
e -= n; // divide by 1<<n
m ^= 1u << 23; // reset the implicit 1
m |= (e & 0xFF) << 23; // mix in the exponent
m |= sign << 31; // mix in the sign
}
memcpy(&res, &m, sizeof m);
return res;
}
void Print4Bytes(unsigned char buf[4])
{
printf("%02X%02X%02X%02X ", buf[3], buf[2], buf[1], buf[0]);
}
int main(void)
{
int x = 0x35AA53;
int n;
for (n = 0; n < 31; n++)
{
float v1 = (float)x/(1u << n);
float v2 = div(x, n);
Print4Bytes((void*)&v1);
printf("%c= ", "!="[memcmp(&v1, &v2, sizeof v1) == 0]);
Print4Bytes((void*)&v2);
printf("%14.6f %14.6f\n", v1, v2);
}
return 0;
}
Output (ideone):
4A56A94C == 4A56A94C 3517011.000000 3517011.000000
49D6A94C == 49D6A94C 1758505.500000 1758505.500000
4956A94C == 4956A94C 879252.750000 879252.750000
48D6A94C == 48D6A94C 439626.375000 439626.375000
4856A94C == 4856A94C 219813.187500 219813.187500
47D6A94C == 47D6A94C 109906.593750 109906.593750
4756A94C == 4756A94C 54953.296875 54953.296875
46D6A94C == 46D6A94C 27476.648438 27476.648438
4656A94C == 4656A94C 13738.324219 13738.324219
45D6A94C == 45D6A94C 6869.162109 6869.162109
4556A94C == 4556A94C 3434.581055 3434.581055
44D6A94C == 44D6A94C 1717.290527 1717.290527
4456A94C == 4456A94C 858.645264 858.645264
43D6A94C == 43D6A94C 429.322632 429.322632
4356A94C == 4356A94C 214.661316 214.661316
42D6A94C == 42D6A94C 107.330658 107.330658
4256A94C == 4256A94C 53.665329 53.665329
41D6A94C == 41D6A94C 26.832664 26.832664
4156A94C == 4156A94C 13.416332 13.416332
40D6A94C == 40D6A94C 6.708166 6.708166
4056A94C == 4056A94C 3.354083 3.354083
3FD6A94C == 3FD6A94C 1.677042 1.677042
3F56A94C == 3F56A94C 0.838521 0.838521
3ED6A94C == 3ED6A94C 0.419260 0.419260
3E56A94C == 3E56A94C 0.209630 0.209630
3DD6A94C == 3DD6A94C 0.104815 0.104815
3D56A94C == 3D56A94C 0.052408 0.052408
3CD6A94C == 3CD6A94C 0.026204 0.026204
3C56A94C == 3C56A94C 0.013102 0.013102
3BD6A94C == 3BD6A94C 0.006551 0.006551
3B56A94C == 3B56A94C 0.003275 0.003275

Resources