I was thinking about the floor function available in math.h. It is very easy to use it:
#include <stdio.h>
#include <math.h>
int main(void)
{
for (double a = 12.5; a < 13.4; a += 0.1)
printf("floor of %.1lf is %.1lf\n", a, floor(a));
return 0;
}
What if I would like to write my own implementation of it? Would it look simply like this:
#include <stdio.h>
#include <math.h>
double my_floor(double num)
{
return (int)num;
}
int main(void)
{
double a;
for (a = 12.5; a < 13.4; a += 0.1)
printf("floor of %.1lf is %.1lf\n", a, floor(a));
printf("\n\n");
for (a = 12.5; a < 13.4; a += 0.1)
printf("floor of %.1lf is %.1lf\n", a, my_floor(a));
return 0;
}
?
It seems it does not work with negative numbers (my_floor), but the second one seems to be fine (my_floor_2):
#include <stdio.h>
#include <math.h>
double my_floor(double num)
{
return (int)num;
}
double my_floor_2(double num)
{
if(num < 0)
return (int)num - 1;
else
return (int)num;
}
int main(void)
{
double a1 = -12.5;
printf("%lf\n", floor(a1));
printf("%lf\n", my_floor(a1));
printf("%lf\n", my_floor_2(a1));
return 0;
}
program output:
-13.000000
-12.000000
-13.000000
Is one of them eventually correct or not?
Both of your attempts have limitations:
If the double value is outside the range of the int type, converting to int is implementation defined.
If the double value is negative but integral, returning (int)num - 1 is incorrect.
Here is an (almost) portable version that tries to handle all cases:
double my_floor_2(double num) {
if (num >= LLONG_MAX || num <= LLONG_MIN || num != num) {
/* handle large values, infinities and nan */
return num;
}
long long n = (long long)num;
double d = (double)n;
if (d == num || num >= 0)
return d;
else
return d - 1;
}
It should be correct if type long long has more value bits than type double, which is the case on most modern systems.
No, you can't tackle it this way. The best way of writing your own implementation is to take the one from the C Standard Library on your platform. But note that might contain platform specific nuances so might not be portable.
The C Standard Library floor function is typically clever in that it doesn't work by taking a conversion to an integral type. If it did then you'd run the risk of signed integer overflow, the behaviour of which is undefined. (Note that the smallest possible range for an int is -32767 to +32767).
The precise implementation is also dependent on the floating point scheme used on your platform.
For a platform using IEEE754 floating point, and a long long type you could adopt this scheme:
If the magnitude of the number is greater than 253, return it (as it's already integral).
Else, cast to a 64-bit type (long long), and return it back.
In C++ and 32 bit arithmetics it can be done for example like this:
//---------------------------------------------------------------------------
// IEEE 754 double MSW masks
const DWORD _f64_sig =0x80000000; // sign
const DWORD _f64_exp =0x7FF00000; // exponent
const DWORD _f64_exp_sig=0x40000000; // exponent sign
const DWORD _f64_exp_bia=0x3FF00000; // exponent bias
const DWORD _f64_exp_lsb=0x00100000; // exponent LSB
const DWORD _f64_exp_pos= 20; // exponent LSB bit position
const DWORD _f64_man =0x000FFFFF; // mantisa
const DWORD _f64_man_msb=0x00080000; // mantisa MSB
const DWORD _f64_man_bits= 52; // mantisa bits
// IEEE 754 single masks
const DWORD _f32_sig =0x80000000; // sign
const DWORD _f32_exp =0x7F800000; // exponent
const DWORD _f32_exp_sig=0x40000000; // exponent sign
const DWORD _f32_exp_bia=0x3F800000; // exponent bias
const DWORD _f32_exp_lsb=0x00800000; // exponent LSB
const DWORD _f32_exp_pos= 23; // exponent LSB bit position
const DWORD _f32_man =0x007FFFFF; // mantisa
const DWORD _f32_man_msb=0x00400000; // mantisa MSB
const DWORD _f32_man_bits= 23; // mantisa bits
//---------------------------------------------------------------------------
double f64_floor(double x)
{
const int h=1; // may be platform dependent MSB/LSB order
const int l=0;
union _f64 // semi result
{
double f; // 64bit floating point
DWORD u[2]; // 2x32 bit uint
} y;
DWORD m,a;
int sig,exp,sh;
y.f=x;
// extract sign
sig =y.u[h]&_f64_sig;
// extract exponent
exp =((y.u[h]&_f64_exp)>>_f64_exp_pos)-(_f64_exp_bia>>_f64_exp_pos);
// floor bit shift
sh=_f64_man_bits-exp; a=0;
if (exp<0)
{
a=y.u[l]|(y.u[h]&_f64_man);
if (sig) return -1.0;
return 0.0;
}
// LSW
if (sh>0)
{
if (sh<32) m=(0xFFFFFFFF>>sh)<<sh; else m=0;
a=y.u[l]&(m^0xFFFFFFFF); y.u[l]&=m;
}
// MSW
sh-=32;
if (sh>0)
{
if (sh<_f64_exp_pos) m=(0xFFFFFFFF>>sh)<<sh; else m=_f64_sig|_f64_exp;
a|=y.u[h]&(m^0xFFFFFFFF); y.u[h]&=m;
}
if ((sig)&&(a)) y.f--;
return y.f;
}
//---------------------------------------------------------------------------
float f32_floor(float x)
{
union // semi result
{
float f; // 32bit floating point
DWORD u; // 32 bit uint
} y;
DWORD m,a;
int sig,exp,sh;
y.f=x;
// extract sign
sig =y.u&_f32_sig;
// extract exponent
exp =((y.u&_f32_exp)>>_f32_exp_pos)-(_f32_exp_bia>>_f32_exp_pos);
// floor bit shift
sh=_f32_man_bits-exp; a=0;
if (exp<0)
{
a=y.u&_f32_man;
if (sig) return -1.0;
return 0.0;
}
if (sh>0)
{
if (sh<_f32_exp_pos) m=(0xFFFFFFFF>>sh)<<sh; else m=_f32_sig|_f32_exp;
a|=y.u&(m^0xFFFFFFFF); y.u&=m;
}
if ((sig)&&(a)) y.f--;
return y.f;
}
//---------------------------------------------------------------------------
The point is to make mask that will clear out the decimal bits from mantissa and in case of negative input and non zero cleared bits decrement the result. To access individual bits you can convert your floating point value to integral representation with use of union (like in the example) or use pointers instead.
I tested this in simple VCL app like this:
float f32;
double f64;
AnsiString txt="";
// 64 bit
txt+="[double]\r\n";
for (f64=-10.0;f64<=10.0;f64+=0.1)
if (fabs(floor(f64)-f64_floor(f64))>1e-6)
{
txt+=AnsiString().sprintf("%5.3lf %5.3lf %5.3lf\r\n",f64,floor(f64),f64_floor(f64));
f64_floor(f64);
}
for (f64=1;f64<=1e307;f64*=1.1)
{
if (fabs(floor( f64)-f64_floor( f64))>1e-6) { txt+=AnsiString().sprintf("%lf lf lf\r\n", f64,floor( f64),f64_floor( f64));
f64_floor( f64); }
if (fabs(floor(-f64)-f64_floor(-f64))>1e-6) { txt+=AnsiString().sprintf("%lf lf lf\r\n",-f64,floor(-f64),f64_floor(-f64));
f64_floor(-f64); }
}
// 32 bit
txt+="[float]\r\n";
for (f32=-10.0;f32<=10.0;f32+=0.1)
if (fabs(floor(f32)-f32_floor(f32))>1e-6)
{
txt+=AnsiString().sprintf("%5.3lf %5.3lf %5.3lf\r\n",f32,floor(f32),f32_floor(f32));
f32_floor(f32);
}
for (f32=1;f32<=1e37;f32*=1.1)
{
if (fabs(floor( f32)-f32_floor( f32))>1e-6) { txt+=AnsiString().sprintf("%lf lf lf\r\n", f32,floor( f32),f32_floor( f32));
f32_floor( f32); }
if (fabs(floor(-f32)-f32_floor(-f32))>1e-6) { txt+=AnsiString().sprintf("%lf lf lf\r\n",-f32,floor(-f32),f32_floor(-f32));
f32_floor(-f32); }
}
mm_log->Lines->Add(txt);
with no difference result (so in all tested cases it matches math.h floor() values. If you want to give this a shot outside VCL then just change AnsiString to any string type you got at hand and change the output from TMemo::mm_log to anything you got (like console cout or whatever)
The double calling of fxx_floor() in case of difference is for debuging purposes (you can place a breakpoint and step in the error case directly).
[Notes]
Beware the order of words (MSW,LSW) is platform dependent so you should adjust the h,l constants accordingly. This code is not optimized so it is easily understandable so do not expect it will be fast.
When the precision of the floating point type is small enough as compared to a wide integer type, cast to that integer type when the floating point value is in the integer range.
Review the function for values outside the intmax_t range, NAN, infinity and -0.0 and adjust as desired.
#if DBL_MANT_DIG >= 64
#error TBD code
#endif
#include <inttypes.h>
// INTMAX_MAX is not exact as a double, yet INTMAX_MAX + 1 is an exact double
#define INTMAX_MAX_P1 ((INTMAX_MAX/2 + 1)*2.0)
double my_floor(double x) {
if (x >= 0.0) {
if (x < INTMAX_MAX_P1) {
return (double)(intmax_t)x;
}
return x;
} else if (x < 0.0) {
if (x >= INTMAX_MIN) {
intmax_t ix = (intmax_t) x;
return (ix == x) ? x : (double)(ix-1);
}
return x;
}
return x; // NAN
}
Try it online!. Try this as your function:
// we need as much space as possible
typedef long double largestFloat;
largestFloat myFloor(largestFloat x)
{
largestFloat xcopy = (x < 0) ? (x * -1) : x;
unsigned int zeros = 0;
largestFloat n = 1;
// Count digits before the decimal
for (n = 1; xcopy > (n * 10); n *= 10, ++zeros)
;
// Make xcopy follow 0 <= xcopy < 1
for (xcopy -= n; zeros != -1; xcopy -= n) {
if (xcopy < 0) {
xcopy += n;
n /= 10;
--zeros;
}
}
xcopy += n;
// Follow standard floor behavior
if (x < 0)
return (xcopy == 0) ? x : (x + xcopy - 1);
else
return x - xcopy;
}
This is an explanation of the code.
Create xcopy (absolute value of x)
Use the first for loop to figure out the number of digits before the decimal point.
Use that number to continually decrease xcopy until it satisfies 0 <= xcopy < 1
Based on whether x was originally positive or negative, either return x - xcopy or x - (1 - xcopy).
Related
I need to check if the result of a mathematical division is an integer or not.
For example, 8 / 2 = 4
is ok.
but 5 / 2 = 2.5
shouldn't be ok.
I've tried the following:
bool isPrime(int num)
{
/* Checks all the numbers before the given input. If the result of
dividing
the input by one of those numbers is an int, then the input is not a
prime number. */
int i, check;
double result;
for (i=2; i<num; i++) {
result = (double) num / i;
check = (int) result;
if (isdigit(check))
return false;
}
return true;
}
I'm having nightmares about isdigit and how to insert the parameter in the right way. I know it requires an int, but I have a double, so I can't really put those pieces together.
It seems like you're trying to do something like this:
result = (double) num / i;
check = (int) result;
if(check == result) {
...
Logically, this is correct. But it will not work in practice because floats does not have infinite precision.
The proper way to check divisibility is using the modulo operator:
if(num % i == 0) {
// Code to run if num / i is an integer
You want to test if an integer division has no remainder: use the modulo operator % that computes the remainder. The isdigit() function has a different purpose: it tests whether a byte read by getc() is a digit ('0' to '9') or not.
Here is a modified version:
bool isPrime(int num) {
/* Checks all the numbers before the given input. If the result of
dividing the input by one of those numbers is integer, then the
input is not a prime number. */
int i;
for (i = 2; i < num; i++) {
if (num % i == 0)
return false;
}
return true;
}
Note that it would be much quicker for prime numbers to stop the search when i * i > num:
bool isPrime(int num) {
/* Checks all the numbers before the given input. If the result of
dividing the input by one of those numbers is integer, then the
input is not a prime number. */
int i;
for (i = 2; i * i <= num; i++) {
if (num % i == 0)
return false;
}
return true;
}
I don´t understand the logic of your function. But maybe this help you. Compare the result of the % modulo operation by the result of the division with the floored version of the division´s result, to 0. If it equals 0, return true;, if not return false;.
At long long int res_floored = res; happens an implicit conversion from double to long long int - The value gets floored, f.e. 4.7 to 4. No explicit cast is needed.
Before that, we have to check if the floored double value of the result of the division is capable to be hold in an long long int. Therefore, I compare res to the macros LONG_MAX and LONG_INT, header limits.h, which represent the maximum and minimum integral values a long int can hold. If it doens´t fit we return -1; as error.
int div_result_in_int (double dividend, double divisor);
{
double res = dividend / divisor;
if (res > LONG_MAX || res < LONG_MIN)
{
return -1;
}
long long int res_floored = res;
if (res % res_floored == 0)
{
return true;
}
else
{
return false;
}
}
I use double for both parameters, because the division of two floating-point values can result in an integral value.
#include <stdio.h>
#include <limits.h>
#define true 1
#define false 0
int div_result_in_int (double dividend, double divisor)
{
double res = dividend / divisor;
if (res > LONG_MAX || res < LONG_MIN)
{
return -1;
}
long long int res_floored = res;
if (res == res_floored)
{
return true;
}
else
{
return false;
}
}
int main(void)
{
printf("%d\n", div_result_in_int(8,4));
printf("%d\n", div_result_in_int(9,5));
printf("%d\n", div_result_in_int(3,1));
printf("%d\n", div_result_in_int(97,14));
printf("%d\n", div_result_in_int(2,0.5));
}
Output:
1
0
1
0
1
I am trying to code a program that will take a floating point number in base 10 and convert its fractional part in base 2. In the following code, I am intending to call my converting function into a printf, and format the output; the issue I have lies in my fra_binary() where I can't figure out the best way to return an integer made of the result of the conversion at each turn respectively (concatenation). Here is what I have done now (the code is not optimized because I am still working on it) :
#include <stdio.h>
#include <math.h>
int fra_binary(double fract) ;
int main()
{
long double n ;
double fract, deci ;
printf("base 10 :\n") ;
scanf("%Lf", &n) ;
fract = modf(n, &deci) ;
int d = deci ;
printf("base 2: %d.%d\n", d, fra_binary(fract)) ;
return(0) ;
}
int fra_binary(double F)
{
double fl ;
double decimal ;
int array[30] ;
for (int i = 0 ; i < 30 ; i++) {
fl = F * 2 ;
F = modf(fl, &decimal) ;
array[i] = decimal ;
if (F == 0) break ;
}
return array[0] ;
}
Obviously this returns partly the desired output, because I would need the whole array concatenated as one int or char to display the series of 1 and 0s I need. So at each turn, I want to use the decimal part of the number I work on as the binary number to concatenate (1 + 0 = 10 and not 1). How would I go about it?
Hope this makes sense!
return array[0] ; is only the first value of int array[30] set in fra_binary(). Code discards all but the first calculation of the loop for (int i = 0 ; i < 30 ; i++).
convert its fractional part in base 2
OP's loop idea is a good starting point. Yet int array[30] is insufficient to encode the fractional portion of all double into a "binary".
can't figure out the best way to return an integer
Returning an int will be insufficient. Instead consider using a string - or manage an integer array in a likewise fashion.
Use defines from <float.h> to drive the buffer requirements.
#include <stdio.h>
#include <math.h>
#include <float.h>
char *fra_binary(char *dest, double x) {
_Static_assert(FLT_RADIX == 2, "Unexpected FP base");
double deci;
double fract = modf(x, &deci);
fract = fabs(fract);
char *s = dest;
do {
double d;
fract = modf(fract * 2.0, &d);
*s++ = "01"[(int) d];
} while (fract);
*s = '\0';
// For debug
printf("%*.*g --> %.0f and .", DBL_DECIMAL_DIG + 8, DBL_DECIMAL_DIG, x,
deci);
return dest;
}
int main(void) {
// Perhaps 53 - -1021 + 1
char fraction_string[DBL_MANT_DIG - DBL_MIN_EXP + 1];
puts(fra_binary(fraction_string, -0.0));
puts(fra_binary(fraction_string, 1.0));
puts(fra_binary(fraction_string, asin(-1))); // machine pi
puts(fra_binary(fraction_string, -0.1));
puts(fra_binary(fraction_string, DBL_MAX));
puts(fra_binary(fraction_string, DBL_MIN));
puts(fra_binary(fraction_string, DBL_TRUE_MIN));
}
Output
-0 --> -0 and .0
1 --> 1 and .0
3.1415926535897931 --> 3 and .001001000011111101101010100010001000010110100011
-0.10000000000000001 --> -0 and .0001100110011001100110011001100110011001100110011001101
1.7976931348623157e+308 --> 179769313486231570814527423731704356798070600000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 and .0
2.2250738585072014e-308 --> 0 and .00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
4.9406564584124654e-324 --> 0 and .000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001
Also unclear why input is long double, yet processing is with double. Recommend using just one FP type.
Note that your algorithm finds out the binary representation of the fraction most significant bit first.
One way to convert the fractional part to a binary string, would be to supply the function with a string and a string length, and have the function fill it with up to that many binary digits:
/* This function returns the number of chars needed in dst
to describe the fractional part of value in binary,
not including the trailing NUL ('\0').
Returns zero in case of an error (non-finite value).
*/
size_t fractional_bits(char *dst, size_t len, double value)
{
double fraction, integral;
size_t i = 0;
if (!isfinite(value))
return 0;
if (value > 0.0)
fraction = modf(value, &integral);
else
if (value < 0.0)
fraction = modf(-value, &integral);
else {
/* Zero fraction. */
if (len > 1) {
dst[0] = '0';
dst[1] = '\0';
} else
if (len > 0)
dst[0] = '\0';
/* One binary digit was needed for exact representation. */
return 1;
}
while (fraction > 0.0) {
fraction = fraction * 2.0;
if (fraction >= 1.0) {
fraction = fraction - 1.0;
if (i < len)
dst[i] = '1';
} else
if (i < len)
dst[i] = '0';
i++;
}
if (i < len)
dst[i] = '\0';
else
if (len > 0)
dst[len - 1] = '\0';
return i;
}
The above function works very much like snprintf(), except it takes only the double whose fractional bits are to be stored as a string of binary digits (0 or 1). and returns 0 in case of an error (non-finite double value).
Another option is to use an unsigned integer type to hold the bits. For example, if your code is intended to work on architectures where double is an IEEE-754 Binary64 type or similar, the mantissa has up to 53 bits of precision, and an uint64_t would suffice.
Here is an example of that:
uint64_t fractional_bits(const double val, size_t bits)
{
double fraction, integral;
uint64_t result = 0;
if (bits < 1 || bits > 64) {
errno = EINVAL;
return 0;
}
if (!isfinite(val)) {
errno = EDOM;
return 0;
}
if (val > 0.0)
fraction = modf(val, &integral);
else
if (val < 0.0)
fraction = modf(-val, &integral);
else {
errno = 0;
return 0;
}
while (bits-->0) {
result = result << 1;
fraction = fraction * 2.0;
if (fraction >= 1.0) {
fraction = fraction - 1.0;
result = result + 1;
}
}
errno = 0;
return result;
}
The return value is the binary representation of the fractional part: [i]fractional_part[/i] ≈ [i]result[/i] / 2[sup][i]bits[/i][/sup], where [i]bits[/i] is between 1 and 64, inclusive.
In order for the caller to detect an error, the function clears errno to zero if no error occurred. If an error does occur, the function returns zero with errno set to EDOM if the value is not finite, or to EINVAL if bits is less than 1 or greater than 64.
You can combine the two approaches, if you implement an arbitrary-size unsigned integer type, or a bitmap type.
For the robustness reason, I want check if a float number is IEEE-754 +-Inf or IEEE-754 Nan. My code is in the following, I want know if it is correct:
#define PLUS_INFINITE (1.0f/0.0f)
#define MINUS_INFINITE (-1.0f/0.0f)
#define NAN (0.0f/0.0f)
float Local_Var;
/*F is a float numnber.*/
if((unsigned long)(F) == 0x7f800000ul)
{
Local_Var = PLUS_INFINITE;
}
elseif((unsigned long)(F) == 0xff800000ul)
{
Local_Var = MINUS_INFINITE;
}
/*fraction = anything except all 0 bits (since all 0 bits represents infinity).*/
elseif((((unsigned long)(F) & 0x007ffffful) != 0ul )
&&((unsigned long)(F) == 0x7f800000ul))
||
(((unsigned long)(F) & 0x807ffffful) != 0ul )
&&
((unsigned long)(F) == 0xff800000ul))
{
Local_Var = NAN;
}
else{}
C99 has macros for the classification of floating-point numbers:
fpclassify(x) returns one of:
FP_NAN: x is not a number;
FP_INFINITE: x is plus or minus infinite;
FP_ZERO: x is zero;
FP_SUBNORMAL: x is too small to be represented in normalized format or
FP_NORMAL: normal floating-point number, i.e. none of the above.
There are also shortcuts that check for one of these classes, which return non-zero if x is what :
isfinite(x)
isnormal(x)
isnan(x)
isinf(x)
The argument x can be any floating-point expression; the macros detect the type of the argument and work for float and double.
EDIT: Since you don't want to use (or cannot use) <math.h>, you could use other properties of nan and inf to classify your numers:
nan compares false to all numbers, including to itself;
inf is greater than FLT_MAX;
-inf is smaller than -FLT_MAX.
So:
#include <stdlib.h>
#include <stdio.h>
#include <float.h>
int main()
{
float f[] = {
0.0, 1.0, FLT_MAX, 0.0 / 0.0, 1.0/0.0, -1.0/0.0
};
int i;
for (i = 0; i < 6; i++) {
float x = f[i];
int is_nan = (x != x);
int is_inf = (x < -FLT_MAX || x > FLT_MAX);
printf("%20g%4d%4d\n", x, is_nan, is_inf);
}
return 0;
}
In this solution, you must adapt the limits if you want to use double.
Casting floats to longs like that is wrong. It should be either a union, or a type-punned pointer.
Here's a working example from dietlibc (with doubles):
https://github.com/ensc/dietlibc/blob/master/lib/__isinf.c
https://github.com/ensc/dietlibc/blob/master/lib/__isnan.c
Musl has a shorter fpclassify, and also proper constants for floats:
http://git.musl-libc.org/cgit/musl/tree/src/math/__fpclassifyf.c
Best to use the fpclassify() functions of #M Oehm answer
Alternatives:
float F;
if (F <= FLT_MAX) {
if (F >= -FLT_MAX) {
puts("Finite");
} else {
puts("-Infinity");
}
} else {
if (F > 0) {
puts("+Infinity");
} else {
puts("NaN");
}
}
If code wants to mess with the bits and assuming float are in binary32 format:
assert(sizeof (float) == sizeof (uint32_t));
union {
float f;
uint32_t u32;
} x;
x.f = F;
Masks depend on relative endian of float and uint32_t endian. They usually are the same.
// Is F one of the 3 special: +inf, -inf, NaN?
if (x.u32 & 0x7F800000 == 0x7F800000) {
if (x.u32 & 0x007FFFFF) {
puts("NaN");
} else if (x.u32 & 0x80000000) {
puts("-Inf");
} else {
puts("+Inf");
}
}
I am not sure if power by squaring takes care of negative exponent. I implemented the following code which works for only positive numbers.
#include <stdio.h>
int powe(int x, int exp)
{
if (x == 0)
return 1;
if (x == 1)
return x;
if (x&1)
return powe(x*x, exp/2);
else
return x*powe(x*x, (exp-1)/2);
}
Looking at https://en.wikipedia.org/wiki/Exponentiation_by_squaring doesn't help as the following code seems wrong.
Function exp-by-squaring(x, n )
if n < 0 then return exp-by-squaring(1 / x, - n );
else if n = 0 then return 1;
else if n = 1 then return x ;
else if n is even then return exp-by-squaring(x * x, n / 2);
else if n is odd then return x * exp-by-squaring(x * x, (n - 1) / 2).
Edit:
Thanks to amit this solution works for both negative and positive numbers:
float powe(float x, int exp)
{
if (exp < 0)
return powe(1/x, -exp);
if (exp == 0)
return 1;
if (exp == 1)
return x;
if (((int)exp)%2==0)
return powe(x*x, exp/2);
else
return x*powe(x*x, (exp-1)/2);
}
For fractional exponent we can do below (Spektre method):
Suppose you have x^0.5 then you easily calculate square root by this method: start from 0 to x/2 and keep checking x^2 is equal to the result or not in binary search method.
So, in case you have x^(1/3) you have to replace if mid*mid <= n to if mid*mid*mid <= n and you will get the cube root of x.Same thing applies for x^(1/4), x^(1/5) and so on. In the case of x^(2/5) we can do (x^(1/5))^2 and again reduce the problem of finding the 5th root of x.
However by this time you would have realized that this method works only in the case when you can convert the root to 1/x format. So are we stuck if we can't convert? No, we can still go ahead as we have the will.
Convert your floating point to fixed point and then calculate pow(a, b). Suppose the number is 0.6, converting this to (24, 8)floating point yields Floor(0.6*1<<8) = 153(10011001). As you know 153 represents the fractional part so in fixed point this (10011001) represent (2^-1, 0, 0, 2^-3, 2^-4, 0, 0, 2^7).So we can again calculate the pow(a, 0.6) by calculating the 2,3, 4 and 7 root of x in fixed point. After calculating we again need to get the result in floating point by dividing with 1<<8.
Code for above method can be found in the accepted answer.
There is also a log based method:
x^y = exp2(y*log2(x))
Integer examples are for 32 bit int arithmetics, DWORD is 32bit unsigned int
floating pow(x,y)=x^y
Is usually evaluated like this:
How Math.Pow (and so on) actually works
so the fractional exponent can be evaluated: pow(x,y) = exp2(y*log2(x)). This can be done also on fixed point:
fixed point bignum pow
integer pow(a,b)=a^b where a>=0 , b>=0
This is easy (you already have that) done by squaring:
DWORD powuu(DWORD a,DWORD b)
{
int i,bits=32;
DWORD d=1;
for (i=0;i<bits;i++)
{
d*=d;
if (DWORD(b&0x80000000)) d*=a;
b<<=1;
}
return d;
}
integer pow(a,b)=a^b where b>=0
Just add few ifs to handle the negative a
int powiu(int a,DWORD b)
{
int sig=0,c;
if ((a<0)&&(DWORD(b&1)) { sig=1; a=-a; } // negative output only if a<0 and b is odd
c=powuu(a,b); if (sig) c=-c;
return c;
}
integer pow(a,b)=a^b
So if b<0 then it means 1/powiu(a,-b) As you can see the result is not integer at all so either ignore this case or return floating value or add a multiplier variable (so you can evaluate PI equations on pure Integer arithmetics). This is float result:
float powfii(int a,int b)
{
if (b<0) return 1.0/float(powiu(a,-b));
else return powiu(a,b);
}
integer pow(a,b)=a^b where b is fractional
You can do something like this a^(1/bb) where bb is integer. In reality this is rooting so you can use binary search to evaluate:
a^(1/2) is square root(a)
a^(1/bb) is bb_root(a)
so do a binary search for c from MSB to LSB and evaluate if pow(c,bb)<=a then leave the bit as is else clear it. This is sqrt example:
int bits(DWORD p) // count how many bits is p
{
DWORD m=0x80000000; int b=32;
for (;m;m>>=1,b--)
if (p>=m) break;
return b;
}
DWORD sqrt(const DWORD &x)
{
DWORD m,a;
m=(bits(x)>>1);
if (m) m=1<<m; else m=1;
for (a=0;m;m>>=1) { a|=m; if (a*a>x) a^=m; }
return a;
}
so now just change the if (a*a>x) with if (pow(a,bb)>x) where bb=1/b ... so b is fractional exponent you looking for and bb is integer. Also m is the number of bits of the result so change m=(bits(x)>>1); to m=(bits(x)/bb);
[edit1] fixed point sqrt example
//---------------------------------------------------------------------------
const int _fx32_fract=16; // fractional bits count
const int _fx32_one =1<<_fx32_fract;
DWORD fx32_mul(const DWORD &x,const DWORD &y) // unsigned fixed point mul
{
DWORD a=x,b=y; // asm has access only to local variables
asm { // compute (a*b)>>_fx32_fract
mov eax,a // eax=a
mov ebx,b // ebx=b
mul eax,ebx // (edx,eax)=eax*ebx
mov ebx,_fx32_one
div ebx // eax=(edx,eax)>>_fx32_fract
mov a,eax;
}
return a;
}
DWORD fx32_sqrt(const DWORD &x) // unsigned fixed point sqrt
{
DWORD m,a;
if (!x) return 0;
m=bits(x); // integer bits
if (m>_fx32_fract) m-=_fx32_fract; else m=0;
m>>=1; // sqrt integer result is half of x integer bits
m=_fx32_one<<m; // MSB of result mask
for (a=0;m;m>>=1) // test bits from MSB to 0
{
a|=m; // bit set
if (fx32_mul(a,a)>x) // if result is too big
a^=m; // bit clear
}
return a;
}
//---------------------------------------------------------------------------
so this is unsigned fixed point. High 16 bits are integer and low 16 bits are fractional part.
this is fp -> fx conversion: DWORD(float(x)*float(_fx32_one))
this is fp <- fx conversion: float(DWORD(x))/float(_fx32_one))
fx32_mul(x,y) is x*y it uses assembler of 80386+ 32bit architecture (you can rewrite it to karatsuba or whatever else to be platform independent)
fx32_sqrt(x) is sqrt(x)
In fixed point you should be aware of the fractional bit shift for multiplication: (a<<16)*(b<<16)=(a*b<<32) you need to shift back by >>16 to get result (a*b<<16). Also the result can overflow 32 bit therefore I use 64 bit result in assembly.
[edit2] 32bit signed fixed point pow C++ example
When you put all the previous steps together you should have something like this:
//---------------------------------------------------------------------------
//--- 32bit signed fixed point format (2os complement)
//---------------------------------------------------------------------------
// |MSB LSB|
// |integer|.|fractional|
//---------------------------------------------------------------------------
const int _fx32_bits=32; // all bits count
const int _fx32_fract_bits=16; // fractional bits count
const int _fx32_integ_bits=_fx32_bits-_fx32_fract_bits; // integer bits count
//---------------------------------------------------------------------------
const int _fx32_one =1<<_fx32_fract_bits; // constant=1.0 (fixed point)
const float _fx32_onef =_fx32_one; // constant=1.0 (floating point)
const int _fx32_fract_mask=_fx32_one-1; // fractional bits mask
const int _fx32_integ_mask=0xFFFFFFFF-_fx32_fract_mask; // integer bits mask
const int _fx32_sMSB_mask =1<<(_fx32_bits-1); // max signed bit mask
const int _fx32_uMSB_mask =1<<(_fx32_bits-2); // max unsigned bit mask
//---------------------------------------------------------------------------
float fx32_get(int x) { return float(x)/_fx32_onef; }
int fx32_set(float x) { return int(float(x*_fx32_onef)); }
//---------------------------------------------------------------------------
int fx32_mul(const int &x,const int &y) // x*y
{
int a=x,b=y; // asm has access only to local variables
asm { // compute (a*b)>>_fx32_fract
mov eax,a
mov ebx,b
mul eax,ebx // (edx,eax)=a*b
mov ebx,_fx32_one
div ebx // eax=(a*b)>>_fx32_fract
mov a,eax;
}
return a;
}
//---------------------------------------------------------------------------
int fx32_div(const int &x,const int &y) // x/y
{
int a=x,b=y; // asm has access only to local variables
asm { // compute (a*b)>>_fx32_fract
mov eax,a
mov ebx,_fx32_one
mul eax,ebx // (edx,eax)=a<<_fx32_fract
mov ebx,b
div ebx // eax=(a<<_fx32_fract)/b
mov a,eax;
}
return a;
}
//---------------------------------------------------------------------------
int fx32_abs_sqrt(int x) // |x|^(0.5)
{
int m,a;
if (!x) return 0;
if (x<0) x=-x;
m=bits(x); // integer bits
for (a=x,m=0;a;a>>=1,m++); // count all bits
m-=_fx32_fract_bits; // compute result integer bits (half of x integer bits)
if (m<0) m=0; m>>=1;
m=_fx32_one<<m; // MSB of result mask
for (a=0;m;m>>=1) // test bits from MSB to 0
{
a|=m; // bit set
if (fx32_mul(a,a)>x) // if result is too big
a^=m; // bit clear
}
return a;
}
//---------------------------------------------------------------------------
int fx32_pow(int x,int y) // x^y
{
// handle special cases
if (!y) return _fx32_one; // x^0 = 1
if (!x) return 0; // 0^y = 0 if y!=0
if (y==-_fx32_one) return fx32_div(_fx32_one,x); // x^-1 = 1/x
if (y==+_fx32_one) return x; // x^+1 = x
int m,a,b,_y; int sx,sy;
// handle the signs
sx=0; if (x<0) { sx=1; x=-x; }
sy=0; if (y<0) { sy=1; y=-y; }
_y=y&_fx32_fract_mask; // _y fractional part of exponent
y=y&_fx32_integ_mask; // y integer part of exponent
a=_fx32_one; // ini result
// powering by squaring x^y
if (y)
{
for (m=_fx32_uMSB_mask;(m>_fx32_one)&&(m>y);m>>=1); // find mask of highest bit of exponent
for (;m>=_fx32_one;m>>=1)
{
a=fx32_mul(a,a);
if (int(y&m)) a=fx32_mul(a,x);
}
}
// powering by rooting x^_y
if (_y)
{
for (b=x,m=_fx32_one>>1;m;m>>=1) // use only fractional part
{
b=fx32_abs_sqrt(b);
if (int(_y&m)) a=fx32_mul(a,b);
}
}
// handle signs
if (sy) { if (a) a=fx32_div(_fx32_one,a); else a=0; /*Error*/ } // underflow
if (sx) { if (_y) a=0; /*Error*/ else if(int(y&_fx32_one)) a=-a; } // negative number ^ non integer exponent, here could add test if 1/_y is integer instead
return a;
}
//---------------------------------------------------------------------------
I have tested it like this:
float a,b,c0,c1,d;
int x,y;
for (a=0.0,x=fx32_set(a);a<=10.0;a+=0.1,x=fx32_set(a))
for (b=-2.5,y=fx32_set(b);b<=2.5;b+=0.1,y=fx32_set(b))
{
if (!x) continue; // math pow has problems with this
if (!y) continue; // math pow has problems with this
c0=pow(a,b);
c1=fx32_get(fx32_pow(x,y));
d=0.0;
if (fabs(c1)<1e-3) d=c1-c0; else d=(c0/c1)-1.0;
if (fabs(d)>0.1)
d=d; // here add breakpoint to check inconsistencies with math pow
}
a,b are floating point
x,y are closest fixed point representations of a,b
c0 is math pow result
c1 is fx32_pow result
d is difference
hope did not forget something trivial but it seems like it works properly. Do not forget that fixed point has very limited precision so the results will differ a bit ...
P.S. Take a look at this:
How to get a square root for 32 bit input in one clock cycle only?
fixed point log2,exp2
integer C++ pow(x,1/y) implementation
integer C++ pow(a,b),log2(a),logb(a) implementation
real domain pow based on complex domain math
I'm looking to for a reasonably efficient way of determining if a floating point value (double) can be exactly represented by an integer data type (long, 64 bit).
My initial thought was to check the exponent to see if it was 0 (or more precisely 127). But that won't work because 2.0 would be e=1 m=1...
So basically, I am stuck. I have a feeling that I can do this with bit masks, but I'm just not getting my head around how to do that at this point.
So how can I check to see if a double is exactly representable as a long?
Thanks
I think I have found a way to clamp a double into an integer in a standard-conforming fashion (this is not really what the question is about, but it helps a lot). First, we need to see why the obvious code is not correct.
// INCORRECT CODE
uint64_t double_to_uint64 (double x)
{
if (x < 0.0) {
return 0;
}
if (x > UINT64_MAX) {
return UINT64_MAX;
}
return x;
}
The problem here is that in the second comparison, UINT64_MAX is being implicitly converted to double. The C standard does not specify exactly how this conversion works, only that it is to be rounded up or down to a representable value. This means that the second comparison may be false, even if should mathematically be true (which can happen when UINT64_MAX is rounded up, and 'x' is mathematically between UINT64_MAX and (double)UINT64_MAX). As such, the conversion of double to uint64_t can result in undefined behavior in that edge case.
Surprisingly, the solution is very simple. Consider that while UINT64_MAX may not be exactly representable in a double, UINT64_MAX+1, being a power of two (and not too large), certainly is. So, if we first round the input to an integer, the comparison x > UINT64_MAX is equivalent to x >= UINT64_MAX+1, except for possible overflow in the addition. We can fix the overflow by using ldexp instead of adding one to UINT64_MAX. That being said, the following code should be correct.
/* Input: a double 'x', which must not be NaN.
* Output: If 'x' is lesser than zero, then zero;
* otherwise, if 'x' is greater than UINT64_MAX, then UINT64_MAX;
* otherwise, 'x', rounded down to an integer.
*/
uint64_t double_to_uint64 (double x)
{
assert(!isnan(x));
double y = floor(x);
if (y < 0.0) {
return 0;
}
if (y >= ldexp(1.0, 64)) {
return UINT64_MAX;
}
return y;
}
Now, to back to your question: is x is exactly representable in an uint64_t? Only if it was neither rounded nor clamped.
/* Input: a double 'x', which must not be NaN.
* Output: If 'x' is exactly representable in an uint64_t,
* then 1, otherwise 0.
*/
int double_representable_in_uint64 (double x)
{
assert(!isnan(x));
return (floor(x) == x && x >= 0.0 && x < ldexp(1.0, 64));
}
The same algorithm can be used for integers of different size, and also for signed integers with a minor modification. The code that follows does some very basic tests of the uint32_t and uint64_t versions (only false positives can possibly be caught), but is also suitable for manual examination of the edge cases.
#include <inttypes.h>
#include <math.h>
#include <limits.h>
#include <assert.h>
#include <stdio.h>
uint32_t double_to_uint32 (double x)
{
assert(!isnan(x));
double y = floor(x);
if (y < 0.0) {
return 0;
}
if (y >= ldexp(1.0, 32)) {
return UINT32_MAX;
}
return y;
}
uint64_t double_to_uint64 (double x)
{
assert(!isnan(x));
double y = floor(x);
if (y < 0.0) {
return 0;
}
if (y >= ldexp(1.0, 64)) {
return UINT64_MAX;
}
return y;
}
int double_representable_in_uint32 (double x)
{
assert(!isnan(x));
return (floor(x) == x && x >= 0.0 && x < ldexp(1.0, 32));
}
int double_representable_in_uint64 (double x)
{
assert(!isnan(x));
return (floor(x) == x && x >= 0.0 && x < ldexp(1.0, 64));
}
int main ()
{
{
printf("Testing 32-bit\n");
for (double x = 4294967295.999990; x < 4294967296.000017; x = nextafter(x, INFINITY)) {
uint32_t y = double_to_uint32(x);
int representable = double_representable_in_uint32(x);
printf("%f -> %" PRIu32 " representable=%d\n", x, y, representable);
assert(!representable || (double)(uint32_t)x == x);
}
}
{
printf("Testing 64-bit\n");
double x = ldexp(1.0, 64) - 40000.0;
for (double x = 18446744073709510656.0; x < 18446744073709629440.0; x = nextafter(x, INFINITY)) {
uint64_t y = double_to_uint64(x);
int representable = double_representable_in_uint64(x);
printf("%f -> %" PRIu64 " representable=%d\n", x, y, representable);
assert(!representable || (double)(uint64_t)x == x);
}
}
}
Here's one method that could work in most cases. I'm not sure if/how it will break if you give it NaN, INF, very large (overflow) numbers...
(Though I think they will all return false - not exactly representable.)
You could:
Cast it to an integer.
Cast it back to a floating-point.
Compare with original value.
Something like this:
double val = ... ; // Value
if ((double)(long long)val == val){
// Exactly representable
}
floor() and ceil() are also fair game (though they may fail if the value overflows an integer):
floor(val) == val
ceil(val) == val
And here's a messy bit-mask solution:
This uses union type-punning and assumes IEEE double-precision. Union type-punning is only valid in C99 TR2 and later.
int representable(double x){
// Handle corner cases:
if (x == 0)
return 1;
// -2^63 is representable as a signed 64-bit integer, but +2^63 is not.
if (x == -9223372036854775808.)
return 1;
// Warning: Union type-punning is only valid in C99 TR2 or later.
union{
double f;
uint64_t i;
} val;
val.f = x;
uint64_t exp = val.i & 0x7ff0000000000000ull;
uint64_t man = val.i & 0x000fffffffffffffull;
man |= 0x0010000000000000ull; // Implicit leading 1-bit.
int shift = (exp >> 52) - 1075;
// Out of range
if (shift < -52 || shift > 10)
return 0;
// Test mantissa
if (shift < 0){
shift = -shift;
return ((man >> shift) << shift) == man;
}else{
return ((man << shift) >> shift) == man;
}
}
You can use the modf function to split a float into the integer and fraction parts. modf is in the standard C library.
#include <math.h>
#include <limits.h>
double val = ...
double i;
long l;
/* check if fractional part is 0 */
if (modf(val, &i) == 0.0) {
/* val is an integer. check if it can be stored in a long */
if (val >= LONG_MIN && val <= LONG_MAX) {
/* can be exactly represented by a long */
l = val;
}
}
How to check if float can be exactly represented as an integer ?
I'm looking to for a reasonably efficient way of determining if a floating point value double can be exactly represented by an integer data type long, 64 bit.
Range (LONG_MIN, LONG_MAX) and fraction (frexp()) tests needed. Also need to watch out for not-a-numbers.
The usual idea is to test like (double)(long)x == x, but to avoid its direct usage. (long)x, when x is out of range, is undefined behavior (UB).
The valid range of conversion for (long)x is LONG_MIN - 1 < x < LONG_MAX + 1 as code discards any fractional part of x during the conversion. So code needs to test, using FP math, if x is in range.
#include <limits.h>
#include <stdbool.h>
#define DBL_LONG_MAXP1 (2.0*(LONG_MAX/2+1))
#define DBL_LONG_MINM1 (2.0*(LONG_MIN/2-1))
bool double_to_long_exact_possible(double x) {
if (x < DBL_LONG_MAXP1) {
double whole_number_part;
if (frexp(x, &whole_number_part) != 0.0) {
return false; // Fractional part exist.
}
#if -LONG_MAX == LONG_MIN
// rare non-2's complement machine
return x > DBL_LONG_MINM1;
#else
return x - LONG_MIN > -1.0;
#endif
}
return false; // Too large or NaN
}
Any IEEE floating-point double or float value with a magnitude at or above 2^52 or 2^23 will be whole number. Adding 2^52 or 2^23 to a positive number whose magnitude is less than that will cause it to be rounded to a whole number. Subtracting the value that was added will yield a whole number which will equal the original iff the original was a whole number. Note that this algorithm will fail with some numbers larger than 2^52, but it isn't needed for numbers that big.
Could you use the modulus operator to check if the double is divisible by one... or am I completely misunderstanding the question?
double val = ... ; // Value
if(val % 1 == 0) {
// Val is evenly divisible by 1 and is therefore a whole number
}