FLT_EPSILON for a nth root finder with SSE/AVX - c

I'm trying to convert a function that finds the nth root in C for a double value from the following link
http://rosettacode.org/wiki/Nth_root#C
to find the nth root for 8 floats at once using AVX.
Part of that code uses DBL_EPSILON * 10. However, when I convert this to use float with AVX I have to use FLT_EPSILON*1000 or the code hangs and does not converge. When I print out FLT_EPSILON I see it is order 1E-7. But this link, http://www.cplusplus.com/reference/cfloat/
, says it should be 1E-5. When I print out DBL_EPSILON it's 1E-16 but the link says it should only be 1E-9. What's going on?
Here is the code so far (not fully optimized).
#include <stdio.h>
#include <float.h>
#include <immintrin.h> // AVX
inline double abs_(double x) {
return x >= 0 ? x : -x;
}
double pow_(double x, int e)
{
double ret = 1;
for (ret = 1; e; x *= x, e >>= 1) {
if ((e & 1)) ret *= x;
}
return ret;
}
double root(double a, int n)
{
double d, x = 1;
x = a/n;
if (!a) return 0;
//if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
int cnt = 0;
do {
cnt++;
d = (a / pow_(x, n - 1) - x) / n;
x+= d;
} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
printf("%d\n", cnt);
return x;
}
__m256 pow_avx(__m256 x, int e) {
__m256 ret = _mm256_set1_ps(1.0f);
for (; e; x = _mm256_mul_ps(x,x), e >>= 1) {
if ((e & 1)) ret = _mm256_mul_ps(x,ret);
}
return ret;
}
inline __m256 abs_avx (__m256 x) {
return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), x), x);
//return x >= 0 ? x : -x;
}
int get_mask(const __m256 d, const __m256 x) {
__m256 ad = abs_avx(d);
__m256 ax = abs_avx(x);
__m256i mask = _mm256_castps_si256(_mm256_cmp_ps(ad, ax, _CMP_GT_OQ));
return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}
__m256 root_avx(__m256 a, int n) {
printf("%e\n", FLT_EPSILON);
printf("%e\n", DBL_EPSILON);
printf("%e\n", FLT_EPSILON*1000.0f);
__m256 d;
__m256 x = _mm256_set1_ps(1.0f);
//if (!a) return 0;
//if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
__m256 in = _mm256_set1_ps(1.0f/n);
__m256 xtmp;
do {
d = _mm256_rcp_ps(pow_avx(x, n - 1));
d = _mm256_sub_ps(_mm256_mul_ps(a,d),x);
d = _mm256_mul_ps(d,in);
//d = (a / pow_avx(x, n - 1) - x) / n;
x = _mm256_add_ps(x, d); //x+= d;
xtmp =_mm256_mul_ps(x, _mm256_set1_ps(FLT_EPSILON*100.0f));
//} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
} while (get_mask(d, xtmp));
return x;
}
int main()
{
__m256 d = _mm256_set1_ps(16.0f);
__m256 out = root_avx(d, 4);
float result[8];
int i;
_mm256_storeu_ps(result, out);
for(i=0; i<8; i++) {
printf("%f\n", result[i]);
} printf("\n");
//double x = 16;
//printf("root(%g, 15) = %g\n", x, root(x, 4));
//double x = pow_(-3.14159, 15);
//printf("root(%g, 15) = %g\n", x, root(x, 15));
return 0;
}

_mm256_rcp_ps, which maps to the rcpps instruction, performs only an approximate reciprocal. The Intel 64 and IA-32 Architectures Software Developer’s Manual says its relative error may be up to 1.5•2-12. This is insufficient to cause the root finder to converge with accuracy 100*FLT_EPSILON.
You could use an exact division, such as:
d = pow_avx(x, n-1);
d = _mm256_sub_ps(_mm256_div_ps(a, d), x);
or add some refinement steps for the reciprocal estimate.
Incidentally, if your compiler supports using regular C operators with SIMD objects, consider using the regular C operators instead:
d = pow_avx(x, n-1);
d = a/d - x;

1e-5 is simply the maximum value the C standard allows an implementation to use for FLT_EPSILON. In practice, you'll be using IEEE-754 single-precision, which has an epsilon of 2-23, which is approximately 1e-7.

Related

modular exponentation funcation generate incorrect result for big input in c

I try two function for modular exponentiation for big base return wrong results,
One of the function is:
uint64_t modular_exponentiation(uint64_t x, uint64_t y, uint64_t p)
{
uint64_t res = 1; // Initialize result
x = x % p; // Update x if it is more than or
// equal to p
while (y > 0)
{
// If y is odd, multiply x with result
if (y & 1)
res = (res*x) % p;
// y must be even now
y = y>>1; // y = y/2
x = (x*x) % p;
}
return res;
}
For input x = 1103362698 ,y = 137911680 , p=1217409241131113809;
It return the value (x^y mod p):749298230523009574(Incorrect).
The correct value is:152166603192600961
The other function i try, gave same result, What is wrong with these functions?
The other one is :
long int exponentMod(long int A, long int B, long int C)
{
// Base cases
if (A == 0)
return 0;
if (B == 0)
return 1;
// If B is even
long int y;
if (B % 2 == 0) {
y = exponentMod(A, B / 2, C);
y = (y * y) % C;
}
// If B is odd
else {
y = A % C;
y = (y * exponentMod(A, B - 1, C) % C) % C;
}
return (long int)((y + C) % C);
}
With p = 1217409241131113809, this value as well as any intermediate values for res and x will be larger than 32 bits. This means that multiplying two of these numbers could result in a value larger than 64 bits which overflows the datatype you're using.
If you restrict the parameters to 32 bit datatypes and use 64 bit datatypes for intermediate values then the function will work. Otherwise you'll need to use a big number library to get correct output.

Using Heron's formula to calculate square root in C

I have implemented this function:
double heron(double a)
{
double x = (a + 1) / 2;
while (x * x - a > 0.000001) {
x = 0.5 * (x + a / x);
}
return x;
}
This function is working as intended, however I would wish to improve it. It's supposed to use and endless while loop to check if something similar to x * x is a. a is the number the user should input.
So far I have no working function using that method...This is my miserably failed attempt:
double heron(double a)
{
double x = (a + 1) / 2;
while (x * x != a) {
x = 0.5 * (x + a / x);
}
return x;
}
This is my first post so if there is anything unclear or something I should add please let me know.
Failed attempt number 2:
double heron(double a)
{
double x = (a + 1) / 2;
while (1) {
if (x * x == a){
break;
} else {
x = 0.5 * (x + a / x);
}
}
return x;
}
Heron's formula
It's supposed to use and endless while loop to check if something similar to x * x is a
Problems:
Slow convergence
When the initial x is quite wrong, the improved |x - sqrt(a)| error may still be only half as big. Given the wide range of double, the may take hundreds of iterations to get close.
Ref: Heron's formula.
For a novel 1st estimation method: Fast inverse square root.
Overflow
x * x in x * x != a is prone to overflow. x != a/x affords a like test without that range problem. Should overflow occur, x may get "infected" with "infinity" or "not-a-number" and fail to achieve convergence.
Oscillations
Once x is "close" to sqrt(a) (within a factor of 2) , the error convergence is quadratic - the number of bits "right" doubles each iteration. This continues until x == a/x or, due to peculiarities of double math, x will endlessly oscillate between two values as will the quotient.
Getting in this oscillation causes OP's loop to not terminate
Putting this together, with a test harness, demonstrates adequate convergence.
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
double rand_finite_double(void) {
union {
double d;
unsigned char uc[sizeof(double)];
} u;
do {
for (unsigned i = 0; i < sizeof u.uc; i++) {
u.uc[i] = (unsigned char) rand();
}
} while (!isfinite(u.d));
return u.d;
}
double sqrt_heron(double a) {
double x = (a + 1) / 2;
double x_previous = -1.0;
for (int i = 0; i < 1000; i++) {
double quotient = a / x;
if (x == quotient || x == x_previous) {
if (x == quotient) {
return x;
}
return ((x + x_previous) / 2);
}
x_previous = x;
x = 0.5 * (x + quotient);
}
// As this code is (should) never be reached, the `for(i)`
// loop "safety" net code is not needed.
assert(0);
}
double test_heron(double xx) {
double x0 = sqrt(xx);
double x1 = sqrt_heron(xx);
if (x0 != x1) {
double delta = fabs(x1 - x0);
double err = delta / x0;
static double emax = 0.0;
if (err > emax) {
emax = err;
printf(" %-24.17e %-24.17e %-24.17e %-24.17e\n", xx, x0, x1, err);
fflush(stdout);
}
}
return 0;
}
int main(void) {
for (int i = 0; i < 100000000; i++) {
test_heron(fabs(rand_finite_double()));
}
return 0;
}
Improvements
sqrt_heron(0.0) works.
Change code for a better initial guess.
double sqrt_heron(double a) {
if (a > 0.0 && a <= DBL_MAX) {
// Better initial guess - halve the exponent of `a`
// Could possible use bit inspection if `double` format known.
int expo;
double significand = frexp(a, &expo);
double x = ldexp(significand, expo / 2);
double x_previous = -1.0;
for (int i = 0; i < 8; i++) { // Notice limit moved from 1000 down to < 10
double quotient = a / x;
if (x == quotient) {
return x;
}
if (x == x_previous) {
return (0.5 * (x + x_previous));
}
x_previous = x;
x = 0.5 * (x + quotient);
}
assert(0);
}
if (a >= 0.0) return a;
assert(0); // invalid argument.
}

Algorithm to find nth root of a number

I am looking for an efficient algorithm to find nth root of a number. The answer must be an integer. I have found that newtons method and bisection method are popular methods. Are there any efficient and simple methods for integer output?
#include <math.h>
inline int root(int input, int n)
{
return round(pow(input, 1./n));
}
This works for pretty much the whole integer range (as IEEE754 8-byte doubles can represent the whole 32-bit int range exactly, which are the representations and sizes that are used on pretty much every system). And I doubt any integer based algorithm is faster on non-ancient hardware. Including ARM. Embedded controllers (the microwave washing machine kind) might not have floating point hardware though. But that part of the question was underspecified.
I know this thread is probably dead, but I don't see any answers I like and that bugs me...
int root(int a, int n) {
int v = 1, bit, tp, t;
if (n == 0) return 0; //error: zeroth root is indeterminate!
if (n == 1) return a;
tp = iPow(v,n);
while (tp < a) { // first power of two such that v**n >= a
v <<= 1;
tp = iPow(v,n);
}
if (tp == a) return v; // answer is a power of two
v >>= 1;
bit = v >> 1;
tp = iPow(v, n); // v is highest power of two such that v**n < a
while (a > tp) {
v += bit; // add bit to value
t = iPow(v, n);
if (t > a) v -= bit; // did we add too much?
else tp = t;
if ( (bit >>= 1) == 0) break;
}
return v; // closest integer such that v**n <= a
}
// used by root function...
int iPow(int a, int e) {
int r = 1;
if (e == 0) return r;
while (e != 0) {
if ((e & 1) == 1) r *= a;
e >>= 1;
a *= a;
}
return r;
}
This method will also work with arbitrary precision fixed point math in case you want to compute something like sqrt(2) to 100 decimal places...
I question your use of "algorithm" when speaking of C programs. Programs and algorithms are not the same (an algorithm is mathematical; a C program is expected to be implementing some algorithm).
But on current processors (like in recent x86-64 laptops or desktops) the FPU is doing fairly well. I guess (but did not benchmark) that a fast way of computing the n-th root could be,
inline unsigned root(unsigned x, unsigned n) {
switch (n) {
case 0: return 1;
case 1: return x;
case 2: return (unsigned)sqrt((double)x);
case 3: return (unsigned)cbrt((double)x);
default: return (unsigned) pow (x, 1.0/n);
}
}
(I made a switch because many processors have hardware to compute sqrt and some have hardware to compute cbrt ..., so you should prefer these when relevant...).
I am not sure that n-th root of a negative number makes sense in general. So my root function takes some unsigned x and returns some unsigned number.  
Here is an efficient general implementation in C, using a simplified version of the "shifting nth root algorithm" to compute the floor of the nth root of x:
uint64_t iroot(const uint64_t x, const unsigned n)
{
if ((x == 0) || (n == 0)) return 0;
if (n == 1) return x;
uint64_t r = 1;
for (int s = ((ilog2(x) / n) * n) - n; s >= 0; s -= n)
{
r <<= 1;
r |= (ipow(r|1, n) <= (x >> s));
}
return r;
}
It needs this function to compute the nth power of x (using the method of exponentiation by squaring):
uint64_t ipow(uint64_t x, unsigned n)
{
if (x <= 1) return x;
uint64_t y = 1;
for (; n != 0; n >>= 1, x *= x)
if (n & 1)
y *= x;
return y;
}
and this function to compute the floor of base-2 logarithm of x:
int ilog2(uint64_t x)
{
#if __has_builtin(__builtin_clzll)
return 63 - ((x != 0) * (int)__builtin_clzll(x)) - ((x == 0) * 64);
#else
int y = -(x == 0);
for (unsigned k = 64 / 2; k != 0; k /= 2)
if ((x >> k) != 0)
{ x >>= k; y += k; }
return y;
#endif
}
Note: This assumes that your compiler understands GCC's __has_builtin test and that your compiler's uint64_t type is the same size as an unsigned long long.
You can try this C function to get the nth_root of an unsigned integer :
unsigned initial_guess_nth_root(unsigned n, unsigned nth){
unsigned res = 1;
for(; n >>= 1; ++res);
return nth ? 1 << (res + nth - 1) / nth : 0 ;
}
// return a number that, when multiplied by itself nth times, makes N.
unsigned nth_root(const unsigned n, const unsigned nth) {
unsigned a = initial_guess_nth_root(n , nth), b, c, r = nth ? a + (n > 0) : n == 1 ;
for (; a < r; b = a + (nth - 1) * r, a = b / nth)
for (r = a, a = n, c = nth - 1; c && (a /= r); --c);
return r;
}
Example of output :
24 == (int) pow(15625, 1.0/3)
25 == nth_root(15625, 3)
0 == nth_root(0, 0)
1 == nth_root(1, 0)
4 == nth_root(4096, 6)
13 == nth_root(18446744073709551614, 17) // 64-bit 20 digits
11 == nth_root(340282366920938463463374607431768211454, 37) // 128-bit 39 digits
Here is the github source.

Calculating exponents in C without pow()

int main ()
{
int n = 0;
int base = 0;
while(n < 10)
{
int x = 2;
int answer = power(x, n);
float neganswer = negpower(x, n);
printf("%d %d %f\n", base, answer, neganswer);
base++;
n++;
}
return EXIT_SUCCESS;
}
int power(int base, int power)
{
int result, i;
result = 1;
for (i=0; i < power; i++)
{
result *= base;
}
return result;
}
int negpower(int base, int power)
{
float result, i;
result = 1.0;
for (i=0; i < power; i++)
{
result = result / base;
}
return result;
}
So I'm trying to call upon this function that i've made, and I think its calculating it correctly, however it is only outputting 1.0000000 followed directly by 0.0000000. I think I've got problems with carrying the float value, can anyone chime in?
Thanks
This is because you are returning a float from negpower() which has return type of int and assigning it to a float neganswer.
Change
int negpower(int base, int power)
to
float negpower(int base, int power)
Output:
Side note:
Always add required header files.
A prototype should be declared if a function definition appears after the main().
The answer is much simpler. Your negpower function returns an int, when you actually return a float from it. Change the prototype and it should work alright.
This is optimized library if you are interested:
#ifdef DOCUMENTATION
title pow x raised to power y
index x raised to power y
usage
.s
double x, y, f, pow();
.br
f = pow(x, y);
.s
description
.s
Returns value of x raised to power y
.s
diagnostics
.s
There are three error possible error messages from this function.
.s
If the x argument is negative the message 'pow arg negative',
followed by the value of x, is written to stderr. The value
of pow for |x| is returned.
.s
If x = 0.0 and y <= 0.0 or if result overflows the message 'pow
overflow', followed by the value of y, is written to stderr.
The value of HUGE is returned.
.s
If the result underflows and if warnings are enabled (normally not),
the message 'pow underflow', followed by the value of y, is written
to stderr. The value of 0 is returned.
.s
The suggestion of Cody and Waite, that the domain be reduced to
simplify the overflow test, has been adopted, consequently overflow
is reported if the result would exceed HUGE * 2**(-1/16).
2**(-1/16) is approximately 0.9576.
.s
internal
.s
Algorithm from Cody and Waite pp. 84-124. This algorithm required
two auxiliary programs POWGA1 and POWGA2 to calculate, respectively,
the arrays a1[] and a2[] used to represent the powers of 2**(-1/16)
to more than machine precision.
The source code for these programs are in the files POWGA1.AUX and
POWGA2.AUX. The octal table on page 98 of Cody and Waite is in the
file POWOCT.DAT which is required on stdin by POWGA2.
.s
author
.s
Hamish Ross.
.s
date
.s
27-Jan-85
#endif
#include <math.h>
#define MAXEXP 2031 /* (MAX_EXP * 16) - 1 */
#define MINEXP -2047 /* (MIN_EXP * 16) - 1 */
static double a1[] = {
1.0,
0.95760328069857365,
0.91700404320467123,
0.87812608018664974,
0.84089641525371454,
0.80524516597462716,
0.77110541270397041,
0.73841307296974966,
0.70710678118654752,
0.67712777346844637,
0.64841977732550483,
0.62092890603674203,
0.59460355750136054,
0.56939431737834583,
0.54525386633262883,
0.52213689121370692,
0.50000000000000000
};
static double a2[] = {
0.24114209503420288E-17,
0.92291566937243079E-18,
-0.15241915231122319E-17,
-0.35421849765286817E-17,
-0.31286215245415074E-17,
-0.44654376565694490E-17,
0.29306999570789681E-17,
0.11260851040933474E-17
};
static double p1 = 0.833333333333332114e-1;
static double p2 = 0.125000000005037992e-1;
static double p3 = 0.223214212859242590e-2;
static double p4 = 0.434457756721631196e-3;
static double q1 = 0.693147180559945296e0;
static double q2 = 0.240226506959095371e0;
static double q3 = 0.555041086640855953e-1;
static double q4 = 0.961812905951724170e-2;
static double q5 = 0.133335413135857847e-2;
static double q6 = 0.154002904409897646e-3;
static double q7 = 0.149288526805956082e-4;
static double k = 0.442695040888963407;
double pow(x, y)
double x, y;
{
double frexp(), g, ldexp(), r, u1, u2, v, w, w1, w2, y1, y2, z;
int iw1, m, p;
if (y == 0.0)
return(1.0);
if (x <= 0.0) {
if (x == 0.0) {
if (y > 0.0)
return(x);
cmemsg(FP_POWO, &y);
return(HUGE);
}
else {
cmemsg(FP_POWN, &x);
x = -x;
}
}
g = frexp(x, &m);
p = 0;
if (g <= a1[8])
p = 8;
if (g <= a1[p + 4])
p += 4;
if (g <= a1[p + 2])
p += 2;
p++;
z = ((g - a1[p]) - a2[p / 2]) / (g + a1[p]);
z += z;
v = z * z;
r = (((p4 * v + p3) * v + p2) * v + p1) * v * z;
r += k * r;
u2 = (r + z * k) + z;
u1 = 0.0625 * (double)(16 * m - p);
y1 = 0.0625 * (double)((int)(16.0 * y));
y2 = y - y1;
w = u2 * y + u1 * y2;
w1 = 0.0625 * (double)((int)(16.0 * w));
w2 = w - w1;
w = w1 + u1 * y1;
w1 = 0.0625 * (double)((int)(16.0 * w));
w2 += (w - w1);
w = 0.0625 * (double)((int)(16.0 * w2));
iw1 = 16.0 * (w1 + w);
w2 -= w;
while (w2 > 0.0) {
iw1++;
w2 -= 0.0625;
}
if (iw1 > MAXEXP) {
cmemsg(FP_POWO, &y);
return(HUGE);
}
if (iw1 < MINEXP) {
cmemsg(FP_POWU, &y);
return(0.0);
}
m = iw1 / 16;
if (iw1 >= 0)
m++;
p = 16 * m - iw1;
z = ((((((q7*w2 + q6)*w2 + q5)*w2 + q4)*w2 + q3)*w2 + q2)*w2 + q1)*w2;
z = a1[p] + a1[p] * z;
return(ldexp(z, m));
}
You have all sorts of ints in there. When you do that, the decimal gets truncated. You should make your power functions return floats, and use a float base.

Speeding up Newton's Method for finding nth root

Let me predicate this question with a statement; This code works as intended but it is slow very very slow for what it is. Is there a way to make it the newton method converge faster or a way to set a __m256 var equal to a single float without messing with the float arrays and such?
__m256 nthRoot(__m256 a, int root){
#define aligned __declspec(align(16)) float
// uses the calculation
// n_x+1 = (1/root)*(root * x + a / pow(x,root))
//initial numbers
aligned r[8];
aligned iN[8];
aligned mN[8];
//Function I made to fill arrays
/*
template<class T>
void FillArray(T a[],T b)
{
int n = sizeof(a)/sizeof(T);
for(int i = 0; i < n; a[i++] = b);
}*/
//fills the arrays
FillArray(iN,(1.0f/(float)root));
FillArray(mN,(float)(root-1));
FillArray(r,(float)root);
//loads the arrays into the sse componenets
__m256 R = _mm256_load_ps(r);
__m256 Ni = _mm256_load_ps(iN);
__m256 Nm = _mm256_load_ps(mN);
//sets initaial guess to 1 / (a * root)
__m256 x = _mm256_rcp_ps(_mm256_mul_ps(R,a));
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multipled by Ni
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Edit #1
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_set1_ps((1.0f)/((float)root));
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,_mm256_rcp_ps(R));
for(int i = 0; i < 10 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//mult nm x then add tar because my compiler stoped thinking that fmadd is a valid instruction
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Any tips or pointers(not the memory kind) to make it the newton method converge faster would be appreciated.
Edit #2 removed on _mm256_set1_ps() function call with _mm256_rcp_ps() because I had already loaded the reciprocal of what I had needed into R
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
//my compiler believes in fmac again
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Edit #3
__m256 SSEnthRoot(__m256 a, int root){
__m256 Ni = _mm256_set1_ps(1.0f/(float)root);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
tar = _mm256_fmadd_ps(Nm,x,tar);
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Your pow function is inefficient.
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
In your example you're taking the 29th root. You need pow(x, 29-1) = x^28. Currently you use 27 multiplications for that but it's possible to do that in only six multiplications.
x^28 = (x^4)*(x^8)*(x^16)
x^4 = y -> 2 multiplications
x^8 = y*y = z -> 1 multiplication
x^16 = z^2 = w-> 1 multiplications
y*z*w -> 2 multiplications
6 multiplications in total
Here is an improved version of you code which is about twice as fast on my system. It uses a new pow_avx_fast function which I created which does x^n for 8 floats at once using AVX. It does e.g. x^28 in 6 multiplications instead of 27. Please see further into my answer. I found a version which finds the result within some tolerance xacc. This could be much faster if the convergence happens quick.
inline __m256 pow_avx_fast(__m256 x, const int n) {
//n must be greater than zero
if(n%2 == 0) {
return pow_avx_fast(_mm256_mul_ps(x, x), n/2);
}
else {
if(n>1) return _mm256_mul_ps(x,pow_avx_fast(_mm256_mul_ps(x, x), (n-1)/2));
return x;
}
}
inline __m256 SSEnthRoot_fast(__m256 a, int root) {
// n_x+1 = (1/root)*((root-1) * x + a / pow(x,root-1))
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++) {
__m256 tmpx = pow_avx_fast(x, root-1);
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
//tar = _mm256_fmadd_ps(Nm,x,tar);
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
For more information how to write an efficient pow function see these links http://en.wikipedia.org/wiki/Addition-chain_exponentiation and
http://en.wikipedia.org/wiki/Exponentiation_by_squaring
Also, your initial guess might not be so good. Here is scalar code to find the nth root based on your method (but using the math pow function which is probably faster than yours). It takes about 50 iterations to solve the 4th root of 16 (which is 2). For the 20 iterations you use it returns over 4000 which is no where close to 2.0. So you will need to adjust your method to do enough iterations to ensure a reasonable answer within some tolerance.
float fx(float a, int n, float x) {
return 1.0f/n * ((n-1)*x + a/pow(x, n-1));
}
float scalar_nthRoot_v2(float a, int root) {
//sets initaial guess to 1 / (a * root)
float x = 1.0f/(a*root);
printf("x0 %f\n", x);
for(int i = 0; i<50; i++) {
x = fx(a, root, x);
printf("x %f\n", x);
}
return x;
}
I got the formula for Newtons method from here.
http://en.wikipedia.org/wiki/Nth_root_algorithm
Here is a version of your function which gives the result within a certain tolerance xacc or quits if no convergence after nmax iterations. This function could be much faster than your method if the convergence happens in less than 20 iterations. It requires that all eight floats converge at once. In other words, if seven converge and one does not then the other seven have to wait for the one that does not converge. That's the problem with SIMD (on the GPU as well) but in general it's still faster than doing it without SIMD.
int get_mask(const __m256 dx, const float xacc) {
__m256i mask = _mm256_castps_si256(_mm256_cmp_ps(dx, _mm256_set1_ps(xacc), _CMP_GT_OQ));
return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}
inline __m256 SSEnthRoot_fast_xacc(const __m256 a, const int root, const int nmax, float xacc) {
// n_x+1 = (1/root)*(root * x + a / pow(x,root))
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
//__m256 Ni = _mm256_set1_ps(1.0f/root);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i <nmax ; i ++) {
__m256 tmpx = pow_avx_fast(x, root-1);
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//tar = _mm256_fmadd_ps(Nm,x,tar);
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
tmpx = _mm256_mul_ps(Ni,tar);
__m256 dx = _mm256_sub_ps(tmpx,x);
dx = _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), dx), dx); //fabs(dx)
int cnt = get_mask(dx, xacc);
if(cnt == 0) return x;
x = tmpx;
}
return x; //at least one value out of eight did not converge by nmax.
}
Here is a more general version of the pow function for avx which works for n<=0 as well.
__m256 pow_avx(__m256 x, const int n) {
if(n<0) {
return pow_avx(_mm256_rcp_ps(x), -n);
}
else if(n == 0) {
return _mm256_set1_ps(1.0f);
}
else if(n == 1) {
return x;
}
else if(n%2 ==0) {
return pow_avx(_mm256_mul_ps(x, x), n/2);
}
else {
return _mm256_mul_ps(x,pow_avx(_mm256_mul_ps(x, x), (n-1)/2));
}
}
Some other suggestions
You can use a SIMD math library which finds the nth root. SIMD math libraries for SSE and AVX
For Intel you can use SVML which is expensive and closed source (Intel's OpenCL driver uses SVML so with that you can get it for free). For AMD you can use LIBM which is free but closed source. There are several open source SIMD math libraries such as http://software-lisc.fbk.eu/avx_mathfun/ and https://bitbucket.org/eschnett/vecmathlib/wiki/Home
To set all elements of an __m256 vector to a single value:
__m256 v = _mm256_set1_ps(1.0f);
or in your specific case:
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_set1_ps((1.0f/(float)root));
__m256 Nm = _mm256_set1_ps((float)(root-1));
Obviously you can get rid of the FillArray stuff once you've made this change.
Perhaps you should be doing this in the log domain.
pow(a,1/root) == exp( log(x) /root)
Julien Pommier has a sse_mathfun.h that has SSE,SSE2 log and exp functions, but I cannot say I've used those in particular. The techniques may be extensible to avx.

Resources