Speeding up Newton's Method for finding nth root - c

Let me predicate this question with a statement; This code works as intended but it is slow very very slow for what it is. Is there a way to make it the newton method converge faster or a way to set a __m256 var equal to a single float without messing with the float arrays and such?
__m256 nthRoot(__m256 a, int root){
#define aligned __declspec(align(16)) float
// uses the calculation
// n_x+1 = (1/root)*(root * x + a / pow(x,root))
//initial numbers
aligned r[8];
aligned iN[8];
aligned mN[8];
//Function I made to fill arrays
/*
template<class T>
void FillArray(T a[],T b)
{
int n = sizeof(a)/sizeof(T);
for(int i = 0; i < n; a[i++] = b);
}*/
//fills the arrays
FillArray(iN,(1.0f/(float)root));
FillArray(mN,(float)(root-1));
FillArray(r,(float)root);
//loads the arrays into the sse componenets
__m256 R = _mm256_load_ps(r);
__m256 Ni = _mm256_load_ps(iN);
__m256 Nm = _mm256_load_ps(mN);
//sets initaial guess to 1 / (a * root)
__m256 x = _mm256_rcp_ps(_mm256_mul_ps(R,a));
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multipled by Ni
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Edit #1
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_set1_ps((1.0f)/((float)root));
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,_mm256_rcp_ps(R));
for(int i = 0; i < 10 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++){
tmpx = _mm256_mul_ps(x,tmpx);
}
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//mult nm x then add tar because my compiler stoped thinking that fmadd is a valid instruction
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Any tips or pointers(not the memory kind) to make it the newton method converge faster would be appreciated.
Edit #2 removed on _mm256_set1_ps() function call with _mm256_rcp_ps() because I had already loaded the reciprocal of what I had needed into R
__m256 SSEnthRoot(__m256 a, int root){
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
//my compiler believes in fmac again
tar = _mm256_fmadd_ps(Nm,x,tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
Edit #3
__m256 SSEnthRoot(__m256 a, int root){
__m256 Ni = _mm256_set1_ps(1.0f/(float)root);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++){
__m256 tmpx = x;
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
tar = _mm256_fmadd_ps(Nm,x,tar);
x = _mm256_mul_ps(Ni,tar);
}
return x;
}

Your pow function is inefficient.
for(int k = 0 ; k < root -2 ; k++)
tmpx = _mm256_mul_ps(x,tmpx);
In your example you're taking the 29th root. You need pow(x, 29-1) = x^28. Currently you use 27 multiplications for that but it's possible to do that in only six multiplications.
x^28 = (x^4)*(x^8)*(x^16)
x^4 = y -> 2 multiplications
x^8 = y*y = z -> 1 multiplication
x^16 = z^2 = w-> 1 multiplications
y*z*w -> 2 multiplications
6 multiplications in total
Here is an improved version of you code which is about twice as fast on my system. It uses a new pow_avx_fast function which I created which does x^n for 8 floats at once using AVX. It does e.g. x^28 in 6 multiplications instead of 27. Please see further into my answer. I found a version which finds the result within some tolerance xacc. This could be much faster if the convergence happens quick.
inline __m256 pow_avx_fast(__m256 x, const int n) {
//n must be greater than zero
if(n%2 == 0) {
return pow_avx_fast(_mm256_mul_ps(x, x), n/2);
}
else {
if(n>1) return _mm256_mul_ps(x,pow_avx_fast(_mm256_mul_ps(x, x), (n-1)/2));
return x;
}
}
inline __m256 SSEnthRoot_fast(__m256 a, int root) {
// n_x+1 = (1/root)*((root-1) * x + a / pow(x,root-1))
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i < 20 ; i ++) {
__m256 tmpx = pow_avx_fast(x, root-1);
//f over f'
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//fmac with Ni*X+tar
//tar = _mm256_fmadd_ps(Nm,x,tar);
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
//Multiplied by the inverse of power
x = _mm256_mul_ps(Ni,tar);
}
return x;
}
For more information how to write an efficient pow function see these links http://en.wikipedia.org/wiki/Addition-chain_exponentiation and
http://en.wikipedia.org/wiki/Exponentiation_by_squaring
Also, your initial guess might not be so good. Here is scalar code to find the nth root based on your method (but using the math pow function which is probably faster than yours). It takes about 50 iterations to solve the 4th root of 16 (which is 2). For the 20 iterations you use it returns over 4000 which is no where close to 2.0. So you will need to adjust your method to do enough iterations to ensure a reasonable answer within some tolerance.
float fx(float a, int n, float x) {
return 1.0f/n * ((n-1)*x + a/pow(x, n-1));
}
float scalar_nthRoot_v2(float a, int root) {
//sets initaial guess to 1 / (a * root)
float x = 1.0f/(a*root);
printf("x0 %f\n", x);
for(int i = 0; i<50; i++) {
x = fx(a, root, x);
printf("x %f\n", x);
}
return x;
}
I got the formula for Newtons method from here.
http://en.wikipedia.org/wiki/Nth_root_algorithm
Here is a version of your function which gives the result within a certain tolerance xacc or quits if no convergence after nmax iterations. This function could be much faster than your method if the convergence happens in less than 20 iterations. It requires that all eight floats converge at once. In other words, if seven converge and one does not then the other seven have to wait for the one that does not converge. That's the problem with SIMD (on the GPU as well) but in general it's still faster than doing it without SIMD.
int get_mask(const __m256 dx, const float xacc) {
__m256i mask = _mm256_castps_si256(_mm256_cmp_ps(dx, _mm256_set1_ps(xacc), _CMP_GT_OQ));
return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}
inline __m256 SSEnthRoot_fast_xacc(const __m256 a, const int root, const int nmax, float xacc) {
// n_x+1 = (1/root)*(root * x + a / pow(x,root))
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_rcp_ps(R);
//__m256 Ni = _mm256_set1_ps(1.0f/root);
__m256 Nm = _mm256_set1_ps((float)(root -1));
__m256 x = _mm256_mul_ps(a,Ni);
for(int i = 0; i <nmax ; i ++) {
__m256 tmpx = pow_avx_fast(x, root-1);
__m256 tar = _mm256_mul_ps(a,_mm256_rcp_ps(tmpx));
//tar = _mm256_fmadd_ps(Nm,x,tar);
tar = _mm256_add_ps(_mm256_mul_ps(Nm,x),tar);
tmpx = _mm256_mul_ps(Ni,tar);
__m256 dx = _mm256_sub_ps(tmpx,x);
dx = _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), dx), dx); //fabs(dx)
int cnt = get_mask(dx, xacc);
if(cnt == 0) return x;
x = tmpx;
}
return x; //at least one value out of eight did not converge by nmax.
}
Here is a more general version of the pow function for avx which works for n<=0 as well.
__m256 pow_avx(__m256 x, const int n) {
if(n<0) {
return pow_avx(_mm256_rcp_ps(x), -n);
}
else if(n == 0) {
return _mm256_set1_ps(1.0f);
}
else if(n == 1) {
return x;
}
else if(n%2 ==0) {
return pow_avx(_mm256_mul_ps(x, x), n/2);
}
else {
return _mm256_mul_ps(x,pow_avx(_mm256_mul_ps(x, x), (n-1)/2));
}
}
Some other suggestions
You can use a SIMD math library which finds the nth root. SIMD math libraries for SSE and AVX
For Intel you can use SVML which is expensive and closed source (Intel's OpenCL driver uses SVML so with that you can get it for free). For AMD you can use LIBM which is free but closed source. There are several open source SIMD math libraries such as http://software-lisc.fbk.eu/avx_mathfun/ and https://bitbucket.org/eschnett/vecmathlib/wiki/Home

To set all elements of an __m256 vector to a single value:
__m256 v = _mm256_set1_ps(1.0f);
or in your specific case:
__m256 R = _mm256_set1_ps((float)root);
__m256 Ni = _mm256_set1_ps((1.0f/(float)root));
__m256 Nm = _mm256_set1_ps((float)(root-1));
Obviously you can get rid of the FillArray stuff once you've made this change.

Perhaps you should be doing this in the log domain.
pow(a,1/root) == exp( log(x) /root)
Julien Pommier has a sse_mathfun.h that has SSE,SSE2 log and exp functions, but I cannot say I've used those in particular. The techniques may be extensible to avx.

Related

Logistic regression code stops working above ~43,500 generated observations

Having some difficulty troubleshooting code I wrote in C to perform a logistic regression. While it seems to work on smaller, semi-randomized datasets, it stops working (e.g. assigning proper probabilities of belonging to class 1) at around the point where I pass 43,500 observations (determined by tweaking the number of observations created. When creating the 150 features used in the code, I do create the first two as a function of the number of observations, so I'm not sure if maybe that's the issue here, though I am using double precision. Maybe there's an overflow somewhere in the code?
The below code should be self-contained; it generates m=50,000 observations with n=150 features. Setting m below 43,500 should return "Percent class 1: 0.250000", setting to 44,000 or above will return "Percent class 1: 0.000000", regardless of what max_iter (number of times we sample m observations) is set to.
The first feature is set to 1.0 divided by the total number of observations, if class 0 (first 75% of observations), or the index of the observation divided by the total number of observations otherwise.
The second feature is just index divided by total number of observations.
All other features are random.
The logistic regression is intended to use stochastic gradient descent, randomly selecting an observation index, computing the gradient of the loss with the predicted y using current weights, and updating weights with the gradient and learning rate (eta).
Using the same initialization with Python and NumPy, I still get the proper results, even above 50,000 observations.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
// Compute z = w * x + b
double dlc( int n, double *X, double *coef, double intercept )
{
double y_pred = intercept;
for (int i = 0; i < n; i++)
{
y_pred += X[i] * coef[i];
}
return y_pred;
}
// Compute y_hat = 1 / (1 + e^(-z))
double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept )
{
double y_pred;
y_pred = dlc(n, X, coef, intercept);
y_pred = 1.0 / (1.0 + exp(-y_pred));
return y_pred;
}
// Stochastic gradient descent
void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed )
{
double *gradient_coef, *X_i;
double y_i, y_pred, resid;
int idx;
double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0;
X_i = (double *) malloc (n * sizeof(double));
gradient_coef = (double *) malloc (n * sizeof(double));
for ( int i = 0; i < n; i++ )
{
coef[i] = 0.0;
gradient_coef[i] = 0.0;
}
*intercept = 0.0;
srand(random_seed);
for ( int epoch = 0; epoch < max_iter; epoch++ )
{
for ( int run = 0; run < m; run++ )
{
// Randomly sample an observation
idx = rand() % m;
for ( int i = 0; i < n; i++ )
{
X_i[i] = X[n*idx+i];
}
y_i = y[idx];
// Compute y_hat
y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept );
resid = -(y_i - y_pred);
// Compute gradients and adjust weights
for (int i = 0; i < n; i++)
{
gradient_coef[i] = X_i[i] * resid;
coef[i] -= eta * gradient_coef[i];
}
if ( fit_intercept == 1 )
{
*intercept -= eta * resid;
}
}
}
}
int main(void)
{
double *X, *y, *coef, *y_pred;
double intercept;
double eta = 0.05;
double alpha = 1.0, beta = 1.0;
long m = 50000;
long n = 150;
int max_iter = 20;
long class_0 = (long)(3.0 / 4.0 * (double)m);
double pct_class_1 = 0.0;
clock_t test_start;
clock_t test_end;
double test_time;
printf("Constructing variables...\n");
X = (double *) malloc (m * n * sizeof(double));
y = (double *) malloc (m * sizeof(double));
y_pred = (double *) malloc (m * sizeof(double));
coef = (double *) malloc (n * sizeof(double));
// Initialize classes
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
y[i] = 0.0;
}
else
{
y[i] = 1.0;
}
}
// Initialize observation features
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
X[n*i] = 1.0 / (double)m;
}
else
{
X[n*i] = (double)i / (double)m;
}
X[n*i + 1] = (double)i / (double)m;
for (int j = 2; j < n; j++)
{
X[n*i + j] = (double)(rand() % 100) / 100.0;
}
}
// Fit weights
printf("Running SGD...\n");
test_start = clock();
sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 );
test_end = clock();
test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", test_time);
// Compute y_hat and share of observations predicted as class 1
printf("Making predictions...\n");
for ( int i = 0; i < m; i++ )
{
y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept );
}
printf("Printing results...\n");
for ( int i = 0; i < m; i++ )
{
//printf("%f\n", y_pred[i]);
if (y_pred[i] > 0.5)
{
pct_class_1 += 1.0;
}
// Troubleshooting print
if (i < 10 || i > m - 10)
{
printf("%g\n", y_pred[i]);
}
}
printf("Percent class 1: %f", pct_class_1 / (double)m);
return 0;
}
For reference, here is my (presumably) equivalent Python code, which returns the correct percent of identified classes at more than 50,000 observations:
import numpy as np
import time
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class LogisticRegressor:
def __init__(self, eta, init_runs, fit_intercept=True):
self.eta = eta
self.init_runs = init_runs
self.fit_intercept = fit_intercept
def fit(self, x, y):
m, n = x.shape
self.coef = np.zeros((n, 1))
self.intercept = np.zeros((1, 1))
for epoch in range(self.init_runs):
for run in range(m):
idx = np.random.randint(0, m)
x_i = x[idx:idx+1, :]
y_i = y[idx]
y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept)
gradient_w = -(x_i.T * (y_i - y_pred_i))
self.coef -= self.eta * gradient_w
if self.fit_intercept:
gradient_b = -(y_i - y_pred_i)
self.intercept -= self.eta * gradient_b
def predict_proba(self, x):
m, n = x.shape
y_pred = np.ones((m, 2))
y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept)
y_pred[:,0:1] -= y_pred[:,1:2]
return y_pred
def predict(self, x):
return np.round(sigmoid(x.dot(self.coef) + self.intercept))
m = 50000
n = 150
class1 = int(3.0 / 4.0 * m)
X = np.random.rand(m, n)
y = np.zeros((m, 1))
for obs in range(m):
if obs < class1:
continue
else:
y[obs,0] = 1
for obs in range(m):
if obs < class1:
X[obs, 0] = 1.0 / float(m)
else:
X[obs, 0] = float(obs) / float(m)
X[obs, 1] = float(obs) / float(m)
logit = LogisticRegressor(0.05, 20)
start_time = time.time()
logit.fit(X, y)
end_time = time.time()
print(round(end_time - start_time, 2))
y_pred = logit.predict(X)
print("Percent:", y_pred.sum() / len(y_pred))
The issue is here:
// Randomly sample an observation
idx = rand() % m;
... in light of the fact that the OP's RAND_MAX is 32767. This is exacerbated by the fact that all of the class 0 observations are at the end.
All samples will be drawn from the first 32768 observations, and when the total number of observations is greater than that, the proportion of class 0 observations among those that can be sampled is less than 0.25. At 43691 total observations, there are no class 0 observations among those that can be sampled.
As a secondary issue, rand() % m does not yield a wholly uniform distribution if m does not evenly divide RAND_MAX + 1, though the effect of this issue will be much more subtle.
Bottom line: you need a better random number generator.
At minimum, you could consider combining the bits from two calls to rand() to yield an integer with sufficient range, but you might want to consider getting a third-party generator. There are several available.
Note: OP reports "m=50,000 observations with n=150 features.", so perhaps this is not the issue for OP, but I'll leave this answer up for reference when OP tries larger tasks.
A potential issue:
long overflow
m * n * sizeof(double) risks overflow when long is 32-bit and m*n > LONG_MAX (or about 46,341 if m, n are the same).
OP does report
A first step is to perform the multiplication using size_t math where we gain at least 1 more bit in the calculation.
// m * n * sizeof(double)
sizeof(double) * m * n
Yet unless OP's size_t is more than 32-bit, we still have trouble.
IAC, I recommend to use size_t for array sizing and indexing.
Check allocations for failure too.
Since RAND_MAX may be too small and array indexing should be done using size_t math, consider a helper function to generate a random index over the entire size_t range.
// idx = rand() % m;
size_t idx = rand_size_t() % (size_t)m;
If stuck with the standard rand(), below is a helper function to extend its range as needed.
It uses the real nifty IMAX_BITS(m).
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
// https://stackoverflow.com/a/4589384/2410359
/* Number of bits in inttype_MAX, or in any (1<<k)-1 where 0 <= k < 2040 */
#define IMAX_BITS(m) ((m)/((m)%255+1) / 255%255*8 + 7-86/((m)%255+12))
// Test that RAND_MAX is a power of 2 minus 1
_Static_assert((RAND_MAX & 1) && ((RAND_MAX/2 + 1) & (RAND_MAX/2)) == 0, "RAND_MAX is not a Mersenne number");
#define RAND_MAX_WIDTH (IMAX_BITS(RAND_MAX))
#define SIZE_MAX_WIDTH (IMAX_BITS(SIZE_MAX))
size_t rand_size_t(void) {
size_t index = (size_t) rand();
for (unsigned i = RAND_MAX_WIDTH; i < SIZE_MAX_WIDTH; i += RAND_MAX_WIDTH) {
index <<= RAND_MAX_WIDTH;
index ^= (size_t) rand();
}
return index;
}
Further considerations can replace the rand_size_t() % (size_t)m with a more uniform distribution.
As has been determined elsewhere, the problem is due to the implementation's RAND_MAX value being too small.
Assuming 32-bit ints, a slightly better PRNG function can be implemented in the code, such as this C implementation of the minstd_rand() function from C++:
#define MINSTD_RAND_MAX 2147483646
// Code assumes `int` is at least 32 bits wide.
static unsigned int minstd_seed = 1;
static void minstd_srand(unsigned int seed)
{
seed %= 2147483647;
// zero seed is bad!
minstd_seed = seed ? seed : 1;
}
static int minstd_rand(void)
{
minstd_seed = (unsigned long long)minstd_seed * 48271 % 2147483647;
return (int)minstd_seed;
}
Another problem is that expressions of the form rand() % m produce a biased result when m does not divide (unsigned int)RAND_MAX + 1. Here is an unbiased function that returns a random integer from 0 to le inclusive, making use of the minstd_rand() function defined earlier:
static int minstd_rand_max(int le)
{
int r;
if (le < 0)
{
r = le;
}
else if (le >= MINSTD_RAND_MAX)
{
r = minstd_rand();
}
else
{
int rm = MINSTD_RAND_MAX - le + MINSTD_RAND_MAX % (le + 1);
while ((r = minstd_rand()) > rm)
{
}
r /= (rm / (le + 1) + 1);
}
return r;
}
(Actually, it does still have a very small bias because minstd_rand() will never return 0.)
For example, replace rand() % 100 with minstd_rand_max(99), and replace rand() % m with minstd_rand_max(m - 1). Also replace srand(random_seed) with minstd_srand(random_seed).

Sparse matrix multiplication in Eigen giving wrong result?

I am using Eigen in a project of mine, and I am running into a strange issue. I have complex sparse matrices A and B (1500x1500 or larger), and am multiplying them together with coefficients.
When A = B, and taking vector x of ones, I expect that
(A-B)*x = 0, (A*B-B*A)*x = 0,
(A*A*B*B - B*B*A*A)*x = 0,
etc. and I do get this result for all these cases. (A.isApprox(B) evaluates to 1 and (A-B).norm() = 0).
However, when I multiply the matrices by doubles, as in
(c1*A*c2*A*d1*B*d2*B - d1*B*d2*B*c1*A*c2*A)*x,
I get a nonzero result, which doesn't make sense to me, as scalars should commute with the matrices. In fact, if I do,
(c1*c2*d1*d2*A*A*B*B - d1*d2*c1*c2*B*B*A*A)*x
I get zero. Any time the coefficients are interspersed in the matrix manipulation, I get a nonzero result.
I am not using any compiler optimizations, etc.
What am I doing wrong here?
Edit:
I have worked up a simple example. Maybe I'm missing something dumb, but here it is. This gives me an error of 10^20.
'''
#include <iostream>
#include <cmath>
#include <vector>
#include <Eigen/Sparse>
#include <complex>
typedef std::complex<double> Scalar;
typedef Eigen::SparseMatrix<Scalar, Eigen::RowMajor> SpMat;
typedef Eigen::Triplet<Scalar> trip;
int main(int argc, const char * argv[]) {
double k0 = M_PI;
double dz = 0.01;
double nz = 1500;
std::vector<double> rhos(nz), atten(nz), cp(nz);
for(int i = 0; i < nz; ++i){
if(i < 750){
rhos[i] = 1.5;
cp[i] = 2500;
atten[i] = 0.5;
}
else{
rhos[i] = 1;
cp[i] = 1500;
atten[i] = 0;
}
}
Scalar ci, eta, n, rho, drhodz;
Scalar t1, t2, t3, t4;
ci = Scalar(0,1);
eta = 1.0/(40.0*M_PI*std::log10(std::exp(1.0)));
int Mp = 6;
std::vector<std::vector<trip> > mat_entries_N(Mp), mat_entries_D(Mp);
for(int i = 0; i < nz; ++i){
n = 1500./cp[i] * (1.+ ci * eta * atten[i]);
rho = rhos[i];
if(i > 0 && i < nz-1){
drhodz = (rhos[i+1]-rhos[i-1])/(2*dz);
}
else if(i == 0){
drhodz = (rhos[i+1]-rhos[i])/(dz);
}
else if(i == nz-1){
drhodz = (rhos[i]-rhos[i-1])/(dz);
}
t1 = (n*n - 1.);
t2 = 1./(k0*k0)*(-2./(dz * dz));
t3 = 1./(k0*k0)*(drhodz/rho*2.*dz);
t4 = 1./(k0*k0)*(1/(dz * dz));
/* MATRICES N AND D ARE IDENTICAL EXCEPT FOR COEFFICIENT*/
double c,d;
for(int mp = 0; mp < Mp; ++mp){
c = std::pow(std::sin((mp+1)*M_PI/(2*Mp+1)),2);
d = std::pow(std::cos((mp+1)*M_PI/(2*Mp+1)),2);
mat_entries_N[mp].push_back(trip(i,i,(c*(t1 + t2))));
mat_entries_D[mp].push_back(trip(i,i,(d*(t1 + t2))));
if(i < nz - 1){
mat_entries_N[mp].push_back(trip(i,i+1,(c*(-t3 + t4))));
mat_entries_D[mp].push_back(trip(i,i+1,(d*(-t3 + t4))));
}
if(i > 0){
mat_entries_N[mp].push_back(trip(i,i-1,(c*(t3 + t4))));
mat_entries_D[mp].push_back(trip(i,i-1,(d*(t3 + t4))));
}
}
}
SpMat N(nz,nz), D(nz,nz);
SpMat identity(nz, nz);
std::vector<trip> idcoeffs;
for(int i = 0; i < nz; ++i){
idcoeffs.push_back(trip(i,i,1));
}
identity.setFromTriplets(idcoeffs.begin(), idcoeffs.end());
SpMat temp(nz,nz);
N = identity;
D = identity;
for(int mp = 0; mp < Mp; ++mp){
temp.setFromTriplets(mat_entries_N[mp].begin(), mat_entries_N[mp].end());
N = (temp*N).eval();
temp.setFromTriplets(mat_entries_D[mp].begin(), mat_entries_D[mp].end());
D = (temp*D).eval();
}
std::cout << (N*D - D*N).norm() << std::endl;
return 0;
}
'''
The problem is that without a meaningful reference value defining what is the expected order of magnitude of a non-zero value, it is impossible to conclude whether 1e20 is a huge or a tiny value.
In your case, the norm of the matrices N and D are about 1e20 and 1e18 respectively, and the norm of N*D is about 1e38. Given that the relative precision of double is about 1e-16, an error of 1e20 can be considered as 0 compared to 1e38.
To summarize, it is most of the time meaningless to look at the absolute error. Instead, you have to look at the relative error:
std::cout << (N*D - D*N).norm()/(N*D).norm() << std::endl;
which gives you about 1e-17. This is indeed smaller that the numerical precision of double.

FFT in ARM Cortex-M0 returns NaN or infinite

I'm working with a 32-bit ARM Cortex-M0 RedBearLab nRF51822 and I am trying to develop a function to calculate the FFTs of a signal.
I can get a 256 points FFT of a signal with this function, but when I try the 512 points FFT (or more), it returns infinite values and NaN. However, when I use my function in a 32 bit ARM Cortex-M4 Teensy 3.1, it works perfectly.
Here I show my code:
void fourier_transform(double samples[], const int n, const int isign) {
int nn,mmax,m,j,istep,i;
float wtemp,wr,wpr,wpi,wi,theta,tempr,tempi;
//if (n<2 || n&(n-1)) throw("n must be power of 2 in four1");
nn = n << 1;
j = 1;
for (i=1;i<nn;i+=2) {
if (j > i) {
double a = samples[j-1];
samples[j-1] = samples[i-1];
samples[i-1] = a;
double b = samples[j];
samples[j] = samples[i];
samples[i] = b;
//SWAP(data[j-1],data[i-1]); //ver o que faz
//SWAP(data[j],data[i]);
}
m=n;
while (m >= 2 && j > m) {
j -= m;
m >>= 1;
}
j += m;
}
mmax=2;
while (nn > mmax) {
istep=mmax << 1;
theta=isign*(6.28318530717959/mmax);
wtemp=sin(0.5*theta);
wpr = -2.0*wtemp*wtemp;
wpi=sin(theta);
wr=1.0;
wi=0.0;
for (m=1;m<mmax;m+=2) {
for (i=m;i<=nn;i+=istep) {
j=i+mmax;
tempr=wr*samples[j-1]-wi*samples[j];
tempi=wr*samples[j]+wi*samples[j-1];
samples[j-1]=samples[i-1]-tempr;
samples[j]=samples[i]-tempi;
samples[i-1] += tempr;
samples[i] += tempi;
}
wr=(wtemp=wr)*wpr-wi*wpi+wr;
wi=wi*wpr+wtemp*wpi+wi;
}
mmax=istep;
}
}
samples[] is the array with my signal and, in the end of FFT, the result is put here. n is the number of points I use to get my FFT.

FLT_EPSILON for a nth root finder with SSE/AVX

I'm trying to convert a function that finds the nth root in C for a double value from the following link
http://rosettacode.org/wiki/Nth_root#C
to find the nth root for 8 floats at once using AVX.
Part of that code uses DBL_EPSILON * 10. However, when I convert this to use float with AVX I have to use FLT_EPSILON*1000 or the code hangs and does not converge. When I print out FLT_EPSILON I see it is order 1E-7. But this link, http://www.cplusplus.com/reference/cfloat/
, says it should be 1E-5. When I print out DBL_EPSILON it's 1E-16 but the link says it should only be 1E-9. What's going on?
Here is the code so far (not fully optimized).
#include <stdio.h>
#include <float.h>
#include <immintrin.h> // AVX
inline double abs_(double x) {
return x >= 0 ? x : -x;
}
double pow_(double x, int e)
{
double ret = 1;
for (ret = 1; e; x *= x, e >>= 1) {
if ((e & 1)) ret *= x;
}
return ret;
}
double root(double a, int n)
{
double d, x = 1;
x = a/n;
if (!a) return 0;
//if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
int cnt = 0;
do {
cnt++;
d = (a / pow_(x, n - 1) - x) / n;
x+= d;
} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
printf("%d\n", cnt);
return x;
}
__m256 pow_avx(__m256 x, int e) {
__m256 ret = _mm256_set1_ps(1.0f);
for (; e; x = _mm256_mul_ps(x,x), e >>= 1) {
if ((e & 1)) ret = _mm256_mul_ps(x,ret);
}
return ret;
}
inline __m256 abs_avx (__m256 x) {
return _mm256_max_ps(_mm256_sub_ps(_mm256_setzero_ps(), x), x);
//return x >= 0 ? x : -x;
}
int get_mask(const __m256 d, const __m256 x) {
__m256 ad = abs_avx(d);
__m256 ax = abs_avx(x);
__m256i mask = _mm256_castps_si256(_mm256_cmp_ps(ad, ax, _CMP_GT_OQ));
return _mm_movemask_epi8(_mm256_castsi256_si128(mask)) + _mm_movemask_epi8(_mm256_extractf128_si256(mask,1));
}
__m256 root_avx(__m256 a, int n) {
printf("%e\n", FLT_EPSILON);
printf("%e\n", DBL_EPSILON);
printf("%e\n", FLT_EPSILON*1000.0f);
__m256 d;
__m256 x = _mm256_set1_ps(1.0f);
//if (!a) return 0;
//if (n < 1 || (a < 0 && !(n&1))) return 0./0.; /* NaN */
__m256 in = _mm256_set1_ps(1.0f/n);
__m256 xtmp;
do {
d = _mm256_rcp_ps(pow_avx(x, n - 1));
d = _mm256_sub_ps(_mm256_mul_ps(a,d),x);
d = _mm256_mul_ps(d,in);
//d = (a / pow_avx(x, n - 1) - x) / n;
x = _mm256_add_ps(x, d); //x+= d;
xtmp =_mm256_mul_ps(x, _mm256_set1_ps(FLT_EPSILON*100.0f));
//} while (abs_(d) >= abs_(x) * (DBL_EPSILON * 10));
} while (get_mask(d, xtmp));
return x;
}
int main()
{
__m256 d = _mm256_set1_ps(16.0f);
__m256 out = root_avx(d, 4);
float result[8];
int i;
_mm256_storeu_ps(result, out);
for(i=0; i<8; i++) {
printf("%f\n", result[i]);
} printf("\n");
//double x = 16;
//printf("root(%g, 15) = %g\n", x, root(x, 4));
//double x = pow_(-3.14159, 15);
//printf("root(%g, 15) = %g\n", x, root(x, 15));
return 0;
}
_mm256_rcp_ps, which maps to the rcpps instruction, performs only an approximate reciprocal. The Intel 64 and IA-32 Architectures Software Developer’s Manual says its relative error may be up to 1.5•2-12. This is insufficient to cause the root finder to converge with accuracy 100*FLT_EPSILON.
You could use an exact division, such as:
d = pow_avx(x, n-1);
d = _mm256_sub_ps(_mm256_div_ps(a, d), x);
or add some refinement steps for the reciprocal estimate.
Incidentally, if your compiler supports using regular C operators with SIMD objects, consider using the regular C operators instead:
d = pow_avx(x, n-1);
d = a/d - x;
1e-5 is simply the maximum value the C standard allows an implementation to use for FLT_EPSILON. In practice, you'll be using IEEE-754 single-precision, which has an epsilon of 2-23, which is approximately 1e-7.

Pollard Rho factorization method implementation in C

Can anyone help me out with the pollard rho implementation? I have implemented this in C. It's working fine for numbers upto 10 digits but it's not able to handle greater numbers.
Please help me out to improve it to carry out factorization of numbers upto 18 digits . My code is this:
#include<stdio.h>
#include<math.h>
int gcd(int a, int b)
{
if(b==0) return a ;
else
return(gcd(b,a%b)) ;
}
long long int mod(long long int a , long long int b , long long int n )
{
long long int x=1 , y=a ;
while(b>0)
{
if(b%2==1) x = ((x%n)*(y%n))%n ;
y = ((y%n)*(y%n))%n ;
b/=2 ;
}
return x%n ;
}
int isprimes(long long int u)
{
if(u==3)
return 1 ;
int a = 2 , i ;
long long int k , t = 0 , r , p ;
k = u-1 ;
while(k%2==0)
{ k/=2 ; t++ ; }
while(a<=3) /*der are no strong pseudoprimes common in base 2 and base 3*/
{
r = mod(a,k,u) ;
for(i = 1 ; i<=t ; i++)
{
p = ((r%u)*(r%u))%u ;
if((p==1)&&(r!=1)&&(r!=(u-1)))
{ return 0 ; }
r = p ;
}
if(p!=1)
return 0 ;
else
a++ ;
}
if(a==4)
return 1 ;
}
long long int pol(long long int u)
{
long long int x = 2 , k , i , a , y , c , s;
int d = 1 ;
k = 2 ;
i = 1 ;
y = x ;
a = u ;
if(isprimes(u)==1)
{
return 1;
}
c=-1 ;
s = 2 ;
while(1)
{
i++;
x=((x%u)*(x%u)-1)% u ;
d = gcd(abs(y-x),u) ;
if(d!=1&&d!=u)
{ printf("%d ",d);
while(a%d==0) { a=a/d; }
x = 2 ;
k = 2 ;
i = 1 ;
y = x ;
if(a==1)
{ return 0 ; }
if(isprimes(a)!=0)
{ return a ; }
u=a ;
}
if(i==k)
{y = x ; k*=2 ; c = x ;} /*floyd cycle detection*/
if(c==x)
{ x = ++s ; }
}
return ;
}
int main()
{
long long int t ;
long long int i , n , j , k , a , b , u ;
while(scanf("%lld",&n)&&n!=0)
{ u = n ; k = 0 ;
while(u%2==0)
{ u/=2 ; k = 1 ; }
if(k==1) printf("2 ") ;
if(u!=1)
t = pol(u) ;
if(u!=1)
{
if(t==1)
{ printf("%lld",u) ; }
else
if(t!=0)
{ printf("%lld",t) ; }
}
printf("\n");
}
return 0;
}
sorry for the long code ..... I am a new coder.
When you're multiplying two numbers modulo m, the intermediate product can become nearly m^2. So if you use a 64-bit unsigned integer type, the maximal modulus it can handle is 2^32, if the modulus is larger, overflow may happen. It will be rare when the modulus is only slightly larger, but that makes it only less obvious, you cannot rely on being lucky if the modulus allows the possibility of overflow.
You can gain a larger range by a factor of two if you choose a representative of the residue class modulo m of absolute value at most m/2 or something equivalent:
uint64_t mod_mul(uint64_t x, uint64_t y, uint64_t m)
{
int neg = 0;
// if x is too large, choose m-x and note that we need one negation for that at the end
if (x > m/2) {
x = m - x;
neg = !neg;
}
// if y is too large, choose m-y and note that we need one negation for that at the end
if (y > m/2) {
y = m - y;
neg = !neg;
}
uint64_t prod = (x * y) % m;
// if we had negated _one_ factor, and the product isn't 0 (mod m), negate
if (neg && prod) {
prod = m - prod;
}
return prod;
}
So that would allow moduli of up to 2^33 with a 64-bit unsigned type. Not a big step.
The recommended solution to the problem is the use of a big-integer library, for example GMP is available as a distribution package on most if not all Linux distros, and also (relatively) easily installable on Windows.
If that is not an option (really, are you sure?), you can get it to work for larger moduli (up to 2^63 for an unsigned 64-bit integer type) using Russian peasant multiplication:
x * y = 2 * (x * (y/2)) + (x * (y % 2))
so for the calculation, you only need that 2*(m-1) doesn't overflow.
uint64_t mod_mult(uint64_t x, uint64_t y, uint64_t m)
{
if (y == 0) return 0;
if (y == 1) return x % m;
uint64_t temp = mod_mult(x,y/2,m);
temp = (2*temp) % m;
if (y % 2 == 1) {
temp = (temp + x) % m;
}
return temp;
}
Note however that this algorithm needs O(log y) steps, so it's rather slow in practice. For smaller m you can speed it up, if 2^k*(m-1) doesn't overflow, you can proceed in steps of k bits instead of single bits (x*y = ((x * (y >> k)) << k) + (x * (y & ((1 << k)-1)))), which is a good improvement if your moduli are never larger than 48 or 56 bits, say.
Using that variant of modular multiplication, your algorithm will work for larger numbers (but it will be significantly slower). You can also try test for the size of the modulus and/or the factors to determine which method to use, if m < 2^32 or x < (2^64-1)/y, the simple (x * y) % m will do.
You can try this C implementation of Pollard Rho :
unsigned long long pollard_rho(const unsigned long long N) {
// Require : a composite number N, not a square.
// Ensure : res is a non-trivial factor of N.
// Option : define a timeout, define a rand function.
static const int timeout = 18;
static unsigned long long rand_val = 2994439072U;
rand_val = (rand_val * 1025416097U + 286824428U) % 4294967291LLU;
unsigned long long res = 1, a, b, c, i = 0, j = 1, x = 1, y = 1 + rand_val % (N - 1);
for (; res == 1; ++i) {
if (i == j) {
if (j >> timeout)
break;
j <<= 1;
x = y;
}
a = y, b = y;
for (y = 0; a; a & 1 ? b >= N - y ? y -= N : 0, y += b : 0, a >>= 1, (c = b) >= N - b ? c -= N : 0, b += c);
y = (1 + y) % N;
for (a = N, b = y > x ? y - x : x - y; (a %= b) && (b %= a););
res = a | b;
}
return res;
}
Otherwise there is a pure C quadratic sieve which factors numbers from 0 to 300-bit.

Resources