I'm trying to understand some basics of DFT, some math equations, and try to implement it with C.
Well, this is the function i used from a book (Algorithms for Image Processing And Computer Vision)
void slowft (float *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4;
int m, k;
/* Constant factor -2 pi */
cmplx (0.0, (float)(atan (1.0)/n * -8.0), &tmp);
printf (" constant factor -2 pi %f ", (float)(atan (1.0)/n * -8.0));
for (m = 0; m<=n; m++)
{
NEXT();
cmplx (x[0], 0.0, &(y[m]));
for (k=1; k<=n-1; k++)
{
/* Exp (tmp*k*m) */
cmplx ((float)k, 0.0, &z2);
cmult (tmp, z2, &z3);
cmplx ((float)m, 0.0, &z2);
cmult (z2, z3, &z4);
cexp (z4, &z2);
/* *x[k] */
cmplx (x[k], 0.0, &z3);
cmult (z2, z3, &z4);
/* + y[m] */
csum (y[m], z4, &z2);
y[m].real = z2.real; y[m].imag = z2.imag;
}
}
}
So actually, I'm stuck on the Constant Factor part. I didn't understand:
1-) what it came from(especially arctan(1)) and
2-) what its purpose of it.
This is the equation of DFT:
And these are other functions that i used:
void cexp (COMPLEX z1, COMPLEX *res)
{
COMPLEX x, y;
x.real = exp((double)z1.real);
x.imag = 0.0;
y.real = (float)cos((double)z1.imag);
y.imag = (float)sin((double)z1.imag);
cmult (x, y, res);
}
void cmult (COMPLEX z1, COMPLEX z2, COMPLEX *res)
{
res->real = z1.real*z2.real - z1.imag*z2.imag;
res->imag = z1.real*z2.imag + z1.imag*z2.real;
}
void csum (COMPLEX z1, COMPLEX z2, COMPLEX *res)
{
res->real = z1.real + z2.real;
res->imag = z1.imag + z2.imag;
}
void cmplx (float rp, float ip, COMPLEX *z)
{
z->real = rp;
z->imag = ip;
}
float cnorm (COMPLEX z)
{
return z.real*z.real + z.imag*z.imag;
}
1-) what it came from(especially arctan(1)) and
The code comment immediately above clues you in:
/* Constant factor -2 pi */
... although actually what is being computed is -2 pi / n (in the broader context of producing a complex number with that as the coefficient of its imaginary component). Observe that the tangent has value 1 for angles whose sine and cosine are equal. The angle that has that property and is in the range [0, pi) is pi / 4, so atan(1.0) * -8.0 is (a good approximation to) -2 pi.
2-) what its purpose of it.
It (or actually its additive inverse) appears in the DFT equation you presented, so it is natural that it appears in a function intended to implement that formula.
Here is the code with comments explaining it.
void slowft (float *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4;
int m, k;
/* Constant factor -2 pi */
cmplx (0.0, (float)(atan (1.0)/n * -8.0), &tmp);
/* atan(1) is π/4, so this sets tmp to -2πi/n. Note that the i
factor, the imaginary unit, comes from putting the expression in
the second argument, which gives the imaginary portion of the
complex number being assigned. (It is written as "j" in the
equation displayed in the question. That is because engineers use
"j" for i, having historically already used "i" for other purposes.)
*/
printf (" constant factor -2 pi %f ", (float)(atan (1.0)/n * -8.0));
for (m = 0; m<=n; m++)
{
NEXT();
// Well, that is a frightening thing to see in code. It is cryptic.
cmplx (x[0], 0.0, &(y[m]));
/* This starts to calculate a sum that will be accumulated in y[m].
The sum will be over k from 0 to n-1. For the first term, k is 0,
so -2πiwk/n will be 0. The coefficient is e to the power of that,
and e**0 is 1, so the first term is x[0] * 1, so we just put x[0]
diretly in y[m] with no multiplication.
*/
for (k=1; k<=n-1; k++)
// This adds the rest of the terms.
{
/* Exp (tmp*k*m) */
cmplx ((float)k, 0.0, &z2);
// This sets z2 to k.
cmult (tmp, z2, &z3);
/* This multiplies the -2πi/n from above with k, so it puts
-2πi/n from above, and This computes -2πik/n it in z3.
*/
cmplx ((float)m, 0.0, &z2);
// This sets z2 to m. m corresponds to the ω in the equation.
cmult (z2, z3, &z4);
// This multiplies m by -2πik/n, putting -2πiwk/n in z4.
cexp (z4, &z2);
/* This raises e to the power of -2πiwk/n, finishing the
coefficient of the term in the sum.
*/
/* *x[k] */
cmplx (x[k], 0.0, &z3);
// This sets z3 to x[k].
cmult (z2, z3, &z4);
// This multiplies x[k] by the coefficient, e**(-2πiwk/n).
/* + y[m] */
csum (y[m], z4, &z2);
/* This adds the term (z4) to the sum being accumulated (y[m])
and puts the updated sum in z2.
*/
y[m].real = z2.real; y[m].imag = z2.imag;
/* This moves the updated sum to y[m]. This is not necessary
because csum is passed its operands as values, so they are
copied when calling the function, and it is safe to update its
output. csum(y[m], z4, &y[m]) above would have worked. But
this works too.
*/
}
}
Standard C has support for complex arithmetic, so it would be easier and clearer to include <complex.h> and write code this way:
void slowft(float *x, complex float *y, int n)
{
static const float TwoPi = 0x3.243f6a8885a308d313198a2e03707344ap1f;
float t0 = -TwoPi/n;
for (int m = 0; m <=n; m++)
{
float t1 = t0*m;
y[m] = x[0];
for (int k = 1; k < n; k++)
y[m] += x[k] * cexpf(t1 * k * I);
}
}
Related
Actually, I think that I get the discrete Fourier transform some basics. And now I have some problems with the fast Fourier transform algorithm.
I don't want to share all the functions so as not to complicate the problem. But if you don't understand some parts I can edit the question.
Slow Fourier transform:
void slowft (float *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4;
int m, k;
/* Constant factor -2 pi */
cmplx (0.0, (float)(atan (1.0)/n * -8.0), &tmp);
printf (" constant factor -2 pi %f ", (float)(atan (1.0)/n * -8.0));
for (m = 0; m<=n; m++)
{
cmplx (x[0], 0.0, &(y[m]));
for (k=1; k<=n-1; k++)
{
/* Exp (tmp*k*m) */
cmplx ((float)k, 0.0, &z2);
cmult (tmp, z2, &z3);
cmplx ((float)m, 0.0, &z2);
cmult (z2, z3, &z4);
cexp (z4, &z2);
/* *x[k] */
cmplx (x[k], 0.0, &z3);
cmult (z2, z3, &z4);
/* + y[m] */
csum (y[m], z4, &z2);
y[m].real = z2.real; y[m].imag = z2.imag;
}
}
}
to make clear:
cmplx is creating a complete number, cmult is complex multiplication and cexp is taking exponent. that's all.
And some optimizations:
void newslowft (double *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4, *pre;
long m, k, i, p;
pre = (COMPLEX *)malloc(sizeof(struct cpx)*1024);
/* Constant factor -2 pi */
cmplx (0.0, atan (1.0)/n * -8.0, &z1);
cexp (z1, &tmp);
/* Pre-compute most of the exponential */
cmplx (1.0, 0.0, &z1); /* Z1 = 1.0; */
//n=1024
for (i=0; i<n; i++)
{
cmplx (z1.real, z1.imag, &(pre[i]));
cmult (z1, tmp, &z3);
cmplx (z3.real, z3.imag, &z1);
}
/* Double loop to compute all Y entries */
for (m = 0; m<n; m++)
{
cmplx (x[0], 0.0, &(y[m]));
for (k=1; k<=n-1; k++)
{
/* Exp (tmp*k*m) */
p = (k*m % n);
/* *x[k] */
cmplx (x[k], 0.0, &z3);
cmult (z3, pre[p], &z4);
/* + y[m] */
csum (y[m], z4, &z2);
y[m].real = z2.real;
y[m].imag = z2.imag;
}
}
}
The problem: The first step of the optimization:
"precalculating some exponential inside the loop".
So this is actually what I ask. How does this algorithm calculate the all exponential?
I think we are calculating the following exponentials: e^0 e^1 e^2.... e^1023 So where are the other exponentials?
I mean, in the first algorithm, inside the for loops we are using m (m=0; m<=1024; m ) and k(k=0; k<1023-1; k ) but, where is the e^1000*900?
As far as I understand, the second algorithm takes the mode according to n. I think this is the key point right? But I didn't get how to work?
Thanks in advance masters.
Follow-up question for IEEE 754 conformant sqrt() implementation for double type.
Context: Need to implement IEEE 754 conformant sqrtf() taking into account the following HW restrictions and usage limitations:
Provides a special instruction qseed.f to get an approximation of the reciprocal of the square root (the accuracy of the result is no less than 6.75 bits, and therefore always within ±1% of the accurate result).
Single precision FP:
a. Support by HW (SP FPU): has support;
b. Support by SW (library): has support;
c. Support of subnormal numbers: no support (FLT_HAS_SUBNORM is 0).
Double precision FP:
a. Support by HW (DP FPU): no support;
b. Support by SW (library): has support;
c. Support of subnormal numbers: no support (DBL_HAS_SUBNORM is 0).
I've found one presentation by John Harrison and ended up with this implementation (note that here qseed.f is replaced by rsqrtf()):
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
// https://github.com/nickzman/hyperspace/blob/master/frsqrt.hh
#if 1
float rsqrtf ( float x )
{
const float xhalf = 0.5f * x;
int i = *(int*) & x;
i = 0x5f375a86 - ( i >> 1 );
x = *(float*) & i;
x = x * ( 1.5f - xhalf * x * x );
x = x * ( 1.5f - xhalf * x * x );
x = x * ( 1.5f - xhalf * x * x );
return x;
}
#else
float rsqrtf ( float x )
{
return 1.0f / sqrtf( x );
}
#endif
float sqrtfr_jh( float x, float r )
{
/*
* John Harrison, Formal Verification Methods 5: Floating Point Verification,
* Intel Corporation, 12 December 2002, document name: slides5.pdf, page 14,
* slide "The square root algorithm".
* URL: https://www.cl.cam.ac.uk/~jrh13/slides/anu-09_12dec02/slides5.pdf
*/
double rd, b, z0, s0, d, k, h0, e, t0, s1, c, d1, h1, s;
static const double half = 0.5;
static const double one = 1.0;
static const double three = 3.0;
static const double two = 2.0;
rd = (double)r;
b = half * x;
z0 = rd * rd;
s0 = x * rd;
d = fma( -b, z0, half );
k = fma( x, rd, -s0 );
h0 = half * rd;
e = fma( three / two, d, one );
t0 = fma( d, s0, k );
s1 = fma( e, t0, s0 );
c = fma( d, e, one );
d1 = fma( -s1, s1, x );
h1 = c * h0;
s = fma( d1, h1, s1 );
return (float)s;
}
float my_sqrtf( float x )
{
/* handle special cases */
if (x == 0) {
return x + x;
}
/* handle normal cases */
if ((x > 0) && (x < INFINITY)) {
return sqrtfr_jh( x, rsqrtf( x ) );
}
/* handle special cases */
return (x < 0) ? NAN : (x + x);
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...#gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components, each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC), period (2^121+2^63-1)
Xorshift (XSH), period 2^64-1
Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c, \
kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
int main (void)
{
const uint64_t N = 10000000000ULL; /* desired number of test cases */
float arg, ref, res;
uint64_t argi64;
uint32_t refi, resi;
uint64_t count = 0;
float spec[] = {0.0f, 1.0f, INFINITY, NAN};
printf ("test a few special cases:\n");
for (int i = 0; i < sizeof (spec)/sizeof(spec[0]); i++) {
printf ("my_sqrt(%a) = %a\n", spec[i], my_sqrtf(spec[i]));
printf ("my_sqrt(%a) = %a\n", -spec[i], my_sqrtf(-spec[i]));
}
printf ("test %lu random cases:\n", N);
do {
argi64 = KISS64;
memcpy (&arg, &argi64, sizeof arg);
if ( fpclassify(arg) == FP_SUBNORMAL )
{
continue;
}
++count;
res = my_sqrtf (arg);
ref = sqrtf (arg);
memcpy (&resi, &res, sizeof resi);
memcpy (&refi, &ref, sizeof refi);
if ( ! ( isnan(res) && isnan(ref) ) )
if (resi != refi) {
printf ("\rerror # arg=%a (%e)\n", arg, arg);
printf ("\rerror # res=%a (%e)\n", res, res);
printf ("\rerror # ref=%a (%e)\n", ref, ref);
return EXIT_FAILURE;
}
if ((count & 0xfffff) == 0) printf ("\r[%lu]", count);
} while (count < N);
printf ("\r[%lu]", count);
printf ("\ntests PASSED\n");
return EXIT_SUCCESS;
}
And it seems to work correctly (at least for some random cases): it reports:
[10000000000]
tests PASSED
Now the question: since the original John Harrison's sqrtf() algorithm uses only single precision computations (i.e. type float), it is possible to reduce the number of operations when using only (except conversions) double precision computations (i.e. type double) and still be IEEE 754 conformant?
P.S. Since users #njuffa and #chux - Reinstate Monica are strong in FP, I invite them to participate. However, all the competent in FP users are welcome.
Computing a single-precision square root via double-precision code is going to be inefficient, especially if the hardware provides no native double-precision operations.
The following assumes hardware that conforms to IEEE-754 (2008), except that subnormals are not supported and flushed to zero. Fused-multiply add (FMA) is supported. It further assumes an ISO-C99 compiler that maps float to IEEE-754 binary32, and that maps the hardware's single-precision FMA instruction to the standard math function fmaf().
From a hardware starting approximation for the reciprocal square root with a maximum relative error of 2-6.75 one can get to a reciprocal square root accurate to 1 single-precision ulp with two Newton-Raphson iterations. Multiplying this with the original argument provides an accurate estimate of the square root. The square of this approximation is subtracted from the orginal argument to compute the approximation error for the square root. This error is then used to apply a correction to the square root approximation, resulting in a correctly-rounded square root.
However, this straightforward algorithm breaks down for arguments that are very small due to underflow or overflow in intermediate computation, in particular when the underlying arithmetic operates in flash-to-zero mode that flushes subnormals to zero. For such arguments we can construct a slowpath code that scales the input towards unity, and scales back the result accordingly once the square root has been computed. Code for handling special operands such as zeros, infinities, NaNs, and negative arguments other than zero is also added to this slowpath code.
The NaN generated by the slowpath code for invalid operations should be adjusted to match the system's existing operations. For example, for x86-based systems this would be a special QNaN called INDEFINITE, with a bit pattern of 0xffc00000, while for a GPU running CUDA it would be the canonical single-precision NaN with a bit pattern of 0x7fffffff.
For performance reasons it may be useful to inline the fastpath code while making the slowpath code a called outlined subroutine. Single-precision math functions with a single argument should always be tested exhaustively against a "golden" reference implementation, which takes just minutes on modern hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
float uint32_as_float (uint32_t);
uint32_t float_as_uint32 (float);
float qseedf (float);
float sqrtf_slowpath (float);
/* Square root computation for IEEE-754 binary32 mapped to 'float' */
float my_sqrtf (float arg)
{
const uint32_t upper = float_as_uint32 (0x1.fffffep+127f);
const uint32_t lower = float_as_uint32 (0x1.000000p-102f);
float rsq, sqt, err;
/* use fastpath computation if argument in [0x1.0p-102, 0x1.0p+128) */
if ((float_as_uint32 (arg) - lower) <= (upper - lower)) {
/* generate low-accuracy approximation to rsqrt(arg) */
rsq = qseedf (arg);
/* apply two Newton-Raphson iterations with quadratic convergence */
rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
/* compute sqrt from rsqrt, round result to nearest or even */
sqt = rsq * arg;
err = fmaf (sqt, -sqt, arg);
sqt = fmaf (0.5f * rsq, err, sqt);
} else {
sqt = sqrtf_slowpath (arg);
}
return sqt;
}
/* interprete bit pattern of 32-bit unsigned integer as IEEE-754 binary32 */
float uint32_as_float (uint32_t a)
{
float r;
memcpy (&r, &a, sizeof r);
return r;
}
/* interprete bit pattern of IEEE-754 binary32 as a 32-bit unsigned integer */
uint32_t float_as_uint32 (float a)
{
uint32_t r;
memcpy (&r, &a, sizeof r);
return r;
}
/* simulate low-accuracy hardware approximation to 1/sqrt(a) */
float qseedf (float a)
{
float r = 1.0f / sqrtf (a);
r = uint32_as_float (float_as_uint32 (r) & ~0x1ffff);
return r;
}
/* square root computation suitable for all IEEE-754 binary32 arguments */
float sqrtf_slowpath (float arg)
{
const float FP32_INFINITY = uint32_as_float (0x7f800000);
const float FP32_QNAN = uint32_as_float (0xffc00000); /* system specific */
const float scale_in = 0x1.0p+26f;
const float scale_out = 0x1.0p-13f;
float rsq, err, sqt;
if (arg < 0.0f) {
return FP32_QNAN;
} else if ((arg == 0.0f) || !(fabsf (arg) < FP32_INFINITY)) { /* Inf, NaN */
return arg + arg;
} else {
/* scale subnormal arguments towards unity */
arg = arg * scale_in;
/* generate low-accuracy approximation to rsqrt(arg) */
rsq = qseedf (arg);
/* apply two Newton-Raphson iterations with quadratic convergence */
rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
/* compute sqrt from rsqrt, round to nearest or even */
sqt = rsq * arg;
err = fmaf (sqt, -sqt, arg);
sqt = fmaf (0.5f * rsq, err, sqt);
/* compensate scaling of argument by counter-scaling the result */
sqt = sqt * scale_out;
return sqt;
}
}
int main (void)
{
uint32_t ai, resi, refi;
float a, res, reff;
double ref;
ai = 0x00000000;
do {
a = uint32_as_float (ai);
res = my_sqrtf (a);
ref = sqrt ((double)a);
reff = (float)ref;
resi = float_as_uint32 (res);
refi = float_as_uint32 (reff);
if (resi != refi) {
printf ("error # %08x %15.8e res=%08x %15.8e ref=%08x %15.8e\n",
ai, a, resi, res, refi, reff);
return EXIT_FAILURE;
}
ai++;
} while (ai);
return EXIT_SUCCESS;
}
I'm working on a minimal ray tracer in C, and I've written a ray tracer a little while ago so I understand the theory behind them, just wanted to do a rewrite for cleanup purposes.
I have the necessary elements for a ray tracer, and nothing more. I've written triangle intersection, transforming pixel space coordinates to NDC (with aspect ratio and FOV accounted for), and writing out the frame buffer.
However, it does not work as expected. The image is entirely black when it should be rendering a single triangle. I've tested writing a single test pixel, and it works fine so I know it isn't an issue with the image writing code.
I've double and triple-checked the code behind the math, and it looks fine to me. Intersection code is basically a duplicate of the source code in the original Moller-Trumbore paper:
/* ray triangle intersection */
bool ray_triangle_intersect(double orig[3], double dir[3], double vert0[3],
double vert1[3], double vert2[3], double* t, double* u, double* v) {
double edge1[3], edge2[3];
double tvec[3], pvec[3], qvec[3];
double det, inv_det;
/* edges */
SUB(edge1, vert1, vert0);
SUB(edge2, vert2, vert0);
/* determinant */
CROSS(pvec, dir, edge2);
/* ray in plane of triangle if near zero */
det = DOT(edge1, pvec);
if(det < EPSILON)
return 0;
SUB(tvec, orig, vert0);
inv_det = 1.0 / det;
/* calculate, check bounds */
*u = DOT(tvec, pvec) * inv_det;
if(*u < 0.0 || *u > 1.0)
return 0;
CROSS(qvec, tvec, edge1);
/* calculate, check bounds */
*v = DOT(dir, qvec) * inv_det;
if(*v < 0.0 || *u + *v > 1.0)
return 0;
*t = DOT(edge2, qvec) * inv_det;
return 1;
}
CROSS, DOT, and SUB are just macros:
#define CROSS(v,v0,v1) \
v[0] = v0[1] * v1[2] - v0[2] * v1[1]; \
v[1] = v0[2] * v1[0] - v0[0] * v1[2]; \
v[2] = v0[0] * v1[1] - v0[1] * v1[0];
#define DOT(v0,v1) (v0[0] * v1[0] + v0[1] * v1[1] + v0[2] * v1[2])
/* v = v0 - v1 */
#define SUB(v,v0,v1) \
v[0] = v0[0] - v1[0]; \
v[1] = v0[1] - v1[1]; \
v[2] = v0[2] - v1[2];
Transformation code is as follows:
double ndc[2];
screen_to_ndc(x, y, &ndc[0], &ndc[1]);
double dir[3];
dir[0] = ndc[0] * ar * tfov;
dir[1] = ndc[1] * tfov;
dir[2] = -1;
norm(dir);
And screen_to_ndc:
void screen_to_ndc(unsigned int x, unsigned int y, double* ndcx, double* ndcy) {
*ndcx = 2 * (((double) x + (1.0 / 2.0)) / (double) WIDTH) - 1;
*ndcy = 1 - 2 * (((double) y + (1.0 / 2.0)) / (double) HEIGHT);
}
Any help would be appreciated.
Try reversing the orientation of your triangle. Your ray-triangle intersection code culls backfaces because it returns early when det is negative.
Important : i'm a beginner in ML and i want to implement, by myself, the algorithms i'm learning, without using ML libraries.
I have a dataset with the price (y) for the number of km (x), and i want to find the function that describe datas.
You can find the dataset and the entire code here : https://wetransfer.com/downloads/034d9918f6d29268f06be45d76e156f420190330174420/6af73b
I'm using a classic gradient descent algorithm : my code works and converge well for some single linear regression problems, but not the one that interest me.
/* Classic gradient descent algorithm */
ft_sum(double *x, double *y, long double theta0, long double theta1, int epoch, int truth)
{
long double result = 0.00;
long double tmp;
int i;
i = 0;
while (epoch--)
{
/* Derivative part of the gradient descent */
tmp = ((x[i] * theta1 + theta0)) - (y[i]);
if (truth == 1)
tmp = tmp * (x[i]);
result += tmp;
i++;
}
return (result);
}
/* Linear regression */
void single_linear_regression(double *x, double *y, double epoch, char *argv)
{
long double theta0 = 0; /* bias */
long double theta1 = 0; /* weight */
long double error = 100; /* Cost of the function */
long double tmp1;
long double tmp2;
double alpha = 0.0000000001; /* with higher learning rate it does not converge */
int i = 0;
while (!(error > -0.4 && error < 0.4)) // it doesn't go below 0.4
{
tmp1 = theta0 - ((alpha * (1.00 / epoch) *
(error = ft_sum(x, y, theta0, theta1, epoch - 1, 0))));
tmp2 = theta1 - ((alpha * (1.00 / epoch) *
(error = ft_sum(x, y, theta0, theta1, epoch - 1, 1))));
theta0 = tmp1;
theta1 = tmp2;
printf("error := %Lf\n", error);
}
printf("error := %Lf | theta0 == %Lf | theta1 == %Lf\n", error, theta0, theta1);
}
At the end, i have :
error := 0.240723 | theta0 == 0.000004 | theta1 == 0.044168
(f(x) = 0.044x + 0.000004)
When the actual function is : -0.02x + 8500...
I have already tried normalizing datas [0-1], changing starting values of the weight and the bias, and i'm really stuck on this.
I am trying to calculate eigenvalues using the TQLI algorithm that I got from the website of the CACS of the University of Southern California. My test script looks like this:
#include <stdio.h>
int main()
{
int i;
i = rand();
printf("My random number: %d\n", i);
float d[4] = {
{1, 2, 3, 4}
};
float e[4] = {
{0, 0, 0, 0}
};
float z[4][4] = {
{1.0, 0.0, 0.0, 0.0} ,
{0.0, 1.0, 0.0, 0.0} ,
{0.0, 0.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 1.0}
};
double *zptr;
zptr = &z[0][0];
printf("Element [2][1] of identity matrix: %f\n", z[2][1]);
printf("Element [2][2] of identity matrix: %f\n", z[2][2]);
tqli(d, e, 4, zptr);
printf("First eigenvalue: %f\n", d[0]);
return 0;
}
When I try to run this script I get a segmentation fault error as you can see in here. At what location does my code produce this segmentation fault. As I believe the code from USC is bug-free I am pretty sure the mistake must be in my call of the function. However I can't see where I made a mistake in my set-up of the arrays as in my opinion I followed the instructions.
Eigenvalue calculation using TQLI algorithm fails with segmentation
fault
Segmentation fault comes from crossing the supplied array boundary. tqli requires specific data preparation.
1) The eigen code from CACS is Fortran based and counts indexes from 1.
2) The tqli expects double pointer for its matrix and double vectors.
/******************************************************************************/
void tqli(double d[], double e[], int n, double **z)
/*******************************************************************************
d, and e should be declared as double.
3) The program needs modification in respect to the data preparation for the above function.
Helper 1-index based vectors have to be created to supply properly formatted data for the tqli:
double z[NP][NP] = { {2, 0, 0}, {0, 4, 0}, {0, 0, 2} } ;
double **a;
double *d,*e,*f;
d=dvector(1,NP); // 1-index based vector
e=dvector(1,NP);
f=dvector(1,NP);
a=dmatrix(1,NP,1,NP); // 1-index based matrix
for (i=1;i<=NP;i++) // loading data from zero besed `ze` to `a`
for (j=1;j<=NP;j++) a[i][j]=z[i-1][j-1];
Complete test program is supplied below. It uses the eigen code from CACS:
/*******************************************************************************
Eigenvalue solvers, tred2 and tqli, from "Numerical Recipes in C" (Cambridge
Univ. Press) by W.H. Press, S.A. Teukolsky, W.T. Vetterling, and B.P. Flannery
*******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NR_END 1
#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
double **dmatrix(int nrl, int nrh, int ncl, int nch)
/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
{
int i,nrow=nrh-nrl+1,ncol=nch-ncl+1;
double **m;
/* allocate pointers to rows */
m=(double **) malloc((size_t)((nrow+NR_END)*sizeof(double*)));
m += NR_END;
m -= nrl;
/* allocate rows and set pointers to them */
m[nrl]=(double *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
m[nrl] += NR_END;
m[nrl] -= ncl;
for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
/* return pointer to array of pointers to rows */
return m;
}
double *dvector(int nl, int nh)
/* allocate a double vector with subscript range v[nl..nh] */
{
double *v;
v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
return v-nl+NR_END;
}
/******************************************************************************/
void tred2(double **a, int n, double d[], double e[])
/*******************************************************************************
Householder reduction of a real, symmetric matrix a[1..n][1..n].
On output, a is replaced by the orthogonal matrix Q effecting the
transformation. d[1..n] returns the diagonal elements of the tridiagonal matrix,
and e[1..n] the off-diagonal elements, with e[1]=0. Several statements, as noted
in comments, can be omitted if only eigenvalues are to be found, in which case a
contains no useful information on output. Otherwise they are to be included.
*******************************************************************************/
{
int l,k,j,i;
double scale,hh,h,g,f;
for (i=n;i>=2;i--) {
l=i-1;
h=scale=0.0;
if (l > 1) {
for (k=1;k<=l;k++)
scale += fabs(a[i][k]);
if (scale == 0.0) /* Skip transformation. */
e[i]=a[i][l];
else {
for (k=1;k<=l;k++) {
a[i][k] /= scale; /* Use scaled a's for transformation. */
h += a[i][k]*a[i][k]; /* Form sigma in h. */
}
f=a[i][l];
g=(f >= 0.0 ? -sqrt(h) : sqrt(h));
e[i]=scale*g;
h -= f*g; /* Now h is equation (11.2.4). */
a[i][l]=f-g; /* Store u in the ith row of a. */
f=0.0;
for (j=1;j<=l;j++) {
/* Next statement can be omitted if eigenvectors not wanted */
a[j][i]=a[i][j]/h; /* Store u/H in ith column of a. */
g=0.0; /* Form an element of A.u in g. */
for (k=1;k<=j;k++)
g += a[j][k]*a[i][k];
for (k=j+1;k<=l;k++)
g += a[k][j]*a[i][k];
e[j]=g/h; /* Form element of p in temporarily unused element of e. */
f += e[j]*a[i][j];
}
hh=f/(h+h); /* Form K, equation (11.2.11). */
for (j=1;j<=l;j++) { /* Form q and store in e overwriting p. */
f=a[i][j];
e[j]=g=e[j]-hh*f;
for (k=1;k<=j;k++) /* Reduce a, equation (11.2.13). */
a[j][k] -= (f*e[k]+g*a[i][k]);
}
}
} else
e[i]=a[i][l];
d[i]=h;
}
/* Next statement can be omitted if eigenvectors not wanted */
d[1]=0.0;
e[1]=0.0;
/* Contents of this loop can be omitted if eigenvectors not
wanted except for statement d[i]=a[i][i]; */
for (i=1;i<=n;i++) { /* Begin accumulation of transformation matrices. */
l=i-1;
if (d[i]) { /* This block skipped when i=1. */
for (j=1;j<=l;j++) {
g=0.0;
for (k=1;k<=l;k++) /* Use u and u/H stored in a to form P.Q. */
g += a[i][k]*a[k][j];
for (k=1;k<=l;k++)
a[k][j] -= g*a[k][i];
}
}
d[i]=a[i][i]; /* This statement remains. */
a[i][i]=1.0; /* Reset row and column of a to identity matrix for next iteration. */
for (j=1;j<=l;j++) a[j][i]=a[i][j]=0.0;
}
}
/******************************************************************************/
void tqli(double d[], double e[], int n, double **z)
/*******************************************************************************
QL algorithm with implicit shifts, to determine the eigenvalues and eigenvectors
of a real, symmetric, tridiagonal matrix, or of a real, symmetric matrix
previously reduced by tred2 sec. 11.2. On input, d[1..n] contains the diagonal
elements of the tridiagonal matrix. On output, it returns the eigenvalues. The
vector e[1..n] inputs the subdiagonal elements of the tridiagonal matrix, with
e[1] arbitrary. On output e is destroyed. When finding only the eigenvalues,
several lines may be omitted, as noted in the comments. If the eigenvectors of
a tridiagonal matrix are desired, the matrix z[1..n][1..n] is input as the
identity matrix. If the eigenvectors of a matrix that has been reduced by tred2
are required, then z is input as the matrix output by tred2. In either case,
the kth column of z returns the normalized eigenvector corresponding to d[k].
*******************************************************************************/
{
double pythag(double a, double b);
int m,l,iter,i,k;
double s,r,p,g,f,dd,c,b;
for (i=2;i<=n;i++) e[i-1]=e[i]; /* Convenient to renumber the elements of e. */
e[n]=0.0;
for (l=1;l<=n;l++) {
iter=0;
do {
for (m=l;m<=n-1;m++) { /* Look for a single small subdiagonal element to split the matrix. */
dd=fabs(d[m])+fabs(d[m+1]);
if ((double)(fabs(e[m])+dd) == dd) break;
}
if (m != l) {
if (iter++ == 30) printf("Too many iterations in tqli");
g=(d[l+1]-d[l])/(2.0*e[l]); /* Form shift. */
r=pythag(g,1.0);
g=d[m]-d[l]+e[l]/(g+SIGN(r,g)); /* This is dm - ks. */
s=c=1.0;
p=0.0;
for (i=m-1;i>=l;i--) { /* A plane rotation as in the original QL, followed by Givens */
f=s*e[i]; /* rotations to restore tridiagonal form. */
b=c*e[i];
e[i+1]=(r=pythag(f,g));
if (r == 0.0) { /* Recover from underflow. */
d[i+1] -= p;
e[m]=0.0;
break;
}
s=f/r;
c=g/r;
g=d[i+1]-p;
r=(d[i]-g)*s+2.0*c*b;
d[i+1]=g+(p=s*r);
g=c*r-b;
/* Next loop can be omitted if eigenvectors not wanted */
for (k=1;k<=n;k++) { /* Form eigenvectors. */
f=z[k][i+1];
z[k][i+1]=s*z[k][i]+c*f;
z[k][i]=c*z[k][i]-s*f;
}
}
if (r == 0.0 && i >= l) continue;
d[l] -= p;
e[l]=g;
e[m]=0.0;
}
} while (m != l);
}
}
/******************************************************************************/
double pythag(double a, double b)
/*******************************************************************************
Computes (a2 + b2)1/2 without destructive underflow or overflow.
*******************************************************************************/
{
double absa,absb;
absa=fabs(a);
absb=fabs(b);
if (absa > absb) return absa*sqrt(1.0+(absb/absa)*(absb/absa));
else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+(absa/absb)*(absa/absb)));
}
#define NP 3
#define TINY 1.0e-6
double sqrt(double x)
{
union
{
int i;
double x;
} u;
u.x = x;
u.i = (1<<29) + (u.i >> 1) - (1<<22);
return u.x;
}
int main()
{
int i,j,k;
double ze[NP][NP] = { {2, 0, 0}, {0, 4, 0}, {0, 0, 2} } ;
double **a;
double *d,*e,*f;
d=dvector(1,NP);
e=dvector(1,NP);
f=dvector(1,NP);
a=dmatrix(1,NP,1,NP);
for (i=1;i<=NP;i++)
for (j=1;j<=NP;j++) a[i][j]=ze[i-1][j-1];
tred2(a,NP,d,e);
tqli(d,e,NP,a);
printf("\nEigenvectors for a real symmetric matrix:\n");
for (i=1;i<=NP;i++) {
for (j=1;j<=NP;j++) {
f[j]=0.0;
for (k=1;k<=NP;k++)
f[j] += (ze[j-1][k-1]*a[k][i]);
}
printf("%s %3d %s %10.6f\n","\neigenvalue",i," =",d[i]);
printf("%11s %14s %9s\n","vector","mtrx*vect.","ratio");
for (j=1;j<=NP;j++) {
if (fabs(a[j][i]) < TINY)
printf("%12.6f %12.6f %12s\n",
a[j][i],f[j],"div. by 0");
else
printf("%12.6f %12.6f %12.6f\n",
a[j][i],f[j],f[j]/a[j][i]);
}
}
//free_dmatrix(a,1,NP,1,NP);
//free_dvector(f,1,NP);
//free_dvector(e,1,NP);
//free_dvector(d,1,NP);
return 0;
}
Output:
Eigenvectors for a real symmetric matrix:
eigenvalue 1 = 2.000000
vector mtrx*vect. ratio
1.000000 2.000000 2.000000
0.000000 0.000000 div. by 0
0.000000 0.000000 div. by 0
eigenvalue 2 = 4.000000
vector mtrx*vect. ratio
0.000000 0.000000 div. by 0
1.000000 4.000000 4.000000
0.000000 0.000000 div. by 0
eigenvalue 3 = 2.000000
vector mtrx*vect. ratio
0.000000 0.000000 div. by 0
0.000000 0.000000 div. by 0
1.000000 2.000000 2.000000
I hope it finaly helps to clarify confusion regarding the data preparation for tqli.