Logistic regression code stops working above ~43,500 generated observations

Logistic regression code stops working above ~43,500 generated observations - c

Having some difficulty troubleshooting code I wrote in C to perform a logistic regression. While it seems to work on smaller, semi-randomized datasets, it stops working (e.g. assigning proper probabilities of belonging to class 1) at around the point where I pass 43,500 observations (determined by tweaking the number of observations created. When creating the 150 features used in the code, I do create the first two as a function of the number of observations, so I'm not sure if maybe that's the issue here, though I am using double precision. Maybe there's an overflow somewhere in the code?
The below code should be self-contained; it generates m=50,000 observations with n=150 features. Setting m below 43,500 should return "Percent class 1: 0.250000", setting to 44,000 or above will return "Percent class 1: 0.000000", regardless of what max_iter (number of times we sample m observations) is set to.
The first feature is set to 1.0 divided by the total number of observations, if class 0 (first 75% of observations), or the index of the observation divided by the total number of observations otherwise.
The second feature is just index divided by total number of observations.
All other features are random.
The logistic regression is intended to use stochastic gradient descent, randomly selecting an observation index, computing the gradient of the loss with the predicted y using current weights, and updating weights with the gradient and learning rate (eta).
Using the same initialization with Python and NumPy, I still get the proper results, even above 50,000 observations.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
// Compute z = w * x + b
double dlc( int n, double *X, double *coef, double intercept )
{
double y_pred = intercept;
for (int i = 0; i < n; i++)
{
y_pred += X[i] * coef[i];
}
return y_pred;
}
// Compute y_hat = 1 / (1 + e^(-z))
double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept )
{
double y_pred;
y_pred = dlc(n, X, coef, intercept);
y_pred = 1.0 / (1.0 + exp(-y_pred));
return y_pred;
}
// Stochastic gradient descent
void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed )
{
double *gradient_coef, *X_i;
double y_i, y_pred, resid;
int idx;
double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0;
X_i = (double *) malloc (n * sizeof(double));
gradient_coef = (double *) malloc (n * sizeof(double));
for ( int i = 0; i < n; i++ )
{
coef[i] = 0.0;
gradient_coef[i] = 0.0;
}
*intercept = 0.0;
srand(random_seed);
for ( int epoch = 0; epoch < max_iter; epoch++ )
{
for ( int run = 0; run < m; run++ )
{
// Randomly sample an observation
idx = rand() % m;
for ( int i = 0; i < n; i++ )
{
X_i[i] = X[n*idx+i];
}
y_i = y[idx];
// Compute y_hat
y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept );
resid = -(y_i - y_pred);
// Compute gradients and adjust weights
for (int i = 0; i < n; i++)
{
gradient_coef[i] = X_i[i] * resid;
coef[i] -= eta * gradient_coef[i];
}
if ( fit_intercept == 1 )
{
*intercept -= eta * resid;
}
}
}
}
int main(void)
{
double *X, *y, *coef, *y_pred;
double intercept;
double eta = 0.05;
double alpha = 1.0, beta = 1.0;
long m = 50000;
long n = 150;
int max_iter = 20;
long class_0 = (long)(3.0 / 4.0 * (double)m);
double pct_class_1 = 0.0;
clock_t test_start;
clock_t test_end;
double test_time;
printf("Constructing variables...\n");
X = (double *) malloc (m * n * sizeof(double));
y = (double *) malloc (m * sizeof(double));
y_pred = (double *) malloc (m * sizeof(double));
coef = (double *) malloc (n * sizeof(double));
// Initialize classes
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
y[i] = 0.0;
}
else
{
y[i] = 1.0;
}
}
// Initialize observation features
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
X[n*i] = 1.0 / (double)m;
}
else
{
X[n*i] = (double)i / (double)m;
}
X[n*i + 1] = (double)i / (double)m;
for (int j = 2; j < n; j++)
{
X[n*i + j] = (double)(rand() % 100) / 100.0;
}
}
// Fit weights
printf("Running SGD...\n");
test_start = clock();
sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 );
test_end = clock();
test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", test_time);
// Compute y_hat and share of observations predicted as class 1
printf("Making predictions...\n");
for ( int i = 0; i < m; i++ )
{
y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept );
}
printf("Printing results...\n");
for ( int i = 0; i < m; i++ )
{
//printf("%f\n", y_pred[i]);
if (y_pred[i] > 0.5)
{
pct_class_1 += 1.0;
}
// Troubleshooting print
if (i < 10 || i > m - 10)
{
printf("%g\n", y_pred[i]);
}
}
printf("Percent class 1: %f", pct_class_1 / (double)m);
return 0;
}
For reference, here is my (presumably) equivalent Python code, which returns the correct percent of identified classes at more than 50,000 observations:
import numpy as np
import time
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class LogisticRegressor:
def __init__(self, eta, init_runs, fit_intercept=True):
self.eta = eta
self.init_runs = init_runs
self.fit_intercept = fit_intercept
def fit(self, x, y):
m, n = x.shape
self.coef = np.zeros((n, 1))
self.intercept = np.zeros((1, 1))
for epoch in range(self.init_runs):
for run in range(m):
idx = np.random.randint(0, m)
x_i = x[idx:idx+1, :]
y_i = y[idx]
y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept)
gradient_w = -(x_i.T * (y_i - y_pred_i))
self.coef -= self.eta * gradient_w
if self.fit_intercept:
gradient_b = -(y_i - y_pred_i)
self.intercept -= self.eta * gradient_b
def predict_proba(self, x):
m, n = x.shape
y_pred = np.ones((m, 2))
y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept)
y_pred[:,0:1] -= y_pred[:,1:2]
return y_pred
def predict(self, x):
return np.round(sigmoid(x.dot(self.coef) + self.intercept))
m = 50000
n = 150
class1 = int(3.0 / 4.0 * m)
X = np.random.rand(m, n)
y = np.zeros((m, 1))
for obs in range(m):
if obs < class1:
continue
else:
y[obs,0] = 1
for obs in range(m):
if obs < class1:
X[obs, 0] = 1.0 / float(m)
else:
X[obs, 0] = float(obs) / float(m)
X[obs, 1] = float(obs) / float(m)
logit = LogisticRegressor(0.05, 20)
start_time = time.time()
logit.fit(X, y)
end_time = time.time()
print(round(end_time - start_time, 2))
y_pred = logit.predict(X)
print("Percent:", y_pred.sum() / len(y_pred))

The issue is here:
// Randomly sample an observation
idx = rand() % m;
... in light of the fact that the OP's RAND_MAX is 32767. This is exacerbated by the fact that all of the class 0 observations are at the end.
All samples will be drawn from the first 32768 observations, and when the total number of observations is greater than that, the proportion of class 0 observations among those that can be sampled is less than 0.25. At 43691 total observations, there are no class 0 observations among those that can be sampled.
As a secondary issue, rand() % m does not yield a wholly uniform distribution if m does not evenly divide RAND_MAX + 1, though the effect of this issue will be much more subtle.
Bottom line: you need a better random number generator.
At minimum, you could consider combining the bits from two calls to rand() to yield an integer with sufficient range, but you might want to consider getting a third-party generator. There are several available.

Note: OP reports "m=50,000 observations with n=150 features.", so perhaps this is not the issue for OP, but I'll leave this answer up for reference when OP tries larger tasks.
A potential issue:
long overflow
m * n * sizeof(double) risks overflow when long is 32-bit and m*n > LONG_MAX (or about 46,341 if m, n are the same).
OP does report
A first step is to perform the multiplication using size_t math where we gain at least 1 more bit in the calculation.
// m * n * sizeof(double)
sizeof(double) * m * n
Yet unless OP's size_t is more than 32-bit, we still have trouble.
IAC, I recommend to use size_t for array sizing and indexing.
Check allocations for failure too.

Since RAND_MAX may be too small and array indexing should be done using size_t math, consider a helper function to generate a random index over the entire size_t range.
// idx = rand() % m;
size_t idx = rand_size_t() % (size_t)m;
If stuck with the standard rand(), below is a helper function to extend its range as needed.
It uses the real nifty IMAX_BITS(m).
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
// https://stackoverflow.com/a/4589384/2410359
/* Number of bits in inttype_MAX, or in any (1<<k)-1 where 0 <= k < 2040 */
#define IMAX_BITS(m) ((m)/((m)%255+1) / 255%255*8 + 7-86/((m)%255+12))
// Test that RAND_MAX is a power of 2 minus 1
_Static_assert((RAND_MAX & 1) && ((RAND_MAX/2 + 1) & (RAND_MAX/2)) == 0, "RAND_MAX is not a Mersenne number");
#define RAND_MAX_WIDTH (IMAX_BITS(RAND_MAX))
#define SIZE_MAX_WIDTH (IMAX_BITS(SIZE_MAX))
size_t rand_size_t(void) {
size_t index = (size_t) rand();
for (unsigned i = RAND_MAX_WIDTH; i < SIZE_MAX_WIDTH; i += RAND_MAX_WIDTH) {
index <<= RAND_MAX_WIDTH;
index ^= (size_t) rand();
}
return index;
}
Further considerations can replace the rand_size_t() % (size_t)m with a more uniform distribution.

As has been determined elsewhere, the problem is due to the implementation's RAND_MAX value being too small.
Assuming 32-bit ints, a slightly better PRNG function can be implemented in the code, such as this C implementation of the minstd_rand() function from C++:
#define MINSTD_RAND_MAX 2147483646
// Code assumes `int` is at least 32 bits wide.
static unsigned int minstd_seed = 1;
static void minstd_srand(unsigned int seed)
{
seed %= 2147483647;
// zero seed is bad!
minstd_seed = seed ? seed : 1;
}
static int minstd_rand(void)
{
minstd_seed = (unsigned long long)minstd_seed * 48271 % 2147483647;
return (int)minstd_seed;
}
Another problem is that expressions of the form rand() % m produce a biased result when m does not divide (unsigned int)RAND_MAX + 1. Here is an unbiased function that returns a random integer from 0 to le inclusive, making use of the minstd_rand() function defined earlier:
static int minstd_rand_max(int le)
{
int r;
if (le < 0)
{
r = le;
}
else if (le >= MINSTD_RAND_MAX)
{
r = minstd_rand();
}
else
{
int rm = MINSTD_RAND_MAX - le + MINSTD_RAND_MAX % (le + 1);
while ((r = minstd_rand()) > rm)
{
}
r /= (rm / (le + 1) + 1);
}
return r;
}
(Actually, it does still have a very small bias because minstd_rand() will never return 0.)
For example, replace rand() % 100 with minstd_rand_max(99), and replace rand() % m with minstd_rand_max(m - 1). Also replace srand(random_seed) with minstd_srand(random_seed).

Related

Taylor Series in C (problem with sin(240) and sin(300))

#include <stdio.h>
#include <math.h>
const int TERMS = 7;
const float PI = 3.14159265358979;
int fact(int n) {
return n<= 0 ? 1 : n * fact(n-1);
}
double sine(int x) {
double rad = x * (PI / 180);
double sin = 0;
int n;
for(n = 0; n < TERMS; n++) { // That's Taylor series!!
sin += pow(-1, n) * pow(rad, (2 * n) + 1)/ fact((2 * n) + 1);
}
return sin;
}
double cosine(int x) {
double rad = x * (PI / 180);
double cos = 0;
int n;
for(n = 0; n < TERMS; n++) { // That's also Taylor series!
cos += pow(-1, n) * pow(rad, 2 * n) / fact(2 * n);
}
return cos;
}
int main(void){
int y;
scanf("%d",&y);
printf("sine(%d)= %lf\n",y, sine(y));
printf("cosine(%d)= %lf\n",y, cosine(y));
return 0;
}
The code above was implemented to compute sine and cosine using Taylor series.
I tried testing the code and it works fine for sine(120).
I am getting wrong answers for sine(240) and sine(300).
Can anyone help me find out why those errors occur?

You should calculate the functions in the first quadrant only [0, pi/2). Exploit the properties of the functions to get the values for other angles. For instance, for values of x between [pi/2, pi), sin(x) can be calculated by sin(pi - x).
The sine of 120 degrees, which is 40 past 90 degrees, is the same as 50 degrees: 40 degrees before 90. Sine starts at 0, then rises toward 1 at 90 degrees, and then falls again in a mirror image to zero at 180.
The negative sine values from pi to 2pi are just -sin(x - pi). I'd handle everything by this recursive definition:
sin(x):
cases x of:
[0, pi/2) -> calculate (Taylor or whatever)
[pi/2, pi) -> sin(pi - x)
[pi/2, 2pi) -> -sin(x - pi)
< 0 -> sin(-x)
>= 2pi -> sin(fmod(x, 2pi)) // floating-point remainder
A similar approach for cos, using identity cases appropriate for it.

The key point is:
TERMS is too small to have proper precision. And if you increase TERMS, you have to change fact implementation as it will likely overflow when working with int.
I would use a sign to toggle the -1 power instead of pow(-1,n) overkill.
Then use double for the value of PI to avoid losing too many decimals
Then for high values, you should increase the number of terms (this is the main issue). using long long for your factorial method or you get overflow. I set 10 and get proper results:
#include <stdio.h>
#include <math.h>
const int TERMS = 10;
const double PI = 3.14159265358979;
long long fact(int n) {
return n<= 0 ? 1 : n * fact(n-1);
}
double powd(double x,int n) {
return n<= 0 ? 1 : x * powd(x,n-1);
}
double sine(int x) {
double rad = x * (PI / 180);
double sin = 0;
int n;
int sign = 1;
for(n = 0; n < TERMS; n++) { // That's Taylor series!!
sin += sign * powd(rad, (2 * n) + 1)/ fact((2 * n) + 1);
sign = -sign;
}
return sin;
}
double cosine(int x) {
double rad = x * (PI / 180);
double cos = 0;
int n;
int sign = 1;
for(n = 0; n < TERMS; n++) { // That's also Taylor series!
cos += sign * powd(rad, 2 * n) / fact(2 * n);
sign = -sign;
}
return cos;
}
int main(void){
int y;
scanf("%d",&y);
printf("sine(%d)= %lf\n",y, sine(y));
printf("cosine(%d)= %lf\n",y, cosine(y));
return 0;
}
result:
240
sine(240)= -0.866026
cosine(240)= -0.500001
Notes:
my recusive implementation of pow using successive multiplications is probably not needed, since we're dealing with floating point. It introduces accumulation error if n is big.
fact could be using floating point to allow bigger numbers and better precision. Actually I suggested long long but it would be better not to assume that the size will be enough. Better use standard type like int64_t for that.
fact and pow results could be pre-computed/hardcoded as well. This would save computation time.

const double TERMS = 14;
const double PI = 3.14159265358979;
double fact(double n) {return n <= 0.0 ? 1 : n * fact(n - 1);}
double sine(double x)
{
double rad = x * (PI / 180);
rad = fmod(rad, 2 * PI);
double sin = 0;
for (double n = 0; n < TERMS; n++)
sin += pow(-1, n) * pow(rad, (2 * n) + 1) / fact((2 * n) + 1);
return sin;
}
double cosine(double x)
{
double rad = x * (PI / 180);
rad = fmod(rad,2*PI);
double cos = 0;
for (double n = 0; n < TERMS; n++)
cos += pow(-1, n) * pow(rad, 2 * n) / fact(2 * n);
return cos;
}
int main()
{
printf("sine(240)= %lf\n", sine(240));
printf("cosine(300)= %lf\n",cosine(300));
}

Gravity computations resulting in NaN. No clear reason

I have problem with my C code.
It's force solution on N-bodies problem in 2D.
Sometimes I get NaN as struct instance params value.
My guess is that something is wrong with division. I have analyzed a lot of cases, but still could not find an occuring pattern that results in values being NaN.
Here is my code:
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <stdlib.h>
double G;
int N;
int T;
int COUNT;
typedef struct
{
double rx, ry;
double vx, vy;
double fx, fy;
double mass;
} Body;
void updateBody (Body* bodyInstance, int timestamp) {
bodyInstance->vx += timestamp * bodyInstance->fx / bodyInstance->mass;
bodyInstance->vy += timestamp * bodyInstance->fy / bodyInstance->mass;
bodyInstance->rx += timestamp * bodyInstance->vx;
bodyInstance->ry += timestamp * bodyInstance->vy;
};
void updateBodyForces (Body* bodyA, Body* bodyB) {
double dx, dy, dist, force;
dx = bodyB->rx - bodyA->rx;
dy = bodyB->rx - bodyA->rx;
// collision/same place in spacetime hack
if (bodyB->rx == bodyA->rx && bodyB->ry == bodyA->ry) {
dist = 1;
} else {
dist = sqrt(pow(dx, 2) + pow(dy, 2));
}
force = (G * bodyA->mass * bodyB->mass) / (pow(dist, 2) + 100);
bodyA->fx += force * dx / dist;
bodyA->fy += force * dy / dist;
}
void resetBodyForces (Body* bodyInstance) {
bodyInstance->fx = 0;
bodyInstance->fy = 0;
}
void getRandomBody (Body* bI) {
bI->rx = rand() % 10;
bI->ry = rand() % 10;
bI->vx = rand() % 10;
bI->vy = rand() % 10;
bI->fx = 0;
bI->fy = 0;
bI->mass = 20;
}
int main( int argc, char *argv[] ) {
G = argc >= 2 ? atof(argv[1]) : 0.01;
N = argc >= 3 ? atoi(argv[2]) : 3;
T = argc >= 4 ? atoi(argv[3]) : 1;
COUNT = argc >= 5 ? atoi(argv[4]) : 10;
srand(time(NULL));
Body bodies[N];
for (int i=0; i<N; i++) {
getRandomBody(&bodies[i]);
}
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < N; j++) {
resetBodyForces(&bodies[j]);
for (int k = 0; k < N; k++) {
if (j != k) {
updateBodyForces(&bodies[j], &bodies[k]);
}
}
}
for (int j = 0; j < N; j++) {
updateBody(&bodies[j], T);
}
}
}

In updateBodyForces you test two floating point values for equality. They may differ by as little as the very last bit, about 1/10,000,000.
Right after this you take the square root of their difference squared, and so the result may be 0 (really really zero, 0.0000000...), which is not a problem, but then you divide by that number. That is the source of the NaN.
Replace this part
// collision/same place in spacetime hack
if (bodyB->rx == bodyA->rx && bodyB->ry == bodyA->ry) {
dist = 1;
}
with a more explicit test based on FLT_EPSILON. See Floating point equality and tolerances for a longer explanation.
After some testing: the epsilon value is difficult to guess. Since you are okay with a dist = 1 for corner cases, add this below the test, above the force line, to be sure:
if (dist < 1)
dist = 1;
so you won't get any NaNs for sure. That leads to this simpler function:
void updateBodyForces (Body* bodyA, Body* bodyB) {
double dx, dy, dist, force;
dx = bodyB->rx - bodyA->rx;
dy = bodyB->ry - bodyA->ry;
dist = sqrt(dx*dx + dy*dy);
// collision/same place in spacetime hack
if (dist < 1)
dist = 1;
force = (G * bodyA->mass * bodyB->mass) / (pow(dist, 2) + 100);
bodyA->fx += force * dx / dist;
bodyA->fy += force * dy / dist;
}
You can make the wibbly-wobbly space-time hack a bit less obvious by replacing the 1 with a smaller value as well.

A common (and by far the most likely explanation here) production of the -NaN result is a negative argument to sqrt, due to either (i) the base parameter to pow being negative, or (ii) an accumulation of joke digits in your floating point variables: bodyInstance->vx += &c. will accumulate rounding errors.
Check that case prior to calling sqrt.
You will also get a NaN with an expression like 0.0 / 0.0, but I've never seen a platform yield a negative NaN in that instance.
So the moral here is to prefer dx * dx to pow(dx, 2): the former is more accurate, not vulnerable to unexpected results with negative dx, and certainly not slower. Better still, use hypot from the C standard library.
Reference: http://en.cppreference.com/w/c/numeric/math/hypot

C: Accessing lookup tables faster?

I have a piece of code that traces 4 sines at a time.
My original code was making roughly 12000 sin() function calls per frame and was running at 30 fps.
I tried optimizing it by generating lookup tables. I ended up with 16 different lookup tables. I declared and load them in a separate header file at the top of my program. Each table is declared like so:
static const float d4_lookup[800] {...};
Now, with this new method I actually lost fps?! I'm running at 20 fps now instead of 30. Each frame now only has to do 8 sin / cos calls and 19200 lookup calls vs 12000 sin() calls.
I compile using gcc with -O3 flag on. At the moment, the lookup tables are included at the top and are part of the global scope of the program.
I assume I'm not loading them in the right memory or something to that effect. How can I speed up the lookup time?
** EDIT 1 **
As requested, here's the function that uses the lookup calls, it is called once per frame:
void
update_sines(void)
{
static float c1_sin, c1_cos;
static float c2_sin, c2_cos;
static float c3_sin, c3_cos;
static float c4_sin, c4_cos;
clock_gettime(CLOCK_MONOTONIC, &spec);
s = spec.tv_sec;
ms = spec.tv_nsec * 0.0000001;
etime = concatenate((long)s, ms);
c1_sin = sinf(etime * 0.00525);
c1_cos = cosf(etime * 0.00525);
c2_sin = sinf(etime * 0.007326);
c2_cos = cosf(etime * 0.007326);
c3_sin = sinf(etime * 0.0046);
c3_cos = cosf(etime * 0.0046);
c4_sin = sinf(etime * 0.007992);
c4_cos = cosf(etime * 0.007992);
int k;
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
}
}
** UPDATE **
For anyone reading this thread, I gave up on this problem. I tried using OpenCL kernels, structs, SIMD instructions as well as all the solutions shown here. In the end the original code that computed the sinf() 12800 per frame worked faster than the lookup tables since the lookup tables didn't fit into the cache. Yet it was still only doing 30 fps. It just had too much going on to keep up with my 60fps expectations. I've decided to take a different direction. Thanks to everyone who contributed to this thread. Most of these solutions would probably work to get some half decent speed improvements but nothing like the 200% speed up I needed here to have the lookup tables work the way I wanted.

Sometimes it's hard to know what's slowing you down, but potentially you are going to ruin your cache hits, you could try a lookup of a struct
typedef struct
{
float bx1_sin;
float bx2_sin;
float bx3_sin;
float bx4_sin;
float bx1_cos;
etc etc
including sine1,2,3,4 as well
} lookup_table
then
lookup_table lookup[800]
now everything at the kth lookup will be in the same small chunk of memory.
also, if you use a macro that takes k as a parameter to do do the contents of the loop lets say SINE_CALC(k), or an inline function...
you can do
for (k = 0; k < 800; ++k)
{
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
}
if you do a macro, make sure the k++ is outside the macro call like shown

Try unrolling your loops like this:
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k];
sine2[k] = a2_lookup[k];
sine3[k] = a3_lookup[k];
sine4[k] = a4_lookup[k];
}
for (k = 0; k < 800; ++k)
{
sine1[k] *= ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k]));
sine2[k] *= ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k]));
sine3[k] *= ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k]));
sine4[k] *= ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k]));
}
for (k = 0; k < 800; ++k)
{
sine1[k] += d1_lookup[k];
sine2[k] += d2_lookup[k] + 50;
sine3[k] += d3_lookup[k];
sine4[k] += d4_lookup[k] + 50;
}
By accessing fewer lookup tables in each loop, you should be able to stay in the cache. The middle loop could be split up as well, but you'll need to create an intermediate table for one of the sub-expressions.

Intel processors can predict serial access (and perform prefetch) for up to 4 arrays both for forward and backward traverse. At least this was true in Core 2 Duo days. Split your for in:
for (k = 0; k < 800; ++k)
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
for (k = 0; k < 800; ++k)
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
for (k = 0; k < 800; ++k)
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
for (k = 0; k < 800; ++k)
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
I guess you have more cache load than benchmarks in other answers so this does matters. I recommend you not to unroll loops, compilers do it well.

Using a simple sin lookup table will yields >20% speed increase on my linux machine (vm, gcc, 64bit). Interestingly, the size of lookup table (within reasonable < L1 cache size values) does not influence the speed of execution.
Using a fastsin simple implementation from here I got >45% improvement.
Code:
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#define LOOKUP_SIZE 628
uint64_t currentTimestampUs( void )
{
struct timeval tv;
time_t localTimeRet;
uint64_t timestamp = 0;
//time_t tzDiff = 0;
struct tm when;
int64_t localeOffset = 0;
{
localTimeRet = time(NULL);
localtime_r ( &localTimeRet, &when );
localeOffset = when.tm_gmtoff * 1000000ll;
}
gettimeofday ( &tv, NULL );
timestamp = ((uint64_t)((tv.tv_sec) * 1000000ll) ) + ( (uint64_t)(tv.tv_usec) );
timestamp+=localeOffset;
return timestamp;
}
const double PI = 3.141592653589793238462;
const double PI2 = 3.141592653589793238462 * 2;
static float sinarr[LOOKUP_SIZE];
void initSinArr() {
int a =0;
for (a=0; a<LOOKUP_SIZE; a++) {
double arg = (1.0*a/LOOKUP_SIZE)*((double)PI * 0.5);
float sinval_f = sin(arg); // double computation earlier to avoid losing precision on value
sinarr[a] = sinval_f;
}
}
float sinlookup(float val) {
float normval = val;
while (normval < 0) {
normval += PI2;
}
while (normval > PI2) {
normval -= PI2;
}
int index = LOOKUP_SIZE*(2*normval/PI);
if (index > 3*LOOKUP_SIZE) {
index = -index + 4*LOOKUP_SIZE;//LOOKUP_SIZE - (index-3*LOOKUP_SIZE);
return -sinarr[index];
} else if (index > 2*LOOKUP_SIZE) {
index = index - 2*LOOKUP_SIZE;
return -sinarr[index];
} else if (index > LOOKUP_SIZE) {
index = 2*LOOKUP_SIZE - index;
return sinarr[index];
} else {
return sinarr[index];
}
}
float sin_fast(float x) {
while (x < -PI)
x += PI2;
while (x > PI)
x -= PI2;
//compute sine
if (x < 0)
return 1.27323954 * x + .405284735 * x * x;
else
return 1.27323954 * x - 0.405284735 * x * x;
}
int main(void) {
initSinArr();
int a = 0;
float val = 0;
const int num_tries = 100000;
uint64_t startLookup = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sinlookup(val);
(void)compval;
}
}
uint64_t startSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin(val);
(void)compval;
}
}
uint64_t startFastSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin_fast(val);
(void)compval;
}
}
uint64_t end = currentTimestampUs();
int64_t lookupMs = (startSin - startLookup)/1000;
int64_t sinMs = (startFastSin - startSin)/1000;
int64_t fastSinMs = (end - startFastSin)/1000;
printf(" lookup: %lld ms\n", lookupMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-lookupMs);
printf(" diff%: %lld %\n", 100*(sinMs-lookupMs)/sinMs);
printf("fastsin: %lld ms\n", fastSinMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-fastSinMs);
printf(" diff%: %lld %\n", 100*(sinMs-fastSinMs)/sinMs);
}
Sample result:
lookup: 2276 ms
sin: 3004 ms
diff: 728 ms
diff%: 24 %
fastsin: 1500 ms
sin: 3004 ms
diff: 1504 ms
diff%: 50 %

The outermost for loop does not work as intended

I have been using Ubuntu 12.04 LTS with GCC to compile my the codes for my assignment for a while. However, recently I have run into two issues as follows:
The following code calculates zero for a nonzero value with the second formula is used.
There is a large amount of error in the calculation of the integral of the standard normal distribution from 0 to 5 or larger standard deviations.
How can I remedy these issues? I am especially obsessed with the first one. Any help or suggestion is appreciated. thanks in advance.
The code is as follows:
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdlib.h>
#define N 599
long double
factorial(long double n)
{
//Here s is the free parameter which is increased by one in each step and
//pro is the initial product and by setting pro to be 0 we also cover the
//case of zero factorial.
int s = 1;
long double pro = 1;
//Here pro stands for product.
if (n < 0)
printf("Factorial is not defined for a negative number \n");
else {
while (n >= s) {
pro *= s;
s++;
}
return pro;
}
}
int main()
{
// Since the function given is the standard normal distribution
// probability density function we have mean = 0 and variance = 1.
// Hence we also have z = x; while dealing with only positive values of
// x and keeping in mind that the PDF is symmetric around the mean.
long double * summand1 = malloc(N * sizeof(long double));
long double * summand2 = malloc(N * sizeof(long double));
int p = 0, k, z[5] = {0, 3, 5, 10, 20};
long double sum1[5] = {0}, sum2[5] = {0} , factor = 1.0;
for (p = 0; p <= 4; p++)
{
for (k = 0; k <= N; k++)
{
summand1[k] = (1 / sqrtl(M_PI * 2) )* powl(-1, k) * powl(z[p], 2 * k + 1) / ( factorial(k) * (2 * k + 1) * powl(2, k));
sum1[p] += summand1[k];
}
//Wolfamalpha site gives the same value here
for (k = 0; k <= N; k++)
{
factor *= (2 * k + 1);
summand2[k] = ((1 / sqrtl(M_PI * 2) ) * powl(z[p], 2 * k + 1) / factor);
//printf("%Le \n", factor);
sum2[p] += summand2[k];
}
sum2[p] = sum2[p] * expl((-powl(z[p],2)) / 2);
}
for (p = 0; p < 4; p++)
{
printf("The sum obtained for z between %d - %d \
\nusing the first formula is %Lf \n", z[p], z[p+1], sum1[p+1]);
printf("The sum obtained for z between %d - %d \
\nusing the second formula is %Lf \n", z[p], z[p+1], sum2[p+1]);
}
return 0;
}
The working code without the outermost for loop is
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdlib.h>
#define N 1200
long double
factorial(long double n)
{
//Here s is the free parameter which is increased by one in each step and
//pro is the initial product and by setting pro to be 0 we also cover the
//case of zero factorial.
int s = 1;
long double pro = 1;
//Here pro stands for product.
if (n < 0)
printf("Factorial is not defined for a negative number \n");
else {
while (n >= s) {
pro *= s;
s++;
}
return pro;
}
}
int main()
{
// Since the function given is the standard normal distribution
// probability density function we have mean = 0 and variance = 1.
// Hence we also have z = x; while dealing with only positive values of
// x and keeping in mind that the PDF is symmetric around the mean.
long double * summand1 = malloc(N * sizeof(long double));
long double * summand2 = malloc(N * sizeof(long double));
int k, z = 3;
long double sum1 = 0, sum2 = 0, pro = 1.0;
for (k = 0; k <= N; k++)
{
summand1[k] = (1 / sqrtl(M_PI * 2) )* powl(-1, k) * powl(z, 2 * k + 1) / ( factorial(k) * (2 * k + 1) * powl(2, k));
sum1 += summand1[k];
}
//Wolfamalpha site gives the same value here
printf("The sum obtained for z between 0-3 using the first formula is %Lf \n", sum1);
for (k = 0; k <= N; k++)
{
pro *= (2 * k + 1);
summand2[k] = ((1 / sqrtl(M_PI * 2) * powl(z, 2 * k + 1) / pro));
//printf("%Le \n", pro);
sum2 += summand2[k];
}
sum2 = sum2 * expl((-powl(z,2)) / 2);
printf("The sum obtained for z between 0-3 using the second formula is %Lf \n", sum2);
return 0;
}

I'm quite certain that the problem is in factor not being set back to 1 in the outer loop..
factor *= (2 * k + 1); (in the loop that calculates sum2.)
In the second version provided the one that works it starts with z=3
However in the first loop since you do not clear it between iterations on p by the time you reach z[2] it already is a huge number.
EDIT: Possible help with precision..
Basically you have a huge number powl(z[p], 2 * k + 1) divided by another huge number factor. huge floating point numbers lose their precision. The way to avoid that is to perform the division as soon as possible..
Instead of first calculating powl(z[p], 2 * k + 1) and dividing by factor :
- (z[p]z[p] ... . * z[p]) / (1*3*5*...(2*k+1))`
rearrange the calculation: (z[p]/1) * (z[p]^2/3) * (z[p]^2/5) ... (z[p]^2/(2*k+1))
You can do this in sumand2 calculation and a similar trick in summand1

fast & efficient least squares fit algorithm in C?

I am trying to implement a linear least squares fit onto 2 arrays of data: time vs amplitude. The only technique I know so far is to test all of the possible m and b points in (y = m*x+b) and then find out which combination fits my data best so that it has the least error. However, I think iterating so many combinations is sometimes useless because it tests out everything. Are there any techniques to speed up the process that I don't know about? Thanks.

Try this code. It fits y = mx + b to your (x,y) data.
The arguments to linreg are
linreg(int n, REAL x[], REAL y[], REAL* b, REAL* m, REAL* r)
n = number of data points
x,y = arrays of data
*b = output intercept
*m = output slope
*r = output correlation coefficient (can be NULL if you don't want it)
The return value is 0 on success, !=0 on failure.
Here's the code
#include "linreg.h"
#include <stdlib.h>
#include <math.h> /* math functions */
//#define REAL float
#define REAL double
inline static REAL sqr(REAL x) {
return x*x;
}
int linreg(int n, const REAL x[], const REAL y[], REAL* m, REAL* b, REAL* r){
REAL sumx = 0.0; /* sum of x */
REAL sumx2 = 0.0; /* sum of x**2 */
REAL sumxy = 0.0; /* sum of x * y */
REAL sumy = 0.0; /* sum of y */
REAL sumy2 = 0.0; /* sum of y**2 */
for (int i=0;i<n;i++){
sumx += x[i];
sumx2 += sqr(x[i]);
sumxy += x[i] * y[i];
sumy += y[i];
sumy2 += sqr(y[i]);
}
REAL denom = (n * sumx2 - sqr(sumx));
if (denom == 0) {
// singular matrix. can't solve the problem.
*m = 0;
*b = 0;
if (r) *r = 0;
return 1;
}
*m = (n * sumxy - sumx * sumy) / denom;
*b = (sumy * sumx2 - sumx * sumxy) / denom;
if (r!=NULL) {
*r = (sumxy - sumx * sumy / n) / /* compute correlation coeff */
sqrt((sumx2 - sqr(sumx)/n) *
(sumy2 - sqr(sumy)/n));
}
return 0;
}
Example
You can run this example online.
int main()
{
int n = 6;
REAL x[6]= {1, 2, 4, 5, 10, 20};
REAL y[6]= {4, 6, 12, 15, 34, 68};
REAL m,b,r;
linreg(n,x,y,&m,&b,&r);
printf("m=%g b=%g r=%g\n",m,b,r);
return 0;
}
Here is the output
m=3.43651 b=-0.888889 r=0.999192
Here is the Excel plot and linear fit (for verification).
All values agree exactly with the C code above (note C code returns r while Excel returns R**2).

There are efficient algorithms for least-squares fitting; see Wikipedia for details. There are also libraries that implement the algorithms for you, likely more efficiently than a naive implementation would do; the GNU Scientific Library is one example, but there are others under more lenient licenses as well.

From Numerical Recipes: The Art of Scientific Computing in (15.2) Fitting Data to a Straight Line:
Linear Regression:
Consider the problem of fitting a set of N data points (xi, yi) to a straight-line model:
Assume that the uncertainty: sigmai associated with each yi and that the xi’s (values of the dependent variable) are known exactly. To measure how well the model agrees with the data, we use the chi-square function, which in this case is:
The above equation is minimized to determine a and b. This is done by finding the derivative of the above equation with respect to a and b, equate them to zero and solve for a and b. Then we estimate the probable uncertainties in the estimates of a and b, since obviously the measurement errors in the data must introduce some uncertainty in the determination of those parameters. Additionally, we must estimate the goodness-of-fit of the data to the
model. Absent this estimate, we have not the slightest indication that the parameters a and b in the model have any meaning at all.
The below struct performs the mentioned calculations:
struct Fitab {
// Object for fitting a straight line y = a + b*x to a set of
// points (xi, yi), with or without available
// errors sigma i . Call one of the two constructors to calculate the fit.
// The answers are then available as the variables:
// a, b, siga, sigb, chi2, and either q or sigdat.
int ndata;
double a, b, siga, sigb, chi2, q, sigdat; // Answers.
vector<double> &x, &y, &sig;
// Constructor.
Fitab(vector<double> &xx, vector<double> &yy, vector<double> &ssig)
: ndata(xx.size()), x(xx), y(yy), sig(ssig), chi2(0.), q(1.), sigdat(0.)
{
// Given a set of data points x[0..ndata-1], y[0..ndata-1]
// with individual standard deviations sig[0..ndata-1],
// sets a,b and their respective probable uncertainties
// siga and sigb, the chi-square: chi2, and the goodness-of-fit
// probability: q
Gamma gam;
int i;
double ss=0., sx=0., sy=0., st2=0., t, wt, sxoss; b=0.0;
for (i=0;i < ndata; i++) { // Accumulate sums ...
wt = 1.0 / SQR(sig[i]); //...with weights
ss += wt;
sx += x[i]*wt;
sy += y[i]*wt;
}
sxoss = sx/ss;
for (i=0; i < ndata; i++) {
t = (x[i]-sxoss) / sig[i];
st2 += t*t;
b += t*y[i]/sig[i];
}
b /= st2; // Solve for a, b, sigma-a, and simga-b.
a = (sy-sx*b) / ss;
siga = sqrt((1.0+sx*sx/(ss*st2))/ss);
sigb = sqrt(1.0/st2); // Calculate chi2.
for (i=0;i<ndata;i++) chi2 += SQR((y[i]-a-b*x[i])/sig[i]);
if (ndata>2) q=gam.gammq(0.5*(ndata-2),0.5*chi2); // goodness of fit
}
// Constructor.
Fitab(vector<double> &xx, vector<double> &yy)
: ndata(xx.size()), x(xx), y(yy), sig(xx), chi2(0.), q(1.), sigdat(0.)
{
// As above, but without known errors (sig is not used).
// The uncertainties siga and sigb are estimated by assuming
// equal errors for all points, and that a straight line is
// a good fit. q is returned as 1.0, the normalization of chi2
// is to unit standard deviation on all points, and sigdat
// is set to the estimated error of each point.
int i;
double ss,sx=0.,sy=0.,st2=0.,t,sxoss;
b=0.0; // Accumulate sums ...
for (i=0; i < ndata; i++) {
sx += x[i]; // ...without weights.
sy += y[i];
}
ss = ndata;
sxoss = sx/ss;
for (i=0;i < ndata; i++) {
t = x[i]-sxoss;
st2 += t*t;
b += t*y[i];
}
b /= st2; // Solve for a, b, sigma-a, and sigma-b.
a = (sy-sx*b)/ss;
siga=sqrt((1.0+sx*sx/(ss*st2))/ss);
sigb=sqrt(1.0/st2); // Calculate chi2.
for (i=0;i<ndata;i++) chi2 += SQR(y[i]-a-b*x[i]);
if (ndata > 2) sigdat=sqrt(chi2/(ndata-2));
// For unweighted data evaluate typical
// sig using chi2, and adjust
// the standard deviations.
siga *= sigdat;
sigb *= sigdat;
}
};
where struct Gamma:
struct Gamma : Gauleg18 {
// Object for incomplete gamma function.
// Gauleg18 provides coefficients for Gauss-Legendre quadrature.
static const Int ASWITCH=100; When to switch to quadrature method.
static const double EPS; // See end of struct for initializations.
static const double FPMIN;
double gln;
double gammp(const double a, const double x) {
// Returns the incomplete gamma function P(a,x)
if (x < 0.0 || a <= 0.0) throw("bad args in gammp");
if (x == 0.0) return 0.0;
else if ((Int)a >= ASWITCH) return gammpapprox(a,x,1); // Quadrature.
else if (x < a+1.0) return gser(a,x); // Use the series representation.
else return 1.0-gcf(a,x); // Use the continued fraction representation.
}
double gammq(const double a, const double x) {
// Returns the incomplete gamma function Q(a,x) = 1 - P(a,x)
if (x < 0.0 || a <= 0.0) throw("bad args in gammq");
if (x == 0.0) return 1.0;
else if ((Int)a >= ASWITCH) return gammpapprox(a,x,0); // Quadrature.
else if (x < a+1.0) return 1.0-gser(a,x); // Use the series representation.
else return gcf(a,x); // Use the continued fraction representation.
}
double gser(const Doub a, const Doub x) {
// Returns the incomplete gamma function P(a,x) evaluated by its series representation.
// Also sets ln (gamma) as gln. User should not call directly.
double sum,del,ap;
gln=gammln(a);
ap=a;
del=sum=1.0/a;
for (;;) {
++ap;
del *= x/ap;
sum += del;
if (fabs(del) < fabs(sum)*EPS) {
return sum*exp(-x+a*log(x)-gln);
}
}
}
double gcf(const Doub a, const Doub x) {
// Returns the incomplete gamma function Q(a, x) evaluated
// by its continued fraction representation.
// Also sets ln (gamma) as gln. User should not call directly.
int i;
double an,b,c,d,del,h;
gln=gammln(a);
b=x+1.0-a; // Set up for evaluating continued fraction
// by modified Lentz’s method with with b0 = 0.
c=1.0/FPMIN;
d=1.0/b;
h=d;
for (i=1;;i++) {
// Iterate to convergence.
an = -i*(i-a);
b += 2.0;
d=an*d+b;
if (fabs(d) < FPMIN) d=FPMIN;
c=b+an/c;
if (fabs(c) < FPMIN) c=FPMIN;
d=1.0/d;
del=d*c;
h *= del;
if (fabs(del-1.0) <= EPS) break;
}
return exp(-x+a*log(x)-gln)*h; Put factors in front.
}
double gammpapprox(double a, double x, int psig) {
// Incomplete gamma by quadrature. Returns P(a,x) or Q(a, x),
// when psig is 1 or 0, respectively. User should not call directly.
int j;
double xu,t,sum,ans;
double a1 = a-1.0, lna1 = log(a1), sqrta1 = sqrt(a1);
gln = gammln(a);
// Set how far to integrate into the tail:
if (x > a1) xu = MAX(a1 + 11.5*sqrta1, x + 6.0*sqrta1);
else xu = MAX(0.,MIN(a1 - 7.5*sqrta1, x - 5.0*sqrta1));
sum = 0;
for (j=0;j<ngau;j++) { // Gauss-Legendre.
t = x + (xu-x)*y[j];
sum += w[j]*exp(-(t-a1)+a1*(log(t)-lna1));
}
ans = sum*(xu-x)*exp(a1*(lna1-1.)-gln);
return (psig?(ans>0.0? 1.0-ans:-ans):(ans>=0.0? ans:1.0+ans));
}
double invgammp(Doub p, Doub a);
// Inverse function on x of P(a,x) .
};
const Doub Gamma::EPS = numeric_limits<Doub>::epsilon();
const Doub Gamma::FPMIN = numeric_limits<Doub>::min()/EPS
and stuct Gauleg18:
struct Gauleg18 {
// Abscissas and weights for Gauss-Legendre quadrature.
static const Int ngau = 18;
static const Doub y[18];
static const Doub w[18];
};
const Doub Gauleg18::y[18] = {0.0021695375159141994,
0.011413521097787704,0.027972308950302116,0.051727015600492421,
0.082502225484340941, 0.12007019910960293,0.16415283300752470,
0.21442376986779355, 0.27051082840644336, 0.33199876341447887,
0.39843234186401943, 0.46931971407375483, 0.54413605556657973,
0.62232745288031077, 0.70331500465597174, 0.78649910768313447,
0.87126389619061517, 0.95698180152629142};
const Doub Gauleg18::w[18] = {0.0055657196642445571,
0.012915947284065419,0.020181515297735382,0.027298621498568734,
0.034213810770299537,0.040875750923643261,0.047235083490265582,
0.053244713977759692,0.058860144245324798,0.064039797355015485
0.068745323835736408,0.072941885005653087,0.076598410645870640,
0.079687828912071670,0.082187266704339706,0.084078218979661945,
0.085346685739338721,0.085983275670394821};
and, finally fuinction Gamma::invgamp():
double Gamma::invgammp(double p, double a) {
// Returns x such that P(a,x) = p for an argument p between 0 and 1.
int j;
double x,err,t,u,pp,lna1,afac,a1=a-1;
const double EPS=1.e-8; // Accuracy is the square of EPS.
gln=gammln(a);
if (a <= 0.) throw("a must be pos in invgammap");
if (p >= 1.) return MAX(100.,a + 100.*sqrt(a));
if (p <= 0.) return 0.0;
if (a > 1.) {
lna1=log(a1);
afac = exp(a1*(lna1-1.)-gln);
pp = (p < 0.5)? p : 1. - p;
t = sqrt(-2.*log(pp));
x = (2.30753+t*0.27061)/(1.+t*(0.99229+t*0.04481)) - t;
if (p < 0.5) x = -x;
x = MAX(1.e-3,a*pow(1.-1./(9.*a)-x/(3.*sqrt(a)),3));
} else {
t = 1.0 - a*(0.253+a*0.12); and (6.2.9).
if (p < t) x = pow(p/t,1./a);
else x = 1.-log(1.-(p-t)/(1.-t));
}
for (j=0;j<12;j++) {
if (x <= 0.0) return 0.0; // x too small to compute accurately.
err = gammp(a,x) - p;
if (a > 1.) t = afac*exp(-(x-a1)+a1*(log(x)-lna1));
else t = exp(-x+a1*log(x)-gln);
u = err/t;
// Halley’s method.
x -= (t = u/(1.-0.5*MIN(1.,u*((a-1.)/x - 1))));
// Halve old value if x tries to go negative.
if (x <= 0.) x = 0.5*(x + t);
if (fabs(t) < EPS*x ) break;
}
return x;
}

Here is my version of a C/C++ function that does simple linear regression. The calculations follow the wikipedia article on simple linear regression. This is published as a single-header public-domain (MIT) library on github: simple_linear_regression. The library (.h file) is tested to work on Linux and Windows, and from C and C++ using -Wall -Werror and all -std versions supported by clang/gcc.
#define SIMPLE_LINEAR_REGRESSION_ERROR_INPUT_VALUE -2
#define SIMPLE_LINEAR_REGRESSION_ERROR_NUMERIC -3
int simple_linear_regression(const double * x, const double * y, const int n, double * slope_out, double * intercept_out, double * r2_out) {
double sum_x = 0.0;
double sum_xx = 0.0;
double sum_xy = 0.0;
double sum_y = 0.0;
double sum_yy = 0.0;
double n_real = (double)(n);
int i = 0;
double slope = 0.0;
double denominator = 0.0;
if (x == NULL || y == NULL || n < 2) {
return SIMPLE_LINEAR_REGRESSION_ERROR_INPUT_VALUE;
}
for (i = 0; i < n; ++i) {
sum_x += x[i];
sum_xx += x[i] * x[i];
sum_xy += x[i] * y[i];
sum_y += y[i];
sum_yy += y[i] * y[i];
}
denominator = n_real * sum_xx - sum_x * sum_x;
if (denominator == 0.0) {
return SIMPLE_LINEAR_REGRESSION_ERROR_NUMERIC;
}
slope = (n_real * sum_xy - sum_x * sum_y) / denominator;
if (slope_out != NULL) {
*slope_out = slope;
}
if (intercept_out != NULL) {
*intercept_out = (sum_y - slope * sum_x) / n_real;
}
if (r2_out != NULL) {
denominator = ((n_real * sum_xx) - (sum_x * sum_x)) * ((n_real * sum_yy) - (sum_y * sum_y));
if (denominator == 0.0) {
return SIMPLE_LINEAR_REGRESSION_ERROR_NUMERIC;
}
*r2_out = ((n_real * sum_xy) - (sum_x * sum_y)) * ((n_real * sum_xy) - (sum_x * sum_y)) / denominator;
}
return 0;
}
Usage example:
#define SIMPLE_LINEAR_REGRESSION_IMPLEMENTATION
#include "simple_linear_regression.h"
#include <stdio.h>
/* Some data that we want to find the slope, intercept and r2 for */
static const double x[] = { 1.47, 1.50, 1.52, 1.55, 1.57, 1.60, 1.63, 1.65, 1.68, 1.70, 1.73, 1.75, 1.78, 1.80, 1.83 };
static const double y[] = { 52.21, 53.12, 54.48, 55.84, 57.20, 58.57, 59.93, 61.29, 63.11, 64.47, 66.28, 68.10, 69.92, 72.19, 74.46 };
int main() {
double slope = 0.0;
double intercept = 0.0;
double r2 = 0.0;
int res = 0;
res = simple_linear_regression(x, y, sizeof(x) / sizeof(x[0]), &slope, &intercept, &r2);
if (res < 0) {
printf("Error: %s\n", simple_linear_regression_error_string(res));
return res;
}
printf("slope: %f\n", slope);
printf("intercept: %f\n", intercept);
printf("r2: %f\n", r2);
return 0;
}

The original example above worked well for me with slope and offset but I had a hard time with the corr coef. Maybe I don't have my parenthesis working the same as the assumed precedence? Anyway, with some help of other web pages I finally got values that match the linear trend-line in Excel. Thought I would share my code using Mark Lakata's variable names. Hope this helps.
double slope = ((n * sumxy) - (sumx * sumy )) / denom;
double intercept = ((sumy * sumx2) - (sumx * sumxy)) / denom;
double term1 = ((n * sumxy) - (sumx * sumy));
double term2 = ((n * sumx2) - (sumx * sumx));
double term3 = ((n * sumy2) - (sumy * sumy));
double term23 = (term2 * term3);
double r2 = 1.0;
if (fabs(term23) > MIN_DOUBLE) // Define MIN_DOUBLE somewhere as 1e-9 or similar
r2 = (term1 * term1) / term23;

as an assignment I had to code in C a simple linear regression using RMSE loss function. The program is dynamic and you can enter your own values and choose your own loss function which is for now limited to Root Mean Square Error. But first here are the algorithms I used:
now the code... you need gnuplot to display the chart, sudo apt install gnuplot
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#define BUFFSIZE 64
#define MAXSIZE 100
static double vector_x[MAXSIZE] = {0};
static double vector_y[MAXSIZE] = {0};
static double vector_predict[MAXSIZE] = {0};
static double max_x;
static double max_y;
static double mean_x;
static double mean_y;
static double teta_0_intercept;
static double teta_1_grad;
static double RMSE;
static double r_square;
static double prediction;
static char intercept[BUFFSIZE];
static char grad[BUFFSIZE];
static char xrange[BUFFSIZE];
static char yrange[BUFFSIZE];
static char lossname_RMSE[BUFFSIZE] = "Simple Linear Regression using RMSE'";
static char cmd_gnu_0[BUFFSIZE] = "set title '";
static char cmd_gnu_1[BUFFSIZE] = "intercept = ";
static char cmd_gnu_2[BUFFSIZE] = "grad = ";
static char cmd_gnu_3[BUFFSIZE] = "set xrange [0:";
static char cmd_gnu_4[BUFFSIZE] = "set yrange [0:";
static char cmd_gnu_5[BUFFSIZE] = "f(x) = (grad * x) + intercept";
static char cmd_gnu_6[BUFFSIZE] = "plot f(x), 'data.temp' with points pointtype 7";
static char const *commands_gnuplot[] = {
cmd_gnu_0,
cmd_gnu_1,
cmd_gnu_2,
cmd_gnu_3,
cmd_gnu_4,
cmd_gnu_5,
cmd_gnu_6,
};
static size_t size;
static void user_input()
{
printf("Enter x,y vector size, MAX = 100\n");
scanf("%lu", &size);
if (size > MAXSIZE) {
printf("Wrong input size is too big\n");
user_input();
}
printf("vector's size is %lu\n", size);
size_t i;
for (i = 0; i < size; i++) {
printf("Enter vector_x[%ld] values\n", i);
scanf("%lf", &vector_x[i]);
}
for (i = 0; i < size; i++) {
printf("Enter vector_y[%ld] values\n", i);
scanf("%lf", &vector_y[i]);
}
}
static void display_vector()
{
size_t i;
for (i = 0; i < size; i++){
printf("vector_x[%lu] = %lf\t", i, vector_x[i]);
printf("vector_y[%lu] = %lf\n", i, vector_y[i]);
}
}
static void concatenate(char p[], char q[]) {
int c;
int d;
c = 0;
while (p[c] != '\0') {
c++;
}
d = 0;
while (q[d] != '\0') {
p[c] = q[d];
d++;
c++;
}
p[c] = '\0';
}
static void compute_mean_x_y()
{
size_t i;
double tmp_x = 0.0;
double tmp_y = 0.0;
for (i = 0; i < size; i++) {
tmp_x += vector_x[i];
tmp_y += vector_y[i];
}
mean_x = tmp_x / size;
mean_y = tmp_y / size;
printf("mean_x = %lf\n", mean_x);
printf("mean_y = %lf\n", mean_y);
}
static void compute_teta_1_grad()
{
double numerator = 0.0;
double denominator = 0.0;
double tmp1 = 0.0;
double tmp2 = 0.0;
size_t i;
for (i = 0; i < size; i++) {
numerator += (vector_x[i] - mean_x) * (vector_y[i] - mean_y);
}
for (i = 0; i < size; i++) {
tmp1 = vector_x[i] - mean_x;
tmp2 = tmp1 * tmp1;
denominator += tmp2;
}
teta_1_grad = numerator / denominator;
printf("teta_1_grad = %lf\n", teta_1_grad);
}
static void compute_teta_0_intercept()
{
teta_0_intercept = mean_y - (teta_1_grad * mean_x);
printf("teta_0_intercept = %lf\n", teta_0_intercept);
}
static void compute_prediction()
{
size_t i;
for (i = 0; i < size; i++) {
vector_predict[i] = teta_0_intercept + (teta_1_grad * vector_x[i]);
printf("y^[%ld] = %lf\n", i, vector_predict[i]);
}
printf("\n");
}
static void compute_RMSE()
{
compute_prediction();
double error = 0;
size_t i;
for (i = 0; i < size; i++) {
error = (vector_predict[i] - vector_y[i]) * (vector_predict[i] - vector_y[i]);
printf("error y^[%ld] = %lf\n", i, error);
RMSE += error;
}
/* mean */
RMSE = RMSE / size;
/* square root mean */
RMSE = sqrt(RMSE);
printf("\nRMSE = %lf\n", RMSE);
}
static void compute_loss_function()
{
int input = 0;
printf("Which loss function do you want to use?\n");
printf(" 1 - RMSE\n");
scanf("%d", &input);
switch(input) {
case 1:
concatenate(cmd_gnu_0, lossname_RMSE);
compute_RMSE();
printf("\n");
break;
default:
printf("Wrong input try again\n");
compute_loss_function(size);
}
}
static void compute_r_square(size_t size)
{
double num_err = 0.0;
double den_err = 0.0;
size_t i;
for (i = 0; i < size; i++) {
num_err += (vector_y[i] - vector_predict[i]) * (vector_y[i] - vector_predict[i]);
den_err += (vector_y[i] - mean_y) * (vector_y[i] - mean_y);
}
r_square = 1 - (num_err/den_err);
printf("R_square = %lf\n", r_square);
}
static void compute_predict_for_x()
{
double x = 0.0;
printf("Please enter x value\n");
scanf("%lf", &x);
prediction = teta_0_intercept + (teta_1_grad * x);
printf("y^ if x = %lf -> %lf\n",x, prediction);
}
static void compute_max_x_y()
{
size_t i;
double tmp1= 0.0;
double tmp2= 0.0;
for (i = 0; i < size; i++) {
if (vector_x[i] > tmp1) {
tmp1 = vector_x[i];
max_x = vector_x[i];
}
if (vector_y[i] > tmp2) {
tmp2 = vector_y[i];
max_y = vector_y[i];
}
}
printf("vector_x max value %lf\n", max_x);
printf("vector_y max value %lf\n", max_y);
}
static void display_model_line()
{
sprintf(intercept, "%0.7lf", teta_0_intercept);
sprintf(grad, "%0.7lf", teta_1_grad);
sprintf(xrange, "%0.7lf", max_x + 1);
sprintf(yrange, "%0.7lf", max_y + 1);
concatenate(cmd_gnu_1, intercept);
concatenate(cmd_gnu_2, grad);
concatenate(cmd_gnu_3, xrange);
concatenate(cmd_gnu_3, "]");
concatenate(cmd_gnu_4, yrange);
concatenate(cmd_gnu_4, "]");
printf("grad = %s\n", grad);
printf("intercept = %s\n", intercept);
printf("xrange = %s\n", xrange);
printf("yrange = %s\n", yrange);
printf("cmd_gnu_0: %s\n", cmd_gnu_0);
printf("cmd_gnu_1: %s\n", cmd_gnu_1);
printf("cmd_gnu_2: %s\n", cmd_gnu_2);
printf("cmd_gnu_3: %s\n", cmd_gnu_3);
printf("cmd_gnu_4: %s\n", cmd_gnu_4);
printf("cmd_gnu_5: %s\n", cmd_gnu_5);
printf("cmd_gnu_6: %s\n", cmd_gnu_6);
/* print plot */
FILE *gnuplot_pipe = (FILE*)popen("gnuplot -persistent", "w");
FILE *temp = (FILE*)fopen("data.temp", "w");
/* create data.temp */
size_t i;
for (i = 0; i < size; i++)
{
fprintf(temp, "%f %f \n", vector_x[i], vector_y[i]);
}
/* display gnuplot */
for (i = 0; i < 7; i++)
{
fprintf(gnuplot_pipe, "%s \n", commands_gnuplot[i]);
}
}
int main(void)
{
printf("===========================================\n");
printf("INPUT DATA\n");
printf("===========================================\n");
user_input();
display_vector();
printf("\n");
printf("===========================================\n");
printf("COMPUTE MEAN X:Y, TETA_1 TETA_0\n");
printf("===========================================\n");
compute_mean_x_y();
compute_max_x_y();
compute_teta_1_grad();
compute_teta_0_intercept();
printf("\n");
printf("===========================================\n");
printf("COMPUTE LOSS FUNCTION\n");
printf("===========================================\n");
compute_loss_function();
printf("===========================================\n");
printf("COMPUTE R_square\n");
printf("===========================================\n");
compute_r_square(size);
printf("\n");
printf("===========================================\n");
printf("COMPUTE y^ according to x\n");
printf("===========================================\n");
compute_predict_for_x();
printf("\n");
printf("===========================================\n");
printf("DISPLAY LINEAR REGRESSION\n");
printf("===========================================\n");
display_model_line();
printf("\n");
return 0;
}

Look at Section 1 of this paper. This section expresses a 2D linear regression as a matrix multiplication exercise. As long as your data is well-behaved, this technique should permit you to develop a quick least squares fit.
Depending on the size of your data, it might be worthwhile to algebraically reduce the matrix multiplication to simple set of equations, thereby avoiding the need to write a matmult() function. (Be forewarned, this is completely impractical for more than 4 or 5 data points!)

The fastest, most efficient way to solve least squares, as far as I am aware, is to subtract (the gradient)/(the 2nd order gradient) from your parameter vector. (2nd order gradient = i.e. the diagonal of the Hessian.)
Here is the intuition:
Let's say you want to optimize least squares over a single parameter. This is equivalent to finding the vertex of a parabola. Then, for any random initial parameter, x0, the vertex of the loss function is located at x0 - f(1) / f(2). That's because adding - f(1) / f(2) to x will always zero out the derivative, f(1).
Side note: Implementing this in Tensorflow, the solution appeared at w0 - f(1) / f(2) / (number of weights), but I'm not sure if that's due to Tensorflow or if it's due to something else..

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Logistic regression code stops working above ~43,500 generated observations - c

Related

Taylor Series in C (problem with sin(240) and sin(300))

Gravity computations resulting in NaN. No clear reason

C: Accessing lookup tables faster?

The outermost for loop does not work as intended

fast & efficient least squares fit algorithm in C?

Categories

Resources