Query using MPI Reduce in C - c

I am trying to use MPI_Reduce() in C to compute a vector called phi with npts elements. To do that, I have allocated chunks of a long vector (longvec), to each process, sum these chunks separately and then sum the partial result of each processor at the end in process 0 to obtain an estimate of each element of phi.
I am getting very silly results... Can anyone tell me what mistake I am making in the code below?
double phie[npts];
phitemp = (double*) malloc (nprocs * sizeof(double));
for (i = 0; i < npts; i++) {
phitemp[rank] = 0;
for (x = rank * 10 + 1; x <= (rank + 1) * 10; x++) {
phitemp[rank] = phitemp[rank] + longvec[x] * vector[i]; }
}
MPI_Reduce(phitemp, & (((double *) phivec)[i]), 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
}

Related

Logistic regression code stops working above ~43,500 generated observations

Having some difficulty troubleshooting code I wrote in C to perform a logistic regression. While it seems to work on smaller, semi-randomized datasets, it stops working (e.g. assigning proper probabilities of belonging to class 1) at around the point where I pass 43,500 observations (determined by tweaking the number of observations created. When creating the 150 features used in the code, I do create the first two as a function of the number of observations, so I'm not sure if maybe that's the issue here, though I am using double precision. Maybe there's an overflow somewhere in the code?
The below code should be self-contained; it generates m=50,000 observations with n=150 features. Setting m below 43,500 should return "Percent class 1: 0.250000", setting to 44,000 or above will return "Percent class 1: 0.000000", regardless of what max_iter (number of times we sample m observations) is set to.
The first feature is set to 1.0 divided by the total number of observations, if class 0 (first 75% of observations), or the index of the observation divided by the total number of observations otherwise.
The second feature is just index divided by total number of observations.
All other features are random.
The logistic regression is intended to use stochastic gradient descent, randomly selecting an observation index, computing the gradient of the loss with the predicted y using current weights, and updating weights with the gradient and learning rate (eta).
Using the same initialization with Python and NumPy, I still get the proper results, even above 50,000 observations.
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
// Compute z = w * x + b
double dlc( int n, double *X, double *coef, double intercept )
{
double y_pred = intercept;
for (int i = 0; i < n; i++)
{
y_pred += X[i] * coef[i];
}
return y_pred;
}
// Compute y_hat = 1 / (1 + e^(-z))
double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept )
{
double y_pred;
y_pred = dlc(n, X, coef, intercept);
y_pred = 1.0 / (1.0 + exp(-y_pred));
return y_pred;
}
// Stochastic gradient descent
void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed )
{
double *gradient_coef, *X_i;
double y_i, y_pred, resid;
int idx;
double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0;
X_i = (double *) malloc (n * sizeof(double));
gradient_coef = (double *) malloc (n * sizeof(double));
for ( int i = 0; i < n; i++ )
{
coef[i] = 0.0;
gradient_coef[i] = 0.0;
}
*intercept = 0.0;
srand(random_seed);
for ( int epoch = 0; epoch < max_iter; epoch++ )
{
for ( int run = 0; run < m; run++ )
{
// Randomly sample an observation
idx = rand() % m;
for ( int i = 0; i < n; i++ )
{
X_i[i] = X[n*idx+i];
}
y_i = y[idx];
// Compute y_hat
y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept );
resid = -(y_i - y_pred);
// Compute gradients and adjust weights
for (int i = 0; i < n; i++)
{
gradient_coef[i] = X_i[i] * resid;
coef[i] -= eta * gradient_coef[i];
}
if ( fit_intercept == 1 )
{
*intercept -= eta * resid;
}
}
}
}
int main(void)
{
double *X, *y, *coef, *y_pred;
double intercept;
double eta = 0.05;
double alpha = 1.0, beta = 1.0;
long m = 50000;
long n = 150;
int max_iter = 20;
long class_0 = (long)(3.0 / 4.0 * (double)m);
double pct_class_1 = 0.0;
clock_t test_start;
clock_t test_end;
double test_time;
printf("Constructing variables...\n");
X = (double *) malloc (m * n * sizeof(double));
y = (double *) malloc (m * sizeof(double));
y_pred = (double *) malloc (m * sizeof(double));
coef = (double *) malloc (n * sizeof(double));
// Initialize classes
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
y[i] = 0.0;
}
else
{
y[i] = 1.0;
}
}
// Initialize observation features
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
X[n*i] = 1.0 / (double)m;
}
else
{
X[n*i] = (double)i / (double)m;
}
X[n*i + 1] = (double)i / (double)m;
for (int j = 2; j < n; j++)
{
X[n*i + j] = (double)(rand() % 100) / 100.0;
}
}
// Fit weights
printf("Running SGD...\n");
test_start = clock();
sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 );
test_end = clock();
test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", test_time);
// Compute y_hat and share of observations predicted as class 1
printf("Making predictions...\n");
for ( int i = 0; i < m; i++ )
{
y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept );
}
printf("Printing results...\n");
for ( int i = 0; i < m; i++ )
{
//printf("%f\n", y_pred[i]);
if (y_pred[i] > 0.5)
{
pct_class_1 += 1.0;
}
// Troubleshooting print
if (i < 10 || i > m - 10)
{
printf("%g\n", y_pred[i]);
}
}
printf("Percent class 1: %f", pct_class_1 / (double)m);
return 0;
}
For reference, here is my (presumably) equivalent Python code, which returns the correct percent of identified classes at more than 50,000 observations:
import numpy as np
import time
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class LogisticRegressor:
def __init__(self, eta, init_runs, fit_intercept=True):
self.eta = eta
self.init_runs = init_runs
self.fit_intercept = fit_intercept
def fit(self, x, y):
m, n = x.shape
self.coef = np.zeros((n, 1))
self.intercept = np.zeros((1, 1))
for epoch in range(self.init_runs):
for run in range(m):
idx = np.random.randint(0, m)
x_i = x[idx:idx+1, :]
y_i = y[idx]
y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept)
gradient_w = -(x_i.T * (y_i - y_pred_i))
self.coef -= self.eta * gradient_w
if self.fit_intercept:
gradient_b = -(y_i - y_pred_i)
self.intercept -= self.eta * gradient_b
def predict_proba(self, x):
m, n = x.shape
y_pred = np.ones((m, 2))
y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept)
y_pred[:,0:1] -= y_pred[:,1:2]
return y_pred
def predict(self, x):
return np.round(sigmoid(x.dot(self.coef) + self.intercept))
m = 50000
n = 150
class1 = int(3.0 / 4.0 * m)
X = np.random.rand(m, n)
y = np.zeros((m, 1))
for obs in range(m):
if obs < class1:
continue
else:
y[obs,0] = 1
for obs in range(m):
if obs < class1:
X[obs, 0] = 1.0 / float(m)
else:
X[obs, 0] = float(obs) / float(m)
X[obs, 1] = float(obs) / float(m)
logit = LogisticRegressor(0.05, 20)
start_time = time.time()
logit.fit(X, y)
end_time = time.time()
print(round(end_time - start_time, 2))
y_pred = logit.predict(X)
print("Percent:", y_pred.sum() / len(y_pred))
The issue is here:
// Randomly sample an observation
idx = rand() % m;
... in light of the fact that the OP's RAND_MAX is 32767. This is exacerbated by the fact that all of the class 0 observations are at the end.
All samples will be drawn from the first 32768 observations, and when the total number of observations is greater than that, the proportion of class 0 observations among those that can be sampled is less than 0.25. At 43691 total observations, there are no class 0 observations among those that can be sampled.
As a secondary issue, rand() % m does not yield a wholly uniform distribution if m does not evenly divide RAND_MAX + 1, though the effect of this issue will be much more subtle.
Bottom line: you need a better random number generator.
At minimum, you could consider combining the bits from two calls to rand() to yield an integer with sufficient range, but you might want to consider getting a third-party generator. There are several available.
Note: OP reports "m=50,000 observations with n=150 features.", so perhaps this is not the issue for OP, but I'll leave this answer up for reference when OP tries larger tasks.
A potential issue:
long overflow
m * n * sizeof(double) risks overflow when long is 32-bit and m*n > LONG_MAX (or about 46,341 if m, n are the same).
OP does report
A first step is to perform the multiplication using size_t math where we gain at least 1 more bit in the calculation.
// m * n * sizeof(double)
sizeof(double) * m * n
Yet unless OP's size_t is more than 32-bit, we still have trouble.
IAC, I recommend to use size_t for array sizing and indexing.
Check allocations for failure too.
Since RAND_MAX may be too small and array indexing should be done using size_t math, consider a helper function to generate a random index over the entire size_t range.
// idx = rand() % m;
size_t idx = rand_size_t() % (size_t)m;
If stuck with the standard rand(), below is a helper function to extend its range as needed.
It uses the real nifty IMAX_BITS(m).
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
// https://stackoverflow.com/a/4589384/2410359
/* Number of bits in inttype_MAX, or in any (1<<k)-1 where 0 <= k < 2040 */
#define IMAX_BITS(m) ((m)/((m)%255+1) / 255%255*8 + 7-86/((m)%255+12))
// Test that RAND_MAX is a power of 2 minus 1
_Static_assert((RAND_MAX & 1) && ((RAND_MAX/2 + 1) & (RAND_MAX/2)) == 0, "RAND_MAX is not a Mersenne number");
#define RAND_MAX_WIDTH (IMAX_BITS(RAND_MAX))
#define SIZE_MAX_WIDTH (IMAX_BITS(SIZE_MAX))
size_t rand_size_t(void) {
size_t index = (size_t) rand();
for (unsigned i = RAND_MAX_WIDTH; i < SIZE_MAX_WIDTH; i += RAND_MAX_WIDTH) {
index <<= RAND_MAX_WIDTH;
index ^= (size_t) rand();
}
return index;
}
Further considerations can replace the rand_size_t() % (size_t)m with a more uniform distribution.
As has been determined elsewhere, the problem is due to the implementation's RAND_MAX value being too small.
Assuming 32-bit ints, a slightly better PRNG function can be implemented in the code, such as this C implementation of the minstd_rand() function from C++:
#define MINSTD_RAND_MAX 2147483646
// Code assumes `int` is at least 32 bits wide.
static unsigned int minstd_seed = 1;
static void minstd_srand(unsigned int seed)
{
seed %= 2147483647;
// zero seed is bad!
minstd_seed = seed ? seed : 1;
}
static int minstd_rand(void)
{
minstd_seed = (unsigned long long)minstd_seed * 48271 % 2147483647;
return (int)minstd_seed;
}
Another problem is that expressions of the form rand() % m produce a biased result when m does not divide (unsigned int)RAND_MAX + 1. Here is an unbiased function that returns a random integer from 0 to le inclusive, making use of the minstd_rand() function defined earlier:
static int minstd_rand_max(int le)
{
int r;
if (le < 0)
{
r = le;
}
else if (le >= MINSTD_RAND_MAX)
{
r = minstd_rand();
}
else
{
int rm = MINSTD_RAND_MAX - le + MINSTD_RAND_MAX % (le + 1);
while ((r = minstd_rand()) > rm)
{
}
r /= (rm / (le + 1) + 1);
}
return r;
}
(Actually, it does still have a very small bias because minstd_rand() will never return 0.)
For example, replace rand() % 100 with minstd_rand_max(99), and replace rand() % m with minstd_rand_max(m - 1). Also replace srand(random_seed) with minstd_srand(random_seed).

openCL Kernel to calculate Pi is not correct value

Good day,
I have an openCL kernel that is using the Leibniz formula to calculate pi. Currently my issue is that the value I get back isn't pi, but instead just 4.
__kernel void calculatePi(int numIterations, __global float *outputPi,
__local float* local_result, int numWorkers)
{
__private const uint gid = get_global_id(0);
__private const uint lid = get_local_id(0);
__private const uint offset = numIterations*gid*2;
__private float sum = 0.0f;
// Have the first worker initialize local_result
if (gid == 0)
{
for (int i = 0; i < numWorkers; i++)
{
local_result[i] = 0.0f;
}
}
// Have all workers wait until this is completed
barrier(CLK_GLOBAL_MEM_FENCE);
// Have each worker calculate their portion of pi
// This is a private value
for (int i=0; i<numIterations; i++)
{
if (i % 2 == 0)
{
sum += 1 / (1 + 2*i + offset);
}
else
{
sum -= 1 / (1 + 2*i + offset);
}
}
// Have each worker move their value to the appropriate
// local_result slot so that the first worker can see it
// when reducing next
local_result[gid] = sum;
// Make sure all workers complete this task before continuing
barrier(CLK_LOCAL_MEM_FENCE);
// Have the first worker add up all of the other worker's values
// to get the final value
if (lid == 0)
{
outputPi[0] = 0;
for (int i = 0; i < numWorkers; i++)
{
outputPi[0] += local_result[i];
}
outputPi[0] *= 4;
}
}
I've steered all of my inputs to my output to verify that they are what I expect. numIterations is 16 and numWorkers is also 16.
When sum is calculated then for the first worker, I would expect the sum to be
1 - 1/3 + 1/5 - 1/7 + 1/9 - 1/11 + 1/13 - 1/15 + 1/17 - 1/19 + 1/21 - 1/23 + 1/25 - 1/27 + 1/29 - 1/31
Using this calculator for the first 16 times, I expect the result to be around 3.2 : https://scratch.mit.edu/projects/19546118/
If I modify my last bit of code to be this so that I can look at a worker's calculated value of "sum":
// Have the first worker add up all of the other worker's values
// to get the final value
if (lid == 0)
{
outputPi[0] = sum * 4;
}
Then the value returned for the first worker is 4 instead of the expected 3.2
Modifying to any other number except lid == 0, all other workers are reporting their sum as 0. So my question is why is that the calculated value? Am I doing something wrong with my sum variable? This should be a private variable and the for loop should be sequential from my understanding for each worker but numerous loops are executed in parallel based on the number of workers.
Here's a link to my github that has the kernel and main code uploaded.
https://github.com/TreverWagenhals/TreverWagenhals/tree/master/School/Heterogeneous%20Computing/Lab2
Thanks
you are performing integral divisions in your code, should be floats:
if (i % 2 == 0)
{
sum += 1. / (1 + 2*i + offset); // notice the 1.
}
else
{
sum -= 1. / (1 + 2*i + offset);
}

Shared memory accessing garbage values in CUDA

I am trying to implement a Navier-Stokes solver in 2D using CUDA. I am using Jacobi's method to solve the system of difference equations. I am dividing the code in 4x4 blocks consisting of 16x16 threads. As every inner point in my matrix (of dimension 64x64) requires its top, bottom, left and right element to compute its new value, I create a new shared matrix of 18x18 dimension for every block. I read all the values into the matrix in this fashion - The thread with indices (0, 0) will write its value into the (1, 1) element in the matrix and will also attempt to read the element above it and the one to its left if this access is not exceeding the boundary. Once this read is done, I update the values of all the internal points and then write them back into memory.
I end up getting garbage values in the matrix pn, even though all the values are initialized correctly. I honestly cannot see where I'm going wrong. Can someone help me with this?
My kernel -
__global__ void red_psi (float *psi_o, float *psi_n, float *e, float *omega, float l1)
{
// m = n = 64
int i1 = blockIdx.x;
int j1 = blockIdx.y;
int i2 = threadIdx.x;
int j2 = threadIdx.y;
int i = (i1 * blockDim.x) + i2; // Actual row of the element
int j = (j1 * blockDim.y) + j2; // Actual column of the element
int l = i * n + j;
// e_XX --> variables refers to expanded shared memory location in order to accomodate halo elements
//Current Local ID with radius offset.
int e_li = i2 + 1;
int e_lj = j2 + 1;
// Variable pointing at top and bottom neighbouring location
int e_li_prev = e_li - 1;
int e_li_next = e_li + 1;
// Variable pointing at left and right neighbouring location
int e_lj_prev = e_lj - 1;
int e_lj_next = e_lj + 1;
__shared__ float po[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
__shared__ float pn[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
__shared__ float oo[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
//__shared__ float ee[BLOCK_SIZE + 2][BLOCK_SIZE + 2];
if (i2 < 1) // copy top and bottom halo
{
//Copy Top Halo Element
if (blockIdx.y > 0) // Boundary check
{
po[i2][e_lj] = psi_o[l - n];
//pn[i2][e_lj] = psi_n[l - n];
oo[i2][e_lj] = omega[l - n];
//printf ("i_pn[%d][%d] = %f\n", i2, e_lj, oo[i2][e_lj]);
}
//Copy Bottom Halo Element
if (blockIdx.y < (gridDim.y - 1)) // Boundary check
{
po[1 + BLOCK_SIZE][e_lj] = psi_o[l + n];
//pn[1 + BLOCK_SIZE][e_lj] = psi_n[l + n];
oo[1 + BLOCK_SIZE][e_lj] = omega[l + n];
//printf ("j_pn[%d][%d] = %f\n", 1 + BLOCK_SIZE, e_lj, oo[1 + BLOCK_SIZE][e_lj]);
}
}
if (j2 < 1) // copy left and right halo
{
if (blockIdx.x > 0) // Boundary check
{
po[e_li][j2] = psi_o[l - 1];
//pn[e_li][j2] = psi_n[l - 1];
oo[e_li][j2] = omega[l - 1];
//printf ("k_pn[%d][%d] = %f\n", e_li, j2, oo[e_li][j2]);
}
if (blockIdx.x < (gridDim.x - 1)) // Boundary check
{
po[e_li][1 + BLOCK_SIZE] = psi_o[l + 1];
//pn[e_li][1 + BLOCK_SIZE] = psi_n[l + 1];
oo[e_li][1 + BLOCK_SIZE] = omega[l + 1];
//printf ("l_pn[%d][%d] = %f\n", e_li, 1 + BLOCK_SIZE, oo[e_li][BLOCK_SIZE + 1]);
}
}
// copy current location
po[e_li][e_lj] = psi_o[l];
//pn[e_li][e_lj] = psi_n[l];
oo[e_li][e_lj] = omega[l];
//printf ("o_pn[%d][%d] = %f\n", e_li, e_lj, oo[e_li][e_lj]);
__syncthreads ();
// Checking whether we have an internal point.
if ((i >= 1 && i < (m - 1)) && (j >= 1 && j < (n - 1)))
{
//printf ("Calculating for - (%d, %d)\n", i, j);
pn[e_li][e_lj] = 0.25 * (po[e_li_next][e_lj] + po[e_li_prev][e_lj] + po[e_li][e_lj_next] + po[e_li][e_lj_prev] + h*h*oo[e_li][e_lj]);
//printf ("n_pn[%d][%d] (%d, %d), a(%d, %d) = %f\n", e_li_prev, e_lj, i1, j1, i, j, po[e_li_prev][e_lj]);
pn[e_li][e_lj] = po[e_li][e_lj] + 1.0 * (pn[e_li][e_lj] - po[e_li][e_lj]);
__syncthreads ();
psi_n[l] = pn[e_li][e_lj];
e[l] = po[e_li][e_lj] - pn[e_li][e_lj];
}
}
This is how I invoke the kernel -
dim3 threadsPerBlock (4, 4);
dim3 numBlocks (4, 4);
red_psi<<<numBlocks, threadsPerBlock>>> (d_xn, d_xx, d_e, d_w, l1);
(d_xx, d_xn, d_e, d_w are all float arrays of size 4096)
I switched the blockDim.x and blockDim.y when I was copying the top / bottom and the left / right halo elements.

The outermost for loop does not work as intended

I have been using Ubuntu 12.04 LTS with GCC to compile my the codes for my assignment for a while. However, recently I have run into two issues as follows:
The following code calculates zero for a nonzero value with the second formula is used.
There is a large amount of error in the calculation of the integral of the standard normal distribution from 0 to 5 or larger standard deviations.
How can I remedy these issues? I am especially obsessed with the first one. Any help or suggestion is appreciated. thanks in advance.
The code is as follows:
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdlib.h>
#define N 599
long double
factorial(long double n)
{
//Here s is the free parameter which is increased by one in each step and
//pro is the initial product and by setting pro to be 0 we also cover the
//case of zero factorial.
int s = 1;
long double pro = 1;
//Here pro stands for product.
if (n < 0)
printf("Factorial is not defined for a negative number \n");
else {
while (n >= s) {
pro *= s;
s++;
}
return pro;
}
}
int main()
{
// Since the function given is the standard normal distribution
// probability density function we have mean = 0 and variance = 1.
// Hence we also have z = x; while dealing with only positive values of
// x and keeping in mind that the PDF is symmetric around the mean.
long double * summand1 = malloc(N * sizeof(long double));
long double * summand2 = malloc(N * sizeof(long double));
int p = 0, k, z[5] = {0, 3, 5, 10, 20};
long double sum1[5] = {0}, sum2[5] = {0} , factor = 1.0;
for (p = 0; p <= 4; p++)
{
for (k = 0; k <= N; k++)
{
summand1[k] = (1 / sqrtl(M_PI * 2) )* powl(-1, k) * powl(z[p], 2 * k + 1) / ( factorial(k) * (2 * k + 1) * powl(2, k));
sum1[p] += summand1[k];
}
//Wolfamalpha site gives the same value here
for (k = 0; k <= N; k++)
{
factor *= (2 * k + 1);
summand2[k] = ((1 / sqrtl(M_PI * 2) ) * powl(z[p], 2 * k + 1) / factor);
//printf("%Le \n", factor);
sum2[p] += summand2[k];
}
sum2[p] = sum2[p] * expl((-powl(z[p],2)) / 2);
}
for (p = 0; p < 4; p++)
{
printf("The sum obtained for z between %d - %d \
\nusing the first formula is %Lf \n", z[p], z[p+1], sum1[p+1]);
printf("The sum obtained for z between %d - %d \
\nusing the second formula is %Lf \n", z[p], z[p+1], sum2[p+1]);
}
return 0;
}
The working code without the outermost for loop is
#include <stdio.h>
#include <math.h>
#include <limits.h>
#include <stdlib.h>
#define N 1200
long double
factorial(long double n)
{
//Here s is the free parameter which is increased by one in each step and
//pro is the initial product and by setting pro to be 0 we also cover the
//case of zero factorial.
int s = 1;
long double pro = 1;
//Here pro stands for product.
if (n < 0)
printf("Factorial is not defined for a negative number \n");
else {
while (n >= s) {
pro *= s;
s++;
}
return pro;
}
}
int main()
{
// Since the function given is the standard normal distribution
// probability density function we have mean = 0 and variance = 1.
// Hence we also have z = x; while dealing with only positive values of
// x and keeping in mind that the PDF is symmetric around the mean.
long double * summand1 = malloc(N * sizeof(long double));
long double * summand2 = malloc(N * sizeof(long double));
int k, z = 3;
long double sum1 = 0, sum2 = 0, pro = 1.0;
for (k = 0; k <= N; k++)
{
summand1[k] = (1 / sqrtl(M_PI * 2) )* powl(-1, k) * powl(z, 2 * k + 1) / ( factorial(k) * (2 * k + 1) * powl(2, k));
sum1 += summand1[k];
}
//Wolfamalpha site gives the same value here
printf("The sum obtained for z between 0-3 using the first formula is %Lf \n", sum1);
for (k = 0; k <= N; k++)
{
pro *= (2 * k + 1);
summand2[k] = ((1 / sqrtl(M_PI * 2) * powl(z, 2 * k + 1) / pro));
//printf("%Le \n", pro);
sum2 += summand2[k];
}
sum2 = sum2 * expl((-powl(z,2)) / 2);
printf("The sum obtained for z between 0-3 using the second formula is %Lf \n", sum2);
return 0;
}
I'm quite certain that the problem is in factor not being set back to 1 in the outer loop..
factor *= (2 * k + 1); (in the loop that calculates sum2.)
In the second version provided the one that works it starts with z=3
However in the first loop since you do not clear it between iterations on p by the time you reach z[2] it already is a huge number.
EDIT: Possible help with precision..
Basically you have a huge number powl(z[p], 2 * k + 1) divided by another huge number factor. huge floating point numbers lose their precision. The way to avoid that is to perform the division as soon as possible..
Instead of first calculating powl(z[p], 2 * k + 1) and dividing by factor :
- (z[p]z[p] ... . * z[p]) / (1*3*5*...(2*k+1))`
rearrange the calculation: (z[p]/1) * (z[p]^2/3) * (z[p]^2/5) ... (z[p]^2/(2*k+1))
You can do this in sumand2 calculation and a similar trick in summand1

Inverse filtering on OpenCV - accessing DFT values and multiplying DFT matrices

I am trying to perform an inverse and a pseudo-inverse filtering in the frequency domain.
However I am having trouble accessing DFT coefficients and multiplying DFT matrices afterwards, since I got complex numbers and, therefore, actually two matrices...
Basically the inverse filtering performs
F = G/H,
where F is the restored image, G is the blurred image and H is the kernel that blurred the image.
The pseudo-inverse needs to access the values in H, since if the value is near 0 it should be replaced in order to avoid problems in the restoration. For this we must change the H so that:
H(u,v) = 1/H(u,v) if H(u,v) > threshold
and = 0 otherwise
I have a kernel1 (h_1), and the images imf (restored) and img (blurred). Here is the code:
// compute the DFTs of the kernel (DFT_B) and the blurred image (DBF_A)
cvDFT( dft_A, dft_A, CV_DXT_FORWARD, complexInput1->height );
cvDFT( dft_B, dft_B, CV_DXT_FORWARD, complexInput2->height );
// the first type is the inverse fitlering
if (type == 1) {
printf("...performing inverse filtering\n");
// dividing the transforms
cvDiv(dft_A, dft_B, dft_C, 1);
}
// the second type is the pseudo-inverse filtering
else {
printf("...prepare kernel for pseudo-inverse filtering\n");
// will try to access the real values in order to see if value is above a threshold
cvSplit( dft_B, image_Re1, image_Im1, 0, 0 );
// pointers to access the data into the real and imaginary matrices
uchar * dRe1 = (uchar *)image_Re1->imageData;
uchar * dIm1 = (uchar *)image_Im1->imageData;
int width = image_Re1->width;
int height = image_Re1->height;
int step = image_Re1->widthStep;
image_Re2 = cvCreateImage(cvGetSize(image_Re1), IPL_DEPTH_32F, 1);
image_Im2 = cvCreateImage(cvGetSize(image_Im2), IPL_DEPTH_32F, 1);
// pointers to access the data into the real and imaginary matrices
// it will be the resulting pseudo-inverse filter
uchar * dRe2 = (uchar *)image_Re2->imageData;
uchar * dIm2 = (uchar *)image_Im2->imageData;
printf("...building kernel for pseudo-inverse filtering\n");
for ( i = 0; i < height; i++ ) {
for ( j = 0; j < width; j++ ) {
// generate the 1/H(i,j) value
if (dRe1[i * step + j] > threshold) {
float realsq = dRe1[i * step + j]*dRe1[i * step + j];
float imagsq = dIm1[i * step + j]*dIm1[i * step + j];
dRe2[i * step + j] = dRe1[i * step + j] / (realsq + imagsq);
dIm2[i * step + j] = -1 * (dIm1[i * step + j] / (realsq + imagsq));
}
else {
dRe2[i * step + j] = 0;
dIm2[i * step + j] = 0;
}
}
}
printf("...merging final kernel\n");
cvMerge(image_Re2, image_Im2, 0, 0, dft_B);
printf("...performing pseudo-inverse filtering\n");
cvMulSpectrums(dft_A, dft_B, dft_C, 1);
}
printf("...performing IDFT\n");
cvDFT(dft_C, dft_H, CV_DXT_INV_SCALE, 1);
printf("...getting size\n");
cvGetSubRect(dft_H, &tmp3, cvRect(0, 0, img->width, img->height));
printf("......(%d, %d) - (%d, %d)\n", tmp3.cols, tmp3.rows, restored->width, restored->height);
cvSplit( &tmp3, image_Re1, image_Im1, 0, 0 );
cvNamedWindow("re", 0);
cvShowImage("re", image_Re2);
cvWaitKey(0);
printf("...copying final image\n");
// error is in the line below
cvCopy(image_Re1, imf, NULL);
I have an error on the last line: --- OpenCV Error: Assertion failed (src.depth() == dst.depth() && src.size() == dst.size()) in cvCopy, file /build/buildd/opencv-2.1.0/src/cxcore/cxcopy.cpp, line 466
I know it have to do with the size or depth but I don't know how to control. Anyway, I tried to show the image_Re1 and it is empty...
Can anyone shed some light on it?
Seems like you didn't initialize your imf picture!
cvCopy needs a initialized matrix do a:
IplImage* imf= cvCreateImage(cvGetSize(image_Re1), IPL_DEPTH_32F, 1);
first and I think it'll work.
Also, you don't free the image space in this code (cvReleaseImage(&image))

Resources