I am trying to calculate eigenvalues using the TQLI algorithm that I got from the website of the CACS of the University of Southern California. My test script looks like this:
#include <stdio.h>
int main()
{
int i;
i = rand();
printf("My random number: %d\n", i);
float d[4] = {
{1, 2, 3, 4}
};
float e[4] = {
{0, 0, 0, 0}
};
float z[4][4] = {
{1.0, 0.0, 0.0, 0.0} ,
{0.0, 1.0, 0.0, 0.0} ,
{0.0, 0.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 1.0}
};
double *zptr;
zptr = &z[0][0];
printf("Element [2][1] of identity matrix: %f\n", z[2][1]);
printf("Element [2][2] of identity matrix: %f\n", z[2][2]);
tqli(d, e, 4, zptr);
printf("First eigenvalue: %f\n", d[0]);
return 0;
}
When I try to run this script I get a segmentation fault error as you can see in here. At what location does my code produce this segmentation fault. As I believe the code from USC is bug-free I am pretty sure the mistake must be in my call of the function. However I can't see where I made a mistake in my set-up of the arrays as in my opinion I followed the instructions.
Eigenvalue calculation using TQLI algorithm fails with segmentation
fault
Segmentation fault comes from crossing the supplied array boundary. tqli requires specific data preparation.
1) The eigen code from CACS is Fortran based and counts indexes from 1.
2) The tqli expects double pointer for its matrix and double vectors.
/******************************************************************************/
void tqli(double d[], double e[], int n, double **z)
/*******************************************************************************
d, and e should be declared as double.
3) The program needs modification in respect to the data preparation for the above function.
Helper 1-index based vectors have to be created to supply properly formatted data for the tqli:
double z[NP][NP] = { {2, 0, 0}, {0, 4, 0}, {0, 0, 2} } ;
double **a;
double *d,*e,*f;
d=dvector(1,NP); // 1-index based vector
e=dvector(1,NP);
f=dvector(1,NP);
a=dmatrix(1,NP,1,NP); // 1-index based matrix
for (i=1;i<=NP;i++) // loading data from zero besed `ze` to `a`
for (j=1;j<=NP;j++) a[i][j]=z[i-1][j-1];
Complete test program is supplied below. It uses the eigen code from CACS:
/*******************************************************************************
Eigenvalue solvers, tred2 and tqli, from "Numerical Recipes in C" (Cambridge
Univ. Press) by W.H. Press, S.A. Teukolsky, W.T. Vetterling, and B.P. Flannery
*******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NR_END 1
#define SIGN(a,b) ((b) >= 0.0 ? fabs(a) : -fabs(a))
double **dmatrix(int nrl, int nrh, int ncl, int nch)
/* allocate a double matrix with subscript range m[nrl..nrh][ncl..nch] */
{
int i,nrow=nrh-nrl+1,ncol=nch-ncl+1;
double **m;
/* allocate pointers to rows */
m=(double **) malloc((size_t)((nrow+NR_END)*sizeof(double*)));
m += NR_END;
m -= nrl;
/* allocate rows and set pointers to them */
m[nrl]=(double *) malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
m[nrl] += NR_END;
m[nrl] -= ncl;
for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;
/* return pointer to array of pointers to rows */
return m;
}
double *dvector(int nl, int nh)
/* allocate a double vector with subscript range v[nl..nh] */
{
double *v;
v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
return v-nl+NR_END;
}
/******************************************************************************/
void tred2(double **a, int n, double d[], double e[])
/*******************************************************************************
Householder reduction of a real, symmetric matrix a[1..n][1..n].
On output, a is replaced by the orthogonal matrix Q effecting the
transformation. d[1..n] returns the diagonal elements of the tridiagonal matrix,
and e[1..n] the off-diagonal elements, with e[1]=0. Several statements, as noted
in comments, can be omitted if only eigenvalues are to be found, in which case a
contains no useful information on output. Otherwise they are to be included.
*******************************************************************************/
{
int l,k,j,i;
double scale,hh,h,g,f;
for (i=n;i>=2;i--) {
l=i-1;
h=scale=0.0;
if (l > 1) {
for (k=1;k<=l;k++)
scale += fabs(a[i][k]);
if (scale == 0.0) /* Skip transformation. */
e[i]=a[i][l];
else {
for (k=1;k<=l;k++) {
a[i][k] /= scale; /* Use scaled a's for transformation. */
h += a[i][k]*a[i][k]; /* Form sigma in h. */
}
f=a[i][l];
g=(f >= 0.0 ? -sqrt(h) : sqrt(h));
e[i]=scale*g;
h -= f*g; /* Now h is equation (11.2.4). */
a[i][l]=f-g; /* Store u in the ith row of a. */
f=0.0;
for (j=1;j<=l;j++) {
/* Next statement can be omitted if eigenvectors not wanted */
a[j][i]=a[i][j]/h; /* Store u/H in ith column of a. */
g=0.0; /* Form an element of A.u in g. */
for (k=1;k<=j;k++)
g += a[j][k]*a[i][k];
for (k=j+1;k<=l;k++)
g += a[k][j]*a[i][k];
e[j]=g/h; /* Form element of p in temporarily unused element of e. */
f += e[j]*a[i][j];
}
hh=f/(h+h); /* Form K, equation (11.2.11). */
for (j=1;j<=l;j++) { /* Form q and store in e overwriting p. */
f=a[i][j];
e[j]=g=e[j]-hh*f;
for (k=1;k<=j;k++) /* Reduce a, equation (11.2.13). */
a[j][k] -= (f*e[k]+g*a[i][k]);
}
}
} else
e[i]=a[i][l];
d[i]=h;
}
/* Next statement can be omitted if eigenvectors not wanted */
d[1]=0.0;
e[1]=0.0;
/* Contents of this loop can be omitted if eigenvectors not
wanted except for statement d[i]=a[i][i]; */
for (i=1;i<=n;i++) { /* Begin accumulation of transformation matrices. */
l=i-1;
if (d[i]) { /* This block skipped when i=1. */
for (j=1;j<=l;j++) {
g=0.0;
for (k=1;k<=l;k++) /* Use u and u/H stored in a to form P.Q. */
g += a[i][k]*a[k][j];
for (k=1;k<=l;k++)
a[k][j] -= g*a[k][i];
}
}
d[i]=a[i][i]; /* This statement remains. */
a[i][i]=1.0; /* Reset row and column of a to identity matrix for next iteration. */
for (j=1;j<=l;j++) a[j][i]=a[i][j]=0.0;
}
}
/******************************************************************************/
void tqli(double d[], double e[], int n, double **z)
/*******************************************************************************
QL algorithm with implicit shifts, to determine the eigenvalues and eigenvectors
of a real, symmetric, tridiagonal matrix, or of a real, symmetric matrix
previously reduced by tred2 sec. 11.2. On input, d[1..n] contains the diagonal
elements of the tridiagonal matrix. On output, it returns the eigenvalues. The
vector e[1..n] inputs the subdiagonal elements of the tridiagonal matrix, with
e[1] arbitrary. On output e is destroyed. When finding only the eigenvalues,
several lines may be omitted, as noted in the comments. If the eigenvectors of
a tridiagonal matrix are desired, the matrix z[1..n][1..n] is input as the
identity matrix. If the eigenvectors of a matrix that has been reduced by tred2
are required, then z is input as the matrix output by tred2. In either case,
the kth column of z returns the normalized eigenvector corresponding to d[k].
*******************************************************************************/
{
double pythag(double a, double b);
int m,l,iter,i,k;
double s,r,p,g,f,dd,c,b;
for (i=2;i<=n;i++) e[i-1]=e[i]; /* Convenient to renumber the elements of e. */
e[n]=0.0;
for (l=1;l<=n;l++) {
iter=0;
do {
for (m=l;m<=n-1;m++) { /* Look for a single small subdiagonal element to split the matrix. */
dd=fabs(d[m])+fabs(d[m+1]);
if ((double)(fabs(e[m])+dd) == dd) break;
}
if (m != l) {
if (iter++ == 30) printf("Too many iterations in tqli");
g=(d[l+1]-d[l])/(2.0*e[l]); /* Form shift. */
r=pythag(g,1.0);
g=d[m]-d[l]+e[l]/(g+SIGN(r,g)); /* This is dm - ks. */
s=c=1.0;
p=0.0;
for (i=m-1;i>=l;i--) { /* A plane rotation as in the original QL, followed by Givens */
f=s*e[i]; /* rotations to restore tridiagonal form. */
b=c*e[i];
e[i+1]=(r=pythag(f,g));
if (r == 0.0) { /* Recover from underflow. */
d[i+1] -= p;
e[m]=0.0;
break;
}
s=f/r;
c=g/r;
g=d[i+1]-p;
r=(d[i]-g)*s+2.0*c*b;
d[i+1]=g+(p=s*r);
g=c*r-b;
/* Next loop can be omitted if eigenvectors not wanted */
for (k=1;k<=n;k++) { /* Form eigenvectors. */
f=z[k][i+1];
z[k][i+1]=s*z[k][i]+c*f;
z[k][i]=c*z[k][i]-s*f;
}
}
if (r == 0.0 && i >= l) continue;
d[l] -= p;
e[l]=g;
e[m]=0.0;
}
} while (m != l);
}
}
/******************************************************************************/
double pythag(double a, double b)
/*******************************************************************************
Computes (a2 + b2)1/2 without destructive underflow or overflow.
*******************************************************************************/
{
double absa,absb;
absa=fabs(a);
absb=fabs(b);
if (absa > absb) return absa*sqrt(1.0+(absb/absa)*(absb/absa));
else return (absb == 0.0 ? 0.0 : absb*sqrt(1.0+(absa/absb)*(absa/absb)));
}
#define NP 3
#define TINY 1.0e-6
double sqrt(double x)
{
union
{
int i;
double x;
} u;
u.x = x;
u.i = (1<<29) + (u.i >> 1) - (1<<22);
return u.x;
}
int main()
{
int i,j,k;
double ze[NP][NP] = { {2, 0, 0}, {0, 4, 0}, {0, 0, 2} } ;
double **a;
double *d,*e,*f;
d=dvector(1,NP);
e=dvector(1,NP);
f=dvector(1,NP);
a=dmatrix(1,NP,1,NP);
for (i=1;i<=NP;i++)
for (j=1;j<=NP;j++) a[i][j]=ze[i-1][j-1];
tred2(a,NP,d,e);
tqli(d,e,NP,a);
printf("\nEigenvectors for a real symmetric matrix:\n");
for (i=1;i<=NP;i++) {
for (j=1;j<=NP;j++) {
f[j]=0.0;
for (k=1;k<=NP;k++)
f[j] += (ze[j-1][k-1]*a[k][i]);
}
printf("%s %3d %s %10.6f\n","\neigenvalue",i," =",d[i]);
printf("%11s %14s %9s\n","vector","mtrx*vect.","ratio");
for (j=1;j<=NP;j++) {
if (fabs(a[j][i]) < TINY)
printf("%12.6f %12.6f %12s\n",
a[j][i],f[j],"div. by 0");
else
printf("%12.6f %12.6f %12.6f\n",
a[j][i],f[j],f[j]/a[j][i]);
}
}
//free_dmatrix(a,1,NP,1,NP);
//free_dvector(f,1,NP);
//free_dvector(e,1,NP);
//free_dvector(d,1,NP);
return 0;
}
Output:
Eigenvectors for a real symmetric matrix:
eigenvalue 1 = 2.000000
vector mtrx*vect. ratio
1.000000 2.000000 2.000000
0.000000 0.000000 div. by 0
0.000000 0.000000 div. by 0
eigenvalue 2 = 4.000000
vector mtrx*vect. ratio
0.000000 0.000000 div. by 0
1.000000 4.000000 4.000000
0.000000 0.000000 div. by 0
eigenvalue 3 = 2.000000
vector mtrx*vect. ratio
0.000000 0.000000 div. by 0
0.000000 0.000000 div. by 0
1.000000 2.000000 2.000000
I hope it finaly helps to clarify confusion regarding the data preparation for tqli.
Related
Using Intel MKL's mkl_sparse_d_mv function on our physcs solver to perform a sparse matrix-vector multiplication yields a speedup of between -50% and +25% depending on the sparse matrix used on each case, comparing against the auto-vectorisation case which itself has an inversely correlated performance.
This is not consistent, as it would be desirable to have the same behaviour for (almost) any system configuration and in particular the team expects (sequential) MKL to be overperformant or at least equally performant against auto-vectorisation (using Intel compiler with -O3 on Intel hardware: Xeon Phi's a little bit constrained on memory speed).
Using as a metric the sparsity times the size of the problem shows that the highest mkl_d_sparse_mv performance is for 'the sparsest' system configurations, i.e. mkl performs better when this [*] is relatively small (image attached: ignore the legend, or know that it shows the cubic root of the problem's size).
[*] #rows x (# non-zero-values / # total values)
An example code is below and the question is if the current (i.e. such a variation against O3 auto-vec, or in general a 2x performance variation depending on the sparse matrix's properties: in particular its level of sparsity?) is part of the MKL expected behaviour or is there a not-so-well-known advice that has been missed.
#include <mkl.h>
#include <mkl_spblas.h>
#define ALIGNMENT 64
// An inherited structure in the current SOA-over-AOS paradigm :-/
typedef double TriDouble[3];
// Flag for building sparse matrix only once
static int alreadyBuilt = 0;
// Relevant matrices
static struct matrix_descr descrA;
static MKL_INT EXPECTED_CALLS = (MKL_INT) 5000000;
static sparse_matrix_t csrA1;
static sparse_matrix_t csrA2;
// Sparse matrix sizes
static MKL_INT *m_var;
static MKL_INT *k_var;
static MKL_INT *m2_var;
static MKL_INT *k2_var;
// Data for sparse matrix 1
static double *sparseMatrixElements;
static MKL_INT *sparseMatrixCols;
static MKL_INT *sparseMatrixRowsB;
static MKL_INT *sparseMatrixRowsE;
// Data for sparse matrix 2
static double *sparseMatrixElements2;
static MKL_INT *sparseMatrixCols2;
static MKL_INT * sparseMatrixRowsB2;
static MKL_INT * sparseMatrixRowsE2;
struct problemInformation{
// It's not empty, just too complex for StackOverflow
};
void coreFunction(problemInformation *info)
{
if (alreadyBuilt == 0){
// Allocate memory for columns, values, rows using mkl_malloc
// e.g.
// mkl_malloc()
const int Nrows = 100000;
const int NvalsPerRow = 16;
sparseMatrixElements = (double*) mkl_malloc( sizeof(double) * Nrows * NvalsPerRow, ALIGNMENT);
// Build the matrix using information
int valueN=0;
for (valueN=0; valueN<Nrows*NvalsPerRow; valueN++){
double localVal = 0.0; // Compute it using input information
sparseMatrixElements[valueN] = localVal;
// Etc for rows, columns
}
const int Ncols = 30000; // Compute it using input information
m_var = (MKL_INT*) mkl_malloc(sizeof(MKL_INT), ALIGNMENT);
k_var = (MKL_INT*) mkl_malloc(sizeof(MKL_INT), ALIGNMENT);
m2_var = (MKL_INT*) mkl_malloc(sizeof(MKL_INT), ALIGNMENT);
k2_var = (MKL_INT*) mkl_malloc(sizeof(MKL_INT), ALIGNMENT);
*m_var = (MKL_INT) Nrows;
*k_var = (MKL_INT) Ncols;
*m2_var = (MKL_INT) (Nrows / 3);
*k2_var = (MKL_INT) (Ncols / 3);
descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
// Create matrix 1
sparse_status_t result = mkl_sparse_d_create_csr(&csrA1, SPARSE_INDEX_BASE_ZERO, *m_var, *k_var, sparseMatrixRowsB, sparseMatrixRowsE, sparseMatrixCols, sparseMatrixElements);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN CREATING MATRIX A1"); fflush(NULL); exit(1);}
// Create matrix 2
result = mkl_sparse_d_create_csr(&csrA2, SPARSE_INDEX_BASE_ZERO, *m2_var, *k2_var, sparseMatrixRowsB2, sparseMatrixRowsE2, sparseMatrixCols2, sparseMatrixElements2);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN CREATING MATRIX A2"); fflush(NULL); exit(1);}
// Set memory hint: how many times will it be used for matrix-vector multiplication.
result = mkl_sparse_set_symgs_hint(csrA1, SPARSE_OPERATION_NON_TRANSPOSE, descrA, EXPECTED_CALLS);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN SETTING MEMORY HINT FOR MATRIX A1"); fflush(NULL); exit(1);}
result = mkl_sparse_set_symgs_hint(csrA2, SPARSE_OPERATION_NON_TRANSPOSE, descrA, EXPECTED_CALLS);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN SETTING MEMORY HINT FOR MATRIX A2"); fflush(NULL); exit(1);}
// Call mkl_sparse_optimize: should we sort CSR column indices or is that included???
result = mkl_sparse_optimize(csrA1);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN MATRIX A1 OPTIMIZATION"); fflush(NULL); exit(1);}
result = mkl_sparse_optimize(csrA2);
if (result != SPARSE_STATUS_SUCCESS) {printf("ERROR IN MATRIX A2 OPTIMIZATION"); fflush(NULL); exit(1);}
alreadyBuilt = 1;
}
const double alpha=1.0;
const double alpha=0.0;
const int OFFSET1=0; // Computed from input information
const int OFFSET2=0; // Computed from input information
const int OFFSET3=0; // Computed from input information
const int OFFSET4=0; // Computed from input information
// Observation 1: copying the input variables "u", "T", "grad_u", "grad_T" into a mkl_malloc allocated
// chunk of 64-aligned memory decreases performance a little bit
// Clock this
mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, csrA1, descrA, &u[OFFSET1][OFFSET2], beta, &grad_u[OFFSET3][0][0]);
mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, alpha, csrA2, descrA, &T[OFFSET4], beta, &grad_T[OFFSET3][0]);
// Also clock the old version
//
// Old version goes here...
// Print the results of the timer
}
int main (void)
{
TriDouble *rhs;
TriDouble *lhs;
build_rhs(rhs); // Call a C++ function that allocates memory using
// new TriDouble[desired_length]
build_lhs(lhs); // idem
problemInformation info; // Initialize the problem's information parsing the command line arguments etc.
const int EPOCHS = 10000;
int i=0;
for (; i< EPOCHS; i++){
coreFunction(info); // Call the core function 10000 times
}
return 0;
}
The previous code is an excellent minimal example of our current setting, but it does not compile as it lacks several relevant declarations.
A working example using mkl_sparse_d_mv is as follows from the official tutorial, albeit it has not been here tested in performance against automatic vectorisation using the relevant matrices.
/*******************************************************************************
* Copyright 2013-2018 Intel Corporation.
*
* This software and the related documents are Intel copyrighted materials, and
* your use of them is governed by the express license under which they were
* provided to you (License). Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute, disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents are provided as is, with no express
* or implied warranties, other than those that are expressly stated in the
* License.
*******************************************************************************/
/*
* Content : Intel(R) MKL IE Sparse BLAS C example for mkl_sparse_d_mv
*
********************************************************************************
*
* Consider the matrix A (see 'Sparse Storage Formats for Sparse BLAS Level 2
* and Level 3 in the Intel(R) MKL Reference Manual')
*
* | 1 -1 0 -3 0 |
* | -2 5 0 0 0 |
* A = | 0 0 4 6 4 |,
* | -4 0 2 7 0 |
* | 0 8 0 0 -5 |
*
* The matrix A is represented in a zero-based compressed sparse row (CSR) storage
* scheme with three arrays (see 'Sparse Matrix Storage Schemes' in the
* Intel(R) MKL Reference Manual) as follows:
*
* values = ( 1 -1 -3 -2 5 4 6 4 -4 2 7 8 -5 )
* columns = ( 0 1 3 0 1 2 3 4 0 2 3 1 4 )
* rowIndex = ( 0 3 5 8 11 13 )
*
* The test computes the following operations :
*
* A*x = y using mkl_sparse_d_mv
* where A is a general sparse matrix and x and y are vectors
*
********************************************************************************
*/
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include "mkl_spblas.h"
int main() {
//*******************************************************************************
// Declaration and initialization of parameters for sparse representation of
// the matrix A in the compressed sparse row format:
//*******************************************************************************
#define M 5
#define N 5
#define NNZ 13
//*******************************************************************************
// Sparse representation of the matrix A
//*******************************************************************************
double csrVal[NNZ] = { 1.0, -1.0, -3.0,
-2.0, 5.0,
4.0, 6.0, 4.0,
-4.0, 2.0, 7.0,
8.0, -5.0 };
MKL_INT csrColInd[NNZ] = { 0, 1, 3,
0, 1,
2, 3, 4,
0, 2, 3,
1, 4 };
MKL_INT csrRowPtr[M+1] = { 0, 3, 5, 8, 11, 13 };
// Descriptor of main sparse matrix properties
struct matrix_descr descrA;
// // Structure with sparse matrix stored in CSR format
sparse_matrix_t csrA;
//*******************************************************************************
// Declaration of local variables:
//*******************************************************************************
double x[N] = { 1.0, 5.0, 1.0, 4.0, 1.0};
double y[N] = { 0.0, 0.0, 0.0, 0.0, 0.0};
double alpha = 1.0, beta = 0.0;
MKL_INT i;
printf( "\n EXAMPLE PROGRAM FOR mkl_sparse_d_mv \n" );
printf( "---------------------------------------------------\n" );
printf( "\n" );
printf( " INPUT DATA FOR mkl_sparse_d_mv \n" );
printf( " WITH GENERAL SPARSE MATRIX \n" );
printf( " ALPHA = %4.1f BETA = %4.1f \n", alpha, beta );
printf( " SPARSE_OPERATION_NON_TRANSPOSE \n" );
printf( " Input vector \n" );
for ( i = 0; i < N; i++ )
{
printf( "%7.1f\n", x[i] );
};
// Create handle with matrix stored in CSR format
mkl_sparse_d_create_csr ( &csrA, SPARSE_INDEX_BASE_ZERO,
N, // number of rows
M, // number of cols
csrRowPtr,
csrRowPtr+1,
csrColInd,
csrVal );
// Create matrix descriptor
descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
// Analyze sparse matrix; choose proper kernels and workload balancing strategy
mkl_sparse_optimize ( csrA );
// Compute y = alpha * A * x + beta * y
mkl_sparse_d_mv ( SPARSE_OPERATION_NON_TRANSPOSE,
alpha,
csrA,
descrA,
x,
beta,
y );
// Release matrix handle and deallocate matrix
mkl_sparse_destroy ( csrA );
printf( " \n" );
printf( " OUTPUT DATA FOR mkl_sparse_d_mv \n" );
// y should be equal { -16.0, 23.0, 32.0, 26.0, 35.0 }
for ( i = 0; i < N; i++ )
{
printf( "%7.1f\n", y[i] );
};
printf( "---------------------------------------------------\n" );
return 0;
}
I'm trying to understand some basics of DFT, some math equations, and try to implement it with C.
Well, this is the function i used from a book (Algorithms for Image Processing And Computer Vision)
void slowft (float *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4;
int m, k;
/* Constant factor -2 pi */
cmplx (0.0, (float)(atan (1.0)/n * -8.0), &tmp);
printf (" constant factor -2 pi %f ", (float)(atan (1.0)/n * -8.0));
for (m = 0; m<=n; m++)
{
NEXT();
cmplx (x[0], 0.0, &(y[m]));
for (k=1; k<=n-1; k++)
{
/* Exp (tmp*k*m) */
cmplx ((float)k, 0.0, &z2);
cmult (tmp, z2, &z3);
cmplx ((float)m, 0.0, &z2);
cmult (z2, z3, &z4);
cexp (z4, &z2);
/* *x[k] */
cmplx (x[k], 0.0, &z3);
cmult (z2, z3, &z4);
/* + y[m] */
csum (y[m], z4, &z2);
y[m].real = z2.real; y[m].imag = z2.imag;
}
}
}
So actually, I'm stuck on the Constant Factor part. I didn't understand:
1-) what it came from(especially arctan(1)) and
2-) what its purpose of it.
This is the equation of DFT:
And these are other functions that i used:
void cexp (COMPLEX z1, COMPLEX *res)
{
COMPLEX x, y;
x.real = exp((double)z1.real);
x.imag = 0.0;
y.real = (float)cos((double)z1.imag);
y.imag = (float)sin((double)z1.imag);
cmult (x, y, res);
}
void cmult (COMPLEX z1, COMPLEX z2, COMPLEX *res)
{
res->real = z1.real*z2.real - z1.imag*z2.imag;
res->imag = z1.real*z2.imag + z1.imag*z2.real;
}
void csum (COMPLEX z1, COMPLEX z2, COMPLEX *res)
{
res->real = z1.real + z2.real;
res->imag = z1.imag + z2.imag;
}
void cmplx (float rp, float ip, COMPLEX *z)
{
z->real = rp;
z->imag = ip;
}
float cnorm (COMPLEX z)
{
return z.real*z.real + z.imag*z.imag;
}
1-) what it came from(especially arctan(1)) and
The code comment immediately above clues you in:
/* Constant factor -2 pi */
... although actually what is being computed is -2 pi / n (in the broader context of producing a complex number with that as the coefficient of its imaginary component). Observe that the tangent has value 1 for angles whose sine and cosine are equal. The angle that has that property and is in the range [0, pi) is pi / 4, so atan(1.0) * -8.0 is (a good approximation to) -2 pi.
2-) what its purpose of it.
It (or actually its additive inverse) appears in the DFT equation you presented, so it is natural that it appears in a function intended to implement that formula.
Here is the code with comments explaining it.
void slowft (float *x, COMPLEX *y, int n)
{
COMPLEX tmp, z1, z2, z3, z4;
int m, k;
/* Constant factor -2 pi */
cmplx (0.0, (float)(atan (1.0)/n * -8.0), &tmp);
/* atan(1) is π/4, so this sets tmp to -2πi/n. Note that the i
factor, the imaginary unit, comes from putting the expression in
the second argument, which gives the imaginary portion of the
complex number being assigned. (It is written as "j" in the
equation displayed in the question. That is because engineers use
"j" for i, having historically already used "i" for other purposes.)
*/
printf (" constant factor -2 pi %f ", (float)(atan (1.0)/n * -8.0));
for (m = 0; m<=n; m++)
{
NEXT();
// Well, that is a frightening thing to see in code. It is cryptic.
cmplx (x[0], 0.0, &(y[m]));
/* This starts to calculate a sum that will be accumulated in y[m].
The sum will be over k from 0 to n-1. For the first term, k is 0,
so -2πiwk/n will be 0. The coefficient is e to the power of that,
and e**0 is 1, so the first term is x[0] * 1, so we just put x[0]
diretly in y[m] with no multiplication.
*/
for (k=1; k<=n-1; k++)
// This adds the rest of the terms.
{
/* Exp (tmp*k*m) */
cmplx ((float)k, 0.0, &z2);
// This sets z2 to k.
cmult (tmp, z2, &z3);
/* This multiplies the -2πi/n from above with k, so it puts
-2πi/n from above, and This computes -2πik/n it in z3.
*/
cmplx ((float)m, 0.0, &z2);
// This sets z2 to m. m corresponds to the ω in the equation.
cmult (z2, z3, &z4);
// This multiplies m by -2πik/n, putting -2πiwk/n in z4.
cexp (z4, &z2);
/* This raises e to the power of -2πiwk/n, finishing the
coefficient of the term in the sum.
*/
/* *x[k] */
cmplx (x[k], 0.0, &z3);
// This sets z3 to x[k].
cmult (z2, z3, &z4);
// This multiplies x[k] by the coefficient, e**(-2πiwk/n).
/* + y[m] */
csum (y[m], z4, &z2);
/* This adds the term (z4) to the sum being accumulated (y[m])
and puts the updated sum in z2.
*/
y[m].real = z2.real; y[m].imag = z2.imag;
/* This moves the updated sum to y[m]. This is not necessary
because csum is passed its operands as values, so they are
copied when calling the function, and it is safe to update its
output. csum(y[m], z4, &y[m]) above would have worked. But
this works too.
*/
}
}
Standard C has support for complex arithmetic, so it would be easier and clearer to include <complex.h> and write code this way:
void slowft(float *x, complex float *y, int n)
{
static const float TwoPi = 0x3.243f6a8885a308d313198a2e03707344ap1f;
float t0 = -TwoPi/n;
for (int m = 0; m <=n; m++)
{
float t1 = t0*m;
y[m] = x[0];
for (int k = 1; k < n; k++)
y[m] += x[k] * cexpf(t1 * k * I);
}
}
I made this implementation of the BSPLINE curve. I Followed the usual definition presented in http://en.wikipedia.org/wiki/B-spline
t is the knot vector.
#include <stdio.h>
double N(int i, int k, double u, double t[])
{
if(k == 1)
{
if(u >= t[i] && u < t[i+1])
return 1.0e0;
else {
return 0.0e0;
}
}
return ((u - t[i])*N(i, k -1, u, t))/(t[i+k-1] - t[i]) + ((t[i+k] - u)*N(i+1, k-1, u, t))/(t[i+k] - t[i+1]);
}
double pu(double u, double x[], int n, int k, double t[])
{
int i;
double r = 0.0e0;
for(i = 0; i < n; i++)
{
r += x[i]*N(i, k, u, t);
}
return r;
}
int main()
{
double t[] = {0.0, 0.5, 1, 2, 3, 4, 4.5, 5}; //knot vector
double x[] = {-30.0, 25.0, 9.0, 20.0, 25.0, 31.0}, y[] = {-5.0, -10.0, 3.0, -10.0, -5.0, 25.0}; //the points
double u;
for(u = 0.0e0; u < 5.0; u+=0.01e0)
{
printf("%lf %lf\n", pu(u, x, 6, 2, t), pu(u, y, 6, 2, t));
}
return 0;
}
The problem is that when I plot the computed points I see that there is an unexpected behavior at the beginning and end of the curve.
For instance:
I can not understand why this happens, try changing the values of t, but it seems that is not it.
There are two implementations of bspline: uniform and standard. In uniform the first and the last control points are not interpolated and in the standard knot sequence both are interpolated. In uniform, you can have uniform knots usually 1,2,3,... For standard knot sequence if you have order k (degree k-1) you should have k zeros, k ones and fill in the middle with 1/(m-k+2) where m is the number of control points. For example, having 5 control points and order 3, knot sequences are 0, 0, 0, 0.25, 0.5, 0.75, 1, 1, 1.
In addition, using delta functions you can have a much better implementation rather than computing N function. Delta function benefits from the local support of bspline. I suggest you take a look at the course notes that I teach at the University of Calgary:
http://pages.cpsc.ucalgary.ca/~amahdavi/pmwiki-2.2.8/uploads/Site/notes1.pdf
Check page 40 algorithm 3.3.
Hopefully it's helpful.
In general: if you have n control points and you construct a bspline curve of degree k, your knot vector has n+k+1 knots. The domain of the curve (i.e. where the summation of the basis functions are =1) is given in the interval [t_k, ... t_n] (as far as my details are right).
In the loop, your parameter values range from 0.0 to 5.0. This should be t[k] = t[2] = 1.0 to t[n] = t[6] = 4.0.
The wiki examples do it the other way, as the knot vectors given there have multiple values (i.e. k-times) at the beginning and the end. So the shown spline curve starts/ends at the first/last control point. Your construction does not give this property.
I am having trouble utilizing CBLAS to perform an Outer Product. My code is as follows:
//===SET UP===//
double x1[] = {1,2,3,4};
double x2[] = {1,2,3};
int dx1 = 4;
int dx2 = 3;
double X[dx1 * dx2];
for (int i = 0; i < (dx1*dx2); i++) {X[i] = 0.0;}
//===DO THE OUTER PRODUCT===//
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasTrans, dx1, dx2, 1, 1.0, x1, dx1, x2, 1, 0.0, X, dx1);
//===PRINT THE RESULTS===//
printf("\nMatrix X (%d x %d) = x1 (*) x2 is:\n", dx1, dx2);
for (i=0; i<4; i++) {
for (j=0; j<3; j++) {
printf ("%lf ", X[j+i*3]);
}
printf ("\n");
}
I get:
Matrix X (4 x 3) = x1 (*) x2 is:
1.000000 2.000000 3.000000
0.000000 -1.000000 -2.000000
-3.000000 0.000000 7.000000
14.000000 21.000000 0.000000
But the correct answer is found here:
https://www.sharcnet.ca/help/index.php/BLAS_and_CBLAS_Usage_and_Examples
I have seen: Efficient computation of kronecker products in C
But, it doesn't help me because they don't actually say how to utilize dgemm to actually do this...
Any help? What am I doing wrong here?
You can do it with dgemm, but it would be more stylistically correct to use dger, which is a dedicated outer-product implementation. As such it's somewhat easier to use correctly:
cblas_dger(CblasRowMajor, /* you’re using row-major storage */
dx1, /* the matrix X has dx1 rows ... */
dx2, /* ... and dx2 columns. */
1.0, /* scale factor to apply to x1x2' */
x1,
1, /* stride between elements of x1. */
x2,
1, /* stride between elements of x2. */
X,
dx2); /* leading dimension of matrix X. */
dgemm does have the nice feature that passing \beta = 0 initializes the result matrix for you, which saves you from needing to explicitly zero it out yourself before the call. #Artem Shinkarov’s answer provides a nice description of how to use dgemm.
The interfaces are not very convenient in BLAS, however, let's try to figure it out. First of all, let's say that all our matrices are in RowMajor. Now we have the following set-up
row col
x1: dx1 1 (A)
x2: 1 dx2 (B)
X: dx1 dx2 (C)
Now, we just need to fill the call according to the documentation, which is specified in terms of
C = \alpha A*B + \beta C
So we get:
cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans,
(int)dx1, /* rows in A */
(int)dx2, /* columns in B */
(int)1, /* columns in A */
1.0, x1, /* \alpha, A itself */
(int)1, /* Colums in A */
x2, /* B itself */
(int)dx2, /* Columns in B */
0.0, X, /* \beta, C itself */
(int)dx2 /* Columns in C */);
So that should do the job I would hope.
Here is a description of the parameters of dgemm: Link
I have been using CHOLMOD to factorise the matrix A and solve the system Ax = b, for A being the Hessian matrix (printed below) and b = [1, 1, 1] created by the cholmod_ones function.
Unfortunately, the solution for x is incorrect (should be [1.5, 2.0, 1.5]) and to confirm I then multiplied A and x back together and don't get [1, 1, 1]. I don't quite understand what I am doing wrong.
Additionally, I've looked at the factor and the values of the matrix elements don't make sense either.
Output
Hessian:
2.000 -1.000 0.000
-1.000 2.000 -1.000
0.000 -1.000 2.000
Solution:
2.500 0.000 0.000
3.500 0.000 0.000
2.500 0.000 0.000
B vector:
1.500 0.000 0.000
2.000 0.000 0.000
1.500 0.000 0.000
Code
iterate_hessian() is an external function that returns doubles which are read into the CHOLMOD hessian matrix.
The entry point for the code is cholesky_determinant which is called with an argument which gives the dimension of the (square) matrix.
#include <cholmod.h>
#include <string.h>
// Function prototype that gives the next value of the Hessian
double iterate_hessian();
cholmod_sparse *cholmod_hessian(double *hessian, size_t dimension, cholmod_common *common) {
// This function assigns the Hessian matrix from OPTIM to a dense matrix for CHOLMOD to use.
// Allocate a dense cholmod matrix of appropriate size
cholmod_triplet *triplet_hessian;
triplet_hessian = cholmod_allocate_triplet(dimension, dimension, dimension*dimension, 0, CHOLMOD_REAL, common);
// Loop through values of hessian and assign their row/column index and values to triplet_hessian.
size_t loop;
for (loop = 0; loop < (dimension * dimension); loop++) {
if (hessian[loop] == 0) {
continue;
}
((int*)triplet_hessian->i)[triplet_hessian->nnz] = loop / dimension;
((int*)triplet_hessian->j)[triplet_hessian->nnz] = loop % dimension;
((double*)triplet_hessian->x)[triplet_hessian->nnz] = hessian[loop];
triplet_hessian->nnz++;
}
// Convert the triplet to a sparse matrix and return.
cholmod_sparse *sparse_hessian;
sparse_hessian = cholmod_triplet_to_sparse(triplet_hessian, (dimension * dimension), common);
return sparse_hessian;
}
void print_matrix(cholmod_dense *matrix, size_t dimension) {
// matrix->x is a void pointer, so first copy it to a double pointer
// of an appropriate size
double *y = malloc(sizeof(matrix->x));
y = matrix->x;
// Loop variables
size_t i, j;
// Row
for(i = 0; i < dimension; i++) {
// Column
for(j = 0; j < dimension; j++) {
printf("% 8.3f ", y[i + j * dimension]);
}
printf("\n");
}
}
cholmod_dense *factorized(cholmod_sparse *sparse_hessian, cholmod_common *common) {
cholmod_factor *factor;
factor = cholmod_analyze(sparse_hessian, common);
cholmod_factorize(sparse_hessian, factor, common);
cholmod_dense *b, *x;
b = cholmod_ones(sparse_hessian->nrow, 1, sparse_hessian->xtype, common);
x = cholmod_solve(CHOLMOD_LDLt, factor, b, common);
cholmod_free_factor(&factor, common);
// Return the solution, x
return x;
}
double cholesky_determinant(int *dimension) {
// Declare variables
double determinant;
cholmod_sparse *A;
cholmod_dense *B, *Y;
cholmod_common common;
// Start CHOLMOD
cholmod_start (&common);
// Allocate storage for the hessian (we want to copy it)
double *hessian = malloc(*dimension * *dimension * sizeof(hessian));
// Get the hessian from OPTIM
int i = 0;
for (i = 0; i < (*dimension * *dimension); i++) {
hessian[i] = iterate_hessian();
}
A = cholmod_hessian(hessian, *dimension, &common);
printf("Hessian:\n");
print_matrix(cholmod_sparse_to_dense(A, &common), *dimension);
B = factorized(A, &common);
printf("Solution:\n");
print_matrix(B, *dimension);
double alpha[] = {1, 0};
double beta[] = {0, 0};
Y = cholmod_allocate_dense(*dimension, 1, *dimension, CHOLMOD_REAL, &common);
cholmod_sdmult(A, 0, alpha, beta, B, Y, &common);
printf("B vector:\n");
print_matrix(Y, *dimension);
determinant = 0.0;
// Free up memory and finish CHOLMOD
cholmod_free_sparse (&A, &common);
cholmod_free_dense (&B, &common);
cholmod_finish (&common);
return determinant;
}
It turns out that I hadn't set the stype for my sparse matrix properly. The stype determines the symmetry (and thus the subsequent behaviour of calls to cholmod_factorize). It was in fact factorising and solving for AA'.