Store result of sparse matrix decomposition in pre-specified memory location, in Eigen - sparse-matrix

I am trying to write a function to perform a sparse Cholesky decomposition using the Eigen library, where I pass in both the pointers to the input matrix data and the pointers to where I want to store the output matrix.
The program is currently
#include <iostream>
#include <Eigen/Dense>
#include <Eigen/SparseCore>
#include <Eigen/SparseCholesky>
using namespace std;
using namespace Eigen;
struct CSC {
int *indptr;
int *indices;
double *data;
int nnz;
};
int cholesky_sparse_d_c(struct CSC *A, struct CSC *L,
int rows, int cols, int nnz) {
// Find sparse Cholesky factorisation of matrix A and store in triangular
// matrix L i.e A = L L.T.
// First we must build the sparse matrix A.
Map<SparseMatrix <double> > A_sp(rows, cols, nnz,
A->indptr, A->indices, A->data);
cout << "A: " << endl << A_sp << endl;
// Now compute the sparse Cholesky decomposition.
SimplicialLLT<SparseMatrix<double> > SLLT;
SLLT.compute(A_sp);
if (SLLT.info() != Success) {
cout << "Decomposition failed";
return -1;
}
cout << "Sparse lower factor of A:" << endl << SLLT.matrixL()
<< endl;
// Put the values back into L. Note I am not sure if we need to create a
// `temp` variable here, as the call `.matrixL()` may be free.
SparseMatrix<double > temp(SLLT.matrixL());
L->indptr = (int *) temp.outerIndexPtr();
L->indices = (int *) temp.innerIndexPtr();
L->data = (double *) temp.valuePtr();
L->nnz = (int) temp.nonZeros();
Map<SparseMatrix <double> > L_sp(rows, cols, L->nnz,
L->indptr, L->indices, L->data);
cout << "L: " << endl << L_sp << endl;
return 0;
}
int main() {
struct CSC A;
int A_indptr[] = {0, 1, 2};
int A_indices[] = {0, 1};
double A_data[] = {1.1, 2.2};
A.indptr = A_indptr;
A.indices = A_indices;
A.data = A_data;
struct CSC L;
cholesky_sparse_d_c(&A, &L, 2, 2, 2);
cout << L.indptr[0] << L.indptr[1] << L.indptr[2] << endl;
cout << L.indices[0] << L.indices[1] << L.indices[2] << endl;
cout << L.data[0] << L.data[1] << L.data[2] << endl;
}
As mentioned in the code, I am not sure if the temp variable is necessary as
L_indptr = SLLT.matrixL().outerIndexPtr();
L_indices = SLLT.matrixL().innerIndexPtr();
L_data = SLLT.matrixL().valuePtr();
may be fine (I am not sure if matrixL() is a free operation).
Regardless, when this function exits the memory that the L pointers were pointing to will now be free'd. I could copy the memory but this is unnecessary and inefficient. What I would ideally like to do is tell SLLT to not create new pointers for
.outerIndexPtr()
.innerIndexPtr()
.valuePtr()
but to use the pointers in the L structure provided.
Is there a way to do this?

If you insist on saving a copy (it should be very cheep compared to the decomposition), then the simplest and safest would be to keep SLLT alive as long as your L, for instance by creating a small structure storing both objects and being responsible for destroying both of them.
Otherwise, you could imagine moving SLLT.matrixL() into L, but then you'll have to free the allocated memories, but you cannot as you don't know how it was allocated. To allocate yourself L and pass it SLLT, you need a way to exactly know the number of non-zeros in L. Actually, this information is computed by the analyzePattern step, but this method also pre-allocate SLLT.matrixL(), so that's too late.

Related

LNK1104 cannot open file 'libfftw3-3.lib'

I am quite fresh in coding C code, trying to use FFTW from the well-known website http://www.fftw.org/ in my Visual Studio 2019.
I followed the tutorial (https://www.youtube.com/watch?v=geYbCA137PU), but an error appeared: LNK1104 cannot open file 'libfftw3-3.lib'
How should I solve the problem? I have googled it, but looks like most of the solution not quite suitable to mine. Almost the last step! Please!
#include <iostream>
#include <fftw3.h>
using namespace std;
//macros for real and imaginary parts
#define REAL 0
#define IMAG 1
//length of complex array
#define N 8
/*Computes the 1-D fast Fourier transform*/
void fft(fftw_complex* in, fftw_complex* out)
{
// creat a DFT plan
fftw_plan plan = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
// execute the plan
fftw_execute(plan);
// do some cleaning
fftw_destroy_plan(plan);
fftw_cleanup();
}
/*Computes the 1-D inverse fast Fourier transform*/
void ifft(fftw_complex* in, fftw_complex* out)
{
// creat a IDFT plan
fftw_plan plan = fftw_plan_dft_1d(N, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
// execute the plan
fftw_execute(plan);
// do some cleaning
fftw_destroy_plan(plan);
fftw_cleanup();
// scale the output to obtain the exact inverse
for (int i = 0; i < N; ++i) {
out[i][REAL] /= N;
out[i][IMAG] /= N;
}
}
/*Display complex numbers in the form a +/- bi. */
void displayComplex(fftw_complex* y)
{
for (int i = 0; i < N; ++i)
if (y[i][IMAG] < 0)
cout << y[i][REAL] << " - " << abs(y[i][IMAG]) << "i" << endl;
else
cout << y[i][REAL] << " + " << y[i][IMAG] << "i" << endl;
}
/*Display real part of complex number*/
void displayReal(fftw_complex* y)
{
for (int i = 0; i < N; ++i)
cout << y[i][REAL] << endl;
}
/* Test */
int main()
{
// input array
fftw_complex x[N];
// output array
fftw_complex y[N];
// fill the first of some numbers
for (int i = 0; i < N; ++i) {
x[i][REAL] = i + 1; // i.e.{1 2 3 4 5 6 7 8}
x[i][IMAG] = 0;
}
// compute the FFT of x and store the result in y.
fft(x, y);
// display the result
cout << "FFT =" << endl;
displayComplex(y);
// compute the IFFT of x and store the result in y.
ifft(y, x);
// display the result
cout << "\nIFFT =" << endl;
displayReal(x);
}
#HAL9000 Thanks for your remind, I found out that I have converted the wrong name of .def so I generated a "libfftw3-3l.lib". That's why it couldn't open the file, it has been solved now!

The program does not crash neither executes completely

it has been a few hours since I am dealing with this issue. I was wondering if someone could point out what am I doing wrong, and if possible - how to fix it. Essentially, I am simply trying to generate n number of object pairs and store them into a vector<pair<Foo, Foo>>. The algorithm involves random number generator. I use STL <random> and its components like m19937, uniform_real_distribution and uniform_int_distribution. Below is the simplified version of what I am trying to do representing the case I got at hand. The second loop always cuts short. However, I fail to see the reason why. Essentially, I never get to see the program execute completely. The last two messages never show.
program
#include <iostream>
#include <vector>
#include <random>
#include <utility>
// utility
using std::pair;
// random
using std::mt19937;
using std::uniform_int_distribution;
using std::uniform_real_distribution;
// iostream
using std::cout;
using std::endl;
// vector
using std::vector;
class Event{
private:
double x, y;
public:
Event(const double X, const double Y);
};
Event::Event(const double X, const double Y): x(X), y(Y){}
int main(){
cout << "Initializing storage..." << endl;
vector<Event> population;
vector<pair<Event,Event>> selection;
cout << "Initializing necessary member variables..." << endl;
const unsigned int SEED = 14112017;
const unsigned int MAX_ITERATIONS = 10000;
const double MIN = 1;
const double MAX = 10000;
mt19937 engine(SEED);
cout << "Generating the initial population..." << endl;
uniform_real_distribution<> real_distribution(MIN, MAX);
for(unsigned int i = 0; i < MAX_ITERATIONS; ++i){
double x = real_distribution(engine);
double y = real_distribution(engine);
Event event(x, y);
population.push_back(event);
}
cout << "Success! The initial population has been generated successfully" << endl;
population.shrink_to_fit();
cout << "Starting the selection process..." << endl;
unsigned int random = 0;
uniform_int_distribution<> int_distribution(MIN, MAX);
for(unsigned int i = 0; i < MAX_ITERATIONS; ++i){
random = int_distribution(engine);
Event event_x = population.at(random);
random = int_distribution(engine);
Event event_y = population.at(random);
pair<Event, Event> bound(event_x, event_y);
selection.push_back(bound);
}
cout << "Success! The selection process has been completed successfully" << endl;
selection.shrink_to_fit();
cout << "population size: " << population.size() << endl;
cout << "selection size: " << selection.size() << endl;
return 0;
}
I compile the above using cygwins C++ compiler, and I execute the code in command-prompt. The OS is Windows 10 x64. The box has 32 GB memory.
uniform_int_distributions constructor is as follows:
explicit uniform_int_distribution( IntType a = 0,
IntType b = std::numeric_limits<IntType>::max() );
By default, it returns an integer which covers all positive values of that type. The range includes the value of the second parameter. If it wouldn't, it would be cumbersome to specify we want all positive integers.
cppreference.com does not document it, but the C++ standard does: Thanks #Cubbi
This is documented on cppreference.com, or in the C++ standard:
26.5.8.2.1 Class template uniform_int_distribution [rand.dist.uni.int]
1 A uniform_int_distribution random number
distribution produces random integers i, a ≤ i ≤ b, distributed
according to the constant discrete probability function
[...]
// constructors and reset functions
explicit uniform_int_distribution(IntType a = 0, IntType b = numeric_limits<IntType>::max());
Here:
uniform_int_distribution<> int_distribution(MIN, MAX);
for(unsigned int i = 0; i < MAX_ITERATIONS; ++i){
random = int_distribution(engine);
Event event_x = population.at(random);
random = int_distribution(engine);
Event event_y = population.at(random);
random can take the value MAX, which is out of the bounds of the population vector.

A more faster (optimized) solution to image decimation (C++)

I am looking for a more faster way of dealing with the following C code. I have an image of 640x480 and I want to decimate it by a factor of 2 by removing every other rows and columns in the image. I have attached the code in the following. Is there any better way to optimize the code.
#define INPUT_NUM_ROW 480
#define INPUT_NUM_COL 640
#define OUTPUT_NUM_ROW 240
#define OUTPUT_NUM_COL 320
unsigned char inputBuf[INPUT_NUM_ROW* INPUT_NUM_COL];
unsigned char outputBuf[OUTPUT_NUM_ROW* OUTPUT_NUM_COL];
void imageDecimate(unsigned char *outputImage , unsigned char *inputImage)
{
/* Fill in your code here */
for (int p = 0; p< OUTPUT_NUM_ROW; p++) {
for (int q = 0; q < OUTPUT_NUM_COL; q++) {
outputImage[p*OUTPUT_NUM_COL + q] = inputImage[(p*INPUT_NUM_COL+q)*2];
// cout << "The pixel at " << p*OUTPUT_NUM_COL+q << " is " << outputImage[p*OUTPUT_NUM_COL+q] << endl;
}
}
}
Rather than doing the math every time in the inner loop, you could do this:
int outputIndex;
int inputIndex;
for (int p = 0; p< OUTPUT_NUM_ROW; p++) {
inputIndex = p * INPUT_NUM_COL * 2;
outputIndex = p * OUTPUT_NUM_COL;
for (int q = 0; q < OUTPUT_NUM_COL; q++) {
outputImage[outputIndex] = inputImage[inputIndex];
inputIndex += 2;
outputIndex++;
// cout << "The pixel at " << p*OUTPUT_NUM_COL+q << " is " << outputImage[p*OUTPUT_NUM_COL+q] << endl;
}
}
}
You could do the incrementing inline with the copying assignment too, and you could also only assign inputIndex and outputIndex the first time, but it wouldn't get you as much of a performance boost as moving the calculation out of the inner loop. I assume that bulk copying functions don't have this kind of incrementing flexibility, but if they do and they use hardware acceleration that is available on all of your target platforms, then that would be a better choice.
I am also assuming that array access like this compiles down to the most optimized pointer arithmetic that you could use.

Dynamic array and how to check if it has a certain number

I'm having trouble with dynamic array. The code I wrote is suppose to input the # of coins and check if 1 is included. If it is not include in the arrays include 1 to the array. But the array size is "fixed" so i can't change the size of array while keeping the other numbers inputted. How can I do this without messing up with my arrays?
#include <iostream>
using namespace std;
int main()
{
int N,coin;
cout << "Enter the value N to produce: " << endl;
cin >> N;
cout << "Enter number of different coins: " << endl;
cin >> coin;
int *S = new int[coin];
cout << "Enter the denominations to use with a space after it" << endl;
cout << "(1 will be added if necessary): " << endl;
for(int i = 0; i < coin; i++)
{
cin >> S[i];
if(S[i] != 1)
S[coin] = 1; // confused at this part of how to set the last element to 1
cout << S[i] << endl;
}
//system("PAUSE");
return 0;
}
here is pseudo code/comments
bool hasOne;
for(int i = 0; i < coin; i++) {
cin >> S[i];
if(S[i] == 1) hasOne = true;
}
if(!hasOne) {
// create a new array size one more than S
// copy elements from S to the new array
// set the last element to 1 in the new array
// assign the new array to S
}
Do you need to append 1 to the end of the array? If so, I would use std::vector object instead of an array
#include <vector>
vector<int> S;
for(int i = 0; i < coin; i++)
{
cin >> S[i];
if(S[i] != 1)
S.push_back(1);
cout << S[i] << endl;
}
If you must use arrays you need to use malloc() to dynamically allocate memory. But since you are using c++ std::vector is the way to go.

CUSP library called from Fortran not working

I want to repetitively solve the CG/BicGSTAB using CUSP solver, called from Fortran. To avoid transfers I am passing the Fortran data directly to CUSP. The code compiles but breaks at the run time flagging:
terminate called after throwing an instance of 'thrust::system::system_error'
what(): invalid argument
terminate called recursively
Aborted (core dumped)
Let alone the core of the code, even the print stream is not happening. The code of course is in the preliminary stage, but I wonder what is wrong with it.
extern "C" void bicgstab_(int *device_I, int *device_J, float *device_V, float *device_x, float *device_b, int *n, int *nnz){
int N = *n;
int NNZ = *nnz;
std::cout << N << " " << NNZ << " " << *device_I << std::endl;
for(int i=0; i<N;i++)std::cout << device_I[i] << " "; std::cout << std::endl;
for(int i=0; i<NNZ;i++)std::cout << device_J[i] << " "; std::cout << std::endl;
for(int i=0; i<NNZ;i++)std::cout << device_V[i] << " "; std::cout << std::endl;
for(int i=0; i<N;i++)std::cout << device_x[i] << " "; std::cout << std::endl;
for(int i=0; i<N;i++)std::cout << device_b[i] << " "; std::cout << std::endl;
// *NOTE* raw pointers must be wrapped with thrust::device_ptr!
thrust::device_ptr<int> wrapped_device_I(device_I);
thrust::device_ptr<int> wrapped_device_J(device_J);
thrust::device_ptr<float> wrapped_device_V(device_V);
thrust::device_ptr<float> wrapped_device_x(device_x);
thrust::device_ptr<float> wrapped_device_b(device_b);
// use array1d_view to wrap the individual arrays
typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView;
typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView;
std::cout << wrapped_device_I[3];
/*
DeviceIndexArrayView row_indices (wrapped_device_I, wrapped_device_I + (N+1));
DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + NNZ);
DeviceValueArrayView values (wrapped_device_V, wrapped_device_V + NNZ);
DeviceValueArrayView x (wrapped_device_x, wrapped_device_x + N);
DeviceValueArrayView b (wrapped_device_b, wrapped_device_b + N);
// std::cout << device_x[0] ;
// for(int i=0;i<NNZ;i++)std::cout << column_indices[i] << std::endl;
// combine the three array1d_views into a csr_matrix_view
typedef cusp::csr_matrix_view<DeviceIndexArrayView,
DeviceIndexArrayView,
DeviceValueArrayView> DeviceView;
// construct a csr_matrix_view from the array1d_views
DeviceView A(N, N, NNZ, row_indices, column_indices, values);
// set stopping criteria: // iteration_limit = 100 // relative_tolerance = 1e-5
cusp::verbose_monitor<float> monitor(b, 100, 1e-5);
// solve the linear system A * x = b with the Conjugate Gradient method
// cusp::krylov::bicgstab(A, x, b);*/
}
If this is not feasible, I can move over to another approach,but as I am not sure about the correctness, I am unable to decide. Any help is appreciated.

Resources