How to convert _mm_shuffle_ps SSE intrinsic to NEON intrinsic?

How to convert _mm_shuffle_ps SSE intrinsic to NEON intrinsic? - arm

I am trying to convert codes written in SSE to NEON SIMD and got stuck because of the _mm_shuffle_ps SSE intrinsic. Here is the code:
b = _mm_shuffle_ps(a, b, 136);
a, b, c are all the __m128 registers.
Now I want to use NEON to implement the same function. Assume that there are 3 float32x4_t vectors : x, y, z. I want to assign the 1st and 3rd lane of x to the 1st and 2nd lane of z respectively, and assign the 1st and 3rd lane of y to the 3rd and 4th lane of z respectively.
I can't find an efficient way to implement the function and need some help.

There's no equivalent to _mm_shuffle_ps, but as noted you can use vtbl.
For DirectXMath, I use VTBL for generic XMVectorSwizzle and XMVectorPermute. I then specialize the template for for ARM/ARM64 with some patterns that can be done efficiently in ARM-NEON.
XMVectorSwizzle
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3)
{
assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
static const uint32_t ControlElement[ 4 ] =
{
0x03020100, // XM_SWIZZLE_X
0x07060504, // XM_SWIZZLE_Y
0x0B0A0908, // XM_SWIZZLE_Z
0x0F0E0D0C, // XM_SWIZZLE_W
};
int8x8x2_t tbl;
tbl.val[0] = vget_low_f32(V);
tbl.val[1] = vget_high_f32(V);
uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
const uint8x8_t rL = vtbl2_u8( tbl, idx );
idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
const uint8x8_t rH = vtbl2_u8( tbl, idx );
return vcombine_f32( rL, rH );
}
template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW>
inline XMVECTOR XMVectorSwizzle(FXMVECTOR V)
{
return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW );
}
template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); }
template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { float32x2_t vt = vget_low_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { float32x2_t vt = vget_high_f32(V); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { float32x2_t vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); }
template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); }
template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); }
template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; }
template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; }
template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); }
template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); }
template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); }
XMVectorPermute
inline XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW)
{
assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
static const uint32_t ControlElement[ 8 ] =
{
0x03020100, // XM_PERMUTE_0X
0x07060504, // XM_PERMUTE_0Y
0x0B0A0908, // XM_PERMUTE_0Z
0x0F0E0D0C, // XM_PERMUTE_0W
0x13121110, // XM_PERMUTE_1X
0x17161514, // XM_PERMUTE_1Y
0x1B1A1918, // XM_PERMUTE_1Z
0x1F1E1D1C, // XM_PERMUTE_1W
};
int8x8x4_t tbl;
tbl.val[0] = vget_low_f32(V1);
tbl.val[1] = vget_high_f32(V1);
tbl.val[2] = vget_low_f32(V2);
tbl.val[3] = vget_high_f32(V2);
uint32x2_t idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
const uint8x8_t rL = vtbl4_u8( tbl, idx );
idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
const uint8x8_t rH = vtbl4_u8( tbl, idx );
return vcombine_f32( rL, rH );
}
template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW>
inline XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2)
{
return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW );
}
template<> inline XMVECTOR XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; }
template<> inline XMVECTOR XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; }
template<> inline XMVECTOR XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); }
template<> inline XMVECTOR XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); }
template<> inline XMVECTOR XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; }
template<> inline XMVECTOR XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; }
template<> inline XMVECTOR XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; }
template<> inline XMVECTOR XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; }
template<> inline XMVECTOR XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; }
template<> inline XMVECTOR XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; }
template<> inline XMVECTOR XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); }
template<> inline XMVECTOR XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); }
template<> inline XMVECTOR XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); }

Related

Porting to newer OpenCV 'C' interface; cv.h file not found

I have a file, included below, that when I compile, I get the error that cv.h file not found. This, I believe, is because I use opencv4 which doesn't support cv.h. I've seen similar posts recommending one to simply downgrade opencv versions, but I don't want to do that.
My question is this:
How do I find what part of the code is dependent on cv.h, so that. I can simply try to update it for opencv4 compatibility?
file:
#ifndef _GUARD_cvl1qc_h_
#define _GUARD_cvl1qc_h_
#include <cv.h>
#include "cvcgsolve.h"
/* taken A, B, X, minimize ||X||_{L1} with constraint: ||AX - B|| < \epsilon */
int cvL1QCSolve( CvMat* A, CvMat* B, CvMat* X, double epsilon, double mu = 10., CvTermCriteria lb_term_crit = cvTermCriteria( CV_TERMCRIT_EPS, 0, 1e-3 ), CvTermCriteria cg_term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 200, 1e-16 ) );
/* taken AOps, AtOps, it specially designed for large scale, AOps is for AX, AtOps is for A'X */
int cvL1QCSolve( CvMatOps AOps, CvMatOps AtOps, void* userdata, CvMat* B, CvMat* X, double epsilon, double mu = 10., CvTermCriteria lb_term_crit = cvTermCriteria( CV_TERMCRIT_EPS, 0, 1e-3 ), CvTermCriteria cg_term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 200, 1e-16 ) );
static int icvL1QCNewton( CvMat* A, CvMat* B, CvMat* X, CvMat* U, double epsilon, double tau, CvTermCriteria nt_term_crit, CvTermCriteria cg_term_crit )
{
const double alpha = .01;
const double beta = .5;
CvMat* R = cvCreateMat( B->rows, B->cols, CV_MAT_TYPE(B->type) );
cvGEMM( A, X, 1, B, -1, R );
CvMat* fu1 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* fu2 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* lfu1 = cvCreateMat( fu1->rows, fu1->cols, CV_MAT_TYPE(fu1->type) );
CvMat* lfu2 = cvCreateMat( fu2->rows, fu2->cols, CV_MAT_TYPE(fu2->type) );
cvSub( U, X, lfu1 );
cvAdd( X, U, lfu2 );
cvSubRS( lfu1, cvScalar(0), fu1 );
cvSubRS( lfu2, cvScalar(0), fu2 );
double epsilon2 = epsilon * epsilon;
double tau_inv = 1. / tau;
double fe = .5 * (cvDotProduct( R, R ) - epsilon2);
double fe_inv = 1. / fe;
cvLog( lfu1, lfu1 );
cvLog( lfu2, lfu2 );
CvScalar sumU = cvSum( U );
CvScalar sumfu1 = cvSum( lfu1 );
CvScalar sumfu2 = cvSum( lfu2 );
double f = sumU.val[0] - tau_inv * (sumfu1.val[0] + sumfu2.val[0] + log(-fe));
CvMat* atr = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* ntgx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* ntgu = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* sig1211 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* sigx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* w1 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* AtA = cvCreateMat( A->cols, A->cols, CV_MAT_TYPE(A->type) );
CvMat* H11 = cvCreateMat( A->cols, A->cols, CV_MAT_TYPE(A->type) );
CvMat* du = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* pX = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* pU = cvCreateMat( U->rows, U->cols, CV_MAT_TYPE(U->type) );
CvMat* pR = cvCreateMat( R->rows, R->cols, CV_MAT_TYPE(R->type) );
CvMat* pfu1 = cvCreateMat( fu1->rows, fu1->cols, CV_MAT_TYPE(fu1->type) );
CvMat* pfu2 = cvCreateMat( fu2->rows, fu2->cols, CV_MAT_TYPE(fu2->type) );
CvMat* Adx = cvCreateMat( B->rows, B->cols, CV_MAT_TYPE(B->type) );
CvMat* dx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
int result = nt_term_crit.max_iter;
int t, i;
for ( t = 0; t < nt_term_crit.max_iter; ++t )
{
cvGEMM( A, R, 1, NULL, 0, atr, CV_GEMM_A_T );
cvGEMM( A, A, 1, NULL, 0, AtA, CV_GEMM_A_T );
cvGEMM( atr, atr, 1, NULL, 0, H11, CV_GEMM_B_T );
double* atrp = atr->data.db;
double* fu1p = fu1->data.db;
double* fu2p = fu2->data.db;
double* ntgxp = ntgx->data.db;
double* ntgup = ntgu->data.db;
double* sig1211p = sig1211->data.db;
double* sigxp = sigx->data.db;
double* w1p = w1->data.db;
double* dup = du->data.db;
for ( i = 0; i < X->rows; ++i, ++atrp, ++fu1p, ++fu2p, ++ntgxp, ++ntgup, ++sig1211p, ++sigxp, ++w1p, ++dup )
{
double fu1_inv = 1. / (*fu1p);
double fu2_inv = 1. / (*fu2p);
double ntgxv = fu1_inv - fu2_inv + fe_inv * (*atrp);
double ntguv = -tau - fu1_inv - fu2_inv;
double sig11 = fu1_inv * fu1_inv + fu2_inv * fu2_inv;
double sig12 = -fu1_inv * fu1_inv + fu2_inv * fu2_inv;
*sig1211p = sig12 / sig11;
*sigxp = sig11 - sig12 * (*sig1211p);
*w1p = ntgxv - (*sig1211p) * ntguv;
*ntgxp = -tau_inv * ntgxv;
*ntgup = -tau_inv * ntguv;
*dup = ntguv / sig11;
}
cvAddWeighted( AtA, -fe_inv, H11, -fe_inv * fe_inv, 0, H11 );
sigxp = sigx->data.db;
double* H11p = H11->data.db;
for ( i = 0; i < A->cols; ++i, ++sigxp, H11p += A->cols + 1 )
*H11p += *sigxp;
if ( cvCGSolve( H11, w1, dx, cg_term_crit ) > .5 )
{
result = t;
goto __clean_up__;
}
cvMatMul( A, dx, Adx );
dup = du->data.db;
sig1211p = sig1211->data.db;
double* dxp = dx->data.db;
for ( i = 0; i < X->rows; ++i, ++dup, ++sig1211p, ++dxp )
*dup -= (*sig1211p) * (*dxp);
/* minimum step size that stays in the interior */
double aqe = cvDotProduct( Adx, Adx );
double bqe = 2. * cvDotProduct( R, Adx );
double cqe = cvDotProduct( R, R ) - epsilon2;
double smax = MIN( 1, -bqe + sqrt( bqe * bqe - 4 * aqe * cqe ) / (2 * aqe) );
dup = du->data.db;
dxp = dx->data.db;
fu1p = fu1->data.db;
fu2p = fu2->data.db;
for ( i = 0; i < X->rows; ++i, ++dup, ++dxp, ++fu1p, ++fu2p )
{
if ( (*dxp) - (*dup) > 0 )
smax = MIN( smax, -(*fu1p) / ((*dxp) - (*dup)) );
if ( (*dxp) + (*dup) < 0 )
smax = MIN( smax, (*fu2p) / ((*dxp) + (*dup)) );
}
smax *= .99;
/* backtracking line search */
bool suffdec = 0;
int backiter = 0;
double fep = fe;
double fp = f;
double lambda2;
while (!suffdec)
{
cvAddWeighted( X, 1, dx, smax, 0, pX );
cvAddWeighted( U, 1, du, smax, 0, pU );
cvAddWeighted( R, 1, Adx, smax, 0, pR );
cvSub( pU, pX, lfu1 );
cvAdd( pX, pU, lfu2 );
cvSubRS( lfu1, cvScalar(0), pfu1 );
cvSubRS( lfu2, cvScalar(0), pfu2 );
fep = .5 * (cvDotProduct( pR, pR ) - epsilon2);
cvLog( lfu1, lfu1 );
cvLog( lfu2, lfu2 );
CvScalar sumpU = cvSum( pU );
CvScalar sumpfu1 = cvSum( pfu1 );
CvScalar sumpfu2 = cvSum( pfu2 );
fp = sumpU.val[0] - tau_inv * (sumpfu1.val[0] + sumpfu2.val[0] + log(-fep));
lambda2 = cvDotProduct( ntgx, dx ) + cvDotProduct( ntgu, du );
double flin = f + alpha * smax * lambda2;
suffdec = (fp <= flin);
smax = beta * smax;
++backiter;
if ( backiter > 32 )
{
result = t;
goto __clean_up__;
}
}
/* set up for next iteration */
cvCopy( pX, X );
cvCopy( pU, U );
cvCopy( pR, R );
cvCopy( pfu1, fu1 );
cvCopy( pfu2, fu2 );
fe = fep;
fe_inv = 1. / fe;
f = fp;
lambda2 = -lambda2 * .5;
if ( lambda2 < nt_term_crit.epsilon )
{
result = t + 1;
break;
}
}
__clean_up__:
cvReleaseMat( &pfu2 );
cvReleaseMat( &pfu1 );
cvReleaseMat( &pR );
cvReleaseMat( &pU );
cvReleaseMat( &pX );
cvReleaseMat( &dx );
cvReleaseMat( &Adx );
cvReleaseMat( &du );
cvReleaseMat( &H11 );
cvReleaseMat( &AtA );
cvReleaseMat( &w1 );
cvReleaseMat( &sigx );
cvReleaseMat( &sig1211 );
cvReleaseMat( &ntgu );
cvReleaseMat( &ntgx );
cvReleaseMat( &lfu2 );
cvReleaseMat( &lfu1 );
cvReleaseMat( &fu2 );
cvReleaseMat( &fu1 );
cvReleaseMat( &R );
return result;
}
int cvL1QCSolve( CvMat* A, CvMat* B, CvMat* X, double epsilon, double mu, CvTermCriteria lb_term_crit, CvTermCriteria cg_term_crit )
{
CvMat* AAt = cvCreateMat( A->rows, A->rows, CV_MAT_TYPE(A->type) );
cvGEMM( A, A, 1, NULL, 0, AAt, CV_GEMM_B_T );
CvMat* W = cvCreateMat( A->rows, 1, CV_MAT_TYPE(X->type) );
if ( cvCGSolve( AAt, B, W, cg_term_crit ) > .5 )
{
cvReleaseMat( &W );
cvReleaseMat( &AAt );
return -1;
}
cvGEMM( A, W, 1, NULL, 0, X, CV_GEMM_A_T );
cvReleaseMat( &W );
cvReleaseMat( &AAt );
CvMat* U = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
cvAbsDiffS( X, U, cvScalar(0) );
CvScalar sumAbsX = cvSum( U );
double minAbsX, maxAbsX;
cvMinMaxLoc( U, &minAbsX, &maxAbsX );
cvConvertScale( U, U, .95, maxAbsX * .1 );
double tau = MAX( (2 * X->rows + 1) / sumAbsX.val[0], 1 );
if ( !(lb_term_crit.type & CV_TERMCRIT_ITER) )
lb_term_crit.max_iter = ceil( (log(2 * X->rows + 1) - log(lb_term_crit.epsilon) - log(tau)) / log(mu) );
CvTermCriteria nt_term_crit = cvTermCriteria( CV_TERMCRIT_EPS + CV_TERMCRIT_ITER, 50, lb_term_crit.epsilon );
for ( int i = 0; i < lb_term_crit.max_iter; ++i )
{
icvL1QCNewton( A, B, X, U, epsilon, tau, nt_term_crit, cg_term_crit );
tau *= mu;
}
cvReleaseMat( &U );
return 0;
}
typedef struct {
CvMatOps AOps;
CvMatOps AtOps;
CvMat* AR;
CvMat* AtR;
void* userdata;
} CvAAtOpsData;
static void icvAAtOps( CvMat* X, CvMat* Y, void* userdata )
{
CvAAtOpsData* data = (CvAAtOpsData*)userdata;
data->AtOps( X, data->AtR, data->userdata );
data->AOps( data->AtR, Y, data->userdata );
}
typedef struct {
CvMatOps AOps;
CvMatOps AtOps;
CvMat* AR;
CvMat* AtR;
CvMat* tX;
CvMat* sigx;
CvMat* atr;
double fe_inv;
double fe_inv_2;
void* userdata;
} CvH11OpsData;
static void icvH11Ops( CvMat* X, CvMat* Y, void* userdata )
{
CvH11OpsData* h11 = (CvH11OpsData*)userdata;
h11->AOps( X, h11->AR, h11->userdata );
h11->AtOps( h11->AR, h11->AtR, h11->userdata );
double rc = h11->fe_inv_2 * cvDotProduct( h11->atr, X );
cvAddWeighted( h11->AtR, -h11->fe_inv, h11->atr, rc, 0, h11->AtR );
cvMul( h11->sigx, X, h11->tX );
cvAdd( h11->tX, h11->AtR, Y );
}
static int icvL1QCNewton( CvAAtOpsData& AAtData, CvMat* B, CvMat* X, CvMat* U, double epsilon, double tau, CvTermCriteria nt_term_crit, CvTermCriteria cg_term_crit )
{
const double alpha = .01;
const double beta = .5;
CvMat* R = cvCreateMat( B->rows, B->cols, CV_MAT_TYPE(B->type) );
AAtData.AOps( X, AAtData.AR, AAtData.userdata );
cvSub( AAtData.AR, B, R );
CvMat* fu1 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* fu2 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* lfu1 = cvCreateMat( fu1->rows, fu1->cols, CV_MAT_TYPE(fu1->type) );
CvMat* lfu2 = cvCreateMat( fu2->rows, fu2->cols, CV_MAT_TYPE(fu2->type) );
cvSub( U, X, lfu1 );
cvAdd( X, U, lfu2 );
cvSubRS( lfu1, cvScalar(0), fu1 );
cvSubRS( lfu2, cvScalar(0), fu2 );
double epsilon2 = epsilon * epsilon;
double tau_inv = 1. / tau;
double fe = .5 * (cvDotProduct( R, R ) - epsilon2);
double fe_inv = 1. / fe;
cvLog( lfu1, lfu1 );
cvLog( lfu2, lfu2 );
CvScalar sumU = cvSum( U );
CvScalar sumfu1 = cvSum( lfu1 );
CvScalar sumfu2 = cvSum( lfu2 );
double f = sumU.val[0] - tau_inv * (sumfu1.val[0] + sumfu2.val[0] + log(-fe));
CvMat* atr = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* ntgx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* ntgu = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* sig1211 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* sigx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* w1 = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* du = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* pX = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* pU = cvCreateMat( U->rows, U->cols, CV_MAT_TYPE(U->type) );
CvMat* pR = cvCreateMat( R->rows, R->cols, CV_MAT_TYPE(R->type) );
CvMat* pfu1 = cvCreateMat( fu1->rows, fu1->cols, CV_MAT_TYPE(fu1->type) );
CvMat* pfu2 = cvCreateMat( fu2->rows, fu2->cols, CV_MAT_TYPE(fu2->type) );
CvMat* Adx = cvCreateMat( B->rows, B->cols, CV_MAT_TYPE(B->type) );
CvMat* dx = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
CvMat* tX = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
int result = nt_term_crit.max_iter;
CvH11OpsData H11OpsData;
H11OpsData.AOps = AAtData.AOps;
H11OpsData.AtOps = AAtData.AtOps;
H11OpsData.AR = AAtData.AR;
H11OpsData.AtR = AAtData.AtR;
H11OpsData.userdata = AAtData.userdata;
H11OpsData.tX = tX;
H11OpsData.atr = atr;
H11OpsData.sigx = sigx;
int t, i;
for ( t = 0; t < nt_term_crit.max_iter; ++t )
{
AAtData.AtOps( R, atr, AAtData.userdata );
double* atrp = atr->data.db;
double* fu1p = fu1->data.db;
double* fu2p = fu2->data.db;
double* ntgxp = ntgx->data.db;
double* ntgup = ntgu->data.db;
double* sig1211p = sig1211->data.db;
double* sigxp = sigx->data.db;
double* w1p = w1->data.db;
double* dup = du->data.db;
for ( i = 0; i < X->rows; ++i, ++atrp, ++fu1p, ++fu2p, ++ntgxp, ++ntgup, ++sig1211p, ++sigxp, ++w1p, ++dup )
{
double fu1_inv = 1. / (*fu1p);
double fu2_inv = 1. / (*fu2p);
double ntgxv = fu1_inv - fu2_inv + fe_inv * (*atrp);
double ntguv = -tau - fu1_inv - fu2_inv;
double sig11 = fu1_inv * fu1_inv + fu2_inv * fu2_inv;
double sig12 = -fu1_inv * fu1_inv + fu2_inv * fu2_inv;
*sig1211p = sig12 / sig11;
*sigxp = sig11 - sig12 * (*sig1211p);
*w1p = ntgxv - (*sig1211p) * ntguv;
*ntgxp = -tau_inv * ntgxv;
*ntgup = -tau_inv * ntguv;
*dup = ntguv / sig11;
}
H11OpsData.fe_inv = fe_inv;
H11OpsData.fe_inv_2 = fe_inv * fe_inv;
if ( cvCGSolve( icvH11Ops, &H11OpsData, w1, dx, cg_term_crit ) > .5 )
{
result = t;
goto __clean_up__;
}
AAtData.AOps( dx, Adx, AAtData.userdata );
dup = du->data.db;
sig1211p = sig1211->data.db;
double* dxp = dx->data.db;
for ( i = 0; i < X->rows; ++i, ++dup, ++sig1211p, ++dxp )
*dup -= (*sig1211p) * (*dxp);
/* minimum step size that stays in the interior */
double aqe = cvDotProduct( Adx, Adx );
double bqe = 2. * cvDotProduct( R, Adx );
double cqe = cvDotProduct( R, R ) - epsilon2;
double smax = MIN( 1, -bqe + sqrt( bqe * bqe - 4 * aqe * cqe ) / (2 * aqe) );
dup = du->data.db;
dxp = dx->data.db;
fu1p = fu1->data.db;
fu2p = fu2->data.db;
for ( i = 0; i < X->rows; ++i, ++dup, ++dxp, ++fu1p, ++fu2p )
{
if ( (*dxp) - (*dup) > 0 )
smax = MIN( smax, -(*fu1p) / ((*dxp) - (*dup)) );
if ( (*dxp) + (*dup) < 0 )
smax = MIN( smax, (*fu2p) / ((*dxp) + (*dup)) );
}
smax *= .99;
/* backtracking line search */
bool suffdec = 0;
int backiter = 0;
double fep = fe;
double fp = f;
double lambda2;
while (!suffdec)
{
cvAddWeighted( X, 1, dx, smax, 0, pX );
cvAddWeighted( U, 1, du, smax, 0, pU );
cvAddWeighted( R, 1, Adx, smax, 0, pR );
cvSub( pU, pX, lfu1 );
cvAdd( pX, pU, lfu2 );
cvSubRS( lfu1, cvScalar(0), pfu1 );
cvSubRS( lfu2, cvScalar(0), pfu2 );
fep = .5 * (cvDotProduct( pR, pR ) - epsilon2);
cvLog( lfu1, lfu1 );
cvLog( lfu2, lfu2 );
CvScalar sumpU = cvSum( pU );
CvScalar sumpfu1 = cvSum( pfu1 );
CvScalar sumpfu2 = cvSum( pfu2 );
fp = sumpU.val[0] - tau_inv * (sumpfu1.val[0] + sumpfu2.val[0] + log(-fep));
lambda2 = cvDotProduct( ntgx, dx ) + cvDotProduct( ntgu, du );
double flin = f + alpha * smax * lambda2;
suffdec = (fp <= flin);
smax = beta * smax;
++backiter;
if ( backiter > 32 )
{
result = t;
goto __clean_up__;
}
}
/* set up for next iteration */
cvCopy( pX, X );
cvCopy( pU, U );
cvCopy( pR, R );
cvCopy( pfu1, fu1 );
cvCopy( pfu2, fu2 );
fe = fep;
fe_inv = 1. / fe;
f = fp;
lambda2 = -lambda2 * .5;
if ( lambda2 < nt_term_crit.epsilon )
{
result = t + 1;
break;
}
}
__clean_up__:
cvReleaseMat( &pfu2 );
cvReleaseMat( &pfu1 );
cvReleaseMat( &pR );
cvReleaseMat( &pU );
cvReleaseMat( &pX );
cvReleaseMat( &tX );
cvReleaseMat( &dx );
cvReleaseMat( &Adx );
cvReleaseMat( &du );
cvReleaseMat( &w1 );
cvReleaseMat( &sigx );
cvReleaseMat( &sig1211 );
cvReleaseMat( &ntgu );
cvReleaseMat( &ntgx );
cvReleaseMat( &lfu2 );
cvReleaseMat( &lfu1 );
cvReleaseMat( &fu2 );
cvReleaseMat( &fu1 );
cvReleaseMat( &R );
return result;
}
int cvL1QCSolve( CvMatOps AOps, CvMatOps AtOps, void* userdata, CvMat* B, CvMat* X, double epsilon, double mu, CvTermCriteria lb_term_crit, CvTermCriteria cg_term_crit )
{
CvMat* Z = cvCreateMat( X->rows, 1, CV_MAT_TYPE(X->type) );
CvMat* W = cvCreateMat( B->rows, 1, CV_MAT_TYPE(B->type) );
CvAAtOpsData AAtData;
AAtData.AOps = AOps;
AAtData.AtOps = AtOps;
AAtData.AtR = Z;
AAtData.userdata = userdata;
if ( cvCGSolve( icvAAtOps, &AAtData, B, W, cg_term_crit ) > .5 )
{
cvReleaseMat( &W );
cvReleaseMat( &Z );
return -1;
}
AtOps( W, X, userdata );
AAtData.AR = W;
CvMat* U = cvCreateMat( X->rows, X->cols, CV_MAT_TYPE(X->type) );
cvAbsDiffS( X, U, cvScalar(0) );
CvScalar sumAbsX = cvSum( U );
double minAbsX, maxAbsX;
cvMinMaxLoc( U, &minAbsX, &maxAbsX );
cvConvertScale( U, U, .95, maxAbsX * .1 );
double tau = MAX( (2 * X->rows + 1) / sumAbsX.val[0], 1 );
if ( !(lb_term_crit.type & CV_TERMCRIT_ITER) )
lb_term_crit.max_iter = ceil( (log(2 * X->rows + 1) - log(lb_term_crit.epsilon) - log(tau)) / log(mu) );
CvTermCriteria nt_term_crit = cvTermCriteria( CV_TERMCRIT_EPS + CV_TERMCRIT_ITER, 50, lb_term_crit.epsilon );
int totaliter = 0;
for ( int i = 0; i < lb_term_crit.max_iter; ++i )
{
totaliter += icvL1QCNewton( AAtData, B, X, U, epsilon, tau, nt_term_crit, cg_term_crit );
tau *= mu;
}
cvReleaseMat( &U );
cvReleaseMat( &W );
cvReleaseMat( &Z );
return 0;
}
#endif

This is the C-interface to OpenCV, not C++. Some of the types seem to be still available, though, with "_c.h" header files, if you still want to use the C code and don't want to convert it to the C++ types and interface.
I managed to get most of your code parsed with OpenCV 4.2 and:
#include <opencv2/core.hpp>
#include <opencv2/core/core_c.h>
However except:
CvMatOps, cvCGSolve
( I didn't find them in the source either, I just searched the OpenCV include directories for these strings, the latter seems to be in that file cvcgsolve.h which is something specific, it's found here: https://github.com/liuliu/l1cs/blob/master/src/cvcgsolve.cpp )
Illustrations:

Gaussian elimination without result for acceleration

Good day,
I'm working on a C library (for myself, code: https://github.com/BattlestarSC/matrixLibrary.git) to handle matrix functions. This is mostly a learning/practice activity. One of my challenges is to take the determinant of a matrix efficiently. As my current attempts have failed, I wanted to take a different approach. I was reading though this method from MIT docs: http://web.mit.edu/18.06/www/Spring17/Determinants.pdf and it made a lot of sense. The issue I'm having is how to get to said point. As the Gaussian elimination method is good for multi-variable systems of equations, my matricies are not built from equations, and therefor are not part of a system. As in, each equation has no set result and does not fit into the form from this paper here:https://math.oregonstate.edu/home/programs/undergrad/CalculusQuestStudyGuides/vcalc/gauss/gauss.html
From this point, I'm at a loss as far as how to proceed with this method.
It makes a lot of sense to take the pivot point from each set of equations as described in the MIT paper, but how should I set up my matricies to make said result valid?

When you perform a Gaussian elimination, you swap rows and repeatedly subtract a multiple of one row from another to produce an upper triangular form.
When you do this on a system of equations or an "augmented matrix", you do not use any information from the result column. The decisions about which rows to swap and which to subtract with what multiplier would be exactly the same no matter what numbers are in the result column.
Because the "result column" is not used, you can perform the same procedure on a normal square matrix. Since the operations don't change the determinant (if you negate one row whenever you swap), you end up with an upper triangular matrix with the same det as the original.
The MIT author calls a function lu to do this in the example near the start. This does L-U decomposition on the matrix, which returns the Gaussian-eliminated matrix in the 'U' part: https://en.wikipedia.org/wiki/LU_decomposition.
L-U decomposition is pretty cool. It's like doing Gaussian elimination to solve all systems with the same "matrix part" all at once, which again you can do because the process doesn't need to see the result column at all.
Starting with a matrix M, you get L and U such that LU = M. That means, if you want to solve:
Mx = y
... where (x an y are column vectors), you have:
LUx = y
Solve Lv=y, which is easy (just substitution) because L is lower-triangular. Then you have:
Ux = v
... which is easy to solve because U is upper-triangular.

GEM is not very good for computers as it needs to reorder the rows so the algo leads to a valid result that adds relatively big overhead and potential instability (if ordered badly). The GEM is much better suited for humans and paper/pencil as we instinctively reorder/chose rows ...
So you should go with the (sub)Determinant approach as you wanted in the first place. Is faster and safer. I know its a bit tricky to learn it from papers. If it helps this is mine ancient matrix.h class (but in C++) I wrote when I was still a rookie (so there might be some hidden bugs I do not know of haven't use this for ages):
//--- matrix ver: 2.1 -------------------------------------------------------
#ifndef _matrix_h
#define _matrix_h
//---------------------------------------------------------------------------
double fabs(double x)
{
if (x<0) x=-x;
return x;
}
//---------------------------------------------------------------------------
class matrix
{
private:double **p;
int xs,ys;
double zeroacc;
public: matrix() { p=NULL; xs=0; ys=0; resize(1,1); zeroacc=1e-10; }
~matrix() { free(); }
void free();
int resize(int _xs,int _ys);
matrix& operator=(const matrix &b);
matrix& operator+();
matrix& operator-();
matrix& operator+(matrix &b);
matrix& operator-(matrix &b);
matrix& operator*(matrix &b);
matrix& operator+=(matrix &b);
matrix& operator-=(matrix &b);
matrix& operator*=(matrix &b);
matrix& operator!();
double& operator()(int y,int x);
double* operator[](int y) { return p[y]; }
void one();
int get_xs() { return xs; }
int get_ys() { return ys; }
double get_zeroacc() { return zeroacc; }
void set_zeroacc(double _zeroacc) { zeroacc=_zeroacc; if (zeroacc<0) zeroacc=-zeroacc; }
void ld(int y,double x0=0.0,double x1=0.0,double x2=0.0,double x3=0.0,double x4=0.0,double x5=0.0,double x6=0.0,double x7=0.0,double x8=0.0,double x9=0.0);
void prn(TCanvas *scr,int x0,int y0);
void lxch(int y1,int y2);
void lcom(int y1,int y2,double k);
void lmul(int y,double k);
void ldiv(int y,double k);
int gaus(matrix &b);
matrix& matrix::submatrix(int _x,int _y);
double determinant();
double subdeterminant();
matrix& inv_det();
matrix& inv_gaus();
};
//---------------------------------------------------------------------------
void matrix::free()
{
int y;
if (p!=NULL)
for (y=0;y<ys;y++)
delete[] p[y];
delete[] p;
p=NULL;
xs=0;
ys=0;
}
//---------------------------------------------------------------------------
int matrix::resize(int _xs,int _ys)
{
int y;
free();
if (_xs<1) _xs=1;
if (_ys<1) _ys=1;
xs=_xs;
ys=_ys;
p=new double*[ys];
if (p==NULL)
{
xs=0;
ys=0;
return 0;
}
for (y=0;y<ys;y++)
{
p[y]=new double[xs];
if (p[y]==NULL)
{
if (y>0)
for (y--;y>=0;y--)
delete p[y];
delete p;
p=NULL;
xs=0;
ys=0;
return 0;
}
}
return 1;
}
//---------------------------------------------------------------------------
matrix& matrix::operator=(const matrix &b)
{
int x,y;
if (!resize(b.get_xs(),b.get_ys())) return *this;
if (b.p)
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
p[y][x]=b.p[y][x];
return *this;
}
//---------------------------------------------------------------------------
matrix& matrix::operator+()
{
static matrix c;
int x,y;
c.resize(xs,ys);
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
c.p[y][x]= p[y][x];
return c;
}
//---------------------------------------------------------------------------
matrix& matrix::operator-()
{
static matrix c;
int x,y;
c.resize(xs,ys);
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
c.p[y][x]=-p[y][x];
return c;
}
//---------------------------------------------------------------------------
matrix& matrix::operator+(matrix &b)
{
static matrix c;
int x,y;
c.free();
if (xs!=b.get_xs()) return c;
if (ys!=b.get_ys()) return c;
c.resize(xs,ys);
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
c.p[y][x]=p[y][x]+b.p[y][x];
return c;
}
//---------------------------------------------------------------------------
matrix& matrix::operator-(matrix &b)
{
static matrix c;
int x,y;
c.free();
if (xs!=b.get_xs()) return c;
if (ys!=b.get_ys()) return c;
c.resize(xs,ys);
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
c.p[y][x]=p[y][x]-b.p[y][x];
return c;
}
//---------------------------------------------------------------------------
matrix& matrix::operator*(matrix &b)
{
static matrix c;
int i,j,k,ii,jj,kk;
c.free();
ii=ys;
jj=b.get_xs();
kk=b.get_ys();
if (kk!=xs) return c;
if (!c.resize(jj,ii)) return c;
for (i=0;i<ii;i++)
for (j=0;j<jj;j++)
c.p[i][j]=0.0;
for (i=0;i<ii;i++)
for (j=0;j<jj;j++)
for (k=0;k<kk;k++)
c.p[i][j]+=p[i][k]*b.p[k][j];
return c;
}
//---------------------------------------------------------------------------
matrix& matrix::operator+=(matrix &b)
{
int x,y;
if (xs!=b.get_xs()) { free(); return *this; }
if (ys!=b.get_ys()) { free(); return *this; }
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
p[y][x]+=b.p[y][x];
return *this;
}
//---------------------------------------------------------------------------
matrix& matrix::operator-=(matrix &b)
{
int x,y;
if (xs!=b.get_xs()) { free(); return *this; }
if (ys!=b.get_ys()) { free(); return *this; }
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
p[y][x]-=b.p[y][x];
return *this;
}
//---------------------------------------------------------------------------
matrix& matrix::operator*=(matrix &b)
{
matrix c;
int i,j,k,ii,jj,kk;
c.free();
ii=ys;
jj=b.get_xs();
kk=b.get_ys();
if (kk!=xs) { *this=c; return *this; }
if (!c.resize(jj,ii)) { *this=c; return *this; }
for (i=0;i<ii;i++)
for (j=0;j<jj;j++)
c.p[i][j]=0.0;
for (i=0;i<ii;i++)
for (j=0;j<jj;j++)
for (k=0;k<kk;k++)
c.p[i][j]+=p[i][k]*b.p[k][j];
*this=c; return *this;
}
//---------------------------------------------------------------------------
matrix& matrix::operator!()
{
// return inv_det();
return inv_gaus();
}
//---------------------------------------------------------------------------
double& matrix::operator()(int y,int x)
{
static double _null;
if (x<0) return _null;
if (y<0) return _null;
if (x>=xs) return _null;
if (y>=ys) return _null;
return p[y][x];
}
//---------------------------------------------------------------------------
void matrix::one()
{
int x,y;
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
if (x!=y) p[y][x]=0.0;
else p[y][x]=1.0;
}
//---------------------------------------------------------------------------
void matrix::ld(int y,double x0,double x1,double x2,double x3,double x4,double x5,double x6,double x7,double x8,double x9)
{
int x;
if (y<0) return;
if (y>=ys) return;
x=0;
if (x<xs) p[y][x]=x0; x++;
if (x<xs) p[y][x]=x1; x++;
if (x<xs) p[y][x]=x2; x++;
if (x<xs) p[y][x]=x3; x++;
if (x<xs) p[y][x]=x4; x++;
if (x<xs) p[y][x]=x5; x++;
if (x<xs) p[y][x]=x6; x++;
if (x<xs) p[y][x]=x7; x++;
if (x<xs) p[y][x]=x8; x++;
if (x<xs) p[y][x]=x9; x++;
}
//---------------------------------------------------------------------------
void matrix::prn(TCanvas *scr,int x0,int y0)
{
int x,y,xx,yy,dx,dy;
dx=50;
dy=13;
yy=y0;
for (y=0;y<ys;y++)
{
xx=x0;
for (x=0;x<xs;x++)
{
scr->TextOutA(xx,yy,AnsiString().sprintf("%.4lf",p[y][x]));
xx+=dx;
}
yy+=dy;
}
}
//---------------------------------------------------------------------------
void matrix::lxch(int y1,int y2)
{
int x;
double a;
if (y1<0) return;
if (y2<0) return;
if (y1>=ys) return;
if (y2>=ys) return;
for (x=0;x<xs;x++) { a=p[y1][x]; p[y1][x]=p[y2][x]; p[y2][x]=a; }
}
//---------------------------------------------------------------------------
void matrix::lcom(int y1,int y2,double k)
{
int x;
if (y1<0) return;
if (y2<0) return;
if (y1>=ys) return;
if (y2>=ys) return;
for (x=0;x<xs;x++) p[y1][x]+=p[y2][x]*k;
}
//---------------------------------------------------------------------------
void matrix::lmul(int y,double k)
{
int x;
if (y<0) return;
if (y>=ys) return;
for (x=0;x<xs;x++) p[y][x]*=k;
}
//---------------------------------------------------------------------------
void matrix::ldiv(int y,double k)
{
int x;
if (y<0) return;
if (y>=ys) return;
if ((k> zeroacc)||(k<-zeroacc)) k=1.0/k; else k=0.0;
for (x=0;x<xs;x++) p[y][x]*=k;
}
//---------------------------------------------------------------------------
int matrix::gaus(matrix &b)
{
int x,y;
double a;
if (xs!=ys) return 0;
if (ys!=b.ys) return 0;
for (x=0;x<xs;x++)
{
a=p[x][x]; // je aktualny prvok (x,x) na diagonale = 0 ?
if (a<0) a=-a;
if (a<=zeroacc)
for (y=0;y<ys;y++) // ak hej najdi nejaky nenulovy riadok v aktualnom stlpci (x)
if (x!=y)
{
a=p[y][x];
if (a<0) a=-a;
if (a>=zeroacc) // ak sa nasiel tak ho pripocitaj k aktualnemu riadku co zrusi tu nulu
{
b.lcom(x,y,1.0);
lcom(x,y,1.0);
break;
}
}
a=p[x][x]; // este raz otestuj ci na diagonale neni nula
if (a<0) a=-a;
if (a<=zeroacc) return 0; // ak je tak koniec
b.ldiv(x,p[x][x]); // sprav na diagonale 1-tku
ldiv(x,p[x][x]);
for (y=0;y<ys;y++) // a vynuluj zvysne riadky v stlpci(x)
if (y!=x)
{
b.lcom(y,x,-p[y][x]);
lcom(y,x,-p[y][x]);
}
}
return 1;
}
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
matrix& matrix::submatrix(int _x,int _y)
{
static matrix c;
int x,y,xx,yy;
c.resize(xs-1,ys-1);
yy=0; for (y=0;y<ys;y++)
if (y!=_y)
{
xx=0; for (x=0;x<xs;x++)
if (x!=_x)
{
c.p[yy][xx]=p[y][x];
xx++;
}
yy++;
}
return c;
}
//---------------------------------------------------------------------------
double matrix::determinant()
{
double D;
matrix a;
int x,y,s;
D=0;
if (xs!=ys) return D;
if (xs==1) { D=p[0][0]; return D; }
y=0;
s=y&1;
for (x=0;x<xs;x++)
{
a=submatrix(x,y);
if (s) D-=a.determinant()*p[y][x];
else D+=a.determinant()*p[y][x];
s=!s;
}
return D;
}
//---------------------------------------------------------------------------
double matrix::subdeterminant()
{
double D;
matrix a,b;
int x,y,s;
D=0;
if (xs!=ys) return D;
if (xs==1) { D=p[0][0]; return D; }
b=this[0];
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
{
a=b.submatrix(x,y);
p[y][x]=a.determinant();
}
y=0;
s=y&1;
for (x=0;x<xs;x++)
{
if (s) D-=p[y][x]*b.p[y][x];
else D+=p[y][x]*b.p[y][x];
s=!s;
}
return D;
}
//---------------------------------------------------------------------------
matrix& matrix::inv_det()
{
int x,y,s;
double D;
static matrix a,b;
a=this[0];
b=this[0];
D=b.subdeterminant();
if (fabs(D)>zeroacc) D=1.0/D;
for (y=0;y<ys;y++)
for (x=0;x<xs;x++)
{
s=(x+y)&1;
if (s) a.p[y][x]=-b.p[x][y]*D;
else a.p[y][x]= b.p[x][y]*D;
}
return a;
}
//---------------------------------------------------------------------------
matrix& matrix::inv_gaus()
{
static matrix a,b;
a=*this;
b.resize(xs,ys);
b.one();
a.gaus(b);
return b;
}
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
Both GEM inv_gaus and (sub)determinant inv_det approaches are present so just extract/compare from it what you need.
BTW lately I needed some math stuff for N-dimensional space and once I was at it I also coded a square matrix as template where the (sub)Determinant approach is done as recursive template nd_math.h:
//--- N-Dimensional math ver: 1.002 -----------------------------------------
#ifndef _ND_math_h
#define _ND_math_h
//---------------------------------------------------------------------------
#include <math.h>
//---------------------------------------------------------------------------
#ifndef _rep4d_h
double divide(double a,double b) { if (fabs(b)<1e-30) return 0.0; return a/b; }
#endif
//---------------------------------------------------------------------------
template <const DWORD N> class vector
{
public:
double a[N];
vector() {}
vector(vector& a) { *this=a; }
~vector() {}
vector* operator = (const vector<N> *a) { *this=*a; return this; }
//vector* operator = (vector<N> &a) { ...copy... return this; }
double& operator [](const int i) { return a[i]; }
vector<N> operator + () { return *this; } // =+v0
vector<N> operator - () { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]= -a[i]; return q; } // =-v0
vector<N> operator + (vector<N> &v) { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]=a[i]+v.a[i]; return q; } // =v0+v1
vector<N> operator - (vector<N> &v) { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]=a[i]-v.a[i]; return q; } // =v0-v1
double operator * (vector<N> &v) { int i; double q; for (q=0.0,i=0;i<N;i++) q +=a[i]*v.a[i]; return q; } // =(v0.v1) dot product
vector<N> operator + (const double &c) { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]=a[i]+c; return q; } // =v0+(c,c,c,c,...)
vector<N> operator - (const double &c) { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]=a[i]-c; return q; } // =v0-(c,c,c,c,...)
vector<N> operator * (const double &c) { int i; vector<N> q; for ( i=0;i<N;i++) q.a[i]=a[i]*c; return q; } // =v0*c
vector<N> operator / ( double c) { int i; vector<N> q; c=divide(1.0,c); for ( i=0;i<N;i++) q.a[i]=a[i]*c; return q; } // =v0/c
vector<N> operator +=(vector<N> &v) { this[0]=this[0]+v; return *this; }; // v0+=v1
vector<N> operator -=(vector<N> &v) { this[0]=this[0]-v; return *this; }; // v0-=v1
vector<N> operator +=(const double &c) { this[0]=this[0]+c; return *this; }; // v0+=(c,c,c,c,...)
vector<N> operator -=(const double &c) { this[0]=this[0]-c; return *this; }; // v0-=(c,c,c,c,...)
vector<N> operator *=(const double &c) { this[0]=this[0]*c; return *this; }; // v0*=c
vector<N> operator /=(const double &c) { this[0]=this[0]/c; return *this; }; // v0/=c
AnsiString str() { int i; AnsiString q; for (q="( ",i=0;i<N;i++) q+=AnsiString().sprintf("%6.3lf ",a[i]); q+=")"; return q; }
double len() { int i; double l; for (l=0.0,i=0;i<N;i++) l+=a[i]*a[i]; return sqrt(l); } // get size
double len2() { int i; double l; for (l=0.0,i=0;i<N;i++) l+=a[i]*a[i]; return l; } // get size^2
void len(double l) { int i; l=divide(l,len()); for (i=0;i<N;i++) a[i]*=l; } // set size
void unit() { len(1.0); } // set unit size
void zero() { int i; for (i=0;i<N;i++) a[i]=0.0; } // set zero vector
void rnd() { int i; for (i=0;i<N;i++) a[i]=(2.0*Random())-1.0; } // set random unit vector
void set(double c) { int i; for (i=0;i<N;i++) a[i]=c; } // (c,c,c,c,...)
// i x j = k | | i j k |
// j x k = i | a x b = det | a0 a1 a2 | = + i*det | a1 a2 | - j*det | a0 a2 | + k*det | a0 a1 |
// k x i = j | | b0 b1 b2 | | b1 b2 | | b0 b2 | | b0 b1 |
void cross(const vector<N> *v)
{
int i,j;
matrix<N> m0;
matrix<N-1> m;
for (i=1;i<N;i++)
for (j=0;j<N;j++)
m0.a[i][j]=v[i-1].a[j];
for (j=0;j<N;j++)
{
m=m0.submatrix(0,j);
if (int(j&1)==0) a[j]=+m.det();
else a[j]=-m.det();
}
}
void cross(vector<N> **v)
{
int i,j;
matrix<N> m0;
matrix<N-1> m;
for (i=1;i<N;i++)
for (j=0;j<N;j++)
m0.a[i][j]=v[i-1]->a[j];
for (j=0;j<N;j++)
{
m=m0.submatrix(0,j);
if (int(j&1)==0) a[j]=+m.det();
else a[j]=-m.det();
}
}
void cross(vector<N> &v0) { vector<N> *v[ 1]={&v0}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1) { vector<N> *v[ 2]={&v0,&v1}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2) { vector<N> *v[ 3]={&v0,&v1,&v2}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3) { vector<N> *v[ 4]={&v0,&v1,&v2,&v3}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4) { vector<N> *v[ 5]={&v0,&v1,&v2,&v3,&v4}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4,vector<N> &v5) { vector<N> *v[ 6]={&v0,&v1,&v2,&v3,&v4,&v5}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4,vector<N> &v5,vector<N> &v6) { vector<N> *v[ 7]={&v0,&v1,&v2,&v3,&v4,&v5,&v6}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4,vector<N> &v5,vector<N> &v6,vector<N> &v7) { vector<N> *v[ 8]={&v0,&v1,&v2,&v3,&v4,&v5,&v6,&v7}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4,vector<N> &v5,vector<N> &v6,vector<N> &v7,vector<N> &v8) { vector<N> *v[ 9]={&v0,&v1,&v2,&v3,&v4,&v5,&v6,&v7,v8}; cross(v); }
void cross(vector<N> &v0,vector<N> &v1,vector<N> &v2,vector<N> &v3,vector<N> &v4,vector<N> &v5,vector<N> &v6,vector<N> &v7,vector<N> &v8,vector<N> &v9) { vector<N> *v[10]={&v0,&v1,&v2,&v3,&v4,&v5,&v6,&v7,v8,v9}; cross(v); }
void ld(const double &a0) { a[0]=a0; }
void ld(const double &a0,const double &a1) { a[0]=a0; a[1]=a1; }
void ld(const double &a0,const double &a1,const double &a2) { a[0]=a0; a[1]=a1; a[2]=a2; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4,const double &a5) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; a[5]=a5; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4,const double &a5,const double &a6) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; a[5]=a5; a[6]=a6; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4,const double &a5,const double &a6,const double &a7) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; a[5]=a5; a[6]=a6; a[7]=a7; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4,const double &a5,const double &a6,const double &a7,const double &a8) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; a[5]=a5; a[6]=a6; a[7]=a7; a[8]=a8; }
void ld(const double &a0,const double &a1,const double &a2,const double &a3,const double &a4,const double &a5,const double &a6,const double &a7,const double &a8,const double &a9) { a[0]=a0; a[1]=a1; a[2]=a2; a[3]=a3; a[4]=a4; a[5]=a5; a[6]=a6; a[7]=a7; a[8]=a8; a[9]=a9; }
};
//---------------------------------------------------------------------------
template <DWORD N> class matrix // square matrix
{
public:
vector<N> a[N];
matrix() {}
matrix(matrix& a) { *this=a; }
~matrix() {}
matrix* operator = (const matrix<N> *a) { *this=*a; return this; }
//matrix* operator = (matrix<N> &a) { ...copy... return this; }
vector<N>& operator [](const int i) { return a[i]; }
matrix<N> operator + () { return *this; }
matrix<N> operator - () { matrix<N> q; int i,j; for (i=0;i<M;i++) for (j=0;j<N;j++) q[i][j]=-a[i][j]; return q; } // = -m0
matrix<N> operator * (const matrix &m)
{
matrix<N> q;
int i,j,k;
for (i=0;i<N;i++)
for (j=0;j<N;j++)
for (q.a[i][j]=0.0,k=0;k<N;k++)
q.a[i].a[j]+=a[i].a[k]*m.a[k].a[j];
return q;
}
vector<N> operator * (vector<N> &v)
{
vector<N> q;
int i,j;
for (i=0;i<N;i++)
for (q.a[i]=0.0,j=0;j<N;j++)
q.a[i]+=a[i][j]*v.a[j];
return q;
}
matrix<N> operator * (const double &c)
{
matrix<N> q;
int i,j;
for (i=0;i<N;i++)
for (j=0;j<N;j++)
q.a[i].a[j]=a[i].a[j]*c;
return q;
}
matrix<N> operator / (const double &c)
{
return this[0]*divide(1.0,c);
}
matrix<N> operator *=(matrix<N> &m) { this[0]=this[0]*m; return *this; };
vector<N> operator *=(vector<N> &v) { this[0]=this[0]*v; return *this; };
matrix<N> operator *=(const double &c) { this[0]=this[0]*c; return *this; };
matrix<N> operator /=(const double &c) { this[0]=this[0]/c; return *this; };
AnsiString str() { int i,j; AnsiString q; for (q="",i=0;i<N;i++,q+="\r\n") { for (q+="( ",j=0;j<N;j++) q+=AnsiString().sprintf("%6.3lf ",a[i][j]); q+=")"; } return q; }
void unit() { int i,j; for (i=0;i<N;a[i][i]=1.0,i++) for (j=0;j<N;j++) a[i][j]=0.0; } // set unit matrix
void zero() { int i,j; for (i=0;i<N;i++) for (j=0;j<N;j++) a[i][j]=0.0; } // set zero matrix
void rnd() { int i,j; for (i=0;i<N;i++) for (j=0;j<N;j++) a[i][j]=(2.0*Random())-1.0; } // set random <-1,+1> matrix
void set(double c) { int i,j; for (i=0;i<N;i++) for (j=0;j<N;j++) a[i][j]=c; } // (c,c,c,c,...)
void orthonormal() // convert to orthonormal matrix
{
int i,j;
vector<N> *pV[N],*pp;
for (i=0;i<N;i++) { a[i].unit(); pV[i]=a+i; }
for (i=1;i<N;i++)
{
pV[0]->cross(pV+1);
pp=pV[0]; for (j=1;j<N;j++) pV[j-1]=pV[j]; pV[N-1]=pp;
}
}
matrix<N> transpose()
{
int i,j;
matrix<N> M;
for (i=0;i<N;i++)
for (j=0;j<N;j++)
M[i][j]=a[j][i];
return M;
}
matrix<N> inverse()
{
return adjugate()/det();
}
matrix<N> adjugate()
{
matrix<N> C;
double s;
int i,j;
for (i=0;i<N;i++)
for ((i&1)?s=-1.0:s=+1.0,j=0;j<N;j++,s=-s)
C[j][i]=minor(i,j)*s;
return C;
}
matrix<N> cofactor()
{
matrix<N> C;
double s;
int i,j;
for (i=0;i<N;i++)
for ((i&1)?s=+1.0:s=-1.0,j=0;j<N;j++,s=-s)
C[i][j]=minor(i,j)*s;
return C;
}
double minor(int i,int j)
{
return submatrix(i,j).det();
}
matrix<N-1> submatrix(int i,int j)
{
matrix<N-1> m;
int i0,i1,j0,j1;
for (i0=0,i1=0;i1<N;i1++)
if (i1!=i){ for (j0=0,j1=0;j1<N;j1++)
if (j1!=j){ m.a[i0][j0]=a[i1][j1]; j0++; } i0++; }
return m;
}
double det();
};
//---------------------------------------------------------------------------
double matrix<1>::det() { return a[0][0]; }
double matrix<2>::det() { return (a[0][0]*a[1][1])-(a[0][1]*a[1][0]); }
template <DWORD N> double matrix<N>::det()
{
double d=0.0; int j;
matrix<N-1> m;
for (j=0;j<N;j++)
{
m=submatrix(0,j);
if (int(j&1)==0) d+=a[0][j]*m.det();
else d-=a[0][j]*m.det();
}
return d;
}
//---------------------------------------------------------------------------
#endif
//---------------------------------------------------------------------------
But as you can see that code is a bit more complicated to follow as I am in a different coding level now (look for inverse)...
If you need also results then compute it as matrix equation:
A*X = Y
X = inv(A)*Y
Where X are unknowns (vector) , Y are knowns (vector) and A is the matrix.

Transpose SSE2 Vectors

I try to convolve an image for wavelet decomposition using SSE2 and C. This image has 4 channels (Lab + alpha) stored contiguously in memory : [LabA][LabA][LabA]… The alpha channel is irrelevant for what I do here.
Accessing a pixel is then straightforward by loading the content of a pointer incremented by 4 interatively:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// neighbours contribution
const __m128 w_1 = _mm_set1_ps(filter[0]) * weight_sse2(pin0, pin1, sharpen);
const __m128 w_2 = _mm_set1_ps(filter[1]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_3 = _mm_set1_ps(filter[3]) * weight_sse2(pin0, pin2, sharpen);
const __m128 w_4 = _mm_set1_ps(filter[4]) * weight_sse2(pin0, pin3, sharpen);
// Filter computation
const __m128 wgt = w_1 + w_2 + w_3 + w_4 + w_0;
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
The function ASAN_ROW slides the pointer along the rows ensuring we stay in the bounds, if not, it takes the nearest neighbour. weight_sse2 is a gaussian weight that does complicated bit-shifts because L and a/b have different weightings.
So, instead of operating on 4 Lab SSE vectors, with the last element lost, I feel it would be faster to operate on 3 SSE vecors, each vector being a Lab channel, of which each element is a neighbouring pixel. So that would become:
static void eaw_decompose_sse2(float *const out,
const float *const in,
float *const detail,
const int scale,
const float sharpen,
const size_t width,
const size_t height)
{
/* Convolve rows */
#ifdef _OPENMP
#pragma omp parallel for collapse(2)
#endif
for(size_t j = 0; j < height; j++)
{
for(size_t i = 0; i < width; i++)
{
const size_t inc = (j * width + i) * 4;
float *pdetail = detail + inc;
float *pcoarse = tmp + inc;
// pixel to be convolved
const __m128 pin0 = _mm_load_ps(in + inc);
const __m128 w_0 = _mm_set1_ps(filter[2]);
// neighbours
const __m128 pin1 = _mm_load_ps(in + ASAN_ROW(i, j, -2, mult, max_height_i, width));
const __m128 pin2 = _mm_load_ps(in + ASAN_ROW(i, j, -1, mult, max_height_i, width));
const __m128 pin3 = _mm_load_ps(in + ASAN_ROW(i, j, 1, mult, max_height_i, width));
const __m128 pin4 = _mm_load_ps(in + ASAN_ROW(i, j, 2, mult, max_height_i, width));
// Lab extraction - pixel to be convolved
__m128 L_0 = _mm_set1_ps( pin0[0] ); // ?
__m128 a_0 = _mm_set1_ps( pin0[1] ); // ?
__m128 b_0 = _mm_set1_ps( pin0[2] ); // ?
// Lab extraction - neighbours
__m128 L_f = _mm_set_ps ({ pin1[0], pin2[0], pin3[0], pin4[0] }); // ?
__m128 a_f = _mm_set_ps ({ pin1[1], pin2[1], pin3[1], pin4[1] }); // ?
__m128 b_f = _mm_set_ps ({ pin1[2], pin2[2], pin3[2], pin4[2] }); // ?
// neighbours contribution
const __m128 filter = _mm_load_ps(filter_coeff);
const __m128 w_L = filter * weight_sse(L_0, L_f, sharpen);
const __m128 w_c = filter * weight_sse(a_0 + b_0, a_f + b_f, sharpen);
// Filter computation
const __m128 wgt = _mm_set_ps( { sum_of_elts_sse(w_L),
sum_of_elts_sse(w_c),
sum_of_elts_sse(w_c),
0.0f } );
const __m128 w1 = _mm_set_ps ({ w_L[0], w_c[0], w_c[0], 0.0f }); // ?
const __m128 w2 = _mm_set_ps ({ w_L[1], w_c[1], w_c[1], 0.0f }); // ?
const __m128 w3 = _mm_set_ps ({ w_L[2], w_c[2], w_c[2], 0.0f }); // ?
const __m128 w4 = _mm_set_ps ({ w_L[3], w_c[3], w_c[3], 0.0f }); // ?
const __m128 sum = (w_1 * pin1 + w_2 * pin2 + w_3 * pin3 + w_4 * pin4 + w_0 * pin0) * _mm_rcp_ps(wgt);
// High frequency layer
_mm_stream_ps(pdetail, pin0 - sum);
// Low frequency layer
_mm_stream_ps(pcoarse, sum);
}
}
}
What is the most cache-efficient way to switch from the channel-based vectors (pixels vectors pin0 to pin4) to the neighbour-based vectors (L_0, L_f), and the other way around (w_L and w_c to w_1-w_4) ? Would the second version be faster ?

Sierpinsky pyramid recursive algorithm

I am trying to implement a Sierpinsky pyramid, that is like a Sierpinsky triangle but in 3D.
I have this structure to contain all the data about a pyramid:
typedef struct
{
GLfloat xUp;
GLfloat yUp;
GLfloat zUp;
GLfloat base;
GLfloat height;
}pyramid;
Then I've written a function that calculates three sub-pyramids:
void findSubPyramids( pyramid pyr, pyramid subs[3])
{
for(int i=0; i<3; i++)
{
subs[i].height=pyr.height/2.0;
subs[i].base=pyr.base/2.0;
}
memcpy(subs,&pyr,3*sizeof(GLfloat));
subs[1].yUp= pyr.yUp-pyr.height/2.0;
subs[1].xUp= pyr.xUp+pyr.base/4.0;
subs[1].zUp= pyr.zUp-pyr.base/4.0;
subs[2].yUp= subs[1].yUp;
subs[2].xUp= pyr.xUp-pyr.base/4.0;
subs[2].zUp= subs[1].zUp;
}
But this algorithm implementation is wrong: something is wrong with the zUp coordinate of the two sub-pyramids at bottom: indeed the pyramid is not drawn as I want:
But if I use glOrtho instead of gluPerspective the pyramid is drawn ok.I know that the gluPerspective and the functions I use are right, but is the algorithm to be wrong.
This is where I implement the algorithm that calculated all the sub-pyramids:
void drawSierpinskyPyramid (pyramid pyr)
{
assert(EQUAL(pyr.height, pyr.base));
if(pyr.base > 4.0)
{
setRandomColor();
pyramid subs[3];
drawPyramid(pyr);
findSubPyramids(pyr, subs);
for(int i=0; i<3; i++)
{
drawSierpinskyPyramid(subs[i]);
}
}
}
I don't get what's wrong.

Give this a shot:
// gcc -std=c99 main.c -lglut -lGL -lGLU
#include <GL/glut.h>
#include <math.h>
#include <stdlib.h>
typedef struct
{
float x, y, z;
} Vec3f;
void glTriangle( Vec3f* v0, Vec3f* v1, Vec3f* v2 )
{
glColor3ub( rand() % 255, rand() % 255, rand() % 255 );
glVertex3fv( (GLfloat*)v0 );
glVertex3fv( (GLfloat*)v1 );
glVertex3fv( (GLfloat*)v2 );
}
// v0, v1, v2 = base, v3 = top
void glTetrahedron( Vec3f* v0, Vec3f* v1, Vec3f* v2, Vec3f* v3 )
{
glTriangle( v0, v2, v1 );
glTriangle( v0, v1, v3 );
glTriangle( v1, v2, v3 );
glTriangle( v2, v0, v3 );
}
Vec3f Lerp( Vec3f* v0, Vec3f* v1, float u )
{
Vec3f ret = {
v0->x + ( v1->x - v0->x ) * u,
v0->y + ( v1->y - v0->y ) * u,
v0->z + ( v1->z - v0->z ) * u,
};
return ret;
}
void glSierpinskiPyramid( Vec3f* v0, Vec3f* v1, Vec3f* v2, Vec3f* v3, unsigned int level )
{
if( level == 0 )
{
glTetrahedron( v0, v1, v2, v3 );
return;
}
// midpoints
Vec3f m01 = Lerp( v0, v1, 0.5 );
Vec3f m12 = Lerp( v1, v2, 0.5 );
Vec3f m02 = Lerp( v0, v2, 0.5 );
Vec3f m03 = Lerp( v0, v3, 0.5 );
Vec3f m13 = Lerp( v1, v3, 0.5 );
Vec3f m23 = Lerp( v2, v3, 0.5 );
glSierpinskiPyramid( v0, &m01, &m02, &m03, level-1 );
glSierpinskiPyramid( &m01, v1, &m12, &m13, level-1 );
glSierpinskiPyramid( &m02, &m12, v2, &m23, level-1 );
glSierpinskiPyramid( &m03, &m13, &m23, v3, level-1 );
}
void display()
{
glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
glMatrixMode( GL_PROJECTION );
glLoadIdentity();
double w = glutGet( GLUT_WINDOW_WIDTH );
double h = glutGet( GLUT_WINDOW_HEIGHT );
gluPerspective( 60, w / h, 0.1, 100 );
glMatrixMode( GL_MODELVIEW );
glLoadIdentity();
glTranslatef( 0, 0, -9 );
srand(0);
glPushMatrix();
glScalef( 3, 3, 3 );
static float angle = 0;
angle += 1;
glRotatef( angle/3, 0.2, 1, 0 );
Vec3f v0 = { -1, -1 / sqrtf(3), -1 / sqrtf(6) };
Vec3f v1 = { 1, -1 / sqrtf(3), -1 / sqrtf(6) };
Vec3f v2 = { 0, 2 / sqrtf(3), -1 / sqrtf(6) };
Vec3f v3 = { 0, 0, 3 / sqrtf(6) };
glBegin( GL_TRIANGLES );
glSierpinskiPyramid( &v0, &v1, &v2, &v3, 3 );
glEnd();
glPopMatrix();
glutSwapBuffers();
}
void timer(int extra)
{
glutPostRedisplay();
glutTimerFunc(16, timer, 0);
}
int main( int argc, char **argv )
{
glutInit( &argc, argv );
glutInitDisplayMode( GLUT_RGBA | GLUT_DEPTH | GLUT_DOUBLE );
glutInitWindowSize( 640, 480 );
glutCreateWindow( "Sierpinski Pyramid" );
glutDisplayFunc( display );
glutTimerFunc(0, timer, 0);
glEnable( GL_DEPTH_TEST );
glEnable( GL_CULL_FACE );
glutMainLoop();
return 0;
}

how to convert OpenGL code using vertex arrays into code using vertex buffer objects?

this is my draw() function written in C, using vertex arrays:
void draw(float x1, float x2, float y1, float y2)
{
glPushMatrix();
glScalef(1.0 / (x2 - x1), 1.0 / (y2 - y1), 1.0);
glTranslatef(-x1, -y1, 0.0);
glColor3f(1.0, 1.0, 1.0);
if( pts.size > 0 )
{
glEnableClientState( GL_VERTEX_ARRAY );
glVertexPointer( 2, GL_FLOAT, 0, (float*)pts.data );
glDrawArrays( GL_LINE_STRIP, 0, pts.size / 2 );
glDisableClientState( GL_VERTEX_ARRAY );
}
glPopMatrix();
};
before calling draw(), pts get's updated inside the update() function:
void update(double (* func)(double x), float x1, float x2, int N)
{
double x, dx = (double)1.0/(double)N;
vector_cleanup( &pts );
m = 0;
for(x = x1; x < x2; x += dx)
{
vector_resize( &pts, pts.size + 2 );
*(float*)vector_get( &pts, pts.size-2 ) = (float)x;
*(float*)vector_get( &pts, pts.size-1 ) = (float)func3(x);
m++;
}
}
I hope that by converting this code to use VBO, my graphics performance will increase.
EDIT: func3() can be anything, e.g. sin(x) or just some linear mapping. All I'm currently trying to do is, to find out how quickly I can plot a bunch of points.

Using GLEW for extension wrangling:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <GL/glew.h>
#include <GL/glut.h>
typedef struct vector /*dynamic vector of void* pointers. This one is used only by the deflate compressor*/
{
void* data;
size_t size; /*in groups of bytes depending on type*/
size_t allocsize; /*in bytes*/
unsigned typesize; /*sizeof the type you store in data*/
} vector;
static unsigned vector_resize(vector* p, size_t size) /*returns 1 if success, 0 if failure ==> nothing done*/
{
if(size * p->typesize > p->allocsize)
{
size_t newsize = size * p->typesize * 2;
void* data = realloc(p->data, newsize);
if(data)
{
p->allocsize = newsize;
p->data = data;
p->size = size;
}
else return 0;
}
else p->size = size;
return 1;
}
static void vector_cleanup(void* p)
{
((vector*)p)->size = ((vector*)p)->allocsize = 0;
free(((vector*)p)->data);
((vector*)p)->data = NULL;
}
static void vector_init(vector* p, unsigned typesize)
{
p->data = NULL;
p->size = p->allocsize = 0;
p->typesize = typesize;
}
static void* vector_get(vector* p, size_t index)
{
return &((char*)p->data)[index * p->typesize];
}
/* function to calculate each data point */
float func(float x)
{
return (float)sin(x);
}
GLuint vbo = 0;
GLsizei vertcount = 0;
void update(float (* func)(float x), float x1, float x2, int N)
{
float x, dx = 1.0f/N;
vector pts;
vector_init( &pts, sizeof( float ) );
for(x = x1; x < x2; x += dx)
{
vector_resize( &pts, pts.size + 2 );
*(float*)vector_get( &pts, pts.size-2 ) = x;
*(float*)vector_get( &pts, pts.size-1 ) = func(x);
}
vertcount = (GLsizei)( pts.size / 2 );
glBindBuffer( GL_ARRAY_BUFFER, vbo );
glBufferData( GL_ARRAY_BUFFER, pts.size * pts.typesize, pts.data, GL_DYNAMIC_DRAW );
glBindBuffer( GL_ARRAY_BUFFER, 0 );
vector_cleanup( &pts );
}
/* plotting function - very slow */
void draw(float x1, float x2, float y1, float y2)
{
glPushMatrix();
glScalef( 1.0f / (x2 - x1), 1.0f / (y2 - y1), 1.0f );
glTranslatef( -x1, -y1, 0.0f );
glColor3f( 1.0f, 1.0f, 1.0f );
glBindBuffer( GL_ARRAY_BUFFER, vbo );
glEnableClientState( GL_VERTEX_ARRAY );
glVertexPointer( 2, GL_FLOAT, 0, 0 );
glDrawArrays( GL_LINE_STRIP, 0, vertcount );
glDisableClientState( GL_VERTEX_ARRAY );
glBindBuffer( GL_ARRAY_BUFFER, 0 );
glPopMatrix();
};
/* Redrawing func */
float xmin = -10, xmax = 10, ymin = -5, ymax = 5;
void redraw(void)
{
glClearColor(0, 0, 0, 0);
glClear(GL_COLOR_BUFFER_BIT);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
// -x, +x, -y, +y, number points
draw(xmin, xmax, ymin, ymax);
glutSwapBuffers();
};
/* Idle proc. Redisplays, if called. */
int nPoints = 3000;
void idle(void)
{
// shift 'xmin' & 'xmax' by one.
xmin++;
xmax++;
update(func, xmin, xmax, nPoints);
glutPostRedisplay();
};
/* Key press processing */
void key(unsigned char c, int x, int y)
{
if(c == 27) exit(0);
};
/* Window reashape */
void reshape(int w, int h)
{
glViewport(0, 0, w, h);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
glOrtho(0, 1, 0, 1, -1, 1);
glMatrixMode(GL_MODELVIEW);
};
/* Main function */
int main(int argc, char **argv)
{
GLenum err;
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
glutCreateWindow("Graph plotter");
glutReshapeWindow(1024, 800);
// init GLEW and output some GL info
err = glewInit();
printf("GL_VERSION : %s\n", glGetString(GL_VERSION) );
printf("GL_VENDOR : %s\n", glGetString(GL_VENDOR) );
printf("GL_RENDERER : %s\n", glGetString(GL_RENDERER) );
if( GLEW_OK != err )
{
printf("glewInit failed: %s", glewGetErrorString(err));
return EXIT_FAILURE;
}
if( !glewIsSupported("GL_VERSION_1_5") )
{
printf("OpenGL version 1.5 or greater required.\n");
return EXIT_FAILURE;
}
glGenBuffers( 1, &vbo );
/* Register GLUT callbacks. */
glutDisplayFunc(redraw);
glutKeyboardFunc(key);
glutReshapeFunc(reshape);
glutIdleFunc(idle);
/* Init the GL state */
glLineWidth(2.0);
/* Main loop */
glutMainLoop();
return 0;
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

How to convert _mm_shuffle_ps SSE intrinsic to NEON intrinsic? - arm

Related

Porting to newer OpenCV 'C' interface; cv.h file not found

Gaussian elimination without result for acceleration

Transpose SSE2 Vectors

Sierpinsky pyramid recursive algorithm

how to convert OpenGL code using vertex arrays into code using vertex buffer objects?

Categories

Resources