How to write the mexFunction of this c file - c

The function is cyclic.c.
void cyclic(float a[], float b[], float c[], float alpha, float beta,
float r[], float x[], unsigned long n)
// Solves for a vector x[1..n] the “cyclic” set of linear equations. a,
//b, c, and r are input vectors, all dimensioned as [1..n], while alpha and beta are //the corner
// entries in the matrix.
I am new for the interface between Matlab and C. And I have not use C for several years.
Last night, I finished it and compile. The last thing is to call it.
#include "mex.h"
#include "nrutil.h"
#define FREE_ARG char*
#define NR_END 1
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#define NR_END 1
#define FREE_ARG char*
void nrerror(char error_text[])
/* Numerical Recipes standard error handler */
{fprintf(stderr,"Numerical Recipes run-time error...\n");
fprintf(stderr,"%s\n",error_text);
fprintf(stderr,"...now exiting to system...\n");
exit(1);
}
float *vector(long nl, long nh)
/* allocate a float vector with subscript range v[nl..nh] */
{
float *v;
v=(float *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(float)));
if (!v) nrerror("allocation failure in vector()");
return v-nl+NR_END;
}
void free_vector(float *v, long nl, long nh)
/* free a float vector allocated with vector() */
{
free((FREE_ARG) (v+nl-NR_END));
}
void tridag(float a[], float b[], float c[], float r[], float u[],
unsigned long n)
{
unsigned long j;
float bet,*gam;
gam=vector(1,n);
if (b[1] == 0.0) nrerror("Error 1 in tridag");
u[1]=r[1]/(bet=b[1]);
for (j=2;j<=n;j++) {
gam[j]=c[j-1]/bet;
bet=b[j]-a[j]*gam[j];
if (bet == 0.0) nrerror("Error 2 in tridag");
u[j]=(r[j]-a[j]*u[j-1])/bet;
}
for (j=(n-1);j>=1;j--)
u[j] -= gam[j+1]*u[j+1];
free_vector(gam,1,n);
}
void cyclic(float a[], float b[], float c[], float alpha, float beta,
float r[], float x[], unsigned long n)
{
void tridag(float a[], float b[], float c[], float r[], float u[],
unsigned long n);
unsigned long i;
float fact,gamma,*bb,*u,*z;
if (n <= 2) nrerror("n too small in cyclic");
bb=vector(1,n);
u=vector(1,n);
z=vector(1,n);
gamma = -b[1]; //Avoid subtraction error in forming bb[1].
bb[1]=b[1]-gamma; //Set up the diagonal of the modified tridiagonal
bb[n]=b[n]-alpha*beta/gamma; //system.
for (i=2;i<n;i++) bb[i]=b[i];
tridag(a,bb,c,r,x,n);// Solve A · x = r.
u[1]=gamma;// Set up the vector u.
u[n]=alpha;
for (i=2;i<n;i++) u[i]=0.0;
tridag(a,bb,c,u,z,n);// Solve A · z = u.
fact=(x[1]+beta*x[n]/gamma)/ //Form v · x/(1 + v · z).
(1.0+z[1]+beta*z[n]/gamma);
for (i=1;i<=n;i++) x[i] -= fact*z[i]; //Nowget the solution vector x.
free_vector(z,1,n);
free_vector(u,1,n);
free_vector(bb,1,n);
}
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
float *a,*b,*c,*x,*r;
float alpha,beta;
unsigned long n = (unsigned long) mxGetScalar(prhs[6]);
// a=mxGetPr(prhs[0]);
// b=mxGetPr(prhs[1]);
// c=mxGetPr(prhs[2]);
// r=mxGetPr(prhs[5]);
a = (float*) mxGetData(prhs[0]);
b = (float*) mxGetData(prhs[1]);
c = (float*) mxGetData(prhs[2]);
r = (float*) mxGetData(prhs[5]);
// alpha=*(mxGetPr(prhs[3]));
// beta=*(mxGetPr(prhs[4]));
alpha = (float) mxGetScalar(prhs[3]);
beta = (float) mxGetScalar(prhs[4]);
plhs[0]= mxCreateDoubleMatrix(n, 1, mxREAL);
x = mxGetPr(plhs[0]);
mexPrintf("%f ",alpha);
mexPrintf("\n");
mexPrintf("%f ",beta);
mexPrintf("\n");
mexPrintf("%d ",n);
mexPrintf("\n");
cyclic(a,b,c, alpha, beta,r,x,n) ;
mexPrintf("%d ",n);
mexPrintf("\n");
}
Finally I successfully compile itcyclic(a,b,c, alpha, beta,r,x,n) ;. But the answer is not right. I thing this is because r is an imaginary vector. So my question is how should I transform r between C and Matlab?

The C function cyclic expects arrays of floats, but mexFunction is passing a double*. Without changing cyclic.c, you have two options:
Convert the data to single in MATLAB and get a float* with mxGetData.
In mexFunction:
float *a = (float*) mxGetData(prhs[0]);
In MATLAB:
mexFunction(single(a),...)
Convert (copy, not cast!) the data in mexFunction.
In mexFunction, allocate new float arrays, and copy each element from the double input array (mxGetPr(prhs[0])) into the temporary float array.
Call mexFunction with a normal double array in MATLAB.
It's probably easier to do the former.
Under no circumstances should you simply cast the pointer, not that you were planning to do that.
Also, the scalars alpha, beta and n need to be read from prhs as scalars and passed to cyclic as scalars. In mexFunction, use:
float alpha = (float) mxGetScalar(prhs[...]);
float beta = (float) mxGetScalar(prhs[...]);
unsigned long n = (unsigned long) mxGetScalar(prhs[...]);
You've entirely forgotten c and r in mexFunction.

Related

error when solving ordinary differential equation numerically in C - error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token

I get following error when I compile my C code. I am using Numerical Recipes 2nd ed. functions rk4() for solving a first order differential equation.
I am not expert in this field. Any help will be highly appreciated.
Error is:
first_order_DE_RK4_example1.c:75: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘{’ token
code is:
#include "nrutil.h"
#include <stdio.h>
#include <math.h>
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []));
void (*derivs)(float, float[], float[]);
int main()
{
int n; float h; float x;
float y[1];
float dydx[1];
n=1;
h=0.2;
x=0;
y[0] = 1;
dydx[0] = 5.0;
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []));
return 0;
}
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []))
{
int i;
float xh,hh,h6,*dym,*dyt,*yt;
dym=vector(1,n);
dyt=vector(1,n);
yt=vector(1,n);
hh=h*0.5;
h6=h/6.0;
xh=x+hh;
for (i=1;i<=n;i++)
{
yt[i]=y[i]+hh*dydx[i];
(*derivs)(xh,yt,dyt);
}
for (i=1;i<=n;i++)
{ yt[i]=y[i]+hh*dyt[i];
(*derivs)(xh,yt,dym);
}
for (i=1;i<=n;i++)
{
yt[i]=y[i]+h*dym[i];
dym[i] += dyt[i];
}
(*derivs)(x+h,yt,dyt);
for (i=1;i<=n;i++)
{
yout[i]=y[i]+h6*(dydx[i]+dyt[i]+2.0*dym[i]);
}
free_vector(yt,1,n);
free_vector(dyt,1,n);
free_vector(dym,1,n);
}
void (*derivs)(float x, float y, float dydx)
{
float rhs;
rhs = 1-x+4*y;
}
You appear to be using function pointers for no reason and confusing prototypes and declarations with calls and definitions.
At the top, this change:
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []));
void (*derivs)(float, float[], float[]);
void rk4(float y[], float dydx[], int n, float x, float h, float yout[]);
void derivs(float, float[], float[]);
In main, this:
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []));
rk4(y, dydx, n, x, h, yout); // with some appropriate declared yout
In the definition of rk4,
void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
void (*derivs)(float, float [], float []))
void rk4(float y[], float dydx[], int n, float x, float h, float yout[])
For calls to derivs in rk4,
(*derivs)(xh,yt,dyt);
derivs(xh, yt, dyt);
In the definition of derivs,
void (*derivs)(float x, float y, float dydx)
void derivs(float x, float y[], float dydx[])
and then fix the computation of rhs.
If you find yourself writing sometype (*name in the future without intending to use function pointers, stop. =)
After correcting to a compiling version, I ran diff to report the changes in a systematic manner. These changes with their reasons are:
8c8
< void (*derivs)(float, float[], float[]);
---
> void myprime(float, float[], float[]);
To avoid confusion, I will use a different name for the parameter and the user-defined function that will fill that parameter.
The user-defined derivative function is just an ordinary function (procedure, subroutine, method), the function pointer declaration is restricted to the places where actually a variable is declared as a function pointer.
14a15
> float yout[1];
Add the missing variable declaration. This will help the program compile, but not run, as you are mixing two different index ranges. The standard range of float arr[N] is 0..N-1. The vector helper function of "Numerical recipes" uses pointer arithmetic so that the index range for the valid, allocated part of float *arr=vector(L,H) is L..H. In your present code you are mixing ranges 0..0 and 1..1 and thus accessing memory outside the allocated range. Switch everything to vector and the index range 1..n.
23,24c24
< void rk4(float y[], float dydx[], int n, float x, float h, float yout[],
< void (*derivs)(float, float [], float []));
---
> rk4(y, dydx, n, x, h, yout, myprime);
To call a function looks different than to declare that function. In the call you just use the variable names for the parameters.
44,47c44,47
< {
< yt[i]=y[i]+hh*dydx[i];
< (*derivs)(xh,yt,dyt);
< }
---
> {
> yt[i]=y[i]+hh*dydx[i];
> }
> derivs(xh,yt,dyt);
First you run the loop to compute the next state vector, then you call the derivative computation once.
The idea to pass the function pointer as a parameter is that you then use it like it were a function. No need to add any dereferencing etc., the C mechanism is constructed like that.
Both points also apply to the other two derivative computations. By the way, you are missing the computation of the derivative in dydx. The idea may be that this value is also used outside the routine for other purposes, so passing it avoids double computation.
73c74
< void (*derivs)(float x, float y, float dydx)
---
> void myprime(float x, float y[], float dydx[])
Implement the user-defined derivative function with the name change.
75,76c76
< float rhs;
< rhs = 1-x+4*y;
---
> dydx[0] = 1-x+4*y[0];
The passed arguments for y and dydx are vectors, they have to be treated as vectors with appropriate de-referencing or element addressing even in the scalar case.
You need to implement the intent behind the function declaration. The result of the derivative computation is to be stored in dydx.

lapack dgels_ segmentation fault 11

I am trying to use LAPACK's dgels_ in C to solve a linear least squares problem. I have to read the matrix A (assumed to have full rank and m>=n) and a vector b from 2 text files. I can easily compile my code, but when i try to run it I get a "segmentation fault 11", but I can't really see why. It is my first time using LAPACK so I don't know if maybe I am using the dgels_ function wrong?? The way I get it the solution x will be overwritten in the vector b? :
lssolve.c:
#include <stdlib.h>
#include <stdio.h>
#include "linalg.h"
/* C prototype for LAPACK routine DGELS */
void dgels_(const char * trans, const int * m, const int * n, const int *
nrhs, double * A, const int * lda, double * B, const int * ldb, double * work,
int * lwork,int * info);
int main(int argc, char * argv[]) {
vector_t * b_t = NULL;
matrix_t * A_t = NULL;
char trans = 'N';
int m, n, nrhs, mb, lda, ldb, info, lwork;
double optwork;
double * work;
// we read the matrix A and the vector b:
b_t = read_vector("b.txt");
A_t = read_matrix("A.txt");
m = A_t-> m; //number of rows in A
n = A_t-> n; //number of columns in A
nrhs = 1; //number of columns in B (will always be 1, since we read b_t with read_vector)
mb = b_t -> n; //number of rows in B
if (mb != m ) { //end program if A and B doesn't have the same number of rows
free(A_t);
free(b_t);
fprintf(stderr, "Sorry, but the matrix A and the vector b have incompatible dimensions. Good Bye!\n");
exit(EXIT_FAILURE);
}
//We make A and B into the wanted input form for the dgels_-function:
double * B = b_t -> v;
double ** A = A_t ->A;
lda = m;
ldb = mb;
//we calculate the optimal size of the work array:
lwork = -1;
dgels_(&trans, &n, &m, &nrhs, *A, &lda, B, &ldb, &optwork, &lwork, &info);
lwork = (int)optwork;
//we allocate space for the work array:
work = (double*)malloc( lwork*sizeof(double));
//solving the least squares problem:
dgels_(&trans, &n, &m, &nrhs, *A, &lda, B, &ldb, work, &lwork, &info);
//Check whether there was an successful exit:
if (info > 0){
fprintf(stderr, "Sorry, but illegal arguments were used, and therefore a least square solution cannot be computes. Good Bye!\n");
exit(EXIT_FAILURE);
} else if(info < 0){
fprintf(stderr, "Sorry, but A doesn't have full rank, and therefore a least square solution cannot be computed. Good Bye!\n");
exit(EXIT_FAILURE);
}
//Saving the least square problem as a vector_t:
vector_t * x = NULL;
x->n = mb;
x->v = B;
print_vector(x);
//Free memory
free_vector(b_t);
free_matrix(A_t);
free_vector(x);
return(EXIT_SUCCESS);
}
I am using the functions read_matrix, read_vector, print_vector, print_matrix and free_vector, which is why I use the struct vector_t and matrix_t:
typedef struct vector {
unsigned long n; /* length of vector */
double * v; /* pointer to array of length n */
} vector_t;
typedef struct matrix {
unsigned long m; /* number of rows */
unsigned long n; /* number of columns */
double ** A; /* pointer to two-dimensional array */
} matrix_t;
I don't think that anything is wrong with read_vector and read_matrix because I can easily do this and use print_vector or print_matrix before I do all of the other operations.
You dereference a NULL pointer here, causing the segfault:
//Saving the least square problem as a vector_t:
vector_t * x = NULL;
x->n = mb;
x->v = B;
Maybe you should use/create a new vector_t instead of just a pointer to a vector_t?

On entry to DGEEV parameter number 9 had an illegal value

I am trying for the first time to use LAPACK from C to diagonalize a matrix and I am stuck.
I have been trying to modify this example http://rcabreral.blogspot.co.uk/2010/05/eigenvalues-clapack.html from zgeev to dgeev. I have looked at the DGEEV input parameters, http://www.netlib.org/lapack/explore-html/d9/d28/dgeev_8f.html but it seems I don't understand the well enough.
Hence, the code below produces:
**** On entry to DGEEV parameter number 9 had an illegal value**
EDIT: The error occurs in the call of dgeev spanning lines 48 to (including) 53.
EDIT: Note that the arguments differ from the specifications here
http://www.netlib.org/lapack/explore-html/d9/d28/dgeev_8f.html
in that they have been translated to pointers. That is necessary when using these Fortran routines in C, as explained here:
http://www.physics.orst.edu/~rubin/nacphy/lapack/cprogp.html
#include <stdio.h>
#include <math.h>
#include <complex.h>
#include <stdlib.h>
//.........................................................................
void dgeTranspose( double *Transposed, double *M ,int n) {
int i,j;
for(i=0;i<n;i++)
for(j=0;j<n;j++)
Transposed[i+n*j] = M[i*n+j];
}
//.........................................................................
// MatrixComplexEigensystem: computes the eigenvectors and eigenValues of input matrix A
// The eigenvectors are stored in columns
//.........................................................................
void MatrixComplexEigensystem( double *eigenvectorsVR, double *eigenvaluesW, double *A, int N){
int i;
double *AT = (double *) malloc( N*N*sizeof(double ) );
dgeTranspose( AT, A , N);
char JOBVL ='N'; // Compute Right eigenvectors
char JOBVR ='V'; // Do not compute Left eigenvectors
double VL[1];
int LDVL = 1;
int LDVR = N;
int LWORK = 4*N;
double *WORK = (double *)malloc( LWORK*sizeof(double));
double *RWORK = (double *)malloc( 2*N*sizeof(double));
int INFO;
dgeev_( &JOBVL, &JOBVR, &N, AT , &N , eigenvaluesW ,
VL, &LDVL,
eigenvectorsVR, &LDVR,
WORK,
&LWORK, RWORK, &INFO );
dgeTranspose( AT, eigenvectorsVR , N);
for(i=0;i<N*N;i++) eigenvectorsVR[i]=AT[i];
free(WORK);
free(RWORK);
free(AT);
}
int main(){
int i,j;
const int N = 3;
double A[] = { 1.+I , 2. , 3 , 4. , 5.+I , 6. , 7., 8., 9. + I};
double eigenVectors[N*N];
double eigenValues[N];
MatrixComplexEigensystem( eigenVectors, eigenValues, A, N);
printf("\nEigenvectors\n");
for(i=0;i<N;i++){
for(j=0;j<N;j++) printf("%e", eigenVectors[i*N + j]);
printf("\n");
}
printf("\nEigenvalues \n");
for(i=0;i<N;i++) printf("%e", eigenValues[i] );
printf("\n------------------------------------------------------------\n");
return 0;
}
You can not port directly from zgeev to dgeev. The zgeev gets a complex matrix and computes complex eigenvalues. While dgeev gets a real matrix and computes complex eigenvalues. In order to be consistent LAPACK uses WR and WI which is used for the real and imaginary part of each eigenvalue.
So note that dgeev definition is
void dgeev_(char* JOBVL, char* JOBVR, int* N, double* A, int* LDA, double* WR, double* WI, double* VL, int* LDVL, double* VR, int* LDVR, double* WORK, int* LWORK, int* INFO);
My suggestion for your example is to remove:
#include <complex.h>
remove I's from matrix of doubles:
double A[] = { 1. , 2. , 3 , 4. , 5. , 6. , 7., 8., 9.};
then double the size of eigenvalues vector:
double eigenValues[2*N];
and call dgeev using WR and WI:
double *eigenvaluesWR = eigenvaluesW;
double *eigenvaluesWI = eigenvaluesW+N;
dgeev_(&JOBVL, &JOBVR, &N, AT, &N,
eigenvaluesWR, eigenvaluesWI,
VL, &LDVL,
eigenvectorsVR, &LDVR,
WORK, &LWORK, &INFO);

How do I parallelize this triple loop in an efficient way?

I'm trying to parallelize a function which takes as input three arrays (x, y, and prb) and one scalar, and outputs three arrays (P1, Pt1, and Px).
The original c code is here (the outlier and E are inconsequential):
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B) ((A) > (B) ? (A) : (B))
#define min(A, B) ((A) < (B) ? (A) : (B))
void cpd_comp(
double* x,
double* y,
double* prb,
double* sigma2,
double* outlier,
double* P1,
double* Pt1,
double* Px,
double* E,
int N,
int M,
int D
)
{
int n, m, d;
double ksig, diff, razn, outlier_tmp, sp;
double *P, *temp_x;
P = (double*) calloc(M, sizeof(double));
temp_x = (double*) calloc(D, sizeof(double));
ksig = -2.0 * *sigma2;
for (n=0; n < N; n++) {
sp=0;
for (m=0; m < M; m++) {
razn=0;
for (d=0; d < D; d++) {
diff=*(x+n+d*N)-*(y+m+d*M); diff=diff*diff;
razn+=diff;
}
*(P+m)=exp(razn/ksig) ;
sp+=*(P+m);
}
*(Pt1+n)=*(prb+n);
for (d=0; d < D; d++) {
*(temp_x+d)=*(x+n+d*N)/ sp;
}
for (m=0; m < M; m++) {
*(P1+m)+=((*(P+m)/ sp) **(prb+n));
for (d=0; d < D; d++) {
*(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
}
}
*E += -log(sp);
}
*E +=D*N*log(*sigma2)/2;
free((void*)P);
free((void*)temp_x);
return;
}
Here is my attempt at parallelizing it:
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
/*headers*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Square of sigma
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in
);
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M);
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M);
/*implementations*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Scalar
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in y
){
//X is generatedPointPos
//Y is points
float
*P,
*P1timessp,
*Pxtimessp,
ksig = -2.0 * (*sigma2),
*h_sumofP = new float[N], //sum of P, on host
*d_sumofP; //sum of P, on device
cudaMalloc((void**)&P, sizeof(float)*M*N);
cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
cudaMalloc((void**)&d_sumofP, sizeof(float)*N);
cudaMalloc((void**)P1, sizeof(float)*M);
cudaMalloc((void**)Px, sizeof(float)*M*3);
cudaMalloc((void**)Pt1, sizeof(float)*N);
d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);
for(int n=0; n<N; n++){
thrust::device_ptr<float>dev_ptr(P);
h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
}
cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);
d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);
cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaFree(P);
cudaFree(P1timessp);
cudaFree(Pxtimessp);
cudaFree(d_sumofP);
delete[]h_sumofP;
}
/*kernels*/
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M){
//thread configuration: <<<dim3(N,M/1024+1),1024>>>
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
float
x1 = x[3*n],
x2 = x[3*n+1],
x3 = x[3*n+2],
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
razn = diff1*diff1+diff2*diff2+diff3*diff3,
Pm = __expf(razn/ksig), //fast exponentiation
prbn = prb[n];
P[M*n+m] = Pm;
__syncthreads();
P1[N*m+n] = Pm*prbn;
Px[3*(N*m+n)+0] = x1*Pm*prbn;
Px[3*(N*m+n)+1] = x2*Pm*prbn;
Px[3*(N*m+n)+2] = x3*Pm*prbn;
}
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M){
//computes P1 and Px
//thread configuration: <<<M/1024+1,1024>>>
int m = threadIdx.x+blockIdx.x*blockDim.x;
if(m>=M) return;
float
P1m = 0,
Pxm1 = 0,
Pxm2 = 0,
Pxm3 = 0;
for(int n=0; n<N; n++){
float spn = 1/sp[n];
P1m += P1timessp[N*m+n]*spn;
Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
}
P1[m] = P1m;
Px[3*m+0] = Pxm1;
Px[3*m+1] = Pxm2;
Px[3*m+2] = Pxm3;
}
However, to my horror, it runs much, much slower than the original version. How do I make it run faster? Please explain things thoroughly since I am very new to CUDA and parallel programming and have no experience in algorithms.
Do note that the c version has column-major ordering and the CUDA version has row-major. I have done several tests to make sure that the result is correct. It's just extremely slow and takes up a LOT of memory.
Any help is greatly appreciated!
EDIT: More information: N and M are on the order of a few thousand (say, 300-3000) and D is always 3. The CUDA version expects arrays to be device memory, except for variables prefixed with h_.
Before trying any CUDA-specific optimizations, profile your code to see where time is being spent.
Try and arrange your array reads/writes so that each CUDA thread uses a strided access pattern. For example, currently you have
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
So thread 1 will read from y[0],y[1],y[2] etc. Instead, rearrange your data so that thread 1 reads from y[0],y[M],y[2*M] and thread 2 reads from y[1],y[M+1],y[2*M+1] etc. You should follow this access pattern for other arrays.
Also, you may want to consider whether you can avoid the use of __syncthreads(). I don't quite follow why it's necessary in this algorithm, it might be worth removing it to see if it improves performance ( even if it produces incorrect results ).
The key to good CUDA performance is almost always to make as near to optimal memory access as possible. Your memory access pattern looks very similar to matrix multiplication. I would start with a good CUDA matrix multiplication implementation, being sure to understand why it's implemented the way it is, and then modify that to suit your needs.

Computing the reciprocal condition number with lapack (i.e. rcond(x))

I wish to do exactly what rcond does in MATLAB/Octave using LAPACK from C.
The MATLAB manual tells me dgecon is used, and that is uses a 1-based norm.
I wrote a simple test program for an extremely simple case; [1,1; 1,0]
For this input matlab and octave gives me 0.25 using rcond and 1/cond(x,1), but in the case using LAPACK, this sample program prints 0.0. For other cases, such as identity, it prints the correct value.
Since MATLAB is supposely actually using this routine with success, what am I doing wrong?
I'm trying to decipher what Octave does, with little success as its wrapped in
#include <stdio.h>
extern void dgecon_(const char *norm, const int *n, const double *a,
const int *lda, const double *anorm, double *rcond, double *work,
int *iwork, int *info, int len_norm);
int main()
{
int i, info, n, lda;
double anorm, rcond;
double w[8] = { 0,0,0,0,0,0,0,0 };
int iw[2] = { 0,0 };
double x[4] = { 1, 1, 1, 0 };
anorm = 2.0; /* maximum column sum, computed manually */
n = 2;
lda = 2;
dgecon_("1", &n, x, &lda, &anorm, &rcond, w, iw, &info, 1);
if (info != 0) fprintf(stderr, "failure with error %d\n", info);
printf("%.5e\n", rcond);
return 0;
}
Compiled with cc testdgecon.c -o testdgecon -llapack; ./testdgecon
I found the answer to me own question.
The matrix is must be LU-decomposed before it is sent to dgecon. This seems very logical since one often wants to solve the system after checking the condition, in which case there is no need to decompose the matrix twice. The same idea goes for the norm which is computed separately.
The following code is all the necessary parts the compute the reciprocal condition number with LAPACK.
#include "stdio.h"
extern int dgecon_(const char *norm, const int *n, double *a, const int *lda, const double *anorm, double *rcond, double *work, int *iwork, int *info, int len);
extern int dgetrf_(const int *m, const int *n, double *a, const int *lda, int *lpiv, int *info);
extern double dlange_(const char *norm, const int *m, const int *n, const double *a, const int *lda, double *work, const int norm_len);
int main()
{
int i, info, n, lda;
double anorm, rcond;
int iw[2];
double w[8];
double x[4] = {7,3,-9,2 };
n = 2;
lda = 2;
/* Computes the norm of x */
anorm = dlange_("1", &n, &n, x, &lda, w, 1);
/* Modifies x in place with a LU decomposition */
dgetrf_(&n, &n, x, &lda, iw, &info);
if (info != 0) fprintf(stderr, "failure with error %d\n", info);
/* Computes the reciprocal norm */
dgecon_("1", &n, x, &lda, &anorm, &rcond, w, iw, &info, 1);
if (info != 0) fprintf(stderr, "failure with error %d\n", info);
printf("%.5e\n", rcond);
return 0;
}

Resources