How do I parallelize this triple loop in an efficient way? - c

I'm trying to parallelize a function which takes as input three arrays (x, y, and prb) and one scalar, and outputs three arrays (P1, Pt1, and Px).
The original c code is here (the outlier and E are inconsequential):
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define max(A, B) ((A) > (B) ? (A) : (B))
#define min(A, B) ((A) < (B) ? (A) : (B))
void cpd_comp(
double* x,
double* y,
double* prb,
double* sigma2,
double* outlier,
double* P1,
double* Pt1,
double* Px,
double* E,
int N,
int M,
int D
)
{
int n, m, d;
double ksig, diff, razn, outlier_tmp, sp;
double *P, *temp_x;
P = (double*) calloc(M, sizeof(double));
temp_x = (double*) calloc(D, sizeof(double));
ksig = -2.0 * *sigma2;
for (n=0; n < N; n++) {
sp=0;
for (m=0; m < M; m++) {
razn=0;
for (d=0; d < D; d++) {
diff=*(x+n+d*N)-*(y+m+d*M); diff=diff*diff;
razn+=diff;
}
*(P+m)=exp(razn/ksig) ;
sp+=*(P+m);
}
*(Pt1+n)=*(prb+n);
for (d=0; d < D; d++) {
*(temp_x+d)=*(x+n+d*N)/ sp;
}
for (m=0; m < M; m++) {
*(P1+m)+=((*(P+m)/ sp) **(prb+n));
for (d=0; d < D; d++) {
*(Px+m+d*M)+= (*(temp_x+d)**(P+m)**(prb+n));
}
}
*E += -log(sp);
}
*E +=D*N*log(*sigma2)/2;
free((void*)P);
free((void*)temp_x);
return;
}
Here is my attempt at parallelizing it:
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/device_ptr.h>
#include <thrust/reduce.h>
/*headers*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Square of sigma
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in
);
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M);
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M);
/*implementations*/
void cpd_comp(
float * x, //Points to register [N*D]
float * y, //Points to be registered [M*D]
float * prb, //Vector of probabilities [N]
float * sigma2, //Scalar
float ** P1, //P1, output, [M]
float ** Pt1, //Pt1, output, [N]
float ** Px, //Px, output, [M*3]
int N, //Number of points, i.e. rows, in x
int M //Number of points, i.e. rows, in y
){
//X is generatedPointPos
//Y is points
float
*P,
*P1timessp,
*Pxtimessp,
ksig = -2.0 * (*sigma2),
*h_sumofP = new float[N], //sum of P, on host
*d_sumofP; //sum of P, on device
cudaMalloc((void**)&P, sizeof(float)*M*N);
cudaMalloc((void**)&P1timessp,sizeof(float)*M*N);
cudaMalloc((void**)&Pxtimessp,sizeof(float)*M*N*3);
cudaMalloc((void**)&d_sumofP, sizeof(float)*N);
cudaMalloc((void**)P1, sizeof(float)*M);
cudaMalloc((void**)Px, sizeof(float)*M*3);
cudaMalloc((void**)Pt1, sizeof(float)*N);
d_computeP<<<dim3(N,M/1024+1),M>1024?1024:M>>>(P,P1timessp,Pxtimessp,NULL,x,y,prb,ksig,N,M);
for(int n=0; n<N; n++){
thrust::device_ptr<float>dev_ptr(P);
h_sumofP[n] = thrust::reduce(dev_ptr+M*n,dev_ptr+M*(n+1),0.0f,thrust::plus<float>());
}
cudaMemcpy(d_sumofP,h_sumofP,sizeof(float)*N,cudaMemcpyHostToDevice);
d_sumP<<<M/1024+1,M>1024?1024:M>>>(d_sumofP,P1timessp,Pxtimessp,*P1,*Px,N,M);
cudaMemcpy(*Pt1,prb,sizeof(float)*N,cudaMemcpyDeviceToDevice);
cudaFree(P);
cudaFree(P1timessp);
cudaFree(Pxtimessp);
cudaFree(d_sumofP);
delete[]h_sumofP;
}
/*kernels*/
__global__ void d_computeP(
float * P,
float * P1,
float * Px,
float * ProbabilityMatrix,
float * x,
float * y,
float * prb,
float ksig,
const int N,
const int M){
//thread configuration: <<<dim3(N,M/1024+1),1024>>>
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
float
x1 = x[3*n],
x2 = x[3*n+1],
x3 = x[3*n+2],
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
razn = diff1*diff1+diff2*diff2+diff3*diff3,
Pm = __expf(razn/ksig), //fast exponentiation
prbn = prb[n];
P[M*n+m] = Pm;
__syncthreads();
P1[N*m+n] = Pm*prbn;
Px[3*(N*m+n)+0] = x1*Pm*prbn;
Px[3*(N*m+n)+1] = x2*Pm*prbn;
Px[3*(N*m+n)+2] = x3*Pm*prbn;
}
__global__ void d_sumP(
float * sp,
float * P1timessp,
float * Pxtimessp,
float * P1,
float * Px,
const int N,
const int M){
//computes P1 and Px
//thread configuration: <<<M/1024+1,1024>>>
int m = threadIdx.x+blockIdx.x*blockDim.x;
if(m>=M) return;
float
P1m = 0,
Pxm1 = 0,
Pxm2 = 0,
Pxm3 = 0;
for(int n=0; n<N; n++){
float spn = 1/sp[n];
P1m += P1timessp[N*m+n]*spn;
Pxm1 += Pxtimessp[3*(N*m+n)+0]*spn;
Pxm2 += Pxtimessp[3*(N*m+n)+1]*spn;
Pxm3 += Pxtimessp[3*(N*m+n)+2]*spn;
}
P1[m] = P1m;
Px[3*m+0] = Pxm1;
Px[3*m+1] = Pxm2;
Px[3*m+2] = Pxm3;
}
However, to my horror, it runs much, much slower than the original version. How do I make it run faster? Please explain things thoroughly since I am very new to CUDA and parallel programming and have no experience in algorithms.
Do note that the c version has column-major ordering and the CUDA version has row-major. I have done several tests to make sure that the result is correct. It's just extremely slow and takes up a LOT of memory.
Any help is greatly appreciated!
EDIT: More information: N and M are on the order of a few thousand (say, 300-3000) and D is always 3. The CUDA version expects arrays to be device memory, except for variables prefixed with h_.

Before trying any CUDA-specific optimizations, profile your code to see where time is being spent.
Try and arrange your array reads/writes so that each CUDA thread uses a strided access pattern. For example, currently you have
int m = threadIdx.x+blockIdx.y*blockDim.x;
int n = blockIdx.x;
if(m>=M || n>=N) return;
diff1 = x1 - y[3*m],
diff2 = x2 - y[3*m+1],
diff3 = x3 - y[3*m+2],
So thread 1 will read from y[0],y[1],y[2] etc. Instead, rearrange your data so that thread 1 reads from y[0],y[M],y[2*M] and thread 2 reads from y[1],y[M+1],y[2*M+1] etc. You should follow this access pattern for other arrays.
Also, you may want to consider whether you can avoid the use of __syncthreads(). I don't quite follow why it's necessary in this algorithm, it might be worth removing it to see if it improves performance ( even if it produces incorrect results ).

The key to good CUDA performance is almost always to make as near to optimal memory access as possible. Your memory access pattern looks very similar to matrix multiplication. I would start with a good CUDA matrix multiplication implementation, being sure to understand why it's implemented the way it is, and then modify that to suit your needs.

Related

Aggregate many small arrays in fewer large arrays by basic function

I have many small 2D arrays (e.g. M x 32 x 40) and fewer larger 2D arrays (e.g. N x 200 x 300).
I would like to 'put' the smaller matrices at indices n,i,j in the larger arrays (upper left index of the array at batch index n). These small arrays could overlap and should be aggregated by functions that are associative and commutative say plus, multiply, etc.
I figure this is a pretty basic scenario that many people should have come across, right? Is there a cuda implementation that supports this in an efficient way?
Typical values M = 10^6, N = 10^4
This is a reduction operation.
In addition to what is expressed in the comments, I'll make the assumption that the distribution of the M matrices in terms of which of the N matrices they belong to, is relatively uniform, i.e. evenly distributed. This means for the dimensions given, that there will be approximately 100 of the M matrices that intended to update N matrix 0, 100 for N matrix 1, and so on. Furthermore, if we inspect the n array, we would observe a uniformly random pattern of indices (i.e. no clumping or grouping).
Given that, in what may be a first for me, I'll suggest a lock/critical section algorithm, using the plumbing from here. Each threadblock will take one of the M arrays, and attempt to acquire a lock so that it can update the appropriate N array. When finished, release the lock.
I considered other approaches as well, some of which are evident in the code. In any event, for the stated conditions, the lock based approach had a kernel runtime of about 40ms on my V100 GPU, which was the best I observed.
I would also note that the stated dimensions result in a data working set of ~8GB. Not that that is a problem, just be aware if running this code as-is on your laptop GPU.
Here's an example:
$ cat t34.cu
#include <iostream>
#include <cstdlib>
const int N = 10000;
const int M = 1000000;
const int Mx = 32;
const int My = 40;
const int Nx = 200;
const int Ny = 300;
const int nTPB = 256;
template <typename T>
__host__ __device__
T reduction_op(T &a, const T &b){ return a+b;}
template <typename T>
__global__ void k(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
for (int ii = 0; ii < num_M; ii++){
if (n[ii] == blockIdx.x) {
for (int jj = threadIdx.x; jj < Mx*My; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x] = reduction_op(
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x], M[ii*Mx*My + y*Mx + x]);}
}
__syncthreads();}
}
// assumes Ny is whole-number divisible by sl
template <typename T>
__global__ void ki(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, const int sl){
extern __shared__ T s[];
for (int c = 0; c < Ny; c+=sl){ // process per chunk of N array
// load shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) s[t] = N[blockIdx.x*Nx*Ny + c*Nx + t];
__syncthreads();
// process chunk stack
for (int ii = 0; ii < num_M; ii++){ // iterate through "stack"
if ((n[ii] == blockIdx.x) && (j[ii] < (c+sl)) && ((j[ii]+My) > c)) {
for (int jj = threadIdx.x; jj < sl*Mx; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
//y += c;
if ((y+c >= j[ii]) && (y+c < (j[ii]+My)))
s[y*Nx+x+i[ii]] = reduction_op(s[y*Nx+x+i[ii]], M[ii*Mx*My + (y+c-j[ii])*Mx + x]);}
}
__syncthreads();}
// save shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) N[blockIdx.x*Nx*Ny + c*Nx + t] = s[t];
}
}
template <typename T>
__global__ void ka(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y)
atomicAdd(N+n[blockIdx.x]*Nx*Ny+(j[blockIdx.x]+y)*Nx+i[blockIdx.x]+x, M[blockIdx.x*Mx*My+y*Mx+x]);
}
__device__ void acquire_semaphore(volatile int *lock){
while (atomicCAS((int *)lock, 0, 1) != 0);
}
__device__ void release_semaphore(volatile int *lock){
*lock = 0;
__threadfence();
}
template <typename T>
__global__ void kl(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, int * __restrict__ locks){
if ((threadIdx.x == 0) && (threadIdx.y == 0))
acquire_semaphore(locks+n[blockIdx.x]);
__syncthreads();
//begin critical section
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y){
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x] = reduction_op(
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x], M[blockIdx.x*Mx*My + y*Mx + x]);}
// end critical section
__threadfence(); // not strictly necessary for the lock, but to make any global updates in the critical section visible to other threads in the grid
__syncthreads();
if ((threadIdx.x == 0) && (threadIdx.y == 0))
release_semaphore(locks+n[blockIdx.x]);
}
typedef float mt;
int main(){
mt *d_M, *h_M, *d_N, *h_N, *r1, *r2;
int *d_n, *h_n, *d_i, *h_i, *d_j, *h_j;
h_M = new mt[M*Mx*My];
h_N = new mt[N*Nx*Ny];
r1 = new mt[N*Nx*Ny];
r2 = new mt[N*Nx*Ny];
h_n = new int[M];
h_i = new int[M];
h_j = new int[M];
cudaMalloc(&d_M, M*Mx*My*sizeof(mt));
cudaMalloc(&d_N, N*Nx*Ny*sizeof(mt));
cudaMalloc(&d_n, M*sizeof(int));
cudaMalloc(&d_i, M*sizeof(int));
cudaMalloc(&d_j, M*sizeof(int));
for (int i = 0; i < M; i++){
h_n[i] = rand()%N;
h_i[i] = rand()%(Nx - Mx);
h_j[i] = rand()%(Ny - My);}
for (int i = 0; i < N*Nx*Ny; i++) h_N[i] = (mt)(i%3);
for (int i = 0; i < M*Mx*My; i++) h_M[i] = (mt)((i%3)+1);
cudaMemcpy(d_M, h_M, M*Mx*My*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_n, h_n, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_i, h_i, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_j, h_j, M*sizeof(int), cudaMemcpyHostToDevice);
#ifdef USE_SINGLE_N
cudaMemset(d_n, 0, M*sizeof(int));
#endif
#if 0
const int sl = 40;
const int sb = sl * Nx * sizeof(mt);
ki<<<N, nTPB, sb>>>(d_M, d_N, d_n, d_i, d_j, M, sl);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
dim3 block(Mx, 8);
#if 0
ka<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
int *d_locks;
cudaMalloc(&d_locks, N*sizeof(int));
cudaMemset(d_locks, 0, N*sizeof(int));
kl<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M, d_locks);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
k<<<N, nTPB>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r1, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < N*Nx*Ny; i++) if (r1[i] != r2[i]) {std::cout << "mismatch at: " << i << " was: " << r2[i] << " should be: " << r1[i] << std::endl; return 0;}
}
$ nvcc -o t34 t34.cu -O3 -lineinfo
$ nvprof ./t34
==17970== NVPROF is profiling process 17970, command: ./t34
==17970== Profiling application: ./t34
==17970== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 34.57% 3.09036s 2 1.54518s 1.54294s 1.54742s [CUDA memcpy DtoH]
33.18% 2.96615s 1 2.96615s 2.96615s 2.96615s void k<float>(float const *, float*, int const *, int const *, int const *, int)
31.81% 2.84401s 6 474.00ms 1.4255ms 1.27035s [CUDA memcpy HtoD]
0.45% 39.949ms 1 39.949ms 39.949ms 39.949ms void kl<float>(float const *, float*, int const *, int const *, int const *, int, int*)
0.00% 2.1120us 1 2.1120us 2.1120us 2.1120us [CUDA memset]
API calls: 96.13% 8.94558s 8 1.11820s 1.9203ms 4.51030s cudaMemcpy
3.60% 334.59ms 6 55.765ms 277.58us 330.37ms cudaMalloc
0.15% 13.752ms 8 1.7190ms 1.3268ms 2.2025ms cuDeviceTotalMem
0.11% 10.472ms 808 12.959us 172ns 728.50us cuDeviceGetAttribute
0.01% 997.81us 8 124.73us 100.93us 176.73us cuDeviceGetName
0.00% 69.047us 2 34.523us 32.349us 36.698us cudaLaunchKernel
0.00% 68.013us 1 68.013us 68.013us 68.013us cudaMemset
0.00% 46.172us 8 5.7710us 1.8940us 23.025us cuDeviceGetPCIBusId
0.00% 8.5060us 16 531ns 260ns 1.5030us cuDeviceGet
0.00% 3.7870us 8 473ns 229ns 881ns cuDeviceGetUuid
0.00% 3.3980us 3 1.1320us 610ns 2.0780us cuDeviceGetCount
$
Extended discussion:
On performance:
This is a memory bound algorithm. Therefore, we can estimate optimal kernel performance by determining the minimum number of memory reads and writes needed to perform the operation, then dividing by the available memory bandwidth, to determine the optimal or lower-bound for kernel duration. Unfortunately the determination of the minimum number of reads and writes depends on the positioning of the M matrices, so cannot be easily generally determined, without inspecting the n, i, and j matrices.
However we can look for another way to estimate. Another approach to estimation would be to observe that each M matrix update will require reading 2 values and writing one value. If we then use that as our estimate, we come up with M*Mx*My*3*sizeof(element_of_M)/GPU_memory_bandwidth. On my V100 (~700GB/s BW) this works out to about 20ms lower bound on kernel duration.
On approaches considered:
"naive" approach, kernel k: Each threadblock will be responsible for one of the N matrices, and will iterate through the M matrices, inspecting n to determine if the M matrices will update the assigned N matrix. This gives a non-optimal run time of ~3s but seems to be mostly invariant performance-wise based on the distribution of n, and can use an "arbitrary" reduction op.
attempt at "optimal" approach, kernel ki: Each threadblock will be responsible for one of the N matrices, but will only load a chunk of that matrix at a time. It will then proceed through the M matrices updating that chunk, similar the the k kernel. This necessitates more loops through the matrices, but should "almost" only load or save each global memory item the minimum number of times necessary. Nevertheless, the run time is really long, ~40s
atomic approach, kernel ka: Each threadblock will be responsible for one of the M matrices, and will atomically update the relevant N matrix. Simplicity. And the runtime is "fast" at ~40ms. (The atomic approach may be even faster than this is non-uniform n distributions. I witnessed kernel runtimes as low as 8ms!) However this is not readily generalizable to operations that don't have an atomic equivalent, such as multiply.
lock based approach, kernel kl: Like the atomic approach, each threadblock will be responsible for one of the M matrices, and will first acquire a lock on the relevant N matrix. The lock means that atomics are not necessary. For the uniformly distributed n case presented, it has about the same performance as the atomic case. It has the benefit that it can handle other reduction ops, such as multiply, readily. A disadvantage is that in the presence of non-uniformly-random distribution in n the performance can suffer, with a worst case in the ballpark of the naive kernel (3-5s).
Overall if the requirement for an arbitrary reduction operator can be dropped (e.g. only use addition, for example) then the atomic method may be best.

On entry to DGEEV parameter number 9 had an illegal value

I am trying for the first time to use LAPACK from C to diagonalize a matrix and I am stuck.
I have been trying to modify this example http://rcabreral.blogspot.co.uk/2010/05/eigenvalues-clapack.html from zgeev to dgeev. I have looked at the DGEEV input parameters, http://www.netlib.org/lapack/explore-html/d9/d28/dgeev_8f.html but it seems I don't understand the well enough.
Hence, the code below produces:
**** On entry to DGEEV parameter number 9 had an illegal value**
EDIT: The error occurs in the call of dgeev spanning lines 48 to (including) 53.
EDIT: Note that the arguments differ from the specifications here
http://www.netlib.org/lapack/explore-html/d9/d28/dgeev_8f.html
in that they have been translated to pointers. That is necessary when using these Fortran routines in C, as explained here:
http://www.physics.orst.edu/~rubin/nacphy/lapack/cprogp.html
#include <stdio.h>
#include <math.h>
#include <complex.h>
#include <stdlib.h>
//.........................................................................
void dgeTranspose( double *Transposed, double *M ,int n) {
int i,j;
for(i=0;i<n;i++)
for(j=0;j<n;j++)
Transposed[i+n*j] = M[i*n+j];
}
//.........................................................................
// MatrixComplexEigensystem: computes the eigenvectors and eigenValues of input matrix A
// The eigenvectors are stored in columns
//.........................................................................
void MatrixComplexEigensystem( double *eigenvectorsVR, double *eigenvaluesW, double *A, int N){
int i;
double *AT = (double *) malloc( N*N*sizeof(double ) );
dgeTranspose( AT, A , N);
char JOBVL ='N'; // Compute Right eigenvectors
char JOBVR ='V'; // Do not compute Left eigenvectors
double VL[1];
int LDVL = 1;
int LDVR = N;
int LWORK = 4*N;
double *WORK = (double *)malloc( LWORK*sizeof(double));
double *RWORK = (double *)malloc( 2*N*sizeof(double));
int INFO;
dgeev_( &JOBVL, &JOBVR, &N, AT , &N , eigenvaluesW ,
VL, &LDVL,
eigenvectorsVR, &LDVR,
WORK,
&LWORK, RWORK, &INFO );
dgeTranspose( AT, eigenvectorsVR , N);
for(i=0;i<N*N;i++) eigenvectorsVR[i]=AT[i];
free(WORK);
free(RWORK);
free(AT);
}
int main(){
int i,j;
const int N = 3;
double A[] = { 1.+I , 2. , 3 , 4. , 5.+I , 6. , 7., 8., 9. + I};
double eigenVectors[N*N];
double eigenValues[N];
MatrixComplexEigensystem( eigenVectors, eigenValues, A, N);
printf("\nEigenvectors\n");
for(i=0;i<N;i++){
for(j=0;j<N;j++) printf("%e", eigenVectors[i*N + j]);
printf("\n");
}
printf("\nEigenvalues \n");
for(i=0;i<N;i++) printf("%e", eigenValues[i] );
printf("\n------------------------------------------------------------\n");
return 0;
}
You can not port directly from zgeev to dgeev. The zgeev gets a complex matrix and computes complex eigenvalues. While dgeev gets a real matrix and computes complex eigenvalues. In order to be consistent LAPACK uses WR and WI which is used for the real and imaginary part of each eigenvalue.
So note that dgeev definition is
void dgeev_(char* JOBVL, char* JOBVR, int* N, double* A, int* LDA, double* WR, double* WI, double* VL, int* LDVL, double* VR, int* LDVR, double* WORK, int* LWORK, int* INFO);
My suggestion for your example is to remove:
#include <complex.h>
remove I's from matrix of doubles:
double A[] = { 1. , 2. , 3 , 4. , 5. , 6. , 7., 8., 9.};
then double the size of eigenvalues vector:
double eigenValues[2*N];
and call dgeev using WR and WI:
double *eigenvaluesWR = eigenvaluesW;
double *eigenvaluesWI = eigenvaluesW+N;
dgeev_(&JOBVL, &JOBVR, &N, AT, &N,
eigenvaluesWR, eigenvaluesWI,
VL, &LDVL,
eigenvectorsVR, &LDVR,
WORK, &LWORK, &INFO);

Returning an array of structs from a function - C programming

So I'm trying to write a function that will return an array of several values. At the moment, it is running correctly but only outputting the final calculated value. How would I make it so the output includes all calculated values?
My code looks like this:
//Practice to output an array of structs
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
struct boat_params {
double V, Uc, Vc;
};
struct boat_params submerged_volume(double L1, double L2, double Lavg, double H) {
struct boat_params volume;
double V_sub, Uc_sub, Vc_sub;
V_sub = 0;
//Boat description
double C, delta;
double theta, theta_rad, theta_min, theta_min2, theta_lim, theta_lim2, theta_lim_deg;
double Ug1, Ug2, Vg1, Vg2, V1, V2;
double pi;
pi = 4*atan(1);
C = sqrt(L1*L1 + L2*L2);
delta = acos(L1/C);
theta_lim = asin(H/L1);
theta_lim_deg = (theta_lim/pi) * 180.0;
theta_min = asin(H/C) - delta;
theta_min2 = 0;
//Calculating the submerged volume and centre of gravity for each different angle
for (theta = 0; theta <= 10; theta ++) {
//**Note: I've taken out the actual calculations of V_sub, Uc_sub, and Vc_sub for brevity**
volume.V = V_sub;
volume.Uc = Uc_sub;
volume.Vc = Vc_sub;
}
return volume;
}
int main () {
double L1, L2, Lavg, H;
struct boat_params volume;
L1 = 17.6;
L2 = 3;
Lavg = 4;
H = 4.5;
volume = submerged_volume(L1, L2, Lavg, H);
printf("V = %lf\nUc = %lf\nVc = %lf\n", volume.V, volume.Uc, volume.Vc);
return 0;
}
I can get it to correctly output the last calculated value (for theta = 10) but that's the only value I'm getting. How would I calculate V_sub, Uc_sub, and Vc_sub for each theta value? and output each value. I'm assuming this means turning the struct into an array and filling each element of the array with values of the struct for that theta but I don't know how to do this!
I really appreciate any help and thank you in advance.
Also: If possible I'd like to avoid pointers but understand this may not be possible! I'm still very new and not good at using them!
You are quite right, you will need to have an array for that. If the number of elements in the array is constant, you could also create a struct that contains exactly that number elements, but please don't do that.
To operate on arrays you will - unfortunately - need pointers. A very common way to do this in C is not to return a pointer, but pass a 'result' pointer in. This means that it will be up to the user of the function to allocate space and free it, he can also use the syntax for arrays. In your code it seems that the number of values is constant, this makes the aforementioned solution possible. Alternatively you could allocate space on the heap (using malloc) and return a pointer, but that means the user needs to free memory he never allocated, counter intuitive and might result in memory leaks if he forgets to do so. Consider the following solution:
void submerged_volume(double L1, double L2, double Lavg, double H, struct boat_params *result) {
// your calculations here
for (theta = 0; theta <= 10; theta ++) {
(result+theta)->V = V_sub;
(result+theta)->Uc = Uc_sub;
(result+theta)->Vc = Vc_sub;
}
}
// somewhere in your code where you want to use your function
struct boat_params values[11];
unsigned char i = 0;
submerged_values(/* parameters */, values);
for (; i <= 10; ++i) {
printf("V = %lf\nUc = %lf\nVc = %lf\n", values[i].V, values[i].Uc, values[i].Vc);
}
Try this, just add your logic to the loop and maths:
#include <stdio.h>
#include <stdlib.h>
#define ARRSIZE 100
typedef struct boat_params {
double V, Uc, Vc;
} Volume;
struct boat_params submerged_volume(double L1, double L2, double Lavg, double H, Volume *volumes[]) {
double theta;
int i = 0; /* only example, change as needed */
Volume *p;
for (theta = 0; theta <= 10; theta ++) {
p = malloc(sizeof(* p));
if (p == NULL) {
printf("malloc failed to allocate a new space");
exit(0);
}
p->V = 1; //V_sub;
p->Uc = 2; //Uc_sub;
p->Vc = 3; //Vc_sub;
volumes[i] = p;
i++;
}
}
int main () {
double L1, L2, Lavg, H;
L1 = 17.6;
L2 = 3;
Lavg = 4;
H = 4.5;
Volume *volumes[ARRSIZE];
submerged_volume(L1, L2, Lavg, H, volumes);
printf("V = %lf\nUc = %lf\nVc = %lf\n", volumes[0]->V, volumes[0]->Uc, volumes[0]->Vc); /* first element for example */
return 0;
}
If you don't know the size of the volumes array in advance, you should consider using linked list.

How to write the mexFunction of this c file

The function is cyclic.c.
void cyclic(float a[], float b[], float c[], float alpha, float beta,
float r[], float x[], unsigned long n)
// Solves for a vector x[1..n] the “cyclic” set of linear equations. a,
//b, c, and r are input vectors, all dimensioned as [1..n], while alpha and beta are //the corner
// entries in the matrix.
I am new for the interface between Matlab and C. And I have not use C for several years.
Last night, I finished it and compile. The last thing is to call it.
#include "mex.h"
#include "nrutil.h"
#define FREE_ARG char*
#define NR_END 1
#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#define NR_END 1
#define FREE_ARG char*
void nrerror(char error_text[])
/* Numerical Recipes standard error handler */
{fprintf(stderr,"Numerical Recipes run-time error...\n");
fprintf(stderr,"%s\n",error_text);
fprintf(stderr,"...now exiting to system...\n");
exit(1);
}
float *vector(long nl, long nh)
/* allocate a float vector with subscript range v[nl..nh] */
{
float *v;
v=(float *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(float)));
if (!v) nrerror("allocation failure in vector()");
return v-nl+NR_END;
}
void free_vector(float *v, long nl, long nh)
/* free a float vector allocated with vector() */
{
free((FREE_ARG) (v+nl-NR_END));
}
void tridag(float a[], float b[], float c[], float r[], float u[],
unsigned long n)
{
unsigned long j;
float bet,*gam;
gam=vector(1,n);
if (b[1] == 0.0) nrerror("Error 1 in tridag");
u[1]=r[1]/(bet=b[1]);
for (j=2;j<=n;j++) {
gam[j]=c[j-1]/bet;
bet=b[j]-a[j]*gam[j];
if (bet == 0.0) nrerror("Error 2 in tridag");
u[j]=(r[j]-a[j]*u[j-1])/bet;
}
for (j=(n-1);j>=1;j--)
u[j] -= gam[j+1]*u[j+1];
free_vector(gam,1,n);
}
void cyclic(float a[], float b[], float c[], float alpha, float beta,
float r[], float x[], unsigned long n)
{
void tridag(float a[], float b[], float c[], float r[], float u[],
unsigned long n);
unsigned long i;
float fact,gamma,*bb,*u,*z;
if (n <= 2) nrerror("n too small in cyclic");
bb=vector(1,n);
u=vector(1,n);
z=vector(1,n);
gamma = -b[1]; //Avoid subtraction error in forming bb[1].
bb[1]=b[1]-gamma; //Set up the diagonal of the modified tridiagonal
bb[n]=b[n]-alpha*beta/gamma; //system.
for (i=2;i<n;i++) bb[i]=b[i];
tridag(a,bb,c,r,x,n);// Solve A · x = r.
u[1]=gamma;// Set up the vector u.
u[n]=alpha;
for (i=2;i<n;i++) u[i]=0.0;
tridag(a,bb,c,u,z,n);// Solve A · z = u.
fact=(x[1]+beta*x[n]/gamma)/ //Form v · x/(1 + v · z).
(1.0+z[1]+beta*z[n]/gamma);
for (i=1;i<=n;i++) x[i] -= fact*z[i]; //Nowget the solution vector x.
free_vector(z,1,n);
free_vector(u,1,n);
free_vector(bb,1,n);
}
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
float *a,*b,*c,*x,*r;
float alpha,beta;
unsigned long n = (unsigned long) mxGetScalar(prhs[6]);
// a=mxGetPr(prhs[0]);
// b=mxGetPr(prhs[1]);
// c=mxGetPr(prhs[2]);
// r=mxGetPr(prhs[5]);
a = (float*) mxGetData(prhs[0]);
b = (float*) mxGetData(prhs[1]);
c = (float*) mxGetData(prhs[2]);
r = (float*) mxGetData(prhs[5]);
// alpha=*(mxGetPr(prhs[3]));
// beta=*(mxGetPr(prhs[4]));
alpha = (float) mxGetScalar(prhs[3]);
beta = (float) mxGetScalar(prhs[4]);
plhs[0]= mxCreateDoubleMatrix(n, 1, mxREAL);
x = mxGetPr(plhs[0]);
mexPrintf("%f ",alpha);
mexPrintf("\n");
mexPrintf("%f ",beta);
mexPrintf("\n");
mexPrintf("%d ",n);
mexPrintf("\n");
cyclic(a,b,c, alpha, beta,r,x,n) ;
mexPrintf("%d ",n);
mexPrintf("\n");
}
Finally I successfully compile itcyclic(a,b,c, alpha, beta,r,x,n) ;. But the answer is not right. I thing this is because r is an imaginary vector. So my question is how should I transform r between C and Matlab?
The C function cyclic expects arrays of floats, but mexFunction is passing a double*. Without changing cyclic.c, you have two options:
Convert the data to single in MATLAB and get a float* with mxGetData.
In mexFunction:
float *a = (float*) mxGetData(prhs[0]);
In MATLAB:
mexFunction(single(a),...)
Convert (copy, not cast!) the data in mexFunction.
In mexFunction, allocate new float arrays, and copy each element from the double input array (mxGetPr(prhs[0])) into the temporary float array.
Call mexFunction with a normal double array in MATLAB.
It's probably easier to do the former.
Under no circumstances should you simply cast the pointer, not that you were planning to do that.
Also, the scalars alpha, beta and n need to be read from prhs as scalars and passed to cyclic as scalars. In mexFunction, use:
float alpha = (float) mxGetScalar(prhs[...]);
float beta = (float) mxGetScalar(prhs[...]);
unsigned long n = (unsigned long) mxGetScalar(prhs[...]);
You've entirely forgotten c and r in mexFunction.

Model using Euler method and pointer arithmetic not functioning

I'm new to C, and quite unfamiliar with writing any program larger than a few lines.
I'm trying to write a model for an object in freefall acted upon by gravity and drag. It uses Eulers method to solve two first order differential equations, one for position and one for velocity.
So we have: F = m dv/dt = -mg - k|v|v and dy/dt = v
These are solved by: Vn+1 = Vn - (delta t*(g+(k/m)|Vn|Vn)) and Yn+1 = Yn + (delta t * Vn)
(In this Vn+1 is the n+1th term etc.)
In my program i've tried to have two functions, for position and velocity, which work by passing pointers with Y and V values between them and the main function, and it should then loop until Y=0 and print off the values at each step.
When I run it it comes up with something like this: http://imgur.com/DNHIhHI
Could anyone tell me either what is wrong with this, or if I need to use a different approach completely?
Many Thanks, Code below
#include <stdio.h>
void Velocity(double *ptr, double m, double k, double t);
void Position(double *pst, double *ptr, double t );
int main()
{
double k = 18833.5608;
double t = 0;
double m;
double speed = 0;
double *ptr = &speed;
double y = 1000;
double *pst = &y;
printf("Enter mass of object: \n");
scanf("%f" , &m);
do
{
Velocity( ptr, m, k, t );
printf("Velocity at time %f is: %f\n" , t, speed);
Position( pst, ptr, t);
printf("Position at time %f is: %f\n" , t , y);
t++;
}
while((y>0));
return 0;
}
void Velocity(double *velo, double m, double k, double t)
{
double g = 9.80665;
*velo = *velo - (t*(g+((k/m)*fabs(*velo)**(velo))));
}
void Position(double *Y , double *velo, double t )
{
*Y = *Y+(t*(*velo));
}
When writing programs that do calculations -- in any language, not just C -- try to make the code that does the computation take arguments and return results but not mutate variables. That is, do not write:
void do_calculation( double * result, double x, double y)
{
*result = x + y;
}
...
double r;
do_calculation(&r, 123, 456);
instead write
double do_calculation(double x, double y)
{
return x + y;
}
...
double r = do_calculation(123, 456);
Make sense?
If you want to modify an existing value, again, don't pass it in as a variable to be mutated. Instead of
void do_calculation(double * accumulator, double x, double y)
{
*accumulator = *accumulator + x + y;
}
...
double r = 10;
do_calculation(&r, 123, 456);
instead say
double do_calculation(double original, double x, double y)
{
return original + x + y;
}
...
double r = 10;
r = do_calculation(r, 123, 456);
Now, once you've got your program architected more sensibly, you need to learn how to debug small programs. Some good advice on that subject can be found here:
http://ericlippert.com/2014/03/05/how-to-debug-small-programs/
A misconcept. I believe you're trying to solve the equations by using small increments of time. Nothing wrong with that, just make the time increment as small as possible, and correct the formulas:
#include <stdio.h>
#include <math.h>
void Velocity(double *velocity, double m, double k, double t)
{
double g = 9.80665;
double velo = *(velocity);
velo = velo - (t*(g+((k/m)*abs(velo)*(velo))));
*(velocity)=velo;
}
void Position(double *position , double *velocity, double t )
{
double Y = *(position);
double velo = *(velocity);
Y = Y+(t*(velo));
*(position)=Y;
}
int main()
{
double k = 18833.5608;
double t = 0;
double dt = 0.001; //making a small increment of time
double m=100;
double speed = 0;
double y = 1000;
//printf("Enter mass of object: \n");
//scanf("%f" , &m);
do
{
Velocity( &speed, m, k, dt );
printf("Velocity at time %f is: %f\n" , t, speed);
Position( &y, &speed, dt);
printf("Position at time %f is: %f\n" , t , y);
t+=dt; //increment time by delta t
}
while((y>0));
return 0;
}

Resources