Related
I'm attempting to implement block matrix multiplication and making it more parallelized.
This is my code :
int i,j,jj,k,kk;
float sum;
int en = 4 * (2048/4);
#pragma omp parallel for collapse(2)
for(i=0;i<2048;i++) {
for(j=0;j<2048;j++) {
C[i][j]=0;
}
}
for (kk=0;kk<en;kk+=4) {
for(jj=0;jj<en;jj+=4) {
for(i=0;i<2048;i++) {
for(j=jj;j<jj+4;j++) {
sum = C[i][j];
for(k=kk;k<kk+4;k++) {
sum+=A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
}
}
I've been playing around with OpenMP but still have had no luck in figuring what the best way to have this done in the least amount of time.
Getting good performance from matrix multiplication is a big job. Since "The best code is the code I don't have to write", a much better use of your time would be to understand how to use a BLAS library.
If you are using X86 processors, the Intel Math Kernel Library (MKL) is available free, and includes optimized, parallelized, matrix multiplication operations.
https://software.intel.com/en-us/articles/free-mkl
(FWIW, I work for Intel, but not on MKL :-))
I recently started looking into dense matrix multiplication (GEMM)again. It turns out the Clang compiler is really good at optimization GEMM without needing any intrinsics (GCC still needs intrinsics). The following code gets 60% of the peak FLOPS of my four core/eight hardware thread Skylake system. It uses block matrix multiplication.
Hyper-threading gives worse performance so you make sure you only use threads equal to the number of cores and bind threads to prevent thread migration.
export OMP_PROC_BIND=true
export OMP_NUM_THREADS=4
Then compile like this
clang -Ofast -march=native -fopenmp -Wall gemm_so.c
The code
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <omp.h>
#include <x86intrin.h>
#define SM 80
typedef __attribute((aligned(64))) float * restrict fast_float;
static void reorder2(fast_float a, fast_float b, int n) {
for(int i=0; i<SM; i++) memcpy(&b[i*SM], &a[i*n], sizeof(float)*SM);
}
static void kernel(fast_float a, fast_float b, fast_float c, int n) {
for(int i=0; i<SM; i++) {
for(int k=0; k<SM; k++) {
for(int j=0; j<SM; j++) {
c[i*n + j] += a[i*n + k]*b[k*SM + j];
}
}
}
}
void gemm(fast_float a, fast_float b, fast_float c, int n) {
int bk = n/SM;
#pragma omp parallel
{
float *b2 = _mm_malloc(sizeof(float)*SM*SM, 64);
#pragma omp for collapse(3)
for(int i=0; i<bk; i++) {
for(int j=0; j<bk; j++) {
for(int k=0; k<bk; k++) {
reorder2(&b[SM*(k*n + j)], b2, n);
kernel(&a[SM*(i*n+k)], b2, &c[SM*(i*n+j)], n);
}
}
}
_mm_free(b2);
}
}
static int doublecmp(const void *x, const void *y) { return *(double*)x < *(double*)y ? -1 : *(double*)x > *(double*)y; }
double median(double *x, int n) {
qsort(x, n, sizeof(double), doublecmp);
return 0.5f*(x[n/2] + x[(n-1)/2]);
}
int main(void) {
int cores = 4;
double frequency = 3.1; // i7-6700HQ turbo 4 cores
double peak = 32*cores*frequency;
int n = SM*10*2;
int mem = sizeof(float) * n * n;
float *a = _mm_malloc(mem, 64);
float *b = _mm_malloc(mem, 64);
float *c = _mm_malloc(mem, 64);
memset(a, 1, mem), memset(b, 1, mem);
printf("%dx%d matrix\n", n, n);
printf("memory of matrices: %.2f MB\n", 3.0*mem*1E-6);
printf("peak SP GFLOPS %.2f\n", peak);
puts("");
while(1) {
int r = 10;
double times[r];
for(int j=0; j<r; j++) {
times[j] = -omp_get_wtime();
gemm(a, b, c, n);
times[j] += omp_get_wtime();
}
double flop = 2.0*1E-9*n*n*n; //GFLOP
double time_mid = median(times, r);
double flops_low = flop/times[r-1], flops_mid = flop/time_mid, flops_high = flop/times[0];
printf("%.2f %.2f %.2f %.2f\n", 100*flops_low/peak, 100*flops_mid/peak, 100*flops_high/peak, flops_high);
}
}
This does GEMM 10 times per iteration of an infinite loop and prints the low, median, and high ratio of FLOPS to peak_FLOPS and finally the median FLOPS.
You will need to adjust the following lines
int cores = 4;
double frequency = 3.1; // i7-6700HQ turbo 4 cores
double peak = 32*cores*frequency;
to the number of physical cores, frequency for all cores (with turbo if enabled), and the number of floating pointer operations per core which is 16 for Core2-Ivy Bridge, 32 for Haswell-Kaby Lake, and 64 for the Xeon Phi Knights Landing.
This code may be less efficient with NUMA systems. It does not do nearly as well with Knight Landing (I just started looking into this).
i just learned GPU programming and now i have a task to find a minimum value from 100x100 matrix by doing parallel at CUDA. i have try this code, but it's not showing the answer, instead of showing my initiate value hmin = 9999999.can anyone give me the right code? oh, the code is in C lang.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define size (100*100)
//Kernel Functions & Variable
__global__ void FindMin(int* mat[100][100],int* kmin){
int b=blockIdx.x+threadIdx.x*blockDim.x;
int k=blockIdx.y+threadIdx.y*blockDim.y;
if(mat[b][k] < kmin){
kmin = mat[b][k];
}
}
int main(int argc, char *argv[]) {
//Declare Variabel
int i,j,hmaks=0,hmin=9999999,hsumin,hsumax; //Host Variable
int *da[100][100],*dmin,*dmaks,*dsumin,*dsumax; // Device Variable
FILE *baca; //for opening txt file
char buf[4]; //used for fscanf
int ha[100][100],b; //matrix shall be filled by "b"
//1: Read txt File
baca=fopen("MatrixTubes1.txt","r");
if (!baca){
printf("Hey, it's not even exist"); //Checking File, is it there?
}
i=0;j=0; //Matrix index initialization
if(!feof(baca)){ //if not end of file then do
for(i = 0; i < 100; i++){
for(j = 0; j < 100; j++){
fscanf(baca,"%s",buf); //read max 4 char
b=atoi(buf); //parsing from string to integer
ha[i][j]=b; //save it to my matrix
}
}
}
fclose(baca);
//all file has been read
//time to close the file
//Sesi 2: Allocation data di GPU
cudaMalloc((void **)&da, size*sizeof(int));
cudaMalloc((void **)&dmin, sizeof(int));
cudaMalloc((void **)&dmaks, sizeof(int));
cudaMalloc((void **)&dsumin, sizeof(int));
cudaMalloc((void **)&dsumax, sizeof(int));
//Sesi 3: Copy data to Device
cudaMemcpy(da, &ha, size*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmin, &hmin, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dmaks, &hmaks, sizeof(int), cudaMemcpyHostToDevice);
//Sesi 4: Call Kernel
FindMin<<<100,100,1>>>(da,dmin);
//5: Copy from Device to Host
cudaMemcpy(&hmin, dmin, sizeof(int), cudaMemcpyDeviceToHost);
//6: Print that value
printf("Minimum Value = %i \n",hmin);
system("pause"); return 0;
}
this is my result
Minimum Value = 9999999
Press any key to continue . . .
I saw a few issues in your code.
As mentioned in the comments from MayurK, you got the indexing wrong.
Also as MayurK said, you are comparing two pointers and not the values they point to.
You kernel invocation code asks for 100 x 100 x 1 grid, with each block containing just 1 thread. This is very bad in terms of efficiency. Also, because of this, your b and k will only range from 0 to 99, as the threadIdx.x will always be zero.
Finally, all threads will be running in parallel, resulting in a race condition in kmin = mat[b][k] (which should be *kmin by the way). When you fixed the indexing problem, all threads in the same block will write to the location in global memory at same time. You should use atomicMin() or a parallel reduction for finding the minimum value in parallel.
I have been trying to perform a QR decomposition of many small matrices in parallel with CUDA.
I therefore used the cublasDgeqrfBatched function in Cublas. I couldn't find a working example of the above fuction and I found some ambiguity in the documentation for calling it.
In fact, I tried to test cublasDgeqrfBatched on the example in the Householder reflections section in Wikipedia as this same method is being used by cublasDgeqrfBatched. The 2 input small matrices are identical and are the following:
A= 12 -51 4
6 167 -68
-4 24 -41
According to the documentation, Aarray is an array of pointers to matrices with dimensions mxn and TauArray is an array of pointers to vectors of dimension of at least max (1, min(m, n).
cublasDgeqrfBatched performs the QR factorization of each Aarray[i] for
i =0, ...,batchSize-1
Each matrix Q[i] is stored in the lower part of each Aarray[i]
I used the following code to call this function:
#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>
#include <cublas.h>
#include "cublas_v2.h"
#include "Utilities.cuh"
#include <helper_cuda.h>
/********/
/* MAIN */
/********/
int main(){
//mxn: size of Array[i]
const int m = 3;
const int n = 3;
double h_A[3*3*2]={12, -51, 4, 6, 167, -68, -4, 24, -41, 12, -51, 4, 6, 167, -68, -4, 24, -41};// two 3x3 identical matrices for test
const int batchSize=2;//2 small matrices
const int ltau=3; //ltau = max(1,min(m,n))
// --- CUBLAS initialization
cublasHandle_t cublas_handle;
cublasStatus_t stat;
cublasSafeCall(cublasCreate(&cublas_handle));
// --- CUDA batched QR initialization
double *d_A, *d_TAU;
checkCudaErrors(cudaMalloc((void**)&d_A, m*n*batchSize*sizeof(double)));
checkCudaErrors(cudaMalloc((void**)&d_TAU, ltau*batchSize*sizeof(double)));
checkCudaErrors(cudaMemcpy(d_A,h_A,m*n*batchSize*sizeof(double),cudaMemcpyHostToDevice));
double *d_Aarray[batchSize],*d_TauArray[batchSize];
for (int i = 0; i < batchSize; i++)
{
d_Aarray[i] = d_A+ i*m*n;
d_TauArray[i] = d_TAU + i*ltau;
}
int lda=3;
int info;
stat=cublasDgeqrfBatched(cublas_handle, m, n, d_Aarray, lda, d_TauArray, &info, batchSize);
if (stat != CUBLAS_STATUS_SUCCESS)
printf("\n cublasDgeqrfBatched failed");
double *A0,*A1;
A0=(double*)malloc(m*n*batchSize*sizeof(double));
A1=(double*)malloc(m*n*sizeof(double));
checkCudaErrors(cudaMemcpy(A0,d_Aarray[0],m*n*sizeof(double),cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(A1,d_Aarray[1],m*n*sizeof(double),cudaMemcpyDeviceToHost));
}
But, got an error "CUDA error batched_QR/kernel.cu:64 code=4(cudaErrorLaunchFailure) "cudaMemcpy(A0,d_Aarray[0],m*n*sizeof(double),cudaMemcpyDeviceToHost)"
I think there is an error in the use of pointers but I can't correct it. Where is the problem please?
Edit:
to make d_Aarray and d_TauArray device arrays as talonmies proposed, I added the following:
double *d_A, *d_TAU;
checkCudaErrors(cudaMalloc((void**)&d_A, m*n*batchSize*sizeof(*d_A)));
checkCudaErrors(cudaMalloc((void**)&d_TAU, ltau*batchSize*sizeof(*d_TAU)));
checkCudaErrors(cudaMemcpy(d_A,h_A,m*n*batchSize*sizeof(double),cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_TAU, 0, ltau*batchSize* sizeof(*d_TAU)));
But always the same error when copying the result back to host.
I think there is an error in the use of pointers
You are correct. The arrays of device pointers you are passing to cublasDgeqrfBatched are host arrays not device arrays:
double *d_Aarray[batchSize],*d_TauArray[batchSize];
for (int i = 0; i < batchSize; i++)
{
d_Aarray[i] = d_A+ i*m*n;
d_TauArray[i] = d_TAU + i*ltau;
}
You must copy d_Aarray and d_TauArray to the device and pass the address of the device copies to cublasDgeqrfBatched for this to work correctly. Something like this:
double *d_Aarray[batchSize],*d_TauArray[batchSize];
for (int i = 0; i < batchSize; i++)
{
d_Aarray[i] = d_A+ i*m*n;
d_TauArray[i] = d_TAU + i*ltau;
}
double ** d_Aarray_, ** d_TauArray_;
cudaMalloc((void **)&d_Aarray_, sizeof(d_Aarray));
cudaMalloc((void **)&d_TauArray_, sizeof(d_TauArray));
cudaMemcpy(d_Aarray_, d_Aarray, sizeof(d_Aarray), cudaMemcpyHostToDevice);
cudaMemcpy(d_TauArray_, d_TauArray, sizeof(d_TauArray), cudaMemcpyHostToDevice);
stat = cublasDgeqrfBatched(cublas_handle, m, n, d_Aarray_, lda, d_TauArray_, &info, batchSize)
[disclaimer: written in Browser]
Here d_Aarray_ and d_TauArray_ are device memory copies of d_Aarray and d_TauArray.
Thank for #hubs , when call cublasSgemv should notice that CUBLAS_OP_T is also transpose vector.
/*I am learning cuda and cublas for a month, and I want to test the performance of cublas for further use. But in my matrix-vector multiplication using cublasSgemv , the answer is wrong.
I initialize Matrix A and Vector x in row-major. I sent them to device using cudaMemcpy, and call the function cublasSgemv , because the A is row-major, I transpose it using a parameter CUBLAS_OP_T.*/
//the row is 50,and col is 10, A[i]=i;x[i]=1; And A matrix is row major.
//the answer I get is 45,545,.....4545,0,0,0,0,0,0,0,0,........0
int main(){
int row=50;
int col=10;
int N=row*col;
float*A=new float[N];
float* y_gpu=new float[50];
for (int i=0;i<N;i++)
{
A[i]=(float)i;
}
float* x=new float[10];
for (int i=0;i<10;i++)
{
x[i]=1;
}
GpuVec(A,x,y_gpu,row,col); //call the function
for(int i=0;i<50;i++){
cout<<" "<<y_gpu[i]<<endl; //
}
return 0;
}
int GpuVec(const float* A,const float* x, float* y,const int row,const int col){
cudaError_t cudastat;
cublasStatus_t stat;
int size=row*col;
cublasHandle_t handle;
float* d_A; //device matrix
float* d_x; //device vector
float* d_y; //device result
cudastat=cudaMalloc((void**)&d_A,size*sizeof(float));
cudastat=cudaMalloc((void**)&d_x,col*sizeof(float));
cudastat=cudaMalloc((void**)&d_y,row*sizeof(float));// when I copy y to d_y ,can I cout d_y?
cudaMemcpy(d_A,A,sizeof(float)*size,cudaMemcpyHostToDevice); //copy A to device d_A
cudaMemcpy(d_x,x,sizeof(float)*col,cudaMemcpyHostToDevice); //copy x to device d_x
float alf=1.0;
float beta=0;
stat=cublasCreate(&handle);
stat=cublasSgemv(handle,CUBLAS_OP_T,col,row,&alf,d_A,col,d_x,1,&beta,d_y,1);//swap col and row
cudaMemcpy(y,d_y,sizeof(float)*row,cudaMemcpyDeviceToHost); // copy device result to host
cudaFree(d_A);
cudaFree(d_x);
cudaFree(d_y);
cublasDestroy(handle);
return 0;
}
To use two-dimensional arrays stored in row-major order in cublas (that works with column-major order) you can call the gemv in this way.
stat = cublasSgemv(handle, CUBLAS_OP_T, col, row, &alf, d_A, col, d_x, 1, &beta, d_y, 1);
You have to swap m (rows) and n (columns) in the call, too, to perform y = A * x, but it allows you to use the cublas call without transposing the original array.
I would like to be able to compute the inverse of a general NxN matrix in C/C++ using lapack.
My understanding is that the way to do an inversion in lapack is by using the dgetri function, however, I can't figure out what all of its arguments are supposed to be.
Here is the code I have:
void dgetri_(int* N, double* A, int* lda, int* IPIV, double* WORK, int* lwork, int* INFO);
int main(){
double M [9] = {
1,2,3,
4,5,6,
7,8,9
};
return 0;
}
How would you complete it to obtain the inverse of the 3x3 matrix M using dgetri_?
Here is the working code for computing the inverse of a matrix using lapack in C/C++:
#include <cstdio>
extern "C" {
// LU decomoposition of a general matrix
void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
// generate inverse of a matrix given its LU decomposition
void dgetri_(int* N, double* A, int* lda, int* IPIV, double* WORK, int* lwork, int* INFO);
}
void inverse(double* A, int N)
{
int *IPIV = new int[N];
int LWORK = N*N;
double *WORK = new double[LWORK];
int INFO;
dgetrf_(&N,&N,A,&N,IPIV,&INFO);
dgetri_(&N,A,&N,IPIV,WORK,&LWORK,&INFO);
delete[] IPIV;
delete[] WORK;
}
int main(){
double A [2*2] = {
1,2,
3,4
};
inverse(A, 2);
printf("%f %f\n", A[0], A[1]);
printf("%f %f\n", A[2], A[3]);
return 0;
}
First, M has to be a two-dimensional array, like double M[3][3]. Your array is, mathematically speaking, a 1x9 vector, which is not invertible.
N is a pointer to an int for the
order of the matrix - in this case,
N=3.
A is a pointer to the LU
factorization of the matrix, which
you can get by running the LAPACK
routine dgetrf.
LDA is an integer for the "leading
element" of the matrix, which lets
you pick out a subset of a bigger
matrix if you want to just invert a
little piece. If you want to invert
the whole matrix, LDA should just be
equal to N.
IPIV is the pivot indices of the
matrix, in other words, it's a list
of instructions of what rows to swap
in order to invert the matrix. IPIV
should be generated by the LAPACK
routine dgetrf.
LWORK and WORK are the "workspaces"
used by LAPACK. If you are inverting
the whole matrix, LWORK should be an
int equal to N^2, and WORK should be
a double array with LWORK elements.
INFO is just a status variable to
tell you whether the operation
completed successfully. Since not all
matrices are invertible, I would
recommend that you send this to some
sort of error-checking system. INFO=0 for successful operation, INFO=-i if the i'th argument had an incorrect input value, and INFO > 0 if the matrix is not invertible.
So, for your code, I would do something like this:
int main(){
double M[3][3] = { {1 , 2 , 3},
{4 , 5 , 6},
{7 , 8 , 9}}
double pivotArray[3]; //since our matrix has three rows
int errorHandler;
double lapackWorkspace[9];
// dgetrf(M,N,A,LDA,IPIV,INFO) means invert LDA columns of an M by N matrix
// called A, sending the pivot indices to IPIV, and spitting error
// information to INFO.
// also don't forget (like I did) that when you pass a two-dimensional array
// to a function you need to specify the number of "rows"
dgetrf_(3,3,M[3][],3,pivotArray[3],&errorHandler);
//some sort of error check
dgetri_(3,M[3][],3,pivotArray[3],9,lapackWorkspace,&errorHandler);
//another error check
}
Here is a working version of the above using OpenBlas interface to LAPACKE.
Link with openblas library (LAPACKE is already contained)
#include <stdio.h>
#include "cblas.h"
#include "lapacke.h"
// inplace inverse n x n matrix A.
// matrix A is Column Major (i.e. firts line, second line ... *not* C[][] order)
// returns:
// ret = 0 on success
// ret < 0 illegal argument value
// ret > 0 singular matrix
lapack_int matInv(double *A, unsigned n)
{
int ipiv[n+1];
lapack_int ret;
ret = LAPACKE_dgetrf(LAPACK_COL_MAJOR,
n,
n,
A,
n,
ipiv);
if (ret !=0)
return ret;
ret = LAPACKE_dgetri(LAPACK_COL_MAJOR,
n,
A,
n,
ipiv);
return ret;
}
int main()
{
double A[] = {
0.378589, 0.971711, 0.016087, 0.037668, 0.312398,
0.756377, 0.345708, 0.922947, 0.846671, 0.856103,
0.732510, 0.108942, 0.476969, 0.398254, 0.507045,
0.162608, 0.227770, 0.533074, 0.807075, 0.180335,
0.517006, 0.315992, 0.914848, 0.460825, 0.731980
};
for (int i=0; i<25; i++) {
if ((i%5) == 0) putchar('\n');
printf("%+12.8f ",A[i]);
}
putchar('\n');
matInv(A,5);
for (int i=0; i<25; i++) {
if ((i%5) == 0) putchar('\n');
printf("%+12.8f ",A[i]);
}
putchar('\n');
}
Example:
% g++ -I [OpenBlas path]/include/ example.cpp [OpenBlas path]/lib/libopenblas.a
% a.out
+0.37858900 +0.97171100 +0.01608700 +0.03766800 +0.31239800
+0.75637700 +0.34570800 +0.92294700 +0.84667100 +0.85610300
+0.73251000 +0.10894200 +0.47696900 +0.39825400 +0.50704500
+0.16260800 +0.22777000 +0.53307400 +0.80707500 +0.18033500
+0.51700600 +0.31599200 +0.91484800 +0.46082500 +0.73198000
+0.24335255 -2.67946180 +3.57538817 +0.83711880 +0.34704217
+1.02790497 -1.05086895 -0.07468137 +0.71041070 +0.66708313
-0.21087237 -4.47765165 +1.73958308 +1.73999641 +3.69324020
-0.14100897 +2.34977565 -0.93725915 +0.47383541 -2.15554470
-0.26329660 +6.46315378 -4.07721533 -3.37094863 -2.42580445
Here is a working version of Spencer Nelson's example above. One mystery about it is that the input matrix is in row-major order, even though it appears to call the underlying fortran routine dgetri. I am led to believe that all the underlying fortran routines require column-major order, but I am no expert on LAPACK, in fact, I'm using this example to help me learn it. But, that one mystery aside:
The input matrix in the example is singular. LAPACK tries to tell you that by returning a 3 in the errorHandler. I changed the 9 in that matrix to a 19, getting an errorHandler of 0 signalling success, and compared the result to that from Mathematica. The comparison was also successful and confirmed that the matrix in the example should be in row-major order, as presented.
Here is the working code:
#include <stdio.h>
#include <stddef.h>
#include <lapacke.h>
int main() {
int N = 3;
int NN = 9;
double M[3][3] = { {1 , 2 , 3},
{4 , 5 , 6},
{7 , 8 , 9} };
int pivotArray[3]; //since our matrix has three rows
int errorHandler;
double lapackWorkspace[9];
// dgetrf(M,N,A,LDA,IPIV,INFO) means invert LDA columns of an M by N matrix
// called A, sending the pivot indices to IPIV, and spitting error information
// to INFO. also don't forget (like I did) that when you pass a two-dimensional
// array to a function you need to specify the number of "rows"
dgetrf_(&N, &N, M[0], &N, pivotArray, &errorHandler);
printf ("dgetrf eh, %d, should be zero\n", errorHandler);
dgetri_(&N, M[0], &N, pivotArray, lapackWorkspace, &NN, &errorHandler);
printf ("dgetri eh, %d, should be zero\n", errorHandler);
for (size_t row = 0; row < N; ++row)
{ for (size_t col = 0; col < N; ++col)
{ printf ("%g", M[row][col]);
if (N-1 != col)
{ printf (", "); } }
if (N-1 != row)
{ printf ("\n"); } }
return 0; }
I built and ran it as follows on a Mac:
gcc main.c -llapacke -llapack
./a.out
I did an nm on the LAPACKE library and found the following:
liblapacke.a(lapacke_dgetri.o):
U _LAPACKE_dge_nancheck
0000000000000000 T _LAPACKE_dgetri
U _LAPACKE_dgetri_work
U _LAPACKE_xerbla
U _free
U _malloc
liblapacke.a(lapacke_dgetri_work.o):
U _LAPACKE_dge_trans
0000000000000000 T _LAPACKE_dgetri_work
U _LAPACKE_xerbla
U _dgetri_
U _free
U _malloc
and it looks like there is a LAPACKE [sic] wrapper that would presumably relieve us of having to take addresses everywhere for fortran's convenience, but I am probably not going to get around to trying it because I have a way forward.
EDIT
Here is a working version that bypasses LAPACKE [sic], using LAPACK fortran routines directly. I do not understand why a row-major input produces correct results, but I confirmed it again in Mathematica.
#include <stdio.h>
#include <stddef.h>
int main() {
int N = 3;
int NN = 9;
double M[3][3] = { {1 , 2 , 3},
{4 , 5 , 6},
{7 , 8 , 19} };
int pivotArray[3]; //since our matrix has three rows
int errorHandler;
double lapackWorkspace[9];
/* from http://www.netlib.no/netlib/lapack/double/dgetrf.f
SUBROUTINE DGETRF( M, N, A, LDA, IPIV, INFO )
*
* -- LAPACK routine (version 3.1) --
* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
* November 2006
*
* .. Scalar Arguments ..
INTEGER INFO, LDA, M, N
* ..
* .. Array Arguments ..
INTEGER IPIV( * )
DOUBLE PRECISION A( LDA, * )
*/
extern void dgetrf_ (int * m, int * n, double * A, int * LDA, int * IPIV,
int * INFO);
/* from http://www.netlib.no/netlib/lapack/double/dgetri.f
SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
*
* -- LAPACK routine (version 3.1) --
* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd..
* November 2006
*
* .. Scalar Arguments ..
INTEGER INFO, LDA, LWORK, N
* ..
* .. Array Arguments ..
INTEGER IPIV( * )
DOUBLE PRECISION A( LDA, * ), WORK( * )
*/
extern void dgetri_ (int * n, double * A, int * LDA, int * IPIV,
double * WORK, int * LWORK, int * INFO);
// dgetrf(M,N,A,LDA,IPIV,INFO) means invert LDA columns of an M by N matrix
// called A, sending the pivot indices to IPIV, and spitting error information
// to INFO. also don't forget (like I did) that when you pass a two-dimensional
// array to a function you need to specify the number of "rows"
dgetrf_(&N, &N, M[0], &N, pivotArray, &errorHandler);
printf ("dgetrf eh, %d, should be zero\n", errorHandler);
dgetri_(&N, M[0], &N, pivotArray, lapackWorkspace, &NN, &errorHandler);
printf ("dgetri eh, %d, should be zero\n", errorHandler);
for (size_t row = 0; row < N; ++row)
{ for (size_t col = 0; col < N; ++col)
{ printf ("%g", M[row][col]);
if (N-1 != col)
{ printf (", "); } }
if (N-1 != row)
{ printf ("\n"); } }
return 0; }
built and run like this:
$ gcc foo.c -llapack
$ ./a.out
dgetrf eh, 0, should be zero
dgetri eh, 0, should be zero
-1.56667, 0.466667, 0.1
1.13333, 0.0666667, -0.2
0.1, -0.2, 0.1
EDIT
The mystery no longer appears to be a mystery. I think the computations are being done in column-major order, as they must, but I am both inputting and printing the matrices as if they were in row-major order. I have two bugs that cancel each other out so things look row-ish even though they're column-ish.