Matrix multiplication with MKL - c

I have the CSR coordinates of a matrix.
/* alloc space for COO matrix */
int *coo_rows = (int*) malloc(K.n_rows * sizeof(int));
int *coo_cols = (int*) malloc(K.n_rows * sizeof(int));
float *coo_vals = (float*) malloc(K.n_rows * sizeof(float));
/*Load coo values*/
int *rowptrs = (int*) malloc((N_unique+1)*sizeof(int));
int *colinds = (int*) malloc(K.n_rows *sizeof(int));
double *vals = (double*) malloc(K.n_rows *sizeof(double));
/* take csr values */
int job[] = {
2, // job(1)=2 (coo->csr with sorting)
0, // job(2)=1 (one-based indexing for csr matrix)
0, // job(3)=1 (one-based indexing for coo matrix)
0, // empty
n1, // job(5)=nnz (sets nnz for csr matrix)
0 // job(6)=0 (all output arrays filled)
};
int info;
mkl_scsrcoo(job, &n, vals, colinds, rowptrs, &n1, coo_vals, coo_rows, coo_cols, &info);
assert(info == 0 && "Converted COO->CSR");
Now I want to apply the mkl_dcsrmm function to compute C := alpha*A*B + beta*C with beta = 0;
/* function declaration */
void mkl_dcsrmm (char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, double *alpha, char *matdescra, double *val, MKL_INT *indx, MKL_INT *pntrb, MKL_INT *pntre, double *b, MKL_INT *ldb, double *beta, double *c, MKL_INT *ldc);
Since now I have.
int A_rows = ..., A_cols = ..., C_cols = ...
double alpha = 1.0;
mkl_dcsrmm ((char*)"N", &A_rows, &C_cols, &A_cols, &alpha, char *matdescra, vals, coo_cols, rowptrs, colinds , double *b, MKL_INT *ldb, double *beta, double *c, MKL_INT *ldc);
I found some difficulties on filling the inputs. Could you please help me to fill the rest of the inputs?
A specific input for which I want to go in more details is the matdescra. I borrowed the following code from cspblas_ccsr example
char matdescra[6];
matdescra[0] = 'g';
matdescra[1] = 'l';
matdescra[2] = 'n';
matdescra[3] = 'c';
but I have some questions about that. The matrix A I am working is not triangular and the initialization of this char array engage you to make such a declaration, how should I configure the parameters of the matdescra array?

Here is what I use, and what works for me.
char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'};
/* https://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-34C8DB79-0139-46E0-8B53-99F3BEE7B2D4.htm#TBL2-6
G: General. D: Diagonal
L/U Lower/Upper triangular (ignored with G)
N: non-unit diagonal (ignored with G)
C: zero-based indexing.
*/
Complete Example
Here is a complete example. I first create a random matrix by filling a dense matrix with a specified density of Non-Zero elements. Then I convert it to a sparse matrix in CSR-format. Finally, I do the multiplication using mkl_dcsrmm. As a possible check (check not done), I do the same multiplication using the cblas_dgemm function with the dense matrix.
#include "mkl.h"
#include "mkl_spblas.h"
#include <stddef.h> // For NULL
#include <stdlib.h> // for rand()
#include <assert.h>
#include <stdio.h>
#include <limits.h>
// Compute C = A * B; where A is sparse and B is dense.
int main() {
MKL_INT m=10, n=5, k=11;
const double sparsity = 0.9; ///< #param sparsity Values below which are set to zero (sampled from uniform(0,1)-distribution).
double *A_dense;
double *B;
double *C;
double alpha = 1.0;
double beta = 0.0;
const int allignment = 64;
// Seed the RNG to always be the same
srand(42);
// Allocate memory to matrices
A_dense = (double *)mkl_malloc( m*k*sizeof( double ), allignment);
B = (double *)mkl_malloc( k*n*sizeof( double ), allignment);
C = (double *)mkl_malloc( m*n*sizeof( double ), allignment);
if (A_dense == NULL || B == NULL || C == NULL) {
printf("ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A_dense);
mkl_free(B);
mkl_free(C);
return 1;
}
// Initializing matrix data
int i;
int nzmax = 0;
for (i = 0; i < (m*k); i++) {
double val = rand() / (double)RAND_MAX;
if ( val < sparsity ) {
A_dense[i] = 0.0;
} else {
A_dense[i] = val;
nzmax++;
}
}
for (i = 0; i < (k*n); i++) {
B[i] = rand();
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
// Convert A to a sparse matrix in CSR format.
// INFO: https://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-AD67DD8D-4C22-4232-8D3F-AF97DC2ABBC8.htm#GUID-AD67DD8D-4C22-4232-8D3F-AF97DC2ABBC8
MKL_INT job[6];
job[0] = 0; // convert TO CSR.
job[1] = 0; // Zero-based indexing for input.
job[2] = 0; // Zero-based indexing for output.
job[3] = 2; // adns is a whole matrix A.
job[4] = nzmax; // Maximum number of non-zero elements allowed.
job[5] = 3; // all 3 arays are generated for output.
/* JOB: conversion parameters
* m: number of rows of A.
* k: number of columns of A.
* adns: (input/output). Array containing non-zero elements of the matrix A.
* lda: specifies the leading dimension of adns. must be at least max(1, m).
* acsr: (input/output) array containing non-zero elements of the matrix A.
* ja: array containing the column indices.
* ia length m+1, rowIndex.
* OUTPUT:
* info: 0 if successful. i if interrupted at i-th row because of lack of space.
*/
int info = -1;
printf("nzmax:\t %d\n", nzmax);
double *A_sparse = mkl_malloc(nzmax * sizeof(double), allignment);
if (A_sparse == NULL) {
printf("ERROR: Could not allocate enough space to A_sparse.\n");
return 1;
}
MKL_INT *A_sparse_cols = mkl_malloc(nzmax * sizeof(MKL_INT), allignment);
if (A_sparse_cols == NULL) {
printf("ERROR: Could not allocate enough space to A_sparse_cols.\n");
return 1;
}
MKL_INT *A_sparse_rowInd = mkl_malloc((m+1) * sizeof(MKL_INT), allignment);
if (A_sparse_rowInd == NULL) {
printf("ERROR: Could not allocate enough space to A_sparse_rowInd.\n");
return 1;
}
mkl_ddnscsr(job, &m, &k, A_dense, &k, A_sparse, A_sparse_cols, A_sparse_rowInd, &info);
if(info != 0) {
printf("WARNING: info=%d, expected 0.\n", info);
}
assert(info == 0);
char transa = 'n';
MKL_INT ldb = n, ldc=n;
char matdescra[6] = {'g', 'l', 'n', 'c', 'x', 'x'};
/* https://software.intel.com/sites/products/documentation/hpc/mkl/mklman/GUID-34C8DB79-0139-46E0-8B53-99F3BEE7B2D4.htm#TBL2-6
G: General. D: Diagonal
L/U Lower/Upper triangular (ignored with G)
N: non-unit diagonal (ignored with G)
C: zero-based indexing.
*/
mkl_dcsrmm(&transa, &m, &n, &m, &alpha, matdescra, A_sparse, A_sparse_cols,
A_sparse_rowInd, &(A_sparse_rowInd[1]), B, &ldb, &beta, C, &ldc);
// The same computation in dense format
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A_dense, k, B, n, beta, C, n);
mkl_free(A_dense);
mkl_free(A_sparse);
mkl_free(A_sparse_cols);
mkl_free(A_sparse_rowInd);
mkl_free(B);
mkl_free(C);
return 0;
}

Related

C malloc giving pointer to already used memory

I have a struct in my code that gets its value changed even though I don't reference at all. The code is:
layer l = linear(2, 3);
neural_network network = create_network();
add_layer(&network, &l);
printf("Before: %llu\n", network.layers[0].weights.column_size);
matrix i = create_matrix(2, 1);
printf("After: %llu\n", network.layers[0].weights.column_size);
And the output to this code is:
Before: 2
After: 0
This doesn't make sense to me as create_matrix is defined as:
matrix create_matrix(uint64_t row_size, uint64_t column_size) {
matrix mat;
mat.row_size = row_size;
mat.column_size = column_size;
mat.array = malloc(sizeof(double) * mat.row_size * mat.column_size);
return mat;
}
I guessed that malloc was messing up for some reason so I printed the addresses of each one and got this:
Address of network.layers[0].weights.column_size: 0x600000b4c010
Before: 2
Address of mat.array: 0x600000b4c010
After: 0
So C is somehow allocating heap memory that should already be used. I'm not really sure why this is going on as I never freed any of the memory. The relevant structs are defined as:
typedef struct layer {
uint64_t neurons;
matrix weights;
matrix biases;
matrix (*compute_activations)(struct layer *l, matrix *activations);
} layer;
typedef struct {
uint16_t number_of_layers;
layer *layers;
} neural_network;
typedef struct {
uint64_t row_size;
uint64_t column_size;
double *array;
} matrix;
Minimum Reproductible Example:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cblas.h>
#include <string.h>
typedef struct {
uint64_t row_size;
uint64_t column_size;
double *array;
} matrix;
typedef struct layer{
uint64_t neurons;
matrix weights;
matrix biases;
matrix (*compute_activations)(struct layer *l, matrix *activations);
} layer;
typedef struct {
uint16_t number_of_layers;
layer *layers;
} neural_network;
matrix create_matrix(uint64_t row_size, uint64_t column_size) {
matrix mat;
mat.row_size = row_size;
mat.column_size = column_size;
mat.array = malloc(sizeof(double) * mat.row_size * mat.column_size);
return mat;
}
matrix matrix_m_multiply(matrix *A, matrix *B, matrix *C, double alpha, double beta) {
matrix C_copy = create_matrix(C->row_size, C->column_size);
memcpy(C_copy.array, C->array, C->row_size * C->column_size * sizeof(double));
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
A->row_size, B->column_size, A->column_size, alpha,
A->array, A->column_size, B->array, B->column_size, beta,
C_copy.array, C->column_size);
return C_copy;
}
matrix matrix_v_multiply(matrix *A, matrix *B, matrix *C, double alpha, double beta) {
matrix C_copy = create_matrix(C->row_size, C->column_size);
memcpy(C_copy.array, C->array, C->row_size * C->column_size * sizeof(double));
cblas_dgemv(CblasRowMajor, CblasNoTrans,
A->row_size, A->column_size, alpha,
A->array, A->column_size, B->array, 1, beta,
C_copy.array, 1);
return C_copy;
}
neural_network create_network() {
neural_network network = { .number_of_layers = 0 };
network.layers = malloc(sizeof(layer) * network.number_of_layers);
return network;
}
void add_layer(neural_network *network, layer *l) {
network->layers = realloc(network->layers, ++network->number_of_layers);
network->layers[network->number_of_layers - 1] = *l;
}
matrix forward_pass(neural_network *network, matrix *inputs) {
matrix activations = *inputs;
for (int i = 0; i < network->number_of_layers; i++) {
activations = network->layers[i].compute_activations(&network->layers[i], &activations);
}
return activations;
}
matrix linear_function(layer *linear_layer, matrix *activations) {
return matrix_v_multiply(&linear_layer->weights, activations, &linear_layer->biases, 1.0, 1.0);
}
layer linear(uint64_t in, uint64_t out) {
layer linear_layer;
linear_layer.neurons = out;
linear_layer.weights = create_matrix(out, in); // Don't have to transpose matrix when doing vector product with inputs
linear_layer.biases = create_matrix(out, 1);
linear_layer.compute_activations = linear_function;
return linear_layer;
}
int main(int argc, char *argv[]) {
srand(0);
layer l = linear(2, 3);
neural_network network = create_network();
add_layer(&network, &l);
printf("Before: %llu\n", network.layers[0].weights.column_size);
matrix i = create_matrix(2, 1);
printf("After: %llu\n", network.layers[0].weights.column_size);
matrix output = forward_pass(&network, &i);
return 0;
}
There are at least two problems in your code:
in function create_network(), you allocate 0 bytes with malloc(), which has implementation defined behavior. You might instead initialize the layers member to a null pointer:
neural_network create_network(void) {
neural_network network = { .number_of_layers = 0, .layers = NULL };
return network;
}
the size reallocated by add_layer() is incorrect: you forgot to multiply the new number of elements by the element size. This problem is the likely explanation for your observations. You should write:
void add_layer(neural_network *network, layer *l) {
network->layers = realloc(network->layers,
sizeof(*network->layers) *
(network->number_of_layers + 1));
network->layers[network->number_of_layers++] = *l;
}
you do not check for memory allocation failure anywhere in your code. I would recommend using malloc, calloc, strdup and realloc wrappers to test for unlikely yet possible allocation failure and exit with an informative message.

lapack dgels_ segmentation fault 11

I am trying to use LAPACK's dgels_ in C to solve a linear least squares problem. I have to read the matrix A (assumed to have full rank and m>=n) and a vector b from 2 text files. I can easily compile my code, but when i try to run it I get a "segmentation fault 11", but I can't really see why. It is my first time using LAPACK so I don't know if maybe I am using the dgels_ function wrong?? The way I get it the solution x will be overwritten in the vector b? :
lssolve.c:
#include <stdlib.h>
#include <stdio.h>
#include "linalg.h"
/* C prototype for LAPACK routine DGELS */
void dgels_(const char * trans, const int * m, const int * n, const int *
nrhs, double * A, const int * lda, double * B, const int * ldb, double * work,
int * lwork,int * info);
int main(int argc, char * argv[]) {
vector_t * b_t = NULL;
matrix_t * A_t = NULL;
char trans = 'N';
int m, n, nrhs, mb, lda, ldb, info, lwork;
double optwork;
double * work;
// we read the matrix A and the vector b:
b_t = read_vector("b.txt");
A_t = read_matrix("A.txt");
m = A_t-> m; //number of rows in A
n = A_t-> n; //number of columns in A
nrhs = 1; //number of columns in B (will always be 1, since we read b_t with read_vector)
mb = b_t -> n; //number of rows in B
if (mb != m ) { //end program if A and B doesn't have the same number of rows
free(A_t);
free(b_t);
fprintf(stderr, "Sorry, but the matrix A and the vector b have incompatible dimensions. Good Bye!\n");
exit(EXIT_FAILURE);
}
//We make A and B into the wanted input form for the dgels_-function:
double * B = b_t -> v;
double ** A = A_t ->A;
lda = m;
ldb = mb;
//we calculate the optimal size of the work array:
lwork = -1;
dgels_(&trans, &n, &m, &nrhs, *A, &lda, B, &ldb, &optwork, &lwork, &info);
lwork = (int)optwork;
//we allocate space for the work array:
work = (double*)malloc( lwork*sizeof(double));
//solving the least squares problem:
dgels_(&trans, &n, &m, &nrhs, *A, &lda, B, &ldb, work, &lwork, &info);
//Check whether there was an successful exit:
if (info > 0){
fprintf(stderr, "Sorry, but illegal arguments were used, and therefore a least square solution cannot be computes. Good Bye!\n");
exit(EXIT_FAILURE);
} else if(info < 0){
fprintf(stderr, "Sorry, but A doesn't have full rank, and therefore a least square solution cannot be computed. Good Bye!\n");
exit(EXIT_FAILURE);
}
//Saving the least square problem as a vector_t:
vector_t * x = NULL;
x->n = mb;
x->v = B;
print_vector(x);
//Free memory
free_vector(b_t);
free_matrix(A_t);
free_vector(x);
return(EXIT_SUCCESS);
}
I am using the functions read_matrix, read_vector, print_vector, print_matrix and free_vector, which is why I use the struct vector_t and matrix_t:
typedef struct vector {
unsigned long n; /* length of vector */
double * v; /* pointer to array of length n */
} vector_t;
typedef struct matrix {
unsigned long m; /* number of rows */
unsigned long n; /* number of columns */
double ** A; /* pointer to two-dimensional array */
} matrix_t;
I don't think that anything is wrong with read_vector and read_matrix because I can easily do this and use print_vector or print_matrix before I do all of the other operations.
You dereference a NULL pointer here, causing the segfault:
//Saving the least square problem as a vector_t:
vector_t * x = NULL;
x->n = mb;
x->v = B;
Maybe you should use/create a new vector_t instead of just a pointer to a vector_t?

Sparse matrix addition in CUDA

I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?
CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.
Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.
Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}

error C2102: '&' requires l-value

The code line: gsl_blas_daxpy(-a,&gsl_matrix_column(D, q).vector,y);
cause the error
error C2102: '&' requires l-value
, now the problem is that I have no control of the GSL functions so I don't know how to figure this out (removing the "&" didn't work)
afterwards i get
error C2198: 'gsl_blas_daxpy' : too few arguments for call
I'm using Visual studio 2010.
GSL_EXPORT int gsl_blas_daxpy (double alpha,
const gsl_vector * X,
gsl_vector * Y);
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_blas.h>
#define M (10) // Number of columns in dictionary */
#define N ((int)(M/2)) // Number of rows in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
int P=1; //number of signals
double epsilon = 1.0e-7; // Residual error
int numOfIterations = N; /* Max num of iterations - same as num of elements in signal */
double sign(double x){return (x>=0) - (x<0);} // Sign function
int main(int argc, char** argv)
{
int n, m, k, iter, q;
double normi, normf, tmp , norm=sqrt(N), htime;
gsl_matrix *D; // A random dictionary used for encoding the sparse signal NxM
gsl_vector *x; // Sparse info signal (encoder input) MxP
gsl_vector *z; // Evaluated Sparse info signal (decoder output) MxP
gsl_vector *r; // Residual error vector MxP
gsl_vector *y; // Sparse representation of signal (encoder output) NxP
gsl_vector_view v;
clock_t start; //for measuring performance
printf("\nDictionary is:NxM=%dx%d,and the signal sparsity is K=%d", N, M, K);
srand(time(NULL)); //Initialize srand
start =clock(); //Initialize clock
/* Initiallize D as a Bernoulli random dictionary */
D = gsl_matrix_alloc (N, M);
for(m=0; m<M; m++)
{
for(n=0; n<N; n++)
{
tmp=sign(2.0*rand()/(double)RAND_MAX-1.0)/norm;
gsl_matrix_set (D, n, m, tmp); //D[n,m]=tmp
}
}
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++)
{
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
}
/* Allocate memory for solution (evaluated signal) */
z = gsl_vector_calloc(M);
/* Allocate memory for residual vector */
r = gsl_vector_calloc(M);
/* Allocate memory for the encoded signal vector (its representation) */
y = gsl_vector_alloc(N);
htime=((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime data allocation: %f", htime);
/* Encoding the signal (x to y) */
start = clock();
gsl_blas_dgemv(CblasNoTrans, 1, D, x, 0, y); // y = Dx
htime=((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime for encoding: %f", htime);
/* Decoding the signal */
start = clock();
normi = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
epsilon = sqrt(epsilon * normi);
normf = normi;
iter = 0;
/*iterate till the computational error is small enough*/
while(normf > epsilon && iter < numOfIterations)
{
gsl_blas_dgemv(CblasTrans, 1, D, y, 0, r); // r=D'*y
q = gsl_blas_idamax(r); //index of max element in residual vector
tmp = gsl_vector_get(r, q); //the max element in r
gsl_vector_set(z, q, gsl_vector_get(z, q)+tmp); // z[q]=z[q]+ tmp
v=gsl_matrix_column(D, q); // choose the dictrionary's atom (coloum) with the index of largest element in r
gsl_blas_daxpy(-tmp,&v.vector,y); // y = y-tmp*v
normf = gsl_blas_dnrm2(y); // ||y|| (L2 norm)
iter++;
}
htime = ((double)clock()-start)/CLOCKS_PER_SEC;
printf("\nTime for decoding: %f", htime);
tmp = 100.0*(normf*normf)/(normi*normi); // the error at end of algorithm
printf("\nComputation residual error: %f",tmp);
/* Check the solution (evaluated signal) against the original signal */
printf("\nSolution (first column),Reference (second column):");
getchar(); // wait for pressing a key
for(m=0; m<M; m++)
{
printf("\n%.3f\t%.3f", gsl_vector_get(x, m),gsl_vector_get(z, m));
}
normi = gsl_blas_dnrm2(x);
gsl_blas_daxpy(-1.0, x, z); // z = z-x
normf = gsl_blas_dnrm2(z); // ||z|| (L2 norm)
tmp = 100.0*(normf*normf)/(normi*normi); //final error
printf("\nSolution residual error: %f\n",tmp);
/* Memory clean up and shutdown*/
gsl_vector_free(y); gsl_vector_free(r);
gsl_vector_free(z); gsl_vector_free(x);
gsl_matrix_free(D);
getchar();
return EXIT_SUCCESS;
}
gsl_matrix_column(D, q).vector is an R-value. You can't take its address. You need an L-value, so assign it to a named variable first, then pass the address of that variable to the function.
If you make a more permanent home for the return value of gsl_matrix_column, (this particular) problem will go away.
Here is some simplified code that illustrates how one might capture a return value in an addressable slot:
struct _foo {
int i;
};
struct _foo bar () {
struct _foo result = { 5 };
return result;
}
/* won't compile; 'lvalue required as unary & operand */
void qux () {
int *j = &bar().i;
}
/* compiles OK */
void qal () {
struct _foo result = bar();
int* j = &result.i;
}
gsl_vector_view c=gsl_matrix_column(D, q);
gsl_blas_daxpy(-a,&c.vector,y);
I think, introducing a temporal variable led you pass a pointer to it to the function.
EDIT: Well, trying to understand the problem, I wanted to know what the function expect:
int gsl_blas_daxpy (double alpha, const gsl_vector * x, gsl_vector * y)
and
gsl_vector_view gsl_matrix_column (gsl_matrix * m, size_t j)
witj some explanation:
A vector view can be passed to any subroutine which takes a vector
argument just as a directly allocated vector would be, using
&view.vector.
and an example:
for (j = 0; j < 10; j++)
{
gsl_vector_view column = gsl_matrix_column (m, j);
double d;
d = gsl_blas_dnrm2 (&column.vector);
printf ("matrix column %d, norm = %g\n", j, d);
}
Now we have another problem:
Here another answer:
Are you aware that int K= 0.7 is K=0 ??
#define M (10) // Number of columns in dictionary */
int K = 0.07*M; //Number of non-zero elements in signal - the sparsity
alloc do not initialice the vector x. x will contain garbage values, not 0. Did you meant x = gsl_vector_calloc(M); with c? It will set x to 0.
/* Create a random K-sparse info signal */
x = gsl_vector_alloc(M);
for(k=0; k<K; k++) // K=0, for get skiped and x not modified.
{
gsl_vector_set(x, rand()%M, 2.0*rand()/(float)RAND_MAX - 1.0); //put random values at k random positions
}
(And here you will have at most K random values, but possible lest)

Getting value from a dynamic allocated 2d array by pointers

I have filled a dynamic allocated float multi array in a function.
A second function has to get the values of the array exploiting the pointer to the first element of the array defined in the former function.
The second function do not access to the correct memory location so it doesn't work but it does if the multy array is defined in a static way.
Does somebody know why?
eval_cell should get values defined in div_int
float f_imp(float x, float y){
return pow(x,2)+pow(y,2)-1;
}
int eval_cell(float* p){
int s[4];
s[0] = f_imp(*p, *(p+1)) <= 0;
printf("%f %f\n",*p, *(p+1));
s[1] = f_imp(*(p+3), *(p+4)) <= 0;
printf("%f %f\n",*(p+3), *(p+4));
s[2] = f_imp(*(p+9), *(p+10)) <= 0;
printf("%f %f\n",*(p+9), *(p+10));
s[3] = f_imp(*(p+6), *(p+7)) <= 0;
printf("%f %f\n",*(p+6), *(p+7));
printf("%d%d%d%d\n",s[0],s[1],s[2],s[3]);
return s[0];
}
void div_int(float* x1, float* y1,float* x2,float* y2,
float* f0, float* f2,float* f6,float* f8){
int i,j,m;
float* p;
float** a_cell; // array 9x3 contente coordinate vertici e valore funzione
*a_cell = (float**) malloc(9*sizeof(float*));
for (i=0;i<9;i++){
a_cell[i] = (float*) malloc(3*sizeof(float));
}
a_cell[0][0] = *x1;
a_cell[0][1] = *y1;
a_cell[0][2] = *f0;
a_cell[2][0] = *x2;
a_cell[2][1] = *y1;
a_cell[2][2] = *f2;
a_cell[6][0] = *x1;
a_cell[6][1] = *y2;
a_cell[6][2] = *f6;
a_cell[8][0] = *x2;
a_cell[8][1] = *y2;
a_cell[8][2] = *f8;
/*** calcolo dei valori incogniti di a_cell ***/
a_cell[1][0] = (*x1+*x2)/2;
a_cell[1][1] = *y1;
a_cell[1][2] = f_imp(a_cell[1][0], a_cell[1][1]);
a_cell[3][0] = *x1;
a_cell[3][1] = (*y1+*y2)/2;
a_cell[3][2] = f_imp(a_cell[3][0], a_cell[3][1]);;
a_cell[4][0] = (*x2+*x1)/2;
a_cell[4][1] = (*y2+*y1)/2;
a_cell[4][2] = f_imp(a_cell[4][0], a_cell[4][1]);
a_cell[5][0] = *x2;
a_cell[5][1] = (*y2+*y1)/2;
a_cell[5][2] = f_imp(a_cell[5][0], a_cell[5][1]);
a_cell[7][0] = (*x1+*x2)/2;
a_cell[7][1] = *y2;
a_cell[7][2] = f_imp(a_cell[7][0], a_cell[7][1]);
for (j=0;j<2;j++){
m = j*3;
for(i=0;i<2;i++){
m += i;
eval_cell(&a_cell[m][0]);
}
}
p = *a_cell;
for (i=0;i<9;i++){
for (j=0;j<3;j++){
printf("%f \n",*(p+3*i+j));
printf("%f \n",a_cell[i][j]);
printf("\n");
}
}
free(a_cell);
return;
}
It's because you using pointer in incorrect way:
See a_cell is pointer to dynamic array of 9 pointers to dynamic array of 3 floats.
So when you do eval_cell(&a_cell[m][0]) (or just eval_cell(a_cell[m]) this is actually the same) you actually get pointer to array of 3 floats. And after that you do:
int eval_cell(float* p){
...
s[2] = f_imp(*(p+9), *(p+10)) <= 0;
*(p+9) will get 9th element in array of 3 floats, so this is incorrect.
It works in static way, because static multi dimension array in memory is just one dimension array for which you was given multi indexing (by compiler). That's why in static you will probably address valid memory area.
See picture for more explanation:
If you want a completely dynamic matrix (2d array), you have to make your own element access function:
double *
make_array (unsigned int rows, unsigned int cols)
{
return malloc (rows * cols * sizeof (double));
}
double *
array_element (double *a, unsigned int cols, unsigned int i, unsigned int j)
{
return a + i * cols + j;
}
#define A(i,j) (*array_element ((a), (cols), (i), (j)))
double *a;
unsigned int rows, cols;
a = make_array (rows, cols);
A(3,4) = 3.14;
printf ("%f\n:" A(3,4));
EDIT:
In your program
*a_cell = (float**) malloc(9*sizeof(float*));
should be
a_cell = (float**) malloc(9*sizeof(float*));
And likewise for
p = *a_cell;

Resources