I am familiarizing myself with CUDA by writing a dot product calculator. I wanted to test it with large array sizes to do a timing study to test two different ways of collecting the vector sum. However, when the size of the array is above 1024 I get errors. I am not so sure where the problem is coming from. The card is a GTX460M with 1.5GB of ram. I am using the card for display (this is a laptop). Aside that I am not sure where the issue could be coming from.
Here is the nvcc compile line:
nvcc D:\Research\CUDA\TestCode\test_dotProduct_1.cu --use_fast_math --gpu-architecture sm_13 --compiler-bindir="D:\Programming\VisualStudio\2010express\VC\bin" --machine 32 -o multi_dot.exe
I also seem to have trouble with compiling in 64 bit but that is another issue
Here is the output for an array of size 1024:
HOST CALCULATION: 357389824.000000
DEV PARA CALCULATION: 357389824.000000
DEV SERI CALCULATION: 357389824.000000
Here is the output for an array of size 2048:
HOST CALCULATION: 2861214720.000000
DEV PARA CALCULATION: -1.#INF00
DEV SERI CALCULATION: -1.#INF00
Here is my code:
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *array_c){
array_c[threadIdx.x] = array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSumVector(double *vector, double *sum, int base){
sum[threadIdx.x + blockIdx.x] = vector[blockIdx.x + threadIdx.x * base] + vector[blockIdx.x + threadIdx.x * base + 1];
}
__global__ void GPU_serialSumVector(double *vector, double *sum, int dim){
for(int i = 0; i < dim; ++i){
sum[0] += vector[i];
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
const int VECTOR_LENGTH = 2048;
int SUM_BASE = 2;
int SUM_ROUNDS = VECTOR_LENGTH / SUM_BASE;
int ELEMENT_SIZE = sizeof(double);
// int currentSize = VECTOR_LENGTH;
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_parallel = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_serial = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_array_c;
double *dev_dot_serial;
double *dev_dot_parallel;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_c, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_parallel, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_serial, ELEMENT_SIZE * VECTOR_LENGTH);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_parallel, &dev_dot_product_parallel, ELEMENT_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_serial, &dev_dot_product_serial, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_array_c);
//condense a second vector in serial to compare speed up of tree condensing
GPU_serialSumVector<<<1,1>>>(dev_array_c, dev_dot_serial, VECTOR_LENGTH);
//condense vector (parallel)
for(int i=SUM_ROUNDS; i>1; i/=SUM_BASE){
GPU_parallelSumVector<<<1,i>>>(dev_array_c, dev_array_c, SUM_BASE);
}
GPU_parallelSumVector<<<1,1>>>(dev_array_c, dev_array_c, SUM_BASE);
//get computed product back to the machine
cudaMemcpy(dev_dot_product_parallel, dev_array_c, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(dev_dot_product_serial, dev_dot_serial, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV PARA CALCULATION: %f \n", dev_dot_product_parallel[0]);
fprintf(output, "DEV SERI CALCULATION: %f \n", dev_dot_product_serial[0]);
/*
fprintf(output, "VALUES OF DEV_ARRAY_C VEC: \n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i is: %f \n", i, dev_dot_product_parallel[i]);
}
*/
free(array_a);
free(array_b);
//free(host_dot_product);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_array_c);
cudaFree(dev_dot_parallel);
cudaFree(dev_dot_serial);
return(0);
}
The maximum number of threads for a block for your card is 1024, which is why you are getting an error (for some older cards its 512). You either need to split up your blocks to use multiple dimensions (again limited to 1024 in a direction for x,y,z on your card) or use more than one block in your grid.
Related
This question already has answers here:
What is an undefined reference/unresolved external symbol error and how do I fix it?
(39 answers)
Closed 6 years ago.
I am working on a project to train multi-layer neural networks using CUDA C. The problem is that when I try to complile the program I get this error:
facetrain.o: In function `backprop_face':
facetrain.c:(.text+0x127): undefined reference to `bpnn_train_kernel'
backprop_kernel.o: In function `bpnn_train_kernel(BPNN*, float*, float*)':
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x6e6): undefined reference to `bpnn_layerforward(float*, float*, float**, int, int)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x703): undefined reference to `bpnn_output_error(float*, float*, float*, int, float*)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x72a): undefined reference to `bpnn_hidden_error(float*, int, float*, int, float**, float*, float*)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x745): undefined reference to `bpnn_adjust_weights(float*, int, float*, int, float**, float**)'
backprop_kernel.o: In function `main':
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x9a5): undefined reference to `setup(int, char**)'
collect2: ld returned 1 exit status
make: *** [backprop] Error 1
Here is the code of backdrop_kernel.cu:
////////////////////////////////////////////////////////////////////////////////
extern void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
extern void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
extern void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
extern void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
extern int setup(int argc, char** argv);
extern float **alloc_2d_dbl(int m, int n);
extern float squash(float x);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
unsigned int num_threads = 0;
unsigned int num_blocks = 0;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
setup(argc, argv);
}
void bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
#ifdef GPU
int m = 0;
float *input_hidden_cuda;
float *input_cuda;
float *output_hidden_cuda;
float *partial_sum;
float *hidden_partial_sum;
float *hidden_delta_cuda;
float *input_prev_weights_cuda;
float sum;
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
num_blocks = in / 16;
dim3 grid( 1 , num_blocks);
dim3 threads(16 , 16);
input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
// this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
m++;
}
}
cudaMalloc((void**) &input_cuda, (in + 1) * sizeof(float));
cudaMalloc((void**) &output_hidden_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void**) &input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMalloc((void**) &hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
#endif
#ifdef CPU
printf("Performing CPU computation\n");
bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid);
#endif
#ifdef GPU
printf("Performing GPU computation\n");
//printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_layerforward_CUDA<<< grid, threads >>>(input_cuda,
output_hidden_cuda,
input_hidden_cuda,
hidden_partial_sum,
in,
hid);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaMemcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 1; j <= hid; j++) {
sum = 0.0;
for (int k = 0; k < num_blocks; k++) {
sum += partial_sum[k * hid + j-1] ;
}
sum += net->input_weights[0][j];
net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
}
#endif
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
#ifdef CPU
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights);
#endif
#ifdef GPU
cudaMalloc((void**) &hidden_delta_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void**) &input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_adjust_weights_cuda<<< grid, threads >>>(hidden_delta_cuda,
hid,
input_cuda,
in,
input_hidden_cuda,
input_prev_weights_cuda
);
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(input_cuda);
cudaFree(output_hidden_cuda);
cudaFree(input_hidden_cuda);
cudaFree(hidden_partial_sum);
cudaFree(input_prev_weights_cuda);
cudaFree(hidden_delta_cuda);
free(partial_sum);
free(input_weights_one_dim);
free(input_weights_prev_one_dim);
#endif
}
If you need any more info or code let me know. Thanks in advance!
It's a linker error. ld is the linker, so if you get an error message ending with "ld returned 1 exit status", that tells you that it's a linker error.
The error message tells you that none of the object files you're linking against contains a definition for bpnn_layerforward(), bpnn_output_error(),.
The reason for that is that the function you've defined is called bpnn_layerforward(), bpnn_output_error(),. (in other words: you misspelled the function name when calling the function (and presumably in the header file as well - otherwise you'd have gotten a different error at compile time)).
I think these functions are part of library, you have to include that library when you make executable.
I am pretty new to CUDA and I'm very struggling with converting a C code to CUDA C, it builds successfully but it keeps crashing. Triple loop function is wrong for sure and I have no idea what should I change.
Function call:
for (z=0;z<=max;z++)
{
correlationsum=coefficient(x, n, dim, z);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
Function
long coefficient(int vctr[40000], long numberofpoints, int coefficientrow, int epsilon)
{
long i, j, k, sum, numberofpairs;
long sq_epsilon;
sq_epsilon=epsilon*epsilon;
numberofpairs=0;
for (i=1;i<=numberofpoints-coefficientrow;i++)
{
sum=0;
for (j=i+1;j<=numberofpoints+1-coefficientrow;j++)
{
for (k=0;k<coefficientrow;k++)
{
sum=sum+(vctr[i+k]-vctr[j+k])*(vctr[i+k]-vctr[j+k]);
}
if(sum<sq_epsilon)
{
numberofpairs++;
sum=0;
}
}
}
return (numberofpairs);
}
I have problems limiting the function in GPU part, so it doesn't go out of bounds (e.g. k is less than coefficientrow above). I saw that it is possible to assign block/threadids and use if function. I have tried it but in triple for loop it is kinda... strange.
Here is almost full code.
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_n, *d_dim, *d_z, *d_x, *d_numbofpairs;
int x[40000], correlation_sum = 0;
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_n, sizeof(int));
cudaMalloc((void **)&d_dim, sizeof(int));
cudaMalloc((void **)&d_z, sizeof(int));
cudaMalloc((void **)&d_x, sizeof(int));
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_dim, &dim, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
for (z = 0; z <= max; z++)
{
cudaMemcpy(d_z, &z, sizeof(int), cudaMemcpyHostToDevice);
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_n);
cudaFree(d_dim);
cudaFree(d_z);
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
I would like some help or tips what to change, what is wrong and why it keeps crashing so I could fix it. Thank you!
EDIT: I completed some parts, sorry my bad. As for threads and blocks, I am very confused, GPU shows 1024 threads per block, and I'm not sure whether it's it or not.
So the "crash" is a seg fault. A seg fault is a problem in host code, not kernel code (although it could be in your usage of the CUDA API).
Your code has a variety of problems.
This might cause trouble:
int x[40000]
this creates a large stack-based allocation. Instead I suggest doing a dynamic allocation:
int *x = (int *)malloc(40000*sizeof(int));
dynamic allocations have much higher size limits.
It's fairly clear from your kernel usage that you intend to use the whole x vector. Therefore, this allocation on the device for d_x is not correct:
cudaMalloc((void **)&d_x, sizeof(int));
we need the same size allocation on the device as what we have on the host:
cudaMalloc((void **)&d_x, 40000*sizeof(int));
Corresponding to 2, you probably would want to copy the entire x vector to the device (it's not really clear since your code doesn't show the initialization of x), and you have incorrectly taken the address of x here, but x is already a pointer:
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
so we want something like this instead:
cudaMemcpy(d_x, x, 40000*sizeof(int), cudaMemcpyHostToDevice);
Your other kernel parameters appear to be scalar parameters. You're mostly handling those incorrectly as well:
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
for a parameter like numberofpoints specified as above (one-way pass to function), we simply pass by value the host quantity we want when calling the kernel, just like we would with an ordinary C function. So this kernel invocation is not correct (even though it appears to compile):
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
instead we want to pass just the host variables, by value:
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
since d_numbofpairs is going both ways, your usage is correct there.
I would also recommend adding proper cuda error checking to your code.
Here is a fully worked example with the above errors fixed. I think the results are bogus of course because the input data (e.g. x) is not initialized.
$ cat t724.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_x, *d_numbofpairs;
int correlation_sum = 0;
int *x = (int *)malloc(40000*sizeof(int));
if (x == NULL) {printf("malloc fail\n"); return -1;}
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_x, sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemcpy(d_x, x, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
for (z = 0; z <= max; z++)
{
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
$ nvcc -o t724 t724.cu
$ ./t724
result for epsilon 0 returns 3
result for epsilon 1 returns 3
result for epsilon 2 returns 3
result for epsilon 3 returns 3
result for epsilon 4 returns 3
result for epsilon 5 returns 3
result for epsilon 6 returns 3
result for epsilon 7 returns 3
result for epsilon 8 returns 3
result for epsilon 9 returns 3
result for epsilon 10 returns 3
$
Note that I didn't make any changes to your kernel code.
I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?
CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.
Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.
Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}
I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
....
To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.
The confusion comes in on the following line:
temp[lindex - RADIUS] = in[gindex - RADIUS];
If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.
The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.
To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.
(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).
My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:
stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);
So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.
There are already good answers, but to focus on the actual point that caused the confusion:
In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.
For example, consider a pointer like this:
int* data= ... // Points to some memory
When you then write a statement like
data[3] = 42;
you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written
int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]
and consequently,
dataWithOffset[-3] = 123; // This will write into data[0]
In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)
I can compile #JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":
And when run, it will abort like:
I have modified the code to the following and the warning disappeared and it can get right result:
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
The result is like this:
I have the following code in cuda_computation.cu
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <assert.h>
void checkCUDAError(const char *msg);
__global__ void euclid_kernel(float *x, float* y, float* f)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.x;
int j = threadIdx.x;
f[idx] = sqrt((x[i]-x[j])*(x[i]-x[j]) + (y[i]-y[j])*(y[i]-y[j]));
}
int main()
{
float *xh;
float *yh;
float *fh;
float *xd;
float *yd;
float *fd;
size_t n = 256;
size_t numBlocks = n;
size_t numThreadsPerBlock = n;
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(float);
xh = (float *) malloc(n * sizeof(float));
yh = (float *) malloc(n * sizeof(float));
fh = (float *) malloc(memSize);
for(int ii(0); ii!=n; ++ii)
{
xh[ii] = ii;
yh[ii] = ii;
}
cudaMalloc( (void **) &xd, n * sizeof(float) );
cudaMalloc( (void **) &yd, n * sizeof(float) );
cudaMalloc( (void **) &fd, memSize );
for(int run(0); run!=10000; ++run)
{
//change value to avoid optimizations
xh[0] = ((float)run)/10000.0;
cudaMemcpy( xd, xh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
cudaMemcpy( yd, yh, n * sizeof(float), cudaMemcpyHostToDevice );
checkCUDAError("cudaMemcpy");
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
euclid_kernel<<< dimGrid, dimBlock >>>( xd, yd, fd );
cudaThreadSynchronize();
checkCUDAError("kernel execution");
cudaMemcpy( fh, fd, memSize, cudaMemcpyDeviceToHost );
checkCUDAError("cudaMemcpy");
}
cudaFree(xd);
cudaFree(yd);
cudaFree(fd);
free(xh);
free(yh);
free(fh);
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
It takes about 6" to run on an FX QUADRO 380, while the corresponding serial version using just one i7-870 core takes just about 3". Do I miss something? Is the code under optimised in some ways? Or is it just expected behaviour that for simple calculations (like this all-pairs Euclidean distance) the overhead needed to move memory exceeds the computational gain?
I think you are being killed by the time to move the data.
Especially since you are calling the CUDA kernel with individual values, it might be quicker to upload a large set of values as a 1D array and operate on them.
Also sqrt isn't done in HW on Cuda (at least not on my GPU) whereas the CPU has optimized FPU HW for this and is probably 10x faster than the GPU, and for a small job like this is probably keeping all the results in cache between the timign runs.
Reduce your global memory reads since they are expensive.
You have 4 global memory reads per thread which can be reduced to 2 using shared memory.
__global__ void euclid_kernel(const float * inX_g, const float* inY_g, float * outF_g)
{
const unsigned int threadId = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float xBlock_s;
__shared__ float yBlock_s;
if(threadIdx.x == 0)
{
xBlock_s = inX_g[blockIdx.x];
yBlock_s = inY_g[blockIdx.x];
}
__syncthreads();
float xSub = xBlock_s - inX_g[threadIdx.x];
float ySub = yBlock_s - inY_g[threadIdx.x];
outF_g[threadId] = sqrt(xSub * xSub + ySub * ySub);
}
You should also test with different block sizes (aslong you have 100% occupancy).
You are splitting the problem so that each block is responsible for a single i vs all 256 j's. This is bad locality, as those 256 j's have to be reloaded for every block, for a total of 2*256*(256 + 1) loads. Instead, split your grid so that each block is responsible for a range of, say, 16 i's and 16 j's, which is still 256 blocks*256 threads. But each block now loads only 2*(16+16) values, for a total or 2*256*32 total loads. The idea is, reuse each loaded value as many times as possible. This may not have a huge impact with 256x256, but becomes more and more important as the size scales.
This optimization is used for efficient matrix multiplies, which have a similar locality problem. See http://en.wikipedia.org/wiki/Loop_tiling, or google for "optimized matrix multiply" for more details. And perhaps the matrix multiplication kernel in the NVIDIA SDK gives some details and ideas.