undefined reference to `setup(int, char**)' [duplicate] - c

This question already has answers here:
What is an undefined reference/unresolved external symbol error and how do I fix it?
(39 answers)
Closed 6 years ago.
I am working on a project to train multi-layer neural networks using CUDA C. The problem is that when I try to complile the program I get this error:
facetrain.o: In function `backprop_face':
facetrain.c:(.text+0x127): undefined reference to `bpnn_train_kernel'
backprop_kernel.o: In function `bpnn_train_kernel(BPNN*, float*, float*)':
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x6e6): undefined reference to `bpnn_layerforward(float*, float*, float**, int, int)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x703): undefined reference to `bpnn_output_error(float*, float*, float*, int, float*)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x72a): undefined reference to `bpnn_hidden_error(float*, int, float*, int, float**, float*, float*)'
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x745): undefined reference to `bpnn_adjust_weights(float*, int, float*, int, float**, float**)'
backprop_kernel.o: In function `main':
tmpxft_0002fa78_00000000-4_backprop_kernel.cudafe1.cpp:(.text+0x9a5): undefined reference to `setup(int, char**)'
collect2: ld returned 1 exit status
make: *** [backprop] Error 1
Here is the code of backdrop_kernel.cu:
////////////////////////////////////////////////////////////////////////////////
extern void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
extern void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
extern void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
extern void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
extern int setup(int argc, char** argv);
extern float **alloc_2d_dbl(int m, int n);
extern float squash(float x);
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
unsigned int num_threads = 0;
unsigned int num_blocks = 0;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
setup(argc, argv);
}
void bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
#ifdef GPU
int m = 0;
float *input_hidden_cuda;
float *input_cuda;
float *output_hidden_cuda;
float *partial_sum;
float *hidden_partial_sum;
float *hidden_delta_cuda;
float *input_prev_weights_cuda;
float sum;
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
num_blocks = in / 16;
dim3 grid( 1 , num_blocks);
dim3 threads(16 , 16);
input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
// this preprocessing stage is added to correct the bugs of wrong memcopy using two-dimensional net->inputweights
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
m++;
}
}
cudaMalloc((void**) &input_cuda, (in + 1) * sizeof(float));
cudaMalloc((void**) &output_hidden_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void**) &input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMalloc((void**) &hidden_partial_sum, num_blocks * WIDTH * sizeof(float));
#endif
#ifdef CPU
printf("Performing CPU computation\n");
bpnn_layerforward(net->input_units, net->hidden_units,net->input_weights, in, hid);
#endif
#ifdef GPU
printf("Performing GPU computation\n");
//printf("in= %d, hid = %d, numblocks = %d\n", in, hid, num_blocks);
cudaMemcpy(input_cuda, net->input_units, (in + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_layerforward_CUDA<<< grid, threads >>>(input_cuda,
output_hidden_cuda,
input_hidden_cuda,
hidden_partial_sum,
in,
hid);
cudaThreadSynchronize();
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) {
printf("bpnn kernel error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
cudaMemcpy(partial_sum, hidden_partial_sum, num_blocks * WIDTH * sizeof(float), cudaMemcpyDeviceToHost);
for (int j = 1; j <= hid; j++) {
sum = 0.0;
for (int k = 0; k < num_blocks; k++) {
sum += partial_sum[k * hid + j-1] ;
}
sum += net->input_weights[0][j];
net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
}
#endif
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
#ifdef CPU
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in, net->input_weights, net->input_prev_weights);
#endif
#ifdef GPU
cudaMalloc((void**) &hidden_delta_cuda, (hid + 1) * sizeof(float));
cudaMalloc((void**) &input_prev_weights_cuda, (in + 1) * (hid + 1) * sizeof(float));
cudaMemcpy(hidden_delta_cuda, net->hidden_delta, (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_prev_weights_cuda, input_weights_prev_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(input_hidden_cuda, input_weights_one_dim, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyHostToDevice);
bpnn_adjust_weights_cuda<<< grid, threads >>>(hidden_delta_cuda,
hid,
input_cuda,
in,
input_hidden_cuda,
input_prev_weights_cuda
);
cudaMemcpy(net->input_units, input_cuda, (in + 1) * sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy(input_weights_one_dim, input_hidden_cuda, (in + 1) * (hid + 1) * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(input_cuda);
cudaFree(output_hidden_cuda);
cudaFree(input_hidden_cuda);
cudaFree(hidden_partial_sum);
cudaFree(input_prev_weights_cuda);
cudaFree(hidden_delta_cuda);
free(partial_sum);
free(input_weights_one_dim);
free(input_weights_prev_one_dim);
#endif
}
If you need any more info or code let me know. Thanks in advance!

It's a linker error. ld is the linker, so if you get an error message ending with "ld returned 1 exit status", that tells you that it's a linker error.
The error message tells you that none of the object files you're linking against contains a definition for bpnn_layerforward(), bpnn_output_error(),.
The reason for that is that the function you've defined is called bpnn_layerforward(), bpnn_output_error(),. (in other words: you misspelled the function name when calling the function (and presumably in the header file as well - otherwise you'd have gotten a different error at compile time)).
I think these functions are part of library, you have to include that library when you make executable.

Related

CUDA triple loop

I am pretty new to CUDA and I'm very struggling with converting a C code to CUDA C, it builds successfully but it keeps crashing. Triple loop function is wrong for sure and I have no idea what should I change.
Function call:
for (z=0;z<=max;z++)
{
correlationsum=coefficient(x, n, dim, z);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
Function
long coefficient(int vctr[40000], long numberofpoints, int coefficientrow, int epsilon)
{
long i, j, k, sum, numberofpairs;
long sq_epsilon;
sq_epsilon=epsilon*epsilon;
numberofpairs=0;
for (i=1;i<=numberofpoints-coefficientrow;i++)
{
sum=0;
for (j=i+1;j<=numberofpoints+1-coefficientrow;j++)
{
for (k=0;k<coefficientrow;k++)
{
sum=sum+(vctr[i+k]-vctr[j+k])*(vctr[i+k]-vctr[j+k]);
}
if(sum<sq_epsilon)
{
numberofpairs++;
sum=0;
}
}
}
return (numberofpairs);
}
I have problems limiting the function in GPU part, so it doesn't go out of bounds (e.g. k is less than coefficientrow above). I saw that it is possible to assign block/threadids and use if function. I have tried it but in triple for loop it is kinda... strange.
Here is almost full code.
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_n, *d_dim, *d_z, *d_x, *d_numbofpairs;
int x[40000], correlation_sum = 0;
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_n, sizeof(int));
cudaMalloc((void **)&d_dim, sizeof(int));
cudaMalloc((void **)&d_z, sizeof(int));
cudaMalloc((void **)&d_x, sizeof(int));
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_dim, &dim, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
for (z = 0; z <= max; z++)
{
cudaMemcpy(d_z, &z, sizeof(int), cudaMemcpyHostToDevice);
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_n);
cudaFree(d_dim);
cudaFree(d_z);
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
I would like some help or tips what to change, what is wrong and why it keeps crashing so I could fix it. Thank you!
EDIT: I completed some parts, sorry my bad. As for threads and blocks, I am very confused, GPU shows 1024 threads per block, and I'm not sure whether it's it or not.
So the "crash" is a seg fault. A seg fault is a problem in host code, not kernel code (although it could be in your usage of the CUDA API).
Your code has a variety of problems.
This might cause trouble:
int x[40000]
this creates a large stack-based allocation. Instead I suggest doing a dynamic allocation:
int *x = (int *)malloc(40000*sizeof(int));
dynamic allocations have much higher size limits.
It's fairly clear from your kernel usage that you intend to use the whole x vector. Therefore, this allocation on the device for d_x is not correct:
cudaMalloc((void **)&d_x, sizeof(int));
we need the same size allocation on the device as what we have on the host:
cudaMalloc((void **)&d_x, 40000*sizeof(int));
Corresponding to 2, you probably would want to copy the entire x vector to the device (it's not really clear since your code doesn't show the initialization of x), and you have incorrectly taken the address of x here, but x is already a pointer:
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
so we want something like this instead:
cudaMemcpy(d_x, x, 40000*sizeof(int), cudaMemcpyHostToDevice);
Your other kernel parameters appear to be scalar parameters. You're mostly handling those incorrectly as well:
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
for a parameter like numberofpoints specified as above (one-way pass to function), we simply pass by value the host quantity we want when calling the kernel, just like we would with an ordinary C function. So this kernel invocation is not correct (even though it appears to compile):
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
instead we want to pass just the host variables, by value:
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
since d_numbofpairs is going both ways, your usage is correct there.
I would also recommend adding proper cuda error checking to your code.
Here is a fully worked example with the above errors fixed. I think the results are bogus of course because the input data (e.g. x) is not initialized.
$ cat t724.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_x, *d_numbofpairs;
int correlation_sum = 0;
int *x = (int *)malloc(40000*sizeof(int));
if (x == NULL) {printf("malloc fail\n"); return -1;}
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_x, sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemcpy(d_x, x, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
for (z = 0; z <= max; z++)
{
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
$ nvcc -o t724 t724.cu
$ ./t724
result for epsilon 0 returns 3
result for epsilon 1 returns 3
result for epsilon 2 returns 3
result for epsilon 3 returns 3
result for epsilon 4 returns 3
result for epsilon 5 returns 3
result for epsilon 6 returns 3
result for epsilon 7 returns 3
result for epsilon 8 returns 3
result for epsilon 9 returns 3
result for epsilon 10 returns 3
$
Note that I didn't make any changes to your kernel code.

Sparse matrix addition in CUDA

I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?
CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.
Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.
Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}

Negative array indexing in shared memory based 1d stencil CUDA implementation

I'm currently working with CUDA programming and I'm trying to learn off of slides from a workshop I found online, which can be found here. The problem I am having is on slide 48. The following code can be found there:
__global__ void stencil_1d(int *in, int *out) {
__shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
int gindex = threadIdx.x + blockIdx.x * blockDim.x;
int lindex = threadIdx.x + RADIUS;
// Read input elements into shared memory
temp[lindex] = in[gindex];
if (threadIdx.x < RADIUS) {
temp[lindex - RADIUS] = in[gindex - RADIUS];
temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
}
....
To add a bit of context. We have an array called in which as length say N. We then have another array out which has length N+(2*RADIUS), where RADIUS has a value of 3 for this particular example. The idea is to copy array in, into array out but to place the array in in position 3 from the beginning of array out i.e out = [RADIUS][in][RADIUS], see slide for graphical representation.
The confusion comes in on the following line:
temp[lindex - RADIUS] = in[gindex - RADIUS];
If gindex is 0 then we have in[-3]. How can we read from a negative index in an array? Any help would really be appreciated.
The answer by pQB is correct. You are supposed to offset the input array pointer by RADIUS.
To show this, I'm providing below a full worked example. Hope it would be beneficial to other users.
(I would say you would need a __syncthreads() after the shared memory loads. I have added it in the below example).
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, unsigned int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
unsigned int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const unsigned int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
You are assuming that in array points to the first position of the memory that has been allocated for this array. However, if you see slide 47, the in array has a halo (orange boxes) of three elements before and after of the data (represented as green cubes).
My assumption is (I have not done the workshop) that the input array is first initialized with an halo and then the pointer is moved in the kernel call. Something like:
stencil_1d<<<dimGrid, dimBlock>>>(in + RADIUS, out);
So, in the kernel, it's safe to do in[-3] because the pointer is not at the beginning of the array.
There are already good answers, but to focus on the actual point that caused the confusion:
In C (not only in CUDA, but in C in general), when you access an "array" by using the [ brackets ], you are actually doing pointer arithmetic.
For example, consider a pointer like this:
int* data= ... // Points to some memory
When you then write a statement like
data[3] = 42;
you are just accessing a memory location that is "three entries behind the original data pointer". So you could also have written
int* data= ... // Points to some memory
int* dataWithOffset = data+3;
dataWithOffset[0] = 42; // This will write into data[3]
and consequently,
dataWithOffset[-3] = 123; // This will write into data[0]
In fact, you can say that data[i] is the same as *(data+i), which is the same as *(i+data), which in turn is the same as i[data], but you should not use this in real programs...)
I can compile #JackOLantern's code, but there is an warning: "pointless comparison of unsigned integer with zero":
And when run, it will abort like:
I have modified the code to the following and the warning disappeared and it can get right result:
#include <thrust/device_vector.h>
#define RADIUS 3
#define BLOCKSIZE 32
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* KERNEL */
/**********/
__global__ void moving_average(unsigned int *in, unsigned int *out, int N) {
__shared__ unsigned int temp[BLOCKSIZE + 2 * RADIUS];
int gindexx = threadIdx.x + blockIdx.x * blockDim.x;
int lindexx = threadIdx.x + RADIUS;
// --- Read input elements into shared memory
temp[lindexx] = (gindexx < N)? in[gindexx] : 0;
if (threadIdx.x < RADIUS) {
temp[threadIdx.x] = (((gindexx - RADIUS) >= 0)&&(gindexx <= N)) ? in[gindexx - RADIUS] : 0;
temp[threadIdx.x + (RADIUS + BLOCKSIZE)] = ((gindexx + BLOCKSIZE) < N)? in[gindexx + BLOCKSIZE] : 0;
}
__syncthreads();
// --- Apply the stencil
unsigned int result = 0;
for (int offset = -RADIUS ; offset <= RADIUS ; offset++) {
result += temp[lindexx + offset];
}
// --- Store the result
out[gindexx] = result;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 55 + 2 * RADIUS;
const unsigned int constant = 4;
thrust::device_vector<unsigned int> d_in(N, constant);
thrust::device_vector<unsigned int> d_out(N);
moving_average<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
thrust::host_vector<unsigned int> h_out = d_out;
for (int i=0; i<N; i++)
printf("Element i = %i; h_out = %i\n", i, h_out[i]);
return 0;
}
The result is like this:

Does "more threads" mean more speed?

I've got a Jacobi implementation on CUDA, but the problem is:
I assign threads at this way:
#define imin(a,b) (a < b ? a : b)
int dimBlocks, dimThreads;
dimThreads = 256;
dimBlocks = imin(32, (dimThreads + dim - 1)/dimThreads);
But if I use 32 threads it's fastest than using 256 threads or moreover...
I've got these results:
Sequential times:
9900 5.882000
9900 6.071000
Parallel times:
9900 1.341000 //using 32
9900 1.626000 //using 256
Where 9900 is matrix WIDTH... And we can see the following:
5.882 / 1.34 = 4.39
6.07 / 1.62 = 3.74
So 32 threads is more efficient than 256?
Sorry, I don't know if I should upload the code(since they are a bit long), if you request it I will do it.
EDIT:
//Based on doubletony algorithm
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "Jacobi.cuh"
#include "thrust\host_vector.h"
#include "thrust\device_vector.h"
#include "thrust\extrema.h"
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <ctime>
#define imin(a,b) (a < b ? a : b)
// name OF FUNCTION: __copy_vector
// PURPOSE:
// The function will copy a vector.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// source double* value vector to be copied
// dest double* reference vector copied
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __copy_vector(double *source, double *dest, const int dim)
{
int tIdx = blockDim.x * blockIdx.x + threadIdx.x;
while(tIdx < dim){
dest[tIdx] = source[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __Jacobi_sum
// PURPOSE:
// The function will execute matrix vector multiplication
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A*B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __Jacobi_sum(const double *A,
const double *B,
double *resul,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
resul[tIdx] = 0;
for(int i = 0; i < dim; i++)
if(tIdx != i)
resul[tIdx] += A[tIdx * dim + i] * B[i];
tIdx += gridDim.x * blockDim.x;
}
__syncthreads;
}
// name OF FUNCTION: __substract
// PURPOSE:
// The function will execute A-B=resul
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* value B
// C double* reference A-B
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __substract(const double *A,
const double *B,
double *C,
const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
C[tIdx] = A[tIdx] - B[tIdx];
tIdx += gridDim.x * blockDim.x;
}
}
// name OF FUNCTION: __divide
// PURPOSE:
// The function will execute the jacobi division, that is,
// (B-sum)/A[i,i]
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* value A
// B double* reference (B-sum)/A[i,i]
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __divide(const double *A, double *B, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
//if(A[tIdx * dim + tIdx] != 0)
B[tIdx] /= A[tIdx * dim + tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: __absolute
// PURPOSE:
// The function will calculate the absolute value for each
// number in an array
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// A double* reference |A[i]|
// dim int value vector dimension
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
__global__ void __absolute(double *A, const int dim)
{
int tIdx = blockIdx.x * blockDim.x + threadIdx.x;
while(tIdx < dim){
if(A[tIdx] < 0)
A[tIdx] = -A[tIdx];
tIdx += blockDim.x * gridDim.x;
}
}
// name OF FUNCTION: Jacobi_Cuda
// PURPOSE:
// The function will calculate a X solution for a linear system
// using Jacobi's Method.
//
// PARAMETERS:
// name type value/reference description
// ---------------------------------------------------------------------
// Matrix_A double* value Matrix A(coefficients)
// Vector_B double* value Vector B
// Vector_X double* reference Solution
// dim int value Matrix Dimension
// e double value Error allowed
// maxIter int value Maximum iterations allowed
// RETURN VALUE:
// name type description
// ---------------------------------------------------------------------
//
void Jacobi_Cuda(const double *Matrix_A,
const double *Vector_B,
double *Vector_X,
const int dim,
const double e,
const int maxIter,
double *t)
{
/** Host variables **/
int iter = 0; // iter counter
double err = 1; // error between X^k and X^k-1
double *tmp; // temporary for thrust norm
double *norm; // Vector norm
tmp = (double *) malloc(sizeof(double) * dim);
norm = (double *) malloc(sizeof(double));
int dimBlocks, dimThreads;
dimThreads = 64;
dimBlocks = imin(32, (dim + dimThreads - 1)/dimThreads);
/** ************** **/
/** Device variables **/
double *d_Matrix_A, *d_Vector_B, *d_Vector_X, *d_Vector_Y, *d_Vector_Resul;
cudaMalloc((void**)&d_Matrix_A, sizeof(double) * dim * dim);
cudaMalloc((void**)&d_Vector_B, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_X, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Y, sizeof(double) * dim);
cudaMalloc((void**)&d_Vector_Resul, sizeof(double) * dim);
/** **************** **/
/** Initialize **/
cudaMemcpy(d_Matrix_A, Matrix_A, sizeof(double) * dim * dim,
cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_B, Vector_B, sizeof(double) * dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_Vector_X, Vector_X, sizeof(double) * dim, cudaMemcpyHostToDevice);
/** ********** **/
clock_t start,finish;
double totaltime;
start = clock();
/** Jacobi **/
while(err > e && iter < maxIter){
__copy_vector<<<dimBlocks, dimThreads>>>(d_Vector_X, d_Vector_Y, dim);
__Jacobi_sum<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_Y,
d_Vector_Resul, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_B, d_Vector_Resul,
d_Vector_X, dim);
__divide<<<dimBlocks, dimThreads>>>(d_Matrix_A, d_Vector_X, dim);
__substract<<<dimBlocks, dimThreads>>>(d_Vector_Y, d_Vector_X,
d_Vector_Resul, dim);
__absolute<<<dimBlocks, dimThreads>>>(d_Vector_Resul, dim);
cudaMemcpy(tmp, d_Vector_Resul, sizeof(double) * dim, cudaMemcpyDeviceToHost);
double *t = thrust::max_element(tmp, tmp + dim); //vector norm
err = *t;
iter++;
}
finish = clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
*t = totaltime;
cudaMemcpy(Vector_X, d_Vector_X, sizeof(double) * dim,
cudaMemcpyDeviceToHost);
if(iter == maxIter)
puts("Jacobi has reached maxIter!");
/** ****** **/
/** Free memory **/
cudaFree(d_Matrix_A);
cudaFree(d_Vector_B);
cudaFree(d_Vector_X);
cudaFree(d_Vector_Y);
cudaFree(d_Vector_Resul);
free(tmp);
free(norm);
/** *********** **/
}
It depends on your algorithm. Some algorithms are by definition non-parallelizable (calculating the Fibonacci series, for example). But here's a parallelizable Jacobi algorithm courtesy of Brown. Note that solving systems of equations CAN be solved either in serial or in parallel, it's just a matter of writing the code.
In short, it's impossible to know whether or not more threads = more speed unless you show us (or at least explain) the algorithm. As far as thread synchronization goes, CUDA is very (very) good at normalizing synchronization costs so (if your algorithm is proper), more threads should almost always yield more speed.
Fewer threads might be more efficient if the workload is small enough that the overheads of managing many threads cause the performance degradation.
...but without seeing your code it's hard to say. Personally I'm more inclined to believe it's just a bug in your code.

Issue With Large Array Sizes in CUDA

I am familiarizing myself with CUDA by writing a dot product calculator. I wanted to test it with large array sizes to do a timing study to test two different ways of collecting the vector sum. However, when the size of the array is above 1024 I get errors. I am not so sure where the problem is coming from. The card is a GTX460M with 1.5GB of ram. I am using the card for display (this is a laptop). Aside that I am not sure where the issue could be coming from.
Here is the nvcc compile line:
nvcc D:\Research\CUDA\TestCode\test_dotProduct_1.cu --use_fast_math --gpu-architecture sm_13 --compiler-bindir="D:\Programming\VisualStudio\2010express\VC\bin" --machine 32 -o multi_dot.exe
I also seem to have trouble with compiling in 64 bit but that is another issue
Here is the output for an array of size 1024:
HOST CALCULATION: 357389824.000000
DEV PARA CALCULATION: 357389824.000000
DEV SERI CALCULATION: 357389824.000000
Here is the output for an array of size 2048:
HOST CALCULATION: 2861214720.000000
DEV PARA CALCULATION: -1.#INF00
DEV SERI CALCULATION: -1.#INF00
Here is my code:
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *array_c){
array_c[threadIdx.x] = array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSumVector(double *vector, double *sum, int base){
sum[threadIdx.x + blockIdx.x] = vector[blockIdx.x + threadIdx.x * base] + vector[blockIdx.x + threadIdx.x * base + 1];
}
__global__ void GPU_serialSumVector(double *vector, double *sum, int dim){
for(int i = 0; i < dim; ++i){
sum[0] += vector[i];
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
const int VECTOR_LENGTH = 2048;
int SUM_BASE = 2;
int SUM_ROUNDS = VECTOR_LENGTH / SUM_BASE;
int ELEMENT_SIZE = sizeof(double);
// int currentSize = VECTOR_LENGTH;
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_parallel = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_serial = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_array_c;
double *dev_dot_serial;
double *dev_dot_parallel;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_c, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_parallel, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_serial, ELEMENT_SIZE * VECTOR_LENGTH);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_parallel, &dev_dot_product_parallel, ELEMENT_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_serial, &dev_dot_product_serial, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_array_c);
//condense a second vector in serial to compare speed up of tree condensing
GPU_serialSumVector<<<1,1>>>(dev_array_c, dev_dot_serial, VECTOR_LENGTH);
//condense vector (parallel)
for(int i=SUM_ROUNDS; i>1; i/=SUM_BASE){
GPU_parallelSumVector<<<1,i>>>(dev_array_c, dev_array_c, SUM_BASE);
}
GPU_parallelSumVector<<<1,1>>>(dev_array_c, dev_array_c, SUM_BASE);
//get computed product back to the machine
cudaMemcpy(dev_dot_product_parallel, dev_array_c, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(dev_dot_product_serial, dev_dot_serial, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV PARA CALCULATION: %f \n", dev_dot_product_parallel[0]);
fprintf(output, "DEV SERI CALCULATION: %f \n", dev_dot_product_serial[0]);
/*
fprintf(output, "VALUES OF DEV_ARRAY_C VEC: \n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i is: %f \n", i, dev_dot_product_parallel[i]);
}
*/
free(array_a);
free(array_b);
//free(host_dot_product);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_array_c);
cudaFree(dev_dot_parallel);
cudaFree(dev_dot_serial);
return(0);
}
The maximum number of threads for a block for your card is 1024, which is why you are getting an error (for some older cards its 512). You either need to split up your blocks to use multiple dimensions (again limited to 1024 in a direction for x,y,z on your card) or use more than one block in your grid.

Resources