CUDA triple loop - c

I am pretty new to CUDA and I'm very struggling with converting a C code to CUDA C, it builds successfully but it keeps crashing. Triple loop function is wrong for sure and I have no idea what should I change.
Function call:
for (z=0;z<=max;z++)
{
correlationsum=coefficient(x, n, dim, z);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
Function
long coefficient(int vctr[40000], long numberofpoints, int coefficientrow, int epsilon)
{
long i, j, k, sum, numberofpairs;
long sq_epsilon;
sq_epsilon=epsilon*epsilon;
numberofpairs=0;
for (i=1;i<=numberofpoints-coefficientrow;i++)
{
sum=0;
for (j=i+1;j<=numberofpoints+1-coefficientrow;j++)
{
for (k=0;k<coefficientrow;k++)
{
sum=sum+(vctr[i+k]-vctr[j+k])*(vctr[i+k]-vctr[j+k]);
}
if(sum<sq_epsilon)
{
numberofpairs++;
sum=0;
}
}
}
return (numberofpairs);
}
I have problems limiting the function in GPU part, so it doesn't go out of bounds (e.g. k is less than coefficientrow above). I saw that it is possible to assign block/threadids and use if function. I have tried it but in triple for loop it is kinda... strange.
Here is almost full code.
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_n, *d_dim, *d_z, *d_x, *d_numbofpairs;
int x[40000], correlation_sum = 0;
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_n, sizeof(int));
cudaMalloc((void **)&d_dim, sizeof(int));
cudaMalloc((void **)&d_z, sizeof(int));
cudaMalloc((void **)&d_x, sizeof(int));
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaMemcpy(d_n, &n, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_dim, &dim, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
for (z = 0; z <= max; z++)
{
cudaMemcpy(d_z, &z, sizeof(int), cudaMemcpyHostToDevice);
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_n);
cudaFree(d_dim);
cudaFree(d_z);
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
I would like some help or tips what to change, what is wrong and why it keeps crashing so I could fix it. Thank you!
EDIT: I completed some parts, sorry my bad. As for threads and blocks, I am very confused, GPU shows 1024 threads per block, and I'm not sure whether it's it or not.

So the "crash" is a seg fault. A seg fault is a problem in host code, not kernel code (although it could be in your usage of the CUDA API).
Your code has a variety of problems.
This might cause trouble:
int x[40000]
this creates a large stack-based allocation. Instead I suggest doing a dynamic allocation:
int *x = (int *)malloc(40000*sizeof(int));
dynamic allocations have much higher size limits.
It's fairly clear from your kernel usage that you intend to use the whole x vector. Therefore, this allocation on the device for d_x is not correct:
cudaMalloc((void **)&d_x, sizeof(int));
we need the same size allocation on the device as what we have on the host:
cudaMalloc((void **)&d_x, 40000*sizeof(int));
Corresponding to 2, you probably would want to copy the entire x vector to the device (it's not really clear since your code doesn't show the initialization of x), and you have incorrectly taken the address of x here, but x is already a pointer:
cudaMemcpy(d_x, &x, sizeof(int), cudaMemcpyHostToDevice);
so we want something like this instead:
cudaMemcpy(d_x, x, 40000*sizeof(int), cudaMemcpyHostToDevice);
Your other kernel parameters appear to be scalar parameters. You're mostly handling those incorrectly as well:
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
for a parameter like numberofpoints specified as above (one-way pass to function), we simply pass by value the host quantity we want when calling the kernel, just like we would with an ordinary C function. So this kernel invocation is not correct (even though it appears to compile):
coefficient << <1, THREADS >> >(d_x, *d_n, *d_dim, *d_z, d_numbofpairs);
instead we want to pass just the host variables, by value:
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
since d_numbofpairs is going both ways, your usage is correct there.
I would also recommend adding proper cuda error checking to your code.
Here is a fully worked example with the above errors fixed. I think the results are bogus of course because the input data (e.g. x) is not initialized.
$ cat t724.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#define THREADS 1024
__global__ void coefficient(int *vctr, int numberofpoints, int coefficient_row, int epsilon, int *numbofpairs){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k = blockIdx.z * blockDim.z + threadIdx.z;
int sum;
numbofpairs = 0;
int sq_epsilon = epsilon*epsilon;
if (i <= numberofpoints - coefficient_row)
{
sum = 0;
if (j <= numberofpoints + 1 - coefficient_row)
{
if (k < coefficient_row)
sum = sum + (vctr[i + k] - vctr[j + k])*(vctr[i + k] - vctr[j + k]);
if (sum < sq_epsilon){
numbofpairs++;
sum = 0;
}}}}
int main()
{
int n, dim, max, z;
int *d_x, *d_numbofpairs;
int correlation_sum = 0;
int *x = (int *)malloc(40000*sizeof(int));
if (x == NULL) {printf("malloc fail\n"); return -1;}
n=10;
max=10;
dim=3;
cudaMalloc((void **)&d_x, sizeof(int));
cudaCheckErrors("cudaMalloc 1 fail");
cudaMalloc((void **)&d_numbofpairs, sizeof(int));
cudaCheckErrors("cudaMalloc 2 fail");
cudaMemcpy(d_x, x, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy 1 fail");
for (z = 0; z <= max; z++)
{
coefficient << <1, THREADS >> >(d_x, n, dim, z, d_numbofpairs);
cudaMemcpy(&correlation_sum, d_numbofpairs, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
printf("result for epsilon %d returns %d\n", z, correlation_sum);
}
cudaFree(d_x);
cudaFree(d_numbofpairs);
return 0;
}
$ nvcc -o t724 t724.cu
$ ./t724
result for epsilon 0 returns 3
result for epsilon 1 returns 3
result for epsilon 2 returns 3
result for epsilon 3 returns 3
result for epsilon 4 returns 3
result for epsilon 5 returns 3
result for epsilon 6 returns 3
result for epsilon 7 returns 3
result for epsilon 8 returns 3
result for epsilon 9 returns 3
result for epsilon 10 returns 3
$
Note that I didn't make any changes to your kernel code.

Related

Aggregate many small arrays in fewer large arrays by basic function

I have many small 2D arrays (e.g. M x 32 x 40) and fewer larger 2D arrays (e.g. N x 200 x 300).
I would like to 'put' the smaller matrices at indices n,i,j in the larger arrays (upper left index of the array at batch index n). These small arrays could overlap and should be aggregated by functions that are associative and commutative say plus, multiply, etc.
I figure this is a pretty basic scenario that many people should have come across, right? Is there a cuda implementation that supports this in an efficient way?
Typical values M = 10^6, N = 10^4
This is a reduction operation.
In addition to what is expressed in the comments, I'll make the assumption that the distribution of the M matrices in terms of which of the N matrices they belong to, is relatively uniform, i.e. evenly distributed. This means for the dimensions given, that there will be approximately 100 of the M matrices that intended to update N matrix 0, 100 for N matrix 1, and so on. Furthermore, if we inspect the n array, we would observe a uniformly random pattern of indices (i.e. no clumping or grouping).
Given that, in what may be a first for me, I'll suggest a lock/critical section algorithm, using the plumbing from here. Each threadblock will take one of the M arrays, and attempt to acquire a lock so that it can update the appropriate N array. When finished, release the lock.
I considered other approaches as well, some of which are evident in the code. In any event, for the stated conditions, the lock based approach had a kernel runtime of about 40ms on my V100 GPU, which was the best I observed.
I would also note that the stated dimensions result in a data working set of ~8GB. Not that that is a problem, just be aware if running this code as-is on your laptop GPU.
Here's an example:
$ cat t34.cu
#include <iostream>
#include <cstdlib>
const int N = 10000;
const int M = 1000000;
const int Mx = 32;
const int My = 40;
const int Nx = 200;
const int Ny = 300;
const int nTPB = 256;
template <typename T>
__host__ __device__
T reduction_op(T &a, const T &b){ return a+b;}
template <typename T>
__global__ void k(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
for (int ii = 0; ii < num_M; ii++){
if (n[ii] == blockIdx.x) {
for (int jj = threadIdx.x; jj < Mx*My; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x] = reduction_op(
N[blockIdx.x*Nx*Ny + i[ii] + (j[ii]+y)*Nx + x], M[ii*Mx*My + y*Mx + x]);}
}
__syncthreads();}
}
// assumes Ny is whole-number divisible by sl
template <typename T>
__global__ void ki(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, const int sl){
extern __shared__ T s[];
for (int c = 0; c < Ny; c+=sl){ // process per chunk of N array
// load shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) s[t] = N[blockIdx.x*Nx*Ny + c*Nx + t];
__syncthreads();
// process chunk stack
for (int ii = 0; ii < num_M; ii++){ // iterate through "stack"
if ((n[ii] == blockIdx.x) && (j[ii] < (c+sl)) && ((j[ii]+My) > c)) {
for (int jj = threadIdx.x; jj < sl*Mx; jj += blockDim.x){
int y = jj/Mx;
int x = jj - (y*Mx);
//y += c;
if ((y+c >= j[ii]) && (y+c < (j[ii]+My)))
s[y*Nx+x+i[ii]] = reduction_op(s[y*Nx+x+i[ii]], M[ii*Mx*My + (y+c-j[ii])*Mx + x]);}
}
__syncthreads();}
// save shared
for (int t = threadIdx.x; t < sl*Nx; t += blockDim.x) N[blockIdx.x*Nx*Ny + c*Nx + t] = s[t];
}
}
template <typename T>
__global__ void ka(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M){
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y)
atomicAdd(N+n[blockIdx.x]*Nx*Ny+(j[blockIdx.x]+y)*Nx+i[blockIdx.x]+x, M[blockIdx.x*Mx*My+y*Mx+x]);
}
__device__ void acquire_semaphore(volatile int *lock){
while (atomicCAS((int *)lock, 0, 1) != 0);
}
__device__ void release_semaphore(volatile int *lock){
*lock = 0;
__threadfence();
}
template <typename T>
__global__ void kl(const T * __restrict__ M, T * __restrict__ N, const int * __restrict__ n, const int * __restrict__ i, const int * __restrict__ j, const int num_M, int * __restrict__ locks){
if ((threadIdx.x == 0) && (threadIdx.y == 0))
acquire_semaphore(locks+n[blockIdx.x]);
__syncthreads();
//begin critical section
int x = threadIdx.x;
for (int y = threadIdx.y; y < My; y += blockDim.y){
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x] = reduction_op(
N[n[blockIdx.x]*Nx*Ny + i[blockIdx.x] + (j[blockIdx.x]+y)*Nx + x], M[blockIdx.x*Mx*My + y*Mx + x]);}
// end critical section
__threadfence(); // not strictly necessary for the lock, but to make any global updates in the critical section visible to other threads in the grid
__syncthreads();
if ((threadIdx.x == 0) && (threadIdx.y == 0))
release_semaphore(locks+n[blockIdx.x]);
}
typedef float mt;
int main(){
mt *d_M, *h_M, *d_N, *h_N, *r1, *r2;
int *d_n, *h_n, *d_i, *h_i, *d_j, *h_j;
h_M = new mt[M*Mx*My];
h_N = new mt[N*Nx*Ny];
r1 = new mt[N*Nx*Ny];
r2 = new mt[N*Nx*Ny];
h_n = new int[M];
h_i = new int[M];
h_j = new int[M];
cudaMalloc(&d_M, M*Mx*My*sizeof(mt));
cudaMalloc(&d_N, N*Nx*Ny*sizeof(mt));
cudaMalloc(&d_n, M*sizeof(int));
cudaMalloc(&d_i, M*sizeof(int));
cudaMalloc(&d_j, M*sizeof(int));
for (int i = 0; i < M; i++){
h_n[i] = rand()%N;
h_i[i] = rand()%(Nx - Mx);
h_j[i] = rand()%(Ny - My);}
for (int i = 0; i < N*Nx*Ny; i++) h_N[i] = (mt)(i%3);
for (int i = 0; i < M*Mx*My; i++) h_M[i] = (mt)((i%3)+1);
cudaMemcpy(d_M, h_M, M*Mx*My*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
cudaMemcpy(d_n, h_n, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_i, h_i, M*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_j, h_j, M*sizeof(int), cudaMemcpyHostToDevice);
#ifdef USE_SINGLE_N
cudaMemset(d_n, 0, M*sizeof(int));
#endif
#if 0
const int sl = 40;
const int sb = sl * Nx * sizeof(mt);
ki<<<N, nTPB, sb>>>(d_M, d_N, d_n, d_i, d_j, M, sl);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
dim3 block(Mx, 8);
#if 0
ka<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
#endif
int *d_locks;
cudaMalloc(&d_locks, N*sizeof(int));
cudaMemset(d_locks, 0, N*sizeof(int));
kl<<<M, block>>>(d_M, d_N, d_n, d_i, d_j, M, d_locks);
cudaMemcpy(r2, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
cudaMemcpy(d_N, h_N, N*Nx*Ny*sizeof(mt), cudaMemcpyHostToDevice);
k<<<N, nTPB>>>(d_M, d_N, d_n, d_i, d_j, M);
cudaMemcpy(r1, d_N, N*Nx*Ny*sizeof(mt), cudaMemcpyDeviceToHost);
for (int i = 0; i < N*Nx*Ny; i++) if (r1[i] != r2[i]) {std::cout << "mismatch at: " << i << " was: " << r2[i] << " should be: " << r1[i] << std::endl; return 0;}
}
$ nvcc -o t34 t34.cu -O3 -lineinfo
$ nvprof ./t34
==17970== NVPROF is profiling process 17970, command: ./t34
==17970== Profiling application: ./t34
==17970== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 34.57% 3.09036s 2 1.54518s 1.54294s 1.54742s [CUDA memcpy DtoH]
33.18% 2.96615s 1 2.96615s 2.96615s 2.96615s void k<float>(float const *, float*, int const *, int const *, int const *, int)
31.81% 2.84401s 6 474.00ms 1.4255ms 1.27035s [CUDA memcpy HtoD]
0.45% 39.949ms 1 39.949ms 39.949ms 39.949ms void kl<float>(float const *, float*, int const *, int const *, int const *, int, int*)
0.00% 2.1120us 1 2.1120us 2.1120us 2.1120us [CUDA memset]
API calls: 96.13% 8.94558s 8 1.11820s 1.9203ms 4.51030s cudaMemcpy
3.60% 334.59ms 6 55.765ms 277.58us 330.37ms cudaMalloc
0.15% 13.752ms 8 1.7190ms 1.3268ms 2.2025ms cuDeviceTotalMem
0.11% 10.472ms 808 12.959us 172ns 728.50us cuDeviceGetAttribute
0.01% 997.81us 8 124.73us 100.93us 176.73us cuDeviceGetName
0.00% 69.047us 2 34.523us 32.349us 36.698us cudaLaunchKernel
0.00% 68.013us 1 68.013us 68.013us 68.013us cudaMemset
0.00% 46.172us 8 5.7710us 1.8940us 23.025us cuDeviceGetPCIBusId
0.00% 8.5060us 16 531ns 260ns 1.5030us cuDeviceGet
0.00% 3.7870us 8 473ns 229ns 881ns cuDeviceGetUuid
0.00% 3.3980us 3 1.1320us 610ns 2.0780us cuDeviceGetCount
$
Extended discussion:
On performance:
This is a memory bound algorithm. Therefore, we can estimate optimal kernel performance by determining the minimum number of memory reads and writes needed to perform the operation, then dividing by the available memory bandwidth, to determine the optimal or lower-bound for kernel duration. Unfortunately the determination of the minimum number of reads and writes depends on the positioning of the M matrices, so cannot be easily generally determined, without inspecting the n, i, and j matrices.
However we can look for another way to estimate. Another approach to estimation would be to observe that each M matrix update will require reading 2 values and writing one value. If we then use that as our estimate, we come up with M*Mx*My*3*sizeof(element_of_M)/GPU_memory_bandwidth. On my V100 (~700GB/s BW) this works out to about 20ms lower bound on kernel duration.
On approaches considered:
"naive" approach, kernel k: Each threadblock will be responsible for one of the N matrices, and will iterate through the M matrices, inspecting n to determine if the M matrices will update the assigned N matrix. This gives a non-optimal run time of ~3s but seems to be mostly invariant performance-wise based on the distribution of n, and can use an "arbitrary" reduction op.
attempt at "optimal" approach, kernel ki: Each threadblock will be responsible for one of the N matrices, but will only load a chunk of that matrix at a time. It will then proceed through the M matrices updating that chunk, similar the the k kernel. This necessitates more loops through the matrices, but should "almost" only load or save each global memory item the minimum number of times necessary. Nevertheless, the run time is really long, ~40s
atomic approach, kernel ka: Each threadblock will be responsible for one of the M matrices, and will atomically update the relevant N matrix. Simplicity. And the runtime is "fast" at ~40ms. (The atomic approach may be even faster than this is non-uniform n distributions. I witnessed kernel runtimes as low as 8ms!) However this is not readily generalizable to operations that don't have an atomic equivalent, such as multiply.
lock based approach, kernel kl: Like the atomic approach, each threadblock will be responsible for one of the M matrices, and will first acquire a lock on the relevant N matrix. The lock means that atomics are not necessary. For the uniformly distributed n case presented, it has about the same performance as the atomic case. It has the benefit that it can handle other reduction ops, such as multiply, readily. A disadvantage is that in the presence of non-uniformly-random distribution in n the performance can suffer, with a worst case in the ballpark of the naive kernel (3-5s).
Overall if the requirement for an arbitrary reduction operator can be dropped (e.g. only use addition, for example) then the atomic method may be best.

Sparse matrix addition in CUDA

I'm considering using CUDA C for a particular problem involving sparse matrix addition.
The docs seem to discuss only operations between a sparse and a dense object.
This leads me to think either: sparse-sparse addition is so trivial it may just be a case of using '+' or similar; or sparse-sparse addition is not implemented. Which is correct, and where can I find the docs?
CUSPARSE has some routines that can operate on two operands that are both sparse matrices, for addition and multiplication.
You can do sparse matrix - sparse matrix addition with CUSPARSE using the cusparse<t>csrgeam function:
This function performs following matrix-matrix operation
C=α∗A+β∗B
where A, B, and C are m×n sparse matrices (defined in CSR storage format ...
Although dense matrix addition is fairly trivial (could be about 3 lines of code, whether in serial or parallel), I personally would not put sparse addition of two CSR matrices at the same level of triviality, especially if the goal is to perform it in parallel. You could try writing your own routine; I wouldn't.
Sparse-sparse addition is surprisingly tricky unless the matrices are the same sparsity pattern. (If they are, just add the elements of the data vectors and call it a day). You'll probably note that even calling the csrgeam method takes a couple of steps - one to calculate the size of the resulting matrix, and then another to do the operation. The reason is that the resulting matrix contains the union of the two nonzero patterns.
If this wasn't tricky enough, let's talk the parallel case, which you're obviously interested in since you're talking about CUDA. If you're in the CSR format, you could parallelize by rows (something like 1 CUDA thread per matrix row as a first pass). You would want to do a first pass, possibly single-threaded to compute the row pointers and column indices, and then a parallel pass to actually run the computation.
Following Robert Crovella's answer, here is a fully worked example on how summing up two sparse matrices in CUDA:
#include <stdio.h>
#include <assert.h>
#include <cusparse.h>
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) { exit(code); }
}
}
void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); }
/***************************/
/* CUSPARSE ERROR CHECKING */
/***************************/
static const char *_cusparseGetErrorEnum(cusparseStatus_t error)
{
switch (error)
{
case CUSPARSE_STATUS_SUCCESS:
return "CUSPARSE_STATUS_SUCCESS";
case CUSPARSE_STATUS_NOT_INITIALIZED:
return "CUSPARSE_STATUS_NOT_INITIALIZED";
case CUSPARSE_STATUS_ALLOC_FAILED:
return "CUSPARSE_STATUS_ALLOC_FAILED";
case CUSPARSE_STATUS_INVALID_VALUE:
return "CUSPARSE_STATUS_INVALID_VALUE";
case CUSPARSE_STATUS_ARCH_MISMATCH:
return "CUSPARSE_STATUS_ARCH_MISMATCH";
case CUSPARSE_STATUS_MAPPING_ERROR:
return "CUSPARSE_STATUS_MAPPING_ERROR";
case CUSPARSE_STATUS_EXECUTION_FAILED:
return "CUSPARSE_STATUS_EXECUTION_FAILED";
case CUSPARSE_STATUS_INTERNAL_ERROR:
return "CUSPARSE_STATUS_INTERNAL_ERROR";
case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
case CUSPARSE_STATUS_ZERO_PIVOT:
return "CUSPARSE_STATUS_ZERO_PIVOT";
}
return "<unknown>";
}
inline void __cusparseSafeCall(cusparseStatus_t err, const char *file, const int line)
{
if (CUSPARSE_STATUS_SUCCESS != err) {
fprintf(stderr, "CUSPARSE error in file '%s', line %d, error %s\nterminating!\n", __FILE__, __LINE__, \
_cusparseGetErrorEnum(err)); \
assert(0); \
}
}
extern "C" void cusparseSafeCall(cusparseStatus_t err) { __cusparseSafeCall(err, __FILE__, __LINE__); }
/********/
/* MAIN */
/********/
int main() {
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
// --- Initialize matrix descriptors
cusparseMatDescr_t descrA, descrB, descrC;
cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseCreateMatDescr(&descrB));
cusparseSafeCall(cusparseCreateMatDescr(&descrC));
const int M = 5; // --- Number of rows
const int N = 6; // --- Number of columns
const int nnz1 = 10; // --- Number of non-zero blocks for matrix A
const int nnz2 = 8; // --- Number of non-zero blocks for matrix A
// --- Host vectors defining the first block-sparse matrix
float *h_csrValA = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrA = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndA = (int *)malloc(nnz1 * sizeof(int));
// --- Host vectors defining the second block-sparse matrix
float *h_csrValB = (float *)malloc(nnz1 * sizeof(float));
int *h_csrRowPtrB = (int *)malloc((M + 1) * sizeof(int));
int *h_csrColIndB = (int *)malloc(nnz1 * sizeof(int));
h_csrValA[0] = 1.f;
h_csrValA[1] = 7.f;
h_csrValA[2] = 1.f;
h_csrValA[3] = 3.f;
h_csrValA[4] = -1.f;
h_csrValA[5] = 10.f;
h_csrValA[6] = 1.f;
h_csrValA[7] = -4.f;
h_csrValA[8] = 1.f;
h_csrValA[9] = 3.f;
h_csrRowPtrA[0] = 0;
h_csrRowPtrA[1] = 3;
h_csrRowPtrA[2] = 5;
h_csrRowPtrA[3] = 6;
h_csrRowPtrA[4] = 8;
h_csrRowPtrA[5] = 10;
h_csrColIndA[0] = 0;
h_csrColIndA[1] = 3;
h_csrColIndA[2] = 5;
h_csrColIndA[3] = 2;
h_csrColIndA[4] = 4;
h_csrColIndA[5] = 1;
h_csrColIndA[6] = 0;
h_csrColIndA[7] = 3;
h_csrColIndA[8] = 3;
h_csrColIndA[9] = 5;
h_csrValB[0] = 3.f;
h_csrValB[1] = 1.f;
h_csrValB[2] = -1.f;
h_csrValB[3] = 1.f;
h_csrValB[4] = -4.f;
h_csrValB[5] = -3.f;
h_csrValB[6] = -2.f;
h_csrValB[7] = 10.f;
h_csrRowPtrB[0] = 0;
h_csrRowPtrB[1] = 2;
h_csrRowPtrB[2] = 4;
h_csrRowPtrB[3] = 5;
h_csrRowPtrB[4] = 7;
h_csrRowPtrB[5] = 8;
h_csrColIndB[0] = 0;
h_csrColIndB[1] = 4;
h_csrColIndB[2] = 0;
h_csrColIndB[3] = 1;
h_csrColIndB[4] = 3;
h_csrColIndB[5] = 0;
h_csrColIndB[6] = 1;
h_csrColIndB[7] = 3;
// --- Device vectors defining the block-sparse matrices
float *d_csrValA; gpuErrchk(cudaMalloc(&d_csrValA, nnz1 * sizeof(float)));
int *d_csrRowPtrA; gpuErrchk(cudaMalloc(&d_csrRowPtrA, (M + 1) * sizeof(int)));
int *d_csrColIndA; gpuErrchk(cudaMalloc(&d_csrColIndA, nnz1 * sizeof(int)));
float *d_csrValB; gpuErrchk(cudaMalloc(&d_csrValB, nnz2 * sizeof(float)));
int *d_csrRowPtrB; gpuErrchk(cudaMalloc(&d_csrRowPtrB, (M + 1) * sizeof(int)));
int *d_csrColIndB; gpuErrchk(cudaMalloc(&d_csrColIndB, nnz2 * sizeof(int)));
gpuErrchk(cudaMemcpy(d_csrValA, h_csrValA, nnz1 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrA, h_csrRowPtrA, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndA, h_csrColIndA, nnz1 * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrValB, h_csrValB, nnz2 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrRowPtrB, h_csrRowPtrB, (M + 1) * sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_csrColIndB, h_csrColIndB, nnz2 * sizeof(int), cudaMemcpyHostToDevice));
// --- Summing the two matrices
int baseC, nnz3;
// --- nnzTotalDevHostPtr points to host memory
int *nnzTotalDevHostPtr = &nnz3;
cusparseSafeCall(cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
int *d_csrRowPtrC; gpuErrchk(cudaMalloc(&d_csrRowPtrC, (M + 1) * sizeof(int)));
cusparseSafeCall(cusparseXcsrgeamNnz(handle, M, N, descrA, nnz1, d_csrRowPtrA, d_csrColIndA, descrB, nnz2, d_csrRowPtrB, d_csrColIndB, descrC, d_csrRowPtrC, nnzTotalDevHostPtr));
if (NULL != nnzTotalDevHostPtr) {
nnz3 = *nnzTotalDevHostPtr;
}
else{
gpuErrchk(cudaMemcpy(&nnz3, d_csrRowPtrC + M, sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(&baseC, d_csrRowPtrC, sizeof(int), cudaMemcpyDeviceToHost));
nnz3 -= baseC;
}
int *d_csrColIndC; gpuErrchk(cudaMalloc(&d_csrColIndC, nnz3 * sizeof(int)));
float *d_csrValC; gpuErrchk(cudaMalloc(&d_csrValC, nnz3 * sizeof(float)));
float alpha = 1.f, beta = 1.f;
cusparseSafeCall(cusparseScsrgeam(handle, M, N, &alpha, descrA, nnz1, d_csrValA, d_csrRowPtrA, d_csrColIndA, &beta, descrB, nnz2, d_csrValB, d_csrRowPtrB, d_csrColIndB, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC));
// --- Transforming csr to dense format
float *d_C; gpuErrchk(cudaMalloc(&d_C, M * N * sizeof(float)));
cusparseSafeCall(cusparseScsr2dense(handle, M, N, descrC, d_csrValC, d_csrRowPtrC, d_csrColIndC, d_C, M));
float *h_C = (float *)malloc(M * N * sizeof(float));
gpuErrchk(cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost));
// --- m is row index, n column index
for (int m = 0; m < M; m++) {
for (int n = 0; n < N; n++) {
printf("%f ", h_C[m + n * M]);
}
printf("\n");
}
return 0;
}

Unspecified launch failure - parallel scan in CUDA

I am using GeForce GT 520 (compute capablility v2.1) to run a program that performs the scan operation on an array of int elements. Here's the code:
/*
This is an implementation of the parallel scan algorithm.
Only a single block of threads is used. Maximum array size = 2048
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define errorCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s, file: %s line: %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void blelloch_scan(int* d_in, int* d_out, int n)
{
extern __shared__ int temp[];// allocated on invocation
int thid = threadIdx.x;
int offset = 1;
temp[2*thid] = d_in[2*thid]; // load input into shared memory
temp[2*thid+1] = d_in[2*thid+1];
// build sum in place up the tree
for (int d = n>>1; d > 0; d >>= 1)
{
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
temp[bi] += temp[ai];
}
offset *= 2;
}
// clear the last element
if (thid == 0)
temp[n - 1] = 0;
__syncthreads();
// traverse down tree & build scan
for (int d = 1; d < n; d *= 2)
{
offset >>= 1;
__syncthreads();
if (thid < d)
{
int ai = offset*(2*thid+1)-1;
int bi = offset*(2*thid+2)-1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
d_out[2*thid] = temp[2*thid]; // write results to device memory
d_out[2*thid+1] = temp[2*thid+1];
}
int main(int argc, char **argv)
{
int ARRAY_SIZE;
if(argc != 2)
{
printf("Input Syntax: ./a.out <number-of-elements>\nProgram terminated.\n");
exit (1);
}
else
ARRAY_SIZE = (int) atoi(*(argv+1));
int *h_in, *h_out, *d_in, *d_out, i;
h_in = (int *) malloc(sizeof(int) * ARRAY_SIZE);
h_out = (int *) malloc(sizeof(int) * ARRAY_SIZE);
cudaSetDevice(0);
cudaDeviceProp devProps;
if (cudaGetDeviceProperties(&devProps, 0) == 0)
{
printf("Using device %d:\n", 0);
printf("%s; global mem: %dB; compute v%d.%d; clock: %d kHz\n",
devProps.name, (int)devProps.totalGlobalMem,
(int)devProps.major, (int)devProps.minor,
(int)devProps.clockRate);
}
for(i = 0; i < ARRAY_SIZE; i++)
{
h_in[i] = i;
}
errorCheck(cudaMalloc((void **) &d_in, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMalloc((void **) &d_out, sizeof(int) * ARRAY_SIZE));
errorCheck(cudaMemcpy(d_in, h_in, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice));
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>> (d_in, d_out, ARRAY_SIZE);
cudaDeviceSynchronize();
errorCheck(cudaGetLastError());
errorCheck(cudaMemcpy(h_out, d_out, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost));
printf("Results:\n");
for(i = 0; i < ARRAY_SIZE; i++)
{
printf("h_in[%d] = %d, h_out[%d] = %d\n", i, h_in[i], i, h_out[i]);
}
return 0;
}
On compiling using nvcc -arch=sm_21 parallel-scan.cu -o parallel-scan, I get an error:
GPUassert: unspecified launch failure, file: parallel-scan-single-block.cu line: 106
Line 106 is the line after kernel launch when we check for errors using errorCheck.
This is what I am planning to implement:
From the kernel, it can be seen that if a block has 1000 threads, it can operate on 2000 elements. Therefore, blockSize = ARRAY_SIZE / 2.
And, shared memory = sizeof(int) * ARRAY_SIZE
Everything is loaded into shared mem. Then, up sweep is done, with last element being set to 0. Finally, down sweep is done to give an exclusive scan of the elements.
I have used this file as the reference to write this code. I do not understand what's the mistake in my code. Any help would be greatly appreciated.
You are launching the kernel like so
blelloch_scan <<<1, ARRAY_SIZE / 2, sizeof(int) * ARRAY_SIZE>>>
meaning that witihin then kernel 0 < thid < int(ARRAY_SIZE/2).
However, your kernel requires a minimum of (2 * int(ARRAY_SIZE/2)) + 1 words of available shared memory to work correctly, otherwise this:
temp[2*thid+1] = d_in[2*thid+1];
will produce an out-of-bounds shared memory access.
If my integer mathematical skillz are not too rusty, this should mean that the code will be safe if ARRAY_SIZE is odd, because ARRAY_SIZE == (2 * int(ARRAY_SIZE/2)) + 1 for any odd integer. However, if ARRAY_SIZE is even, then ARRAY_SIZE < (2 * int(ARRAY_SIZE/2)) + 1 and you have a problem.
It might be that shared memory page size granularity saves you for some even values of ARRAY_SIZE which should theoretically fail, because the hardware will always round up the dynamic shared memory allocation to the next page size larger than the request size. But there should be a number of even values of ARRAY_SIZE for which this fails.
I can't comment on whether the rest of the kernel is correct or not, but using a shared memory size of sizeof(int) * size_t(1 + ARRAY_SIZE) should make this particular problem go away.

Non-square matrix multiplication in CUDA

For my GPU programming class, we've been tasked with completing certain parts of a non-square matrix multiplication program. Specifically, the kernel function and initializing the thread block and kernel grid dimensions.
I've based my code on the CUDA C Programming Guide's matrix multiplication code, but instead of using structs as they do, I have modified mine to use only the parameters given (since we're not allowed to change parameters). We are provided with the 3 matrices A, B, and C, as well as the dimensions of them- m x k, k x n, and m x n, respectively. Where the struct used A.height, I've used dimension m, where it used B.width, I've used dimension n, etc.
I've run into several problems, the first of which is that my program doesn't pass the included test, which verifies the correctness of the product matrix C. I assume that there is something wrong in my matrix multiplication code, then, and that the issue probably arises from me adapting the struct code.
#include <stdio.h>
__global__ void mysgemm(int m, int n, int k, const float *A, const float *B,
float* C) {
/********************************************************************
*
* Compute C = A x B
* where A is a (m x k) matrix
* where B is a (k x n) matrix
* where C is a (m x n) matrix
*
********************************************************************/
// INSERT KERNEL CODE HERE
// Each thread computes one element of C
// by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
for (int e = 0; e < k; ++e){
Cvalue += (A[row * k + e]) * (B[e * n + col]);
}
C[row * n + col] = Cvalue;
}
My other problem, which I'm even less sure about, involves the code to initialize the thread block and kernel grid dimensions.
// Initialize thread block and kernel grid dimensions ---------------------
const unsigned int BLOCK_SIZE = 16; // Use 16x16 thread blocks
//INSERT CODE HERE
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(n / dimBlock.x, m / dimBlock.y);
// Invoke CUDA kernel -----------------------------------------------------
//INSERT CODE HERE
mysgemm<<<dimGrid, dimBlock>>>(m, n, k, A, B, C);
I understand dimBlock, but I don't understand dimGrid, and don't have a proper idea of what to use as parameters for it. When I run the code as is, the kernel won't even launch if the matrix I pass in doesn't have a dimension that is a power of 2. And if I do use a power of 2, the test still fails.
I apologize if I've been too wordy. This is my first post and I wanted to give as many details as possible. Hopefully someone can help walk me through these issues.
The following kernel I'm posting below is a variant of the one I posted in
CUDA: Tiled matrix-matrix multiplication with shared memory and matrix size which is non-multiple of the block size
in that it does not use shared memory.
__global__ void MatMulNoShared(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {
float CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {
for (int n = 0; n < TILE_DIM; ++n)
if ((k*TILE_DIM + n < ACols && Row < ARows) && (k*TILE_DIM + n < BRows && Col < BCols))
CValue += A[Row*ACols + k*TILE_DIM + n] * B[(k*TILE_DIM + n)*BCols + Col];
}
if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}
The two if statements in the kernel are the if statements mentioned in the answer by Eric.
For the sake of your convenience, I'm posting the full code below:
#include <stdio.h>
#include <math.h>
#include <conio.h>
#define TILE_DIM 16 // Tile dimension
#define DIMX 373
#define DIMY 242
#define DIMZ 533
__global__ void MatMulNoShared(float* A, float* B, float* C, int ARows, int ACols, int BRows, int BCols, int CRows, int CCols) {
float CValue = 0;
int Row = blockIdx.y*TILE_DIM + threadIdx.y;
int Col = blockIdx.x*TILE_DIM + threadIdx.x;
for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {
for (int n = 0; n < TILE_DIM; ++n)
if ((k*TILE_DIM + n < ACols && Row < ARows) && (k*TILE_DIM + n < BRows && Col < BCols))
CValue += A[Row*ACols + k*TILE_DIM + n] * B[(k*TILE_DIM + n)*BCols + Col];
}
if (Row < CRows && Col < CCols) C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols)+(blockIdx.x*blockDim.x)+threadIdx.x]=CValue;
}
int main() {
int CCols = DIMZ, CRows=DIMX, ACols=DIMY, ARows=DIMX, BCols=DIMZ, BRows=DIMY;
dim3 dimBlock(TILE_DIM, TILE_DIM, 1);
dim3 dimGrid;
dimGrid.x = (CCols + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
float *deviceA, *deviceB, *deviceC;
float* hostA = (float*)malloc(DIMX*DIMY*sizeof(float));
float* hostB = (float*)malloc(DIMY*DIMZ*sizeof(float));
float* hostC = (float*)malloc(DIMX*DIMZ*sizeof(float));
float* hostCp = (float*)malloc(DIMX*DIMZ*sizeof(float));
for (int x = 0; x<DIMX; x++)
for (int y = 0; y<DIMY; y++) {
hostA[x*DIMY+y] = rand()/(float)RAND_MAX;
hostB[x*DIMY+y] = rand()/(float)RAND_MAX;
}
cudaMalloc((void **)&deviceA, DIMX*DIMY*sizeof(float));
cudaMalloc((void **)&deviceB, DIMY*DIMZ*sizeof(float));
cudaMalloc((void **)&deviceC, DIMX*DIMZ*sizeof(float));
cudaMemcpy(deviceA, hostA, DIMX*DIMY*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceB, hostB, DIMY*DIMZ*sizeof(float), cudaMemcpyHostToDevice);
MatMulNoShared<<<dimGrid , dimBlock>>>(deviceA , deviceB , deviceC , ARows , ACols, BRows ,BCols , CRows , CCols);
cudaMemcpy(hostC, deviceC, DIMX*DIMZ*sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}
Note that the two instructions
dimGrid.x = (CCols + dimBlock.x - 1)/dimBlock.x;
dimGrid.y = (CRows + dimBlock.y - 1)/dimBlock.y;
ensure a full tiled coverage of the matrices, as mentioned at point 1. of Eric's answer.
Your code currently only works when m and n are multiples of 16, which is your block size.
Two things you can do now to make it work on arbitrary sizes.
Make the gird size large enough to cover the whole matrix C. Instead of using the floor of n/blockdim.x as you have done, you could use the ceil of that value by
(n+blockdim.x-1)/blockdim.x
After you have done step 1, the matrix you are multiplying will be a little bit larger because of the ceiling operation. you could then limit the multiplying to the exact size of the result matrix C by adding an if clause in the kernel.
Please refer to CUDA docs for more details, especially the programming guide.
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html

Issue With Large Array Sizes in CUDA

I am familiarizing myself with CUDA by writing a dot product calculator. I wanted to test it with large array sizes to do a timing study to test two different ways of collecting the vector sum. However, when the size of the array is above 1024 I get errors. I am not so sure where the problem is coming from. The card is a GTX460M with 1.5GB of ram. I am using the card for display (this is a laptop). Aside that I am not sure where the issue could be coming from.
Here is the nvcc compile line:
nvcc D:\Research\CUDA\TestCode\test_dotProduct_1.cu --use_fast_math --gpu-architecture sm_13 --compiler-bindir="D:\Programming\VisualStudio\2010express\VC\bin" --machine 32 -o multi_dot.exe
I also seem to have trouble with compiling in 64 bit but that is another issue
Here is the output for an array of size 1024:
HOST CALCULATION: 357389824.000000
DEV PARA CALCULATION: 357389824.000000
DEV SERI CALCULATION: 357389824.000000
Here is the output for an array of size 2048:
HOST CALCULATION: 2861214720.000000
DEV PARA CALCULATION: -1.#INF00
DEV SERI CALCULATION: -1.#INF00
Here is my code:
/*Code for a CUDA test project doing a basic dot product with doubles
*
*
*
*/
#include <stdio.h>
#include <cuda.h>
__global__ void GPU_parallelDotProduct(double *array_a, double *array_b, double *array_c){
array_c[threadIdx.x] = array_a[threadIdx.x] * array_b[threadIdx.x];
}
__global__ void GPU_parallelSumVector(double *vector, double *sum, int base){
sum[threadIdx.x + blockIdx.x] = vector[blockIdx.x + threadIdx.x * base] + vector[blockIdx.x + threadIdx.x * base + 1];
}
__global__ void GPU_serialSumVector(double *vector, double *sum, int dim){
for(int i = 0; i < dim; ++i){
sum[0] += vector[i];
}
}
__host__ void CPU_serialDot(double *first, double *second, double *dot, int dim){
for(int i=0; i<dim; ++i){
dot[0] += first[i] * second[i];
}
}
__host__ void CPU_serialSetupVector(double *vector, int dim, int incrSize, int start){
for(int i=0; i<dim; ++i){
vector[i] = start + i * incrSize;
}
}
int main(){
//define array size to be used
//int i,j;
const int VECTOR_LENGTH = 2048;
int SUM_BASE = 2;
int SUM_ROUNDS = VECTOR_LENGTH / SUM_BASE;
int ELEMENT_SIZE = sizeof(double);
// int currentSize = VECTOR_LENGTH;
//arrays for dot product
//host
double *array_a = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *array_b = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_parallel = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double *dev_dot_product_serial = (double*) malloc(VECTOR_LENGTH * ELEMENT_SIZE);
double host_dot_product = 0.0;
//fill with values
CPU_serialSetupVector(array_a, VECTOR_LENGTH, 1, 0);
CPU_serialSetupVector(array_b, VECTOR_LENGTH, 1, 0);
CPU_serialDot(array_a, array_b, &host_dot_product, VECTOR_LENGTH);
//device
double *dev_array_a;
double *dev_array_b;
double *dev_array_c;
double *dev_dot_serial;
double *dev_dot_parallel;
//allocate cuda memory
cudaMalloc((void**)&dev_array_a, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_b, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_array_c, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_parallel, ELEMENT_SIZE * VECTOR_LENGTH);
cudaMalloc((void**)&dev_dot_serial, ELEMENT_SIZE * VECTOR_LENGTH);
//copy to from host to device
cudaMemcpy(dev_array_a, array_a, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_array_b, array_b, ELEMENT_SIZE * VECTOR_LENGTH, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_parallel, &dev_dot_product_parallel, ELEMENT_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(dev_dot_serial, &dev_dot_product_serial, ELEMENT_SIZE, cudaMemcpyHostToDevice);
//perform CUDA dot product
GPU_parallelDotProduct<<<1, VECTOR_LENGTH>>>(dev_array_a, dev_array_b, dev_array_c);
//condense a second vector in serial to compare speed up of tree condensing
GPU_serialSumVector<<<1,1>>>(dev_array_c, dev_dot_serial, VECTOR_LENGTH);
//condense vector (parallel)
for(int i=SUM_ROUNDS; i>1; i/=SUM_BASE){
GPU_parallelSumVector<<<1,i>>>(dev_array_c, dev_array_c, SUM_BASE);
}
GPU_parallelSumVector<<<1,1>>>(dev_array_c, dev_array_c, SUM_BASE);
//get computed product back to the machine
cudaMemcpy(dev_dot_product_parallel, dev_array_c, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
cudaMemcpy(dev_dot_product_serial, dev_dot_serial, VECTOR_LENGTH * ELEMENT_SIZE, cudaMemcpyDeviceToHost);
FILE *output = fopen("test_dotProduct_1.txt", "w");
fprintf(output, "HOST CALCULATION: %f \n", host_dot_product);
fprintf(output, "DEV PARA CALCULATION: %f \n", dev_dot_product_parallel[0]);
fprintf(output, "DEV SERI CALCULATION: %f \n", dev_dot_product_serial[0]);
/*
fprintf(output, "VALUES OF DEV_ARRAY_C VEC: \n");
for(int i=0; i<VECTOR_LENGTH; ++i){
fprintf(output, "value %i is: %f \n", i, dev_dot_product_parallel[i]);
}
*/
free(array_a);
free(array_b);
//free(host_dot_product);
cudaFree(dev_array_a);
cudaFree(dev_array_b);
cudaFree(dev_array_c);
cudaFree(dev_dot_parallel);
cudaFree(dev_dot_serial);
return(0);
}
The maximum number of threads for a block for your card is 1024, which is why you are getting an error (for some older cards its 512). You either need to split up your blocks to use multiple dimensions (again limited to 1024 in a direction for x,y,z on your card) or use more than one block in your grid.

Resources