CUDA error: Invalid configuration argument - c

Hi I have the following CUDA C code:
kernel.cu:
/******************************************************************************
*cr
******************************************************************************/
#include <stdio.h>
#define TILE_SIZE 16
#define BLOCK_SIZE 512
/****************************************************************/
// Kernel for matrix multiplication:
// A: m x n matrix
// B: n x k matrix
// C = A x B: m x k matrix
__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {
__shared__ float ds_A[TILE_SIZE][TILE_SIZE];
__shared__ float ds_B[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = (by*TILE_SIZE+ty);//%m;
int col = (bx*TILE_SIZE+tx);//%n;
float pvalue = 0;
for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
{
if((i*TILE_SIZE +tx < k) && (row < m))
ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
else ds_A[ty][tx] = 0;
if((i*TILE_SIZE+ty < k) && (col < n))
ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col]; // Load data into shared memory
else ds_B[ty][tx] = 0;
__syncthreads();
if(row < m && col < n)
{
for(int j=0;j<TILE_SIZE;++j)
{
//if(j < k)
pvalue += ds_A[ty][j]*ds_B[j][tx];
}
}
__syncthreads();
}
if(row < m && col < n)
C[row*n+col] = pvalue;
}
/****************************************************************/
/****************************************************************/
// Kernel to multiply each element in A by the corresponding element in B and store
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
C[i] = A[i]*B[i];
}
/****************************************************************/
/****************************************************************/
// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
if(start + t >= size)
partialSum[t] = 0;
else partialSum[t] = in[start+t];
if(start + blockDim.x+t>= size)
partialSum[blockDim.x+t] = 0;
else partialSum[blockDim.x+t] = in[start + blockDim.x+t];
for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride ==0)
partialSum[2*t]+=partialSum[2*t+stride];
}
__syncthreads();
out[blockIdx.x] = partialSum[0];
}
/****************************************************************/
/****************************************************************/
// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}
/****************************************************************/
// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = tx+bx*blockDim.x;
int j = ty+by*blockDim.y;
if((i < m) && (j < m))
out[i*m+j] = in[i]-in[j];
}
// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = exp(-coeff * in[i]);
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff * in[i];
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff + in[i];
}
/****************************************************************/
// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = coeff/in[i];
}
/****************************************************************/
/****************************************************************/
// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
// loop through columns j
for(int j=0; j<m; ++j)
{
if(i == j)
out[i*m + j] = p2[i];
else if(i < m)
out[i*m + j] = num[i*m+j]/denom[i];
}
}
/****************************************************************/
/****************************************************************/
// Kernel to update the error, counter, and parameter variables
__global__ void update(int* counter, double* error, double *mu, double *mu_temp, double* alpha, double* alpha_temp, double* omega, double* omega_temp)
{
*counter = *counter + 1;
*error = (mu - mu_temp)*(mu - mu_temp) + (alpha-alpha_temp)*(alpha-alpha_temp) + (omega-omega_temp)*(omega-omega_temp);
mu = mu_temp;
alpha = alpha_temp;
omega = omega_temp;
}
/****************************************************************/
/****************************************************************/
// Kernel to assign old * coeff + inc to new
__global__ void assign(double* n, double* old, double coeff, double inc)
{
//*n = (*old)*coeff + inc;
*n = 5.0;
}
/****************************************************************/
/******************************************************************************************************/
// Function to calibrate the 1-D Hawke's process. Does so via an iterative procedure. Variables:
// int size: length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu: One of three parameters calibrated
// double alpha: One of three parameters calibrated
// double omega: One of three parameters calibrated
// double* A: A matrix filled out and used to calibrate
// double* T: A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta: A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E: A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p: A probability matrix of cross excitations
// double* p2: A vector of self-excitation probabilities
// double* ones: A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times: A (size x 1) vector of time series data to be calibrated
// int MAX_ITER: The maximum number of iterations allowed in the calibration
// double* TOL: The error tolerance or accuracy allowed in the calibration
// double* temp_1: A (size x 1) temporary vector used in intermediate calculations
// double* temp_2: A temporary matrix used in intermediate calculations
// double* temp_3: A temporary scalar used in intermediate calculations
/******************************************************************************************************/
void calibrate(int size, double *mu, double *mu_t, double *alpha, double *alpha_t, double *omega, double *omega_t, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int *ctr, double *err, double* temp_1, double* temp_2, double* temp_3)
{
//1) (a) Perform inner product to start initial values of mu, alpha, and omega
inner_product(temp_3, size, Times, ones, temp_1); // Inner product of Time series
dim3 dimGrid(1,1,1);
dim3 dimBlock(1,1,1);
//assign<<<dimGrid,dimBlock>>>(mu_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
//assign<<<dimGrid,dimBlock>>>(alpha_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
//assign<<<dimGrid,dimBlock>>>(omega_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
/*
//1) (b) Fill out matrix T of time differences
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill<<<dimGrid,dimBlock>>>(size, Times, T);
// 2) Fill out matrix E
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);
// 3) Update matrix A
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);
dim3 dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
dim3 dimBlock(TILE_SIZE,TILE_SIZE,1);
mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)
// 4) Update matrix D
mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);
// 5) Update matrix p and vector p2
update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);
// 6) Update parameters mu, alpha, omega
inner_product(mu_t, size, p2, ones, temp_1);
mu_t /=Times[size-1];
reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
alpha_t/= size;
// Treat T and p as very long vectors and calculate the inner product
inner_product(omega_t, size*size, T, p, temp_2);
omega_t = alpha_t/omega_t;
*/
// 7) Update error
dim3 g(100,100,1);
dim3 b(100,100,1);
//update<<<g,b>>>(ctr,err,mu,mu_t,alpha,alpha_t,omega,omega_t);
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n",cudaGetErrorString(error));
exit(-1);
}
}
And the following file launches the host code that contains all the kernel calls.
main.cu (I do not use support.h yet):
/******************************************************************************
*cr
*cr
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"
int main (int argc, char *argv[])
{
Timer timer;
cudaError_t cuda_ret;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem...\n"); fflush(stdout);
startTime(&timer);
double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h;
double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2, *temp_3;
double* mu_h, *alpha_h, *omega_h; // hawkes parameters on host
double* mu_d, *alpha_d, *omega_d; // hawkes parameters on device
double* mu_t_d, *alpha_t_d, *omega_t_d; // hawkes temporary parameters on device
double* err_h, *err_d; // Iterative variables for hohst and device
int* ctr_h, *ctr_d;
int N;
unsigned int mat_size, vec_size;
// Import data
FILE *fp;
char str[60];
unsigned int count=0;
double d;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
++count;
// Stick with a limited subset of the data for now to avoid using too much host memory
N = 1000;
fclose(fp);
printf("Count is %u \n",count);
mat_size = N*N;
vec_size = N;
dim3 dim_grid, dim_block;
// Fill matrices with 0's
A_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }
T_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }
Delta_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }
E_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }
p_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }
// Fill vectors with 0's, except the 1's vector
p2_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }
Times_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }
D_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }
ones_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }
// Start constants as zero
mu_h = (double*) malloc( sizeof(double));
alpha_h = (double*) malloc( sizeof(double));
omega_h = (double*) malloc( sizeof(double));
err_h = (double*) malloc( sizeof(double));
ctr_h = (int*) malloc( sizeof(int));
*mu_h = 0;
*alpha_h = 0;
*omega_h = 0;
*err_h = 0;
*ctr_h = 0;
// Import data
count=0;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
{
sscanf(str, "%lf", &d);
if(count < vec_size)
Times_h[count] = d;
++count;
}
fclose(fp);
/*printf("TIMES VECTOR: \n");
for (unsigned int i=0; i < vec_size; ++i)
{
printf("TIMES_H[ %u ] is ",i);
printf("%f \n", Times_h[i]);
}*/
printf("Count is %u \n",count);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cudaMalloc((void**) &A_d, mat_size*sizeof(double)); // Create device variable for matrix A
cudaMalloc((void**) &T_d, mat_size*sizeof(double)); // Create device variable for matrix T
cudaMalloc((void**) &Delta_d, mat_size*sizeof(double)); // Create device variable for matrix Delta
cudaMalloc((void**) &E_d, mat_size*sizeof(double)); // Create device variable for matrix E
cudaMalloc((void**) &p_d, mat_size*sizeof(double)); // Create device variable for matrix p
cudaMalloc((void**) &p2_d, vec_size*sizeof(double)); // Create device variable for vector p2
cudaMalloc((void**) &D_d, vec_size*sizeof(double)); // Create device variable for vector D
cudaMalloc((void**) &Times_d, vec_size*sizeof(double)); // Create device variable for vector Times
cudaMalloc((void**) &ones_d, vec_size*sizeof(double)); // Create device variable for vector ones
// Parameters and intermediate parameters
cudaMalloc((void**) &mu_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_d, sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &mu_t_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_t_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_t_d, sizeof(double)); // Create device variable for constant omega
// Temporary variables
cudaMalloc((void**) &temp_1, vec_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_2, mat_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_3, sizeof(double)); // Create device variable for constant omega
// Iteration variables
cudaMalloc((void**) &err_d, sizeof(double)); // Create device variable for iterative counters
cudaMalloc((void**) &ctr_d, sizeof(int));
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Times_d,Times_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Parameters and intermediate parameters
cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(mu_t_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_t_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_t_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Temporary variables
cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_3,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Iteration variables
cudaMemcpy(err_d,err_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ctr_d,ctr_h,sizeof(int), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
int MAX_ITER = 100;
double TOL = .001;
//while(ctr_h < MAX_ITER && err_h < TOL)
//{
calibrate(vec_size,mu_d, mu_t_d, alpha_d, alpha_t_d, omega_d, omega_t_d, A_d, T_d, Delta_d, E_d, p_d,
p2_d, D_d, ones_d, Times_d, ctr_d, err_d, temp_1, temp_2, temp_3);
// cudaMemcpy(err_h,err_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
// cudaMemcpy(ctr_h,ctr_d,sizeof(int), cudaMemcpyDeviceToHost); // Copy from device var to host var
//}
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host...\n"); fflush(stdout);
startTime(&timer);
cudaMemcpy(mu_h,mu_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(alpha_h,alpha_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(omega_h,omega_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
printf("mu is %f: \n",*mu_h);
printf("alpha is %f: \n",*alpha_h);
printf("omega is %f: \n",*omega_h);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Free memory ------------------------------------------------------------
free(A_h);
free(T_h);
free(Delta_h);
free(E_h);
free(p_h);
free(p2_h);
free(D_h);
free(ones_h);
free(Times_h);
free(mu_h);
free(alpha_h);
free(omega_h);
cudaFree(A_d);
cudaFree(T_d);
cudaFree(Delta_d);
cudaFree(E_d);
cudaFree(p_d);
cudaFree(p2_d);
cudaFree(D_d);
cudaFree(ones_d);
cudaFree(Times_d);
cudaFree(mu_d);
cudaFree(alpha_d);
cudaFree(omega_d);
return 0;
}
I am getting the error CUDA error: invalid configuration argument from the cudaGetLastError() call at the end of kernel.cu. Since everything is commented out except the inner_product host function that calls two kernels, I assume the problem is coming from there:
/****************************************************************/
// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}
/****************************************************************/
Since the m passed is 1000 and BLOCK_SIZE is 512, this should produce dimGrid(1,1,1) and dimBlock(512,512,1), but this is giving the above error. Even when I changed this to dimBlock(256,256,1) I got the same error. I am pretty sure block sizes of up to 1024 threads are allowed for this device.

Your block dimenion dimBlock(512,512) is to big! There is a maximum count of threads per thread block that can be launched, depending on the compute architecture of your gpu.
There are several ways how to find out what the maximum block dimensions are.
A fast way is to use from the cuda sdk samples the deviceQuery program. That lists all informations of your cuda-enabled gpus, such like the maximum block dimensions.
Or you use cuda occupancy calculator and try to input your kernel parameters. In your example there will be a error.
A third way is to read the cuda programming guide where you can find all the informations that you need.

This might help to dynamically define the number of threads you want to use.
struct cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
cout<<"using "<<properties.multiProcessorCount<<" multiprocessors"<<endl;
cout<<"max threads per processor: "<<properties.maxThreadsPerMultiProcessor<<endl<<endl;
You can use any configuration of:
dim3(iThreadsPerBlock)
dim3(iThreadsPerBlockX, iThreadsPerBlockY)
dim3(iThreadsPerBlockX, iThreadsPerBlockY, iThreadsPerBlockZ)
where the volume of your block does not exceed properties.maxThreadsPerMultiProcessor

Related

cuda magma matrix-matrix addition kernel

I tried using similar format as magmablas_sgeadd_q kernel, however I am not getting proper outputs, moreover every time I run it, I get a different output.
The code that I used is given below:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, 1 );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
//Error checkng
printf("Matrix C ");
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
Output I got:
Matrix addition of 4 rows and 2 columns
Copy input data from the host memory to the CUDA device
CUDA kernel launch with 4 blocks of 2 threads
Copy output data from the CUDA device to the host memory
A is 0.000000, B is 0.364784, C is 0.364784
A is 0.000000, B is 0.952230, C is 0.952230
A is 0.000000, B is 0.000000, C is 0.000000
A is 0.000000, B is 0.000000, C is 0.000000
A is 0.840188, B is 0.394383, C is 1.234571
A is 0.783099, B is 0.798440, C is 1.581539
A is 0.911647, B is 0.197551, C is 1.109199
A is 0.335223, B is 0.768230, C is 1.103452
Matrix A
0.840188 0.783099
0.911647 0.335223
0.277775 0.477397
0.364784 0.952230
Matrix B
0.394383 0.798440
0.197551 0.768230
0.553970 0.628871
0.000000 0.000000
Matrix C
0.0000000.000000
0.0000000.000000
0.0000000.000000
0.0000000.000000
Let me know if you find something wrong with the code.
Thank you
There were two coding errors that I found:
When you use this method to "bump" the base pointer for matrices dA and dB in your kernel, you must also do the same for the base pointer for matrix dC:
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
dC += ind + iby*lddb; // add this line
Your host-code nested for loops are not indexing correctly. The outer loop is intended to index across the rows, and you have n rows, but you are allowing the outer loop to index across m rows:
for (int i = 0; i < m; ++i)
{
for (int j = 0; j < n ; j ++)
so then when you do the actual indexing calculation here:
h_A[i*m+j] = rand()/(float)RAND_MAX;
you are indexing out-of-bounds. (i*m exceeds the matrix size, for some values of i) This problem is repeated in all of your nested for-loops in host code. The fix is to reverse the m, n bounds on your i, j loops.
The following code has those errors fixed (plus some additions for variable definitions you left out - err and flag are undefined in the code you have posted currently - which create compile errors.) It seems to run correctly and produce the correct result:
$ cat t1213.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#define BLK_X 2
#define BLK_Y 1
__global__ void matrixAdd2( const float *dA, const float *dB, float *dC, int m, int n)
{
int ldda = m;
int lddb = m;
int ind = blockIdx.x*BLK_X + threadIdx.x;
int iby = blockIdx.y*BLK_Y;
/* check if full block-column */
bool full = (iby + BLK_Y <= n);
/* do only rows inside matrix */
if ( ind < m ) {
dA += ind + iby*ldda;
dB += ind + iby*lddb;
dC += ind + iby*lddb;
if ( full )
{
// full block-column
#pragma unroll
for( int j=0; j < BLK_Y; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
else
{
// partial block-column
for( int j=0; j < BLK_Y && iby+j < n; ++j )
{
dC[j*lddb] = dA[j*ldda] + dB[j*lddb];
printf("parital: A is %f, B is %f, C is %f \n",dA[j*ldda],dB[j*lddb],dC[j*lddb]);
}
}
}
}
int main ( void )
{
int m = 4; // a - mxn matrix
int n = 2; // b - mxn matrix
size_t size = m * n * sizeof(float);
printf("Matrix addition of %d rows and %d columns \n", m, n);
// allocate matrices on the host
float *h_A = (float *)malloc(size); // a- mxn matrix on the host
float *h_B = (float *)malloc(size); // b- mxn matrix on the host
float *h_C = (float *)malloc(size); // b- mxn matrix on the host
// Initialize the host input matrixs
for (int i = 0; i < n; ++i)
{
for (int j = 0; j < m ; j ++)
{
h_A[i*m+j] = rand()/(float)RAND_MAX;
h_B[i*m+j] = rand()/(float)RAND_MAX;
}
}
// Allocate the device input matrix A
float *d_A = NULL;
cudaError_t err = cudaMalloc((void **)&d_A, size);; // d_a - mxn matrix a on the device
// Allocate the device input matrix B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
// Allocate the device output matrix C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
// Copy the host input matrixs A and B in host memory to the device input matrixs in device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// defining number of threads and blocks
dim3 threads( BLK_X, BLK_Y );
dim3 grid((int)ceil(m/BLK_X),(int)ceil(n/BLK_Y) );
// Launching kernel
matrixAdd2<<<grid, threads, 0>>>(d_A, d_B, d_C, m, n);
// Copy the device result matrix in device memory to the host result matrix in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//print A matrix
printf("Matrix A");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_A[i*m+j]);
}
printf("\n");
}
// print B matrix if required
printf("Matrix B");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf(" %f", h_B[i*m+j]);
}
printf("\n");
}
int flag = 0;
//Error checkng
printf("Matrix C ");
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%f", h_C[i*m+j]);
if(h_C[i*m+j] == h_A[i*m+j] + h_B[i*m+j] )
{
flag = flag + 1;
}
}
printf("\n");
}
if(flag==m*n)
{
printf("Test PASSED\n");
}
// Free device global memory
err = cudaFree(d_A);
err = cudaFree(d_B);
err = cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
err = cudaDeviceReset();
printf("Done\n");
return 0;
}
$ nvcc -o t1213 t1213.cu
$ cuda-memcheck ./t1213
========= CUDA-MEMCHECK
Matrix addition of 4 rows and 2 columns
Copy input data from the host memory to the CUDA device
Copy output data from the CUDA device to the host memory
A is 0.277775, B is 0.553970, C is 0.831745
A is 0.477397, B is 0.628871, C is 1.106268
A is 0.364784, B is 0.513401, C is 0.878185
A is 0.952230, B is 0.916195, C is 1.868425
A is 0.911647, B is 0.197551, C is 1.109199
A is 0.335223, B is 0.768230, C is 1.103452
A is 0.840188, B is 0.394383, C is 1.234571
A is 0.783099, B is 0.798440, C is 1.581539
Matrix A 0.840188 0.783099 0.911647 0.335223
0.277775 0.477397 0.364784 0.952230
Matrix B 0.394383 0.798440 0.197551 0.768230
0.553970 0.628871 0.513401 0.916195
Matrix C 1.2345711.5815391.1091991.103452
0.8317451.1062680.8781851.868425
Test PASSED
Done
========= ERROR SUMMARY: 0 errors
$

CUDA Array set to 0 after kernel call

I have a simple program with 3 array, that count how much the third array is 0 and the first and second has same values. when it's true increment another array index.
The problems are:
If kernel has only the first if() then function the array A is ever 0
If I insert if() then else function the values of array A is set to 0 after index = 2 and don't count the state when A,B,C=0
this is the code
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, float *b, float *c, float *res)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (a[idx]=b[idx] && c[idx]==0) {
res[0]++;
}
else if (a[idx]=b[idx] && c[idx]==1){
res[1]++;
}
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
float *b_h, *b_d; // Pointer to host & device arrays
float *c_h, *c_d; // Pointer to host & device arrays
float *res_h, *res_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
//size_t size_s = 4 * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
b_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &b_d, size); // Allocate array on device
c_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &c_d, size); // Allocate array on device
res_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &res_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
// for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) b_h[i] = (float)i;
for (int i=0; i<N; i++) c_h[i] = (float)i;
for (int i=0; i<4; i++) res_h[i] = 0;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 8;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, b_d, c_d, res_d);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(b_h, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(c_h, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(res_h, res_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++){
printf("%f A \n", a_h[i]);
}
for (int i=0; i<N; i++){
printf("%f B \n", b_h[i]);
}
for (int i=0; i<N; i++){
printf("%f C \n", c_h[i]);
}
for (int i=0; i<4; i++){
printf("%f res \n", res_h[i]);
}
// Cleanup
free(a_h); cudaFree(a_d);
free(b_h); cudaFree(b_d);
free(c_h); cudaFree(c_d);
free(res_h); cudaFree(res_d);
}
Aside from the = in if (a[idx]=b[idx] && c[idx]==0) { that should be == as you already found (and same goes for the following if statement), there are at least two other issues in your code:
You don't check that the thread index doesn't go over the limit of the arrays. So since you are using 2 block of 8 threads, you have 16 threads accessing 10 elements arrays. To avoid the issue, you need to pass N as parameter for your kernel and add a if ( idx < N ) somewhere.
You accumulate in res in parallel without any sort of protection, leading to all kinds of race conditions. This is a very typical histogram issue that is explained aplenty in the literature (web, books, CUDA examples...). A quick fix for you (albeit probably not the most effective one) would be to use atomic operations, such as atomicAdd. In you case, the line res[0]++; would become atomicAdd( &res[0], 1 );, and res[1]++; would become (as you guessed) atomicAdd( &res[1], 1 );. The support of this for float implies you compile your code while using compute capability at least 2.0.
HTH
Sorry, I solved the problem.It was a mistake typing control = and not true ==

Dereferencing pointer in CUDA C

I am a novice C programmer and was a bit confused about this segmentation fault. I have worked with pointers before and this doesn't make sense. This code is being done on an NVIDIA GPU but I am not using any of the CUDA API functions yet (commented them out to isolate the error).
I get the error when de-referencing the pointer *mu on the GPU (see code below) in the function calibrate. That is, the error is a segmentation fault.
My host code is:
/******************************************************************************
*cr
*cr
******************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"
int main (int argc, char *argv[])
{
Timer timer;
cudaError_t cuda_ret;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem...\n"); fflush(stdout);
startTime(&timer);
double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h;
double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2;
double* mu_h, *alpha_h, *omega_h;
double* mu_d, *alpha_d, *omega_d;
int N;
unsigned int mat_size, vec_size;
// Import data
FILE *fp;
char str[60];
unsigned int count=0;
double d;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
++count;
// Stick with a limited subset of the data for now
N = 2000;
fclose(fp);
printf("Count is %u \n",count);
mat_size = N*N;
vec_size = N;
dim3 dim_grid, dim_block;
// Fill matrices with 0's
A_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }
T_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }
Delta_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }
E_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }
p_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }
// Fill vectors with 0's, except the 1's vector
p2_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }
Times_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }
D_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }
ones_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }
// Start constants as zero
mu_h = (double*) malloc( sizeof(double));
alpha_h = (double*) malloc( sizeof(double));
omega_h = (double*) malloc( sizeof(double));
*mu_h = 0;
*alpha_h = 0;
*omega_h = 0;
// Import data
count=0;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
{
sscanf(str, "%lf", &d);
if(count < vec_size)
Times_h[count] = d;
++count;
}
fclose(fp);
/*printf("TIMES VECTOR: \n");
for (unsigned int i=0; i < vec_size; ++i)
{
printf("TIMES_H[ %u ] is ",i);
printf("%f \n", Times_h[i]);
}*/
printf("Count is %u \n",count);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cudaMalloc((void**) &A_d, mat_size*sizeof(double)); // Create device variable for matrix A
cudaMalloc((void**) &T_d, mat_size*sizeof(double)); // Create device variable for matrix T
cudaMalloc((void**) &Delta_d, mat_size*sizeof(double)); // Create device variable for matrix Delta
cudaMalloc((void**) &E_d, mat_size*sizeof(double)); // Create device variable for matrix E
cudaMalloc((void**) &p_d, mat_size*sizeof(double)); // Create device variable for matrix p
cudaMalloc((void**) &p2_d, vec_size*sizeof(double)); // Create device variable for vector p2
cudaMalloc((void**) &D_d, vec_size*sizeof(double)); // Create device variable for vector D
cudaMalloc((void**) &Times_d, vec_size*sizeof(double)); // Create device variable for vector Times
cudaMalloc((void**) &ones_d, vec_size*sizeof(double)); // Create device variable for vector ones
cudaMalloc((void**) &mu_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_d, sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_1, vec_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_2, mat_size*sizeof(double)); // Create device variable for constant omega
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
int MAX_ITER = 100;
double TOL = .001;
calibrate(vec_size,mu_d, alpha_d, omega_d, A_d, T_d, Delta_d, E_d, p_d, p2_d, D_d, ones_d, Times_d,
MAX_ITER, TOL, temp_1, temp_2);
//tiledSgemm('N', 'N', matArow, matBcol, matBrow, 1.0f, \
// A_d, matArow, B_d, matBrow, 0.0f, C_d, matBrow); // A1_d, B1_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host...\n"); fflush(stdout);
startTime(&timer);
cudaMemcpy(mu_h,mu_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(alpha_h,alpha_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(omega_h,omega_d,sizeof(float), cudaMemcpyDeviceToHost); // Copy from device var to host var
printf("mu is %f: \n",mu_h);
printf("alpha is %f: \n",alpha_h);
printf("omega is %f: \n",omega_h);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Free memory ------------------------------------------------------------
free(A_h);
free(T_h);
free(Delta_h);
free(E_h);
free(p_h);
free(p2_h);
free(D_h);
free(ones_h);
free(Times_h);
free(mu_h);
free(alpha_h);
free(omega_h);
cudaFree(A_d);
cudaFree(T_d);
cudaFree(Delta_d);
cudaFree(E_d);
cudaFree(p_d);
cudaFree(p2_d);
cudaFree(D_d);
cudaFree(ones_d);
cudaFree(Times_d);
cudaFree(mu_d);
cudaFree(alpha_d);
cudaFree(omega_d);
return 0;
}
The Kernel code on the GPU is:
/*****************************************************************************************/
#include <stdio.h>
#define TILE_SIZE 16
#define BLOCK_SIZE 512
__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {
__shared__ float ds_A[TILE_SIZE][TILE_SIZE];
__shared__ float ds_B[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = (by*TILE_SIZE+ty);//%m;
int col = (bx*TILE_SIZE+tx);//%n;
float pvalue = 0;
for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
{
if((i*TILE_SIZE +tx < k) && (row < m))
ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
else ds_A[ty][tx] = 0;
if((i*TILE_SIZE+ty < k) && (col < n))
ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col]; // Load data into shared memory
else ds_B[ty][tx] = 0;
__syncthreads();
if(row < m && col < n)
{
for(int j=0;j<TILE_SIZE;++j)
{
//if(j < k)
pvalue += ds_A[ty][j]*ds_B[j][tx];
}
}
__syncthreads();
}
if(row < m && col < n)
C[row*n+col] = pvalue;
}
// Kernel to multiply each element in A by the corresponding element in B and store
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
C[i] = A[i]*B[i];
}
// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
if(start + t >= size)
partialSum[t] = 0;
else partialSum[t] = in[start+t];
if(start + blockDim.x+t>= size)
partialSum[blockDim.x+t] = 0;
else partialSum[blockDim.x+t] = in[start + blockDim.x+t];
for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride ==0)
partialSum[2*t]+=partialSum[2*t+stride];
}
__syncthreads();
out[blockIdx.x] = partialSum[0];
}
// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}
// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = tx+bx*blockDim.x;
int j = ty+by*blockDim.y;
if((i < m) && (j < m))
out[i*m+j] = in[i]-in[j];
}
// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = exp(-coeff * in[i]);
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff * in[i];
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff + in[i];
}
// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = coeff/in[i];
}
// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
// loop through columns j
for(int j=0; j<m; ++j)
{
if(i == j)
out[i*m + j] = p2[i];
else if(i < m)
out[i*m + j] = num[i*m+j]/denom[i];
}
}
/*****************************************************************************************/
// int size: length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu: One of three parameters calibrated
// double alpha: One of three parameters calibrated
// double omega: One of three parameters calibrated
// double* A: A matrix filled out and used to calibrate
// double* T: A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta: A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E: A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p: A probability matrix of cross excitations
// double* p2: A vector of self-excitation probabilities
// double* ones: A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times: A (size x 1) vector of time series data to be calibrated
// int MAX_ITER: The maximum number of iterations allowed in the calibration
// double* TOL: The error tolerance or accuracy allowed in the calibration
// double* temp_1: A (size x 1) temporary vector used in intermediate calculations
// double* temp_2: A temporary matrix used in intermediate calculations
/*****************************************************************************************/
void calibrate(int size, double *mu, double *alpha, double *omega, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int MAX_ITER, double TOL, double* temp_1, double* temp_2)
{
//1) (a) Perform inner product to start initial values of mu, alpha, and omega
*mu = .11; // ERROR IS HERE!!
/*
inner_product(mu, size, Times, ones, temp_1);
double a = *(mu);
a = a/size;
*mu = .11;
/*
/size;
*alpha = *mu;
*omega = *mu;
double mu_t = 0;
double alpha_t = 0;
double omega_t = 0;
double err = 0;
int ctr = 0;
//1) (b) Fill out matrix T of time differences
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill<<<dimGrid,dimBlock>>>(size, Times, T);
while(ctr < MAX_ITER && err < TOL)
{
// 2) Fill out matrix E
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);
// 3) Update matrix A
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);
dim3 dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
dim3 dimBlock(TILE_SIZE,TILE_SIZE,1);
mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)
// 4) Update matrix D
mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);
// 5) Update matrix p and vector p2
update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);
// 6) Update parameters mu, alpha, omega
inner_product(mu_t, size, p2, ones, temp_1);
mu_t /=Times[size-1];
reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
alpha_t/= size;
// Treat T and p as very long vectors and calculate the inner product
inner_product(omega_t, size*size, T, p, temp_2);
omega_t = alpha_t/omega_t;
// 7) Update error
ctr++;
err = (mu - mu_t)*(mu - mu_t) + (alpha-alpha_t)*(alpha-alpha_t) + (omega-omega_t)*(omega-omega_t);
mu = mu_t;
alpha = alpha_t;
omega = omega_t;
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n",cudaGetErrorString(error));
exit(-1);
}
}
*/
}
However, I think 99% of this code isn't relevant to the issue (I use nothing from "support.h" at the moment. Basically, I get an error de-referencing the pointer on the GPU, even though it is presumably not null. Thanks!
If you do proper cuda error checking you'll discover another problem with your code, this line:
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice);
should be something like this:
cudaMemcpy(Times_d,Times_h,vec_size*sizeof(double), cudaMemcpyHostToDevice);
However that's not the crux of the issue. It took me a while to figure out that you are not making any kernel calls. If you call a kernel, all the parameters you pass to that kernel must be accessible by the device. So if you pass a pointer, the pointer must point to device memory. You are doing this with mu_d which is a device pointer:
calibrate(vec_size,mu_d,...
But your calibrate is not a kernel!!
It's an ordinary host function running on the host (CPU). So when you try and dereference the device pointer mu_d in host code:
*mu = .11; // ERROR IS HERE!!
You get a seg fault. I'm not sure why you're trying to debug this way, but simply converting kernel calls to host routines, while leaving all the parameters the same, is not a valid way to debug.
Fundamental CUDA rules (ignoring cuda 6 Unified Memory):
you cannot dereference a host pointer in device code
you cannot dereference a device pointer in host code
Your code is a violation of the 2nd rule above.

error: expected a ")" on CUDA kernel code

Hi I am writing some code in CUDA C that uses many kernels. The kernel.cu and main.cu are pictured below.
kernel.cu: (almost everything is commented out in the host code at the bottom)
/******************************************************************************
*cr
******************************************************************************/
#include <stdio.h>
#define TILE_SIZE 16
#define BLOCK_SIZE 512
/****************************************************************/
// Kernel for matrix multiplication:
// A: m x n matrix
// B: n x k matrix
// C = A x B: m x k matrix
__global__ void mysgemm(int m, int n, int k, const double *A, const double *B, double* C) {
__shared__ float ds_A[TILE_SIZE][TILE_SIZE];
__shared__ float ds_B[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = (by*TILE_SIZE+ty);//%m;
int col = (bx*TILE_SIZE+tx);//%n;
float pvalue = 0;
for(int i=0;i<(k-1)/TILE_SIZE+1;++i)
{
if((i*TILE_SIZE +tx < k) && (row < m))
ds_A[ty][tx] = A[row*k+i*TILE_SIZE+tx];
else ds_A[ty][tx] = 0;
if((i*TILE_SIZE+ty < k) && (col < n))
ds_B[ty][tx] = B[(i*TILE_SIZE+ty)*n+col]; // Load data into shared memory
else ds_B[ty][tx] = 0;
__syncthreads();
if(row < m && col < n)
{
for(int j=0;j<TILE_SIZE;++j)
{
//if(j < k)
pvalue += ds_A[ty][j]*ds_B[j][tx];
}
}
__syncthreads();
}
if(row < m && col < n)
C[row*n+col] = pvalue;
}
/****************************************************************/
/****************************************************************/
// Kernel to multiply each element in A by the corresponding element in B and store
// the result to the corresponding element in C. All vectors should be of length m
__global__ void elem_mul(int m, const double *A, const double *B, double* C)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
C[i] = A[i]*B[i];
}
/****************************************************************/
// Kernel for parallel sum
__global__ void reduction(double *out, double *in, unsigned size)
{
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int t = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
if(start + t >= size)
partialSum[t] = 0;
else partialSum[t] = in[start+t];
if(start + blockDim.x+t>= size)
partialSum[blockDim.x+t] = 0;
else partialSum[blockDim.x+t] = in[start + blockDim.x+t];
for(unsigned int stride = 1; stride <=blockDim.x; stride*=2)
{
__syncthreads();
if(t % stride ==0)
partialSum[2*t]+=partialSum[2*t+stride];
}
__syncthreads();
out[blockIdx.x] = partialSum[0];
}
// Uses several kernels to compute the inner product of A and B
void inner_product(double *out, int m, const double *A, const double* B, double* temp)
{
dim3 dimGrid((m-1)/BLOCK_SIZE+1,(m-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
elem_mul<<<dimGrid,dimBlock>>>(m,A,B,temp);
reduction<<<dimGrid,dimBlock>>>(out,temp,m);
}
// Kernel to multiply each element in the matrix out in the following manner:
// out(i,j) = in(i) - in(j)
__global__ void fill(int m, const double *in, double *out)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int i = tx+bx*blockDim.x;
int j = ty+by*blockDim.y;
if((i < m) && (j < m))
out[i*m+j] = in[i]-in[j];
}
// Kernel to fill the matrix out with the formula out(i,j) = exp(-omega*T(i.j))
__global__ void fill_E(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = exp(-coeff * in[i]);
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_mul(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff * in[i];
}
// Kernel for scalar multiplication for an mxk matirx and a coefficient coeff
__global__ void scal_add(int m, int k, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m*k)
out[i] = coeff + in[i];
}
/****************************************************************/
// Kernel to update vector p2
__global__ void update_p2(int m, double coeff, double *in, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
if(i < m)
out[i] = coeff/in[i];
}
/****************************************************************/
/****************************************************************/
// Kernel to update matrix p
__global__ void update_p(int m, double* p2, double *denom, double *num, double *out)
{
int bx = blockIdx.x;
int tx = threadIdx.x;
int i = tx+bx*blockDim.x;
// loop through columns j
for(int j=0; j<m; ++j)
{
if(i == j)
out[i*m + j] = p2[i];
else if(i < m)
out[i*m + j] = num[i*m+j]/denom[i];
}
}
/****************************************************************/
/****************************************************************/
// Kernel to update the error, counter, and parameter variables
__global__ void update(int* counter, double* error, double *mu, double *mu_temp, double* alpha, double* alpha_temp, double* omega, double* omega_temp)
{
counter = counter + 1;
*error = (mu - mu_temp)*(mu - mu_temp) + (alpha-alpha_temp)*(alpha-alpha_temp) + (omega-omega_temp)*(omega-omega_temp);
mu = mu_temp;
alpha = alpha_temp;
omega = omega_temp;
}
/****************************************************************/
/****************************************************************/
// Kernel to assign old * coeff + inc to new
__global__ void assign(double* new, double* old, double coeff, double inc)
{
*new = 5.0;
}
/****************************************************************/
/******************************************************************************************************/
// Does so via an iterative procedure. Variables:
// int size: length of the Time-series vectors. Also the number of rows and columns in input matrices
// double mu: One of three parameters calibrated
// double alpha: One of three parameters calibrated
// double omega: One of three parameters calibrated
// double* A: A matrix filled out and used to calibrate
// double* T: A distance matrix T(i,j) = Times[i]-Times[j]
// double* Delta: A dissimilarity matrix Delta(i,j) = 1 if i > j, 0 otherwise
// double* E: A matrix filled out and used to calibrate--E(i,j) = exp(-omega*T(i,j))
// double* p: A probability matrix of cross excitations
// double* p2: A vector of self-excitation probabilities
// double* ones: A (size x 1) vector of 1's used in inner products and identity transformations
// double* Times: A (size x 1) vector of time series data to be calibrated
// int MAX_ITER: The maximum number of iterations allowed in the calibration
// double* TOL: The error tolerance or accuracy allowed in the calibration
// double* temp_1: A (size x 1) temporary vector used in intermediate calculations
// double* temp_2: A temporary matrix used in intermediate calculations
// double* temp_3: A temporary scalar used in intermediate calculations
/******************************************************************************************************/
void calibrate(int size, double *mu, double *mu_t, double *alpha, double *alpha_t, double *omega, double *omega_t, double *A, double *T, double *Delta, double *E, double *p, double *p2, double *D, double* ones, double *Times, int *ctr, double *err, double* temp_1, double* temp_2, double* temp_3)
{
//1) (a) Perform inner product to start initial values of mu, alpha, and omega
inner_product(temp_3, size, Times, ones, temp_1); // Inner product of Time series
dim3 dimGrid(1,1,1);
dim3 dimBlock(1,1,1);
assign<<<dimGrid,dimBlock>>>(mu_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
assign<<<dimGrid,dimBlock>>>(alpha_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
assign<<<dimGrid,dimBlock>>>(omega_t,temp_3,1.1,0); // Assign mu_t to be temp_3*(1/size) (the average)
/*
//1) (b) Fill out matrix T of time differences
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill<<<dimGrid,dimBlock>>>(size, Times, T);
// 2) Fill out matrix E
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
fill_E<<<dimGrid,dimBlock>>>(size, omega, T, E);
// 3) Update matrix A
dim3 dimGrid((size-1)/BLOCK_SIZE+1,(size-1)/BLOCK_SIZE+1,1);
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE,1);
scal_mult<<<dimGrid,dimBlock>>>(size,size, alpha, delta, A);
scal_mult<<<dimGrid,dimBlock>>>(size,size, omega, A, A);
dim3 dimGrid((n-1)/TILE_SIZE+1,(m-1)/TILE_SIZE+1,1);
dim3 dimBlock(TILE_SIZE,TILE_SIZE,1);
mysgemm<<<dimGrid,dimBlock>>>(size,size,size,A,E,A)
// 4) Update matrix D
mysgemm<<<dimGrid,dimBlock>>>(size,size,1,A,ones,D);
scal_add<<<dimGrid,dimBlock>>>(size,size, mu, D, D);
// 5) Update matrix p and vector p2
update_p2<<<dimGrid,dimBlock>>>(size,mu, D, p2);
update_p<<<dimGrid,dimBlock>>>(size,p2, D, A, p);
// 6) Update parameters mu, alpha, omega
inner_product(mu_t, size, p2, ones, temp_1);
mu_t /=Times[size-1];
reduction<<<dimGrid,dimBlock>>>(alpha_t,p,size*size);
alpha_t/= size;
// Treat T and p as very long vectors and calculate the inner product
inner_product(omega_t, size*size, T, p, temp_2);
omega_t = alpha_t/omega_t;
// 7) Update error
dim3 dimGrid(1,1,1);
dim3 dimBlock(1,1,1);
update<<<dimGrid,dimBlock>>>(ctr,err,mu,mu_t,alpha,alpha_t,omega,omega_t)
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
printf("CUDA error: %s\n",cudaGetErrorString(error));
exit(-1);
}
*/
}
main.cu (i don't use support.h yet)
/********************************************
*cr
cr
*********************************************/
#include <stdio.h>
#include <stdlib.h>
#include "kernel.cu"
#include "support.h"
int main (int argc, char *argv[])
{
Timer timer;
cudaError_t cuda_ret;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem...\n"); fflush(stdout);
startTime(&timer);
double* A_h, *T_h, *Delta_h, *E_h, *p_h, *p2_h, *D_h, *Times_h, *ones_h;
double* A_d, *T_d, *Delta_d, *E_d, *p_d, *p2_d, *D_d, *Times_d, *ones_d, *temp_1, *temp_2, *temp_3;
double* mu_h, *alpha_h, *omega_h; // hawkes parameters on host
double* mu_d, *alpha_d, *omega_d; // hawkes parameters on device
double* mu_t_d, *alpha_t_d, *omega_t_d; // hawkes temporary parameters on device
double* err_h, *err_d; // Iterative variables for hohst and device
int* ctr_h, *ctr_d;
int N;
unsigned int mat_size, vec_size;
// Import data
FILE *fp;
char str[60];
unsigned int count=0;
double d;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
++count;
// Stick with a limited subset of the data for now to avoid using too much host memory
N = 2000;
fclose(fp);
printf("Count is %u \n",count);
mat_size = N*N;
vec_size = N;
dim3 dim_grid, dim_block;
// Fill matrices with 0's
A_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { A_h[i] = 0; }
T_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { T_h[i] = 0; }
Delta_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { Delta_h[i] = 0; }
E_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { E_h[i] = 0; }
p_h = (double*) malloc( sizeof(double)*mat_size );
for (unsigned int i=0; i < mat_size; ++i) { p_h[i] = 0; }
// Fill vectors with 0's, except the 1's vector
p2_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { p2_h[i] = 0; }
Times_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { Times_h[i] = 0; }
D_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { D_h[i] = 0; }
ones_h = (double*) malloc( sizeof(double)*vec_size );
for (unsigned int i=0; i < vec_size; ++i) { ones_h[i] = 0; }
// Start constants as zero
mu_h = (double*) malloc( sizeof(double));
alpha_h = (double*) malloc( sizeof(double));
omega_h = (double*) malloc( sizeof(double));
err_h = (double*) malloc( sizeof(double));
ctr_h = (int*) malloc( sizeof(int));
*mu_h = 0;
*alpha_h = 0;
*omega_h = 0;
*err_h = 0;
*ctr_h = 0;
// Import data
count=0;
/* opening file for reading */
fp = fopen("AAPL_data.txt","r");
if(fp == NULL) {
perror("Error opening file");
return(-1);
}
while(fgets (str, 60, fp)!=NULL)
{
sscanf(str, "%lf", &d);
if(count < vec_size)
Times_h[count] = d;
++count;
}
fclose(fp);
/*printf("TIMES VECTOR: \n");
for (unsigned int i=0; i < vec_size; ++i)
{
printf("TIMES_H[ %u ] is ",i);
printf("%f \n", Times_h[i]);
}*/
printf("Count is %u \n",count);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cudaMalloc((void**) &A_d, mat_size*sizeof(double)); // Create device variable for matrix A
cudaMalloc((void**) &T_d, mat_size*sizeof(double)); // Create device variable for matrix T
cudaMalloc((void**) &Delta_d, mat_size*sizeof(double)); // Create device variable for matrix Delta
cudaMalloc((void**) &E_d, mat_size*sizeof(double)); // Create device variable for matrix E
cudaMalloc((void**) &p_d, mat_size*sizeof(double)); // Create device variable for matrix p
cudaMalloc((void**) &p2_d, vec_size*sizeof(double)); // Create device variable for vector p2
cudaMalloc((void**) &D_d, vec_size*sizeof(double)); // Create device variable for vector D
cudaMalloc((void**) &Times_d, vec_size*sizeof(double)); // Create device variable for vector Times
cudaMalloc((void**) &ones_d, vec_size*sizeof(double)); // Create device variable for vector ones
// Parameters and intermediate parameters
cudaMalloc((void**) &mu_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_d, sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &mu_t_d, sizeof(double)); // Create device variable for constant mu
cudaMalloc((void**) &alpha_t_d, sizeof(double)); // Create device variable for constant alpha
cudaMalloc((void**) &omega_t_d, sizeof(double)); // Create device variable for constant omega
// Temporary variables
cudaMalloc((void**) &temp_1, vec_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_2, mat_size*sizeof(double)); // Create device variable for constant omega
cudaMalloc((void**) &temp_3, sizeof(double)); // Create device variable for constant omega
// Iteration variables
cudaMalloc((void**) &err_d, sizeof(double)); // Create device variable for iterative counters
cudaMalloc((void**) &ctr_d, sizeof(int));
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cudaMemcpy(A_d,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(T_d,T_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Delta_d,Delta_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(E_d,E_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p_d,p_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(p2_d,p2_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(D_d,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ones_d,ones_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(Times_d,Times_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Parameters and intermediate parameters
cudaMemcpy(mu_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(mu_t_d,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(alpha_t_d,alpha_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(omega_t_d,omega_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Temporary variables
cudaMemcpy(temp_1,D_h,vec_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_2,A_h,mat_size*sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(temp_3,mu_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
// Iteration variables
cudaMemcpy(err_d,err_h,sizeof(double), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaMemcpy(ctr_d,ctr_h,sizeof(int), cudaMemcpyHostToDevice); // Copy from host var to device var
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel using standard sgemm interface ---------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
int MAX_ITER = 100;
double TOL = .001;
//while(ctr_h < MAX_ITER && err_h < TOL)
//{
calibrate(vec_size,mu_d, mu_t_d, alpha_d, alpha_t_d, omega_d, omega_t_d, A_d, T_d, Delta_d, E_d, p_d,
p2_d, D_d, ones_d, Times_d, ctr_d, err_d, temp_1, temp_2, temp_3);
// cudaMemcpy(err_h,err_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
// cudaMemcpy(ctr_h,ctr_d,sizeof(int), cudaMemcpyDeviceToHost); // Copy from device var to host var
//}
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host...\n"); fflush(stdout);
startTime(&timer);
cudaMemcpy(mu_h,mu_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(alpha_h,alpha_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
cudaMemcpy(omega_h,omega_d,sizeof(double), cudaMemcpyDeviceToHost); // Copy from device var to host var
printf("mu is %f: \n",*mu_h);
printf("alpha is %f: \n",*alpha_h);
printf("omega is %f: \n",*omega_h);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Free memory ------------------------------------------------------------
free(A_h);
free(T_h);
free(Delta_h);
free(E_h);
free(p_h);
free(p2_h);
free(D_h);
free(ones_h);
free(Times_h);
free(mu_h);
free(alpha_h);
free(omega_h);
cudaFree(A_d);
cudaFree(T_d);
cudaFree(Delta_d);
cudaFree(E_d);
cudaFree(p_d);
cudaFree(p2_d);
cudaFree(D_d);
cudaFree(ones_d);
cudaFree(Times_d);
cudaFree(mu_d);
cudaFree(alpha_d);
cudaFree(omega_d);
return 0;
}
I want to avoid copying CUDA memory variables back to the host to do simple arithmetic on them, so I made the kernel
global void assign(double* new, double* old, double coeff, double inc)
in kernel.cu to calculate *new = (*old)*coeff + inc on the GPU. However, right now I am just having it store 5 to new. I cannot even compile yet:
kernel.cu(211): error: expected a ")" and have no clue why.
new is a reserved word in C++ and CUDA C. Don't use it as a variable name.

atomicAdd causing error Unable to Launch/Execute Kernel

I have the following CUDA C code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
which causes my program to crash with the error: "unable to launch/execute kernel"
Here buffer is an input array of integers to this function of size elements and histo_private is an array of integers in shared memory of histo_size elements.
I know this isn't an index out of bounds error because when I use the code:
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
int a = histo_private[buffer[i]];
i+=stride;
}
So I gather that there is something wrong with the atomicAdd function and/or the memory address of this 32-bit int array.
The kernel.cu file contains the following code:
// Define your kernels in this file you may use more than one kernel if you
// need to
// INSERT KERNEL(S) HERE
__global__ void histo_kernel(unsigned int* buffer, unsigned int size, int* histo, unsigned int histo_size)
{
extern __shared__ int histo_private[];
if(threadIdx.x < histo_size)
histo_private[threadIdx.x] = 0;
__syncthreads();
// compute block's histogram
int i = threadIdx.x + blockIdx.x*blockDim.x;
int stride = blockDim.x*gridDim.x;
while(i < size)
{
//int a = histo_private[buffer[i]];
atomicAdd(&(histo_private[buffer[i]]),1);
i+=stride;
}
// store to global histogram
__syncthreads();
//if(threadIdx.x < histo_size)
// atomicAdd(&(histo[threadIdx.x]),histo_private[threadIdx.x]);
}
// ensures that no bins contains more than 255 elements
__global__ void enforce_saturation(int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
{
if(histo[i] > 255) // this will be necessary to prevent data loss
histo[i] = 255; // when converting from int to uint8_t
}
}
__global__ void construct_histo(uint8_t* histo_unpacked, int* histo, unsigned int histo_size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < histo_size)
histo_unpacked[i] = histo[i];
}
// unpacks the input array into an output array with 'spaces'
__global__ void unpack(uint8_t* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[4*i] = in[i];
out[4*i+1] = 0;
out[4*i+2] = 0;
out[4*i+3] = 0;
}
}
// converts the input uint8_t array to an int array
__global__ void convert(uint8_t* in, int* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (int) in[4*i];
}
}
// converts the input int array to a uint8_t array
__global__ void convert_back(int* in, uint8_t* out, unsigned int size)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if(i < size)
{
out[i] = (uint8_t) in[i];
}
}
void histogram(unsigned int* input, uint8_t* bins, unsigned int num_elements, unsigned int num_bins)
{
int BLOCK_SIZE = (int) num_bins;
BLOCK_SIZE = 512;
dim3 dim_grid, dim_block;
dim_block.x = BLOCK_SIZE; dim_block.y = dim_block.z = 1;
dim_grid.x = 1+(num_elements-1)/BLOCK_SIZE; dim_grid.y = dim_grid.z = 1;
// create an array of uint8_t to be converted into an array of int
uint8_t* bins_unpacked;
cudaMalloc((void**)&bins_unpacked, 4 * num_bins * sizeof(uint8_t));
// unpack the input uint8_t array
unpack<<<dim_grid,dim_block>>>(bins, bins_unpacked, num_bins);
// need an int version of bins_d
int* bins_int_d;
cudaMalloc((void**)&bins_int_d, num_bins * sizeof(int));
// convert the uint8_t array to an int array
convert<<<dim_grid,dim_block>>>(bins_unpacked, bins_int_d, num_bins);
// run kernel and enforce saturation requirements
int histo_private_size = num_bins;
histo_kernel<<<dim_grid,dim_block,histo_private_size>>>(input, num_elements, bins_int_d, num_bins);
enforce_saturation<<<dim_grid,dim_block>>>(bins_int_d,num_bins);
// convert the int array back to uint8_t
convert_back<<<dim_grid,dim_block>>>(bins_int_d, bins, num_bins);
}
While the function that calls this last histogram function is in main.cu (I did NOT make this second file--it was provided to me--also, I have been testing this on consistent data by compiling via make test-mode):
#include <stdio.h>
#include <stdint.h>
#include "support.h"
#include "kernel.cu"
int main(int argc, char* argv[])
{
Timer timer;
// Initialize host variables ----------------------------------------------
#if TEST_MODE
printf("\n***Running in test mode***\n"); fflush(stdout);
#endif
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
unsigned int *in_h;
uint8_t* bins_h;
unsigned int *in_d;
uint8_t* bins_d;
unsigned int num_elements, num_bins;
cudaError_t cuda_ret;
if(argc == 1) {
num_elements = 1000000;
num_bins = 4096;
} else if(argc == 2) {
num_elements = atoi(argv[1]);
num_bins = 4096;
} else if(argc == 3) {
num_elements = atoi(argv[1]);
num_bins = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./histogram # Input: 1,000,000, Bins: 4,096"
"\n Usage: ./histogram <m> # Input: m, Bins: 4,096"
"\n Usage: ./histogram <m> <n> # Input: m, Bins: n"
"\n");
exit(0);
}
initVector(&in_h, num_elements, num_bins);
bins_h = (uint8_t*) malloc(num_bins*sizeof(uint8_t));
// TESTING
for(unsigned int i = 0; i < num_bins; ++i)
{
bins_h[i] = i;
//printf("uint8_t Element %u: is %u \n", i, bins_h[i]);
}
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Input size = %u\n Number of bins = %u\n", num_elements,
num_bins);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMalloc((void**)&in_d, num_elements * sizeof(unsigned int));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cuda_ret = cudaMalloc((void**)&bins_d, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(in_d, in_h, num_elements * sizeof(unsigned int),
cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cuda_ret = cudaMemset(bins_d, 0, num_bins * sizeof(uint8_t));
if(cuda_ret != cudaSuccess) FATAL("Unable to set device memory");
// TESTING
//cuda_ret = cudaMemcpy(bins_d, bins_h, num_bins * sizeof(uint8_t),
// cudaMemcpyHostToDevice);
//if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to the device");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
histogram(in_d, bins_d, num_elements, num_bins);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
cuda_ret = cudaMemcpy(bins_h, bins_d, num_bins * sizeof(uint8_t),
cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy memory to host");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
#if TEST_MODE
printf("\nResult:\n");
for(unsigned int binIdx = 0; binIdx < num_bins; ++binIdx) {
printf("Bin %u: %u elements\n", binIdx, bins_h[binIdx]);
}
printf("\nElements Vec:\n");
for(unsigned int i = 0; i < num_elements; ++i) {
printf("Element %u: %u is \n", i, in_h[i]);
}
#endif
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(in_h, bins_h, num_elements, num_bins);
// Free memory ------------------------------------------------------------
cudaFree(in_d); cudaFree(bins_d);
free(in_h); free(bins_h);
return 0;
}
Turns out that this was just an index out of bounds error. The element buffer[i] was greater than the length of histo_private. As another poster mentioned, this was not obvious due to the following artifact of the c compiler:
The compiler is permitted to assume every access is within bounds. That line of my test code did nothing if the access is within bounds and therefore the compiler is permitted to assume that line of code does nothing. Thus it didn't require an access so the successful run of the test code was misleading. Once that line was changed to where the variable hist_private was modified at buffer[i], runtime errors came about.

Resources