cublasStrsmBatched - execution failed - c

I can't run cublasStrsmBatched (line 113) without CUBLAS_STATUS_EXECUTION_FAILED (13) output. To simplify, all matrix values and alpha are 1.0, all matrices are square and lda, ldb, m and n are equal.
I am able to run cublasSgemmBatched and cublasStrsm in the same way, with no error. cublasStrsmBatched should be the same, but it is not, not for me.
Please tell me if you have any idea about what am I doing wrong in this code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
cublasHandle_t handle;
void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);
void TestCublasStrsmBatched(int size, int numOfLinSys);
int main()
{
cublasStatus_t status = cublasCreate(&handle);
CheckCublasCreate(status);
/*arguments are size of square matrix
and number of linear systems*/
TestCublasStrsmBatched(2,2);
status = cublasDestroy(handle);
CheckCublasDestroy(status);
}
void TestCublasStrsmBatched(int size, int numOfLinSys)
{
cublasStatus_t status;
cudaError_t error;
float **h_A;
float **d_A;
float **h_B;
float **d_B;
float **hd_A;
float **hd_B;
float *alpha;
const int n = size;
const int m = size;
const int lda=m;
const int ldb=m;
const int matA_numOfElem = m*m;
const int matB_numOfElem = m*n;
int i,j;
h_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_A);
h_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_B);
alpha=(float *)malloc(sizeof(float));
*alpha = 1.0;
for (j=0; j<numOfLinSys; j++){
h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
CheckAllocateHost(h_A);
for (i=0; i < matA_numOfElem; i++)
h_A[j][i] = 1.0;
h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
CheckAllocateHost(h_B);
for (i=0; i < matB_numOfElem; i++)
h_B[j][i] = 1.0;
}
hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_A);
hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_B);
for (j=0; j<numOfLinSys; j++){
error = cudaMalloc((void **)&hd_A[j],
matA_numOfElem * sizeof(float));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&hd_B[j],
matB_numOfElem * sizeof(float));
CheckCudaMalloc(error);
status = cublasSetMatrix(m, m, sizeof(float),
h_A[j], lda, hd_A[j], lda);
CheckCublasSetGetMatrix(status);
status = cublasSetMatrix(m, n, sizeof(float),
h_B[j], ldb, hd_B[j], ldb);
CheckCublasSetGetMatrix(status);
}
error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
/*After cublasStrsmBatched call
status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
status = cublasStrsmBatched(handle,
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
CheckKernelExecution(status);
}
void CheckCublasCreate( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr,
"!!!! CUBLAS initialization error \n");
exit(EXIT_FAILURE);
}
}
void CheckAllocateHost( void* h_pointer )
{
if (h_pointer == 0){
fprintf(stderr,
"!!!! host memory allocation error \n");
exit(EXIT_FAILURE);
}
}
void CheckCudaMalloc( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr,
"!!!! device memory allocation error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCudaMemcpy( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr, "!!!! data copy error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCublasSetGetMatrix( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! device access error \n");
exit(EXIT_FAILURE);
}
}
void CheckKernelExecution( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! kernel execution error.\n");
exit(EXIT_FAILURE);
}
}
void CheckCublasDestroy( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! shutdown error \n");
exit(EXIT_FAILURE);
}
}
Using Linux, CUDA 5.5, T10 and Windows, CUDA 5.5, GTX285
Thanks!

The batched triangular backsolver is something I hadn't tried before in CUBLAS, so I was interested to take a look and see what might be going on. Your code is rather complex, so I didn't bother trying to understand it, but when I ran it, it appeared to be failing with an internal CUBLAS launch failure:
$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
========= Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
========= Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
========= Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
========= Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
========= Host Frame:[0x1]
(This is an OS X machine with a compute 1.2 GPU and CUDA 5.0). Error 8 is cudaErrorInvalidDeviceFunction, which usually only comes up when a library or fatbinary doesn't have an architecture which matches or can't be JIT recompiled into something your GPU can run.
Intrigued, I wrote my own much simpler repro case from scratch:
#include <iostream>
#include <cublas_v2.h>
int main(void)
{
const int Neq = 5, Nrhs = 2, Nsys = 4;
float Atri[Neq][Neq] =
{ { 1, 6, 11, 16, 21},
{ 0, 7, 12, 17, 22},
{ 0, 0, 13, 18, 23},
{ 0, 0, 0, 19, 24},
{ 0, 0, 0, 0, 25} };
float B[Nrhs][Neq] =
{ { 1, 27, 112, 290, 595},
{ 2, 40, 148, 360, 710} };
float *syslhs[Nsys], *sysrhs[Nsys];
float *A_, *B_, **syslhs_, **sysrhs_;
size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);
cudaMalloc((void **)(&A_), Asz);
cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));
cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
for(int i=0; i<Nsys; i++) {
syslhs[i] = A_;
sysrhs[i] = (float*)((char *)B_ + i*Bsz);
cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
}
size_t syssz = sizeof(float *) * (size_t)Nsys;
cudaMalloc((void **)&syslhs_, syssz);
cudaMalloc((void **)&sysrhs_, syssz);
cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);
const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
const cublasOperation_t trans = CUBLAS_OP_N;
float alpha = 1.f;
cublasHandle_t handle;
cublasCreate(&handle);
cublasStrsmBatched(
handle,
side, ulo, trans, diag,
Neq, Nrhs,
&alpha,
syslhs_, Neq,
sysrhs_, Neq,
Nsys
);
for(int k=0; k<Nsys; k++) {
cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
for(int i=0; i<Nrhs; i++) {
for(int j=0; j<Neq; j++) {
std::cout << B[i][j] << ",";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
return 0;
}
This also fails the same way as your code. At first inspection, this really does seem to be a CUBLAS internal problem, although it is very difficult to say what. About the only thing I can think of is that these solvers are only supported on compute capability 3.5 devices not supported on compute 1.x devices, but the documentation fails to mention it. Between us we have tested compute 1.2, compute 1.3, and compute 3.0[error on my part, I read K10 not T10 in your question] devices, so there isn't much else left.....
All I can suggest is trying to run your code with cuda-memcheck and see whether it reports the same error. if it does, I see a bug report to NVIDIA in your future.
EDIT: I flagrantly disregarded the EULA and used cuobjdump to explore the cubin payloads in the CUDA 5 cublas library. For the single precision batched trsm routines I found cubins for
32 bit sm_20
32 bit sm_30
32 bit sm_35
64 bit sm_20
64 bit sm_30
64 bit sm_35
There are clearly no sm_1x cubins in the library, so my compute_12 device should produce the runtime library error I see. It also explains your error with the GTX 285 and Telsa T10, which are both compute_13.
EDIT2:
As suspected, my repro code runs perfectly on a linux system with a compute_30 device under both CUDA 5.0 and CUDA 5.5 release libraries.

Related

cudaLaunchKernel failed to launch kernel

I am trying to launch kernel function using the runtime API. For some reason, I am not able the directly call cudaLaunchKernel. Instead, I have call a function that calls cudaLaunchKernel inside it. Here is an example, which simply just print a message from the device:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. \n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}
Option 1, which directly calls the cudaLaunchKernel, works. However, option 2, which indirectly invokes the cudaLaunchKernel, does not work. Using option 2, no message was printed from the device, and the return value is not equal to CUDA_SUCCESS.
I was wondering if anyone has any insights into this problem.
Thank you in advance for your help and time.
grid and block dimensions cannot be zero:
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
the reason your two launches behave differently is that you are creating the grid and block dimension variables differently.
If you change to:
const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};
it will work for either case.
By the way, you're not doing yourself any favors with this sort of error trapping:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
This would be a lot more instructive:
if (res != cudaSuccess)
{
printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
return -1;
}

C CUDA send struct array to host from device

I have this structure
struct Data {
int x
int y;
float z;
};
I sent it without problems to kernel
__global__ void calculate(Data *d_data) {
d_data[myCounter].x = 1;
d_data[myCounter].y = 1;
d_data[myCounter].z = 1.0;
}
#DEFINE MAX_SIZE 100
int main() {
Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
Data *d_data;
const int DATA_BYTES = MAX_SIZE * sizeof(Data);
int elements = 20;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculate << < 1, elements >> > (d_data);
cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaDeviceSynchronize();
for (i = 0; i < elements; i++) {
printf("%2d %2.1f %s\n", d_data[i].x, d_data[i].y,
d_data[i].z); // this prints nothing
}
cudaStatus = cudaDeviceReset();
}
When I tested struct array inside kernel function calculate it printed right results but when I try to send data from device to host using cudaMemcpy program crashes with no errros and prints nothing, how I can transfer this struct array from device?
There are several problems with the code you have shown.
You are missing a semicolon in your struct definition.
No definition is provided in the kernel code for the variable myCounter
No definition is provided for the variable i in main
You are attempting to print from the device variable d_data instead of the host variable data. This is illegal in CUDA. After copying to the host variable data, print from there.
You are using incorrect printf format specifiers. The data types in your struct are an int, an int and a float. You were using %2d %2.1f %s which would match an int, a float, and a string variable (null-terminated array of characters), but is incorrect for your struct.
The following code has the above issues addressed and seems to run correctly for me:
$ cat t430.cu
#include <stdio.h>
struct Data {
int x; // was missing semicolon
int y;
float z;
};
__global__ void calculate(Data *d_data) {
int myCounter = threadIdx.x; // this line was missing
d_data[myCounter].x = 1;
d_data[myCounter].y = 1;
d_data[myCounter].z = 1.0;
}
#define MAX_SIZE 100
int main() {
Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
Data *d_data;
int i; // this line was missing
const int DATA_BYTES = MAX_SIZE * sizeof(Data);
int elements = 20;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculate << < 1, elements >> > (d_data);
cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaDeviceSynchronize();
for (i = 0; i < elements; i++) {
printf("%2d %2d %2.1f\n", data[i].x, data[i].y,
data[i].z); // this was trying to print from d_data
}
cudaStatus = cudaDeviceReset();
}
$ nvcc -arch=sm_61 -o t430 t430.cu
$ cuda-memcheck ./t430
========= CUDA-MEMCHECK
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
========= ERROR SUMMARY: 0 errors
$
You should add this macro to your code
#define CUDA_SAFE_CALL(call)
do {
cudaError_t err = call;
if (cudaSuccess != err) {
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.",
__FILE__, __LINE__, cudaGetErrorString(err) );
exit(EXIT_FAILURE);
}
} while (0)
then:
CUDA_SAFE_CALL(cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
btw, your myCounter seems not right. Could you provide some details on the value of myCounter in the code above?

CUDA program gives cudaErrorIllegalAddress on sm_35 Kepler GPUs, but runs on fine on other GPUs

I'm having a very weird problem with my program. Essentially I'm doing a matrix multiplication on part of a matrix. The program apparently runs fine on most cards cards but crashes on sm_35 Kepler (=GK110) cards.
The initial program was written in PyCUDA, but I've since managed to boil it down to the following minimal example written in C:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
int main(int argc, char **argv)
{
cudaError_t status;
cublasStatus_t status_blas;
CUresult status_drv;
float *A = 0;
float *B = 0;
float *C = 0;
float alpha = 1.0f;
float beta = 0.0f;
float *oldA, *oldB, *oldC;
cublasHandle_t handle;
int n = 131;
int m = 2483;
int k = 3;
int i;
CUcontext ctx;
cuInit(0);
status_drv = cuCtxCreate(&ctx, 0, 0);
if (status_drv != CUDA_SUCCESS) {
fprintf(stderr, "!!!! Context creation error: %d\n", status);
return EXIT_FAILURE;
}
status_blas = cublasCreate(&handle);
if (status_blas != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
for (i = 0; i < 5; ++i) {
printf("Iteration %d\n", i);
if (cudaMalloc((void **)&B, m * k * sizeof(B[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate B)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&C, m * m * sizeof(C[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate C)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&A, n * m * sizeof(A[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate A)\n");
return EXIT_FAILURE;
}
int s = 3;
float * A_slice = A + 128*m;
status_blas = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, s,
&alpha, A_slice, m, B, k, &beta, C, m);
if (status_blas != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
if (i == 0) {
oldA = A;
oldB = B;
oldC = C;
} else if (i == 1) {
status = cudaFree(oldA);
if (status != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free A, %d)\n", status);
return EXIT_FAILURE;
}
if (cudaFree(oldB) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free B)\n");
return EXIT_FAILURE;
}
if (cudaFree(oldC) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free C)\n");
return EXIT_FAILURE;
}
}
}
cublasDestroy(handle);
cuCtxDestroy(ctx);
return 0;
}
I only free memory in the 2nd iteration of the for loop to mimic the behavior of the original python program. The program will crash in the 2nd iteration of the for-loop when it tries to free "A", with cudaFree returning a cudaErrorIllegalAddress error.
Concretely, the was tried on the following cards:
NVS 5400M -> no issues
GTX560Ti -> no issues
Tesla S2050 -> no issues
unknown sm_30 card (see comments to this post) -> no issues
K40c -> CRASH
GTX 780 -> CRASH
K20m -> CRASH
I used a number of Linux machines with different distributions, some of them using CUDA 5.5 and some using CUDA 6.0. At least on the machines I have direct control over, all cards were using the 331 nvidia driver series.
There are several things to note here:
the order of the malloc calls matters. If I allocate A before B things run fine
the numerical constants matter a bit. For some values (e.g. n=30) no crash occurs, for others there is a crash
The order of the free/malloc calls matter. If I free the memory in the same iteration where I allocate, everything works just fine
At this point I'm pretty desperate. I don't see why or where I'm doing anything wrong. If anyone could help me, I'd really appreciate it.
EDIT: as pointed out in the comments, apparently it only fails to run on sm_35 (i.e., GK110 cards), but runs fine on sm_30 Kepler cards.
This issue should be fixed in the CUDA 6.5 production release package, now available for download from http://www.nvidia.com/getcuda

CUDA atomicAdd() is giving wrong results

I am using atomicAdd() to add 1 to each of the elements of an array c = {0,0,0,0,0} using two different schemes
c[i] = c[i] + 1;
result - c = {1,1,1,1,1}
c[i] = atomicAdd(&(c[i]),1);
result c = {0,0,0,0,0}
I am totally clueless as to why I am getting such results, here is the small code I am using to get the results.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
void addWithCuda(int *c, int size);
__global__ void addKernel(int *c, int size)
{
int i = threadIdx.x;
if (i < size)
c[i] = c[i] + 1;
//c[i] = atomicAdd(&(c[i]),(int)1);
}
int main()
{
const int arraySize = 5;
int c[arraySize] = {0,0,0,0,0};
// Add vectors in parallel.
addWithCuda(c, arraySize);
Sleep(3000);
printf("result = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
void addWithCuda(int *c, int size)
{
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, size);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
}
c[i] = atomicAdd(&(c[i]),(int)1);
should be
atomicAdd(&(c[i]),(int)1);
Basically the &(c[i]), the reference call is used to add the +1 directly in the array. the atomicAdd is returning 0 ; and you are putting the zero inside the array.

cuda matrix multiplication size

i am new to cuda c..i wrote a basic matrix multiplication programme using shared memory..but the problem is i cannot increase the matrix size beyond 288 and if i does so i get stack overflow error..i have nvidia gtx 480 gpu..could anyone pls tell me how to increase the size and what mistakes i'm doing
#define tile_width 16
#define width 288
void mat_mul_kernel1(int *a,int *b,int *c)
{
int row= blockIdx.y*blockDim.y + threadIdx.y;
int col= blockIdx.x*blockDim.x + threadIdx.x;
int pvalue=0;
__shared__ int sha[tile_width*tile_width];
__shared__ int shb[tile_width*tile_width];
for (int m=0;m<width/tile_width;m++)
{
sha[threadIdx.y*tile_width+threadIdx.x]=a[row*width+(m*tile_width)+threadIdx.x];
shb[threadIdx.y*tile_width+threadIdx.x]=b[(m*tile_width+threadIdx.y)*width+col];
__syncthreads();
for (int k=0;k<tile_width;k++)
pvalue+=sha[threadIdx.y*tile_width+k]*shb[k*tile_width+threadIdx.x];
__syncthreads();
}
c[row*width+col]=pvalue;
}
int main()
{
int a[width*width],b[width*width],c[width*width];
int *deva,*devb,*devc;
float etime;
for (int i=0;i<width;i++)
{
for(int j=0;j<width;j++)
{
a[i*width+j]=1;
b[i*width+j]=1;
}
}
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimGrid((int)(width)/tile_width,(int)(width)/tile_width);
dim3 dimBlock(tile_width,tile_width);
cudaError_t error;
error=cudaMalloc((void**)&deva,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at a allocation");
exit(EXIT_FAILURE);
}
error=cudaMemcpy(deva,a,width*width*sizeof(int),cudaMemcpyHostToDevice);
if(error!= cudaSuccess)
{
printf("error at a copying");
exit(EXIT_FAILURE);
}
error=cudaMalloc((void**)&devb,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at b allocation");
exit(EXIT_FAILURE);
}
error=cudaMemcpy(devb,b,width*width*sizeof(int),cudaMemcpyHostToDevice);
if(error!= cudaSuccess)
{
printf("error at b copying");
exit(EXIT_FAILURE);
}
error=cudaMalloc((void**)&devc,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at c allocation");
exit(EXIT_FAILURE);
}
cudaEventRecord(start,0);
mat_mul_kernel1<<<dimGrid,dimBlock,tile_width*tile_width*sizeof(int)>>>(deva,devb,devc);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&etime,start,stop);
error=cudaMemcpy(c,devc,width*width*sizeof(int),cudaMemcpyDeviceToHost);
if(error!= cudaSuccess)
{
printf("error at c copying");
//exit(EXIT_FAILURE);
}
cudaFree(deva);
cudaFree(devb);
cudaFree(devc);
printf("ElapsedTime %f milliseconds",etime);
}
The problem you see has nothing to do with CUDA. The problems are your arrays a, b, c. They are allocated on the stack. They have a size of 288 x 288 x siezof(int) x 3 what leads to 972kB (sizeof(int) = 4 byte). So I asume your hitting the standard maximum stack size, which lies, as far as I know, arround 1MB.
Try to allocate your arrays dynamically on the heap
int* a = (int*) malloc(width * width * sizeof(int));
and free the memory at the end
free(a);

Resources