cuda matrix multiplication size - c

i am new to cuda c..i wrote a basic matrix multiplication programme using shared memory..but the problem is i cannot increase the matrix size beyond 288 and if i does so i get stack overflow error..i have nvidia gtx 480 gpu..could anyone pls tell me how to increase the size and what mistakes i'm doing
#define tile_width 16
#define width 288
void mat_mul_kernel1(int *a,int *b,int *c)
{
int row= blockIdx.y*blockDim.y + threadIdx.y;
int col= blockIdx.x*blockDim.x + threadIdx.x;
int pvalue=0;
__shared__ int sha[tile_width*tile_width];
__shared__ int shb[tile_width*tile_width];
for (int m=0;m<width/tile_width;m++)
{
sha[threadIdx.y*tile_width+threadIdx.x]=a[row*width+(m*tile_width)+threadIdx.x];
shb[threadIdx.y*tile_width+threadIdx.x]=b[(m*tile_width+threadIdx.y)*width+col];
__syncthreads();
for (int k=0;k<tile_width;k++)
pvalue+=sha[threadIdx.y*tile_width+k]*shb[k*tile_width+threadIdx.x];
__syncthreads();
}
c[row*width+col]=pvalue;
}
int main()
{
int a[width*width],b[width*width],c[width*width];
int *deva,*devb,*devc;
float etime;
for (int i=0;i<width;i++)
{
for(int j=0;j<width;j++)
{
a[i*width+j]=1;
b[i*width+j]=1;
}
}
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimGrid((int)(width)/tile_width,(int)(width)/tile_width);
dim3 dimBlock(tile_width,tile_width);
cudaError_t error;
error=cudaMalloc((void**)&deva,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at a allocation");
exit(EXIT_FAILURE);
}
error=cudaMemcpy(deva,a,width*width*sizeof(int),cudaMemcpyHostToDevice);
if(error!= cudaSuccess)
{
printf("error at a copying");
exit(EXIT_FAILURE);
}
error=cudaMalloc((void**)&devb,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at b allocation");
exit(EXIT_FAILURE);
}
error=cudaMemcpy(devb,b,width*width*sizeof(int),cudaMemcpyHostToDevice);
if(error!= cudaSuccess)
{
printf("error at b copying");
exit(EXIT_FAILURE);
}
error=cudaMalloc((void**)&devc,width*width*sizeof(int));
if(error!= cudaSuccess)
{
printf("error at c allocation");
exit(EXIT_FAILURE);
}
cudaEventRecord(start,0);
mat_mul_kernel1<<<dimGrid,dimBlock,tile_width*tile_width*sizeof(int)>>>(deva,devb,devc);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&etime,start,stop);
error=cudaMemcpy(c,devc,width*width*sizeof(int),cudaMemcpyDeviceToHost);
if(error!= cudaSuccess)
{
printf("error at c copying");
//exit(EXIT_FAILURE);
}
cudaFree(deva);
cudaFree(devb);
cudaFree(devc);
printf("ElapsedTime %f milliseconds",etime);
}

The problem you see has nothing to do with CUDA. The problems are your arrays a, b, c. They are allocated on the stack. They have a size of 288 x 288 x siezof(int) x 3 what leads to 972kB (sizeof(int) = 4 byte). So I asume your hitting the standard maximum stack size, which lies, as far as I know, arround 1MB.
Try to allocate your arrays dynamically on the heap
int* a = (int*) malloc(width * width * sizeof(int));
and free the memory at the end
free(a);

Related

Trying to run a sorting C program which takes in input via the command line (Mac Terminal)

Trying to run a sorting C program which takes in input via the command line (Mac Terminal). If I manually input data the program works. If I input data from the command line (i.e. time ./hw2 mergesort < 10000.txt ) I get error:
hw2(1368,0x7fffcf79b3c0) malloc: * mach_vm_map(size=18446744065119617024) failed (error code=3)
* error: can't allocate region
*** set a breakpoint in malloc_error_break to debug
ERROR: malloc failed for size: -2147483648
real 1m41.341s
user 1m38.316s
sys 0m2.406s
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define DEFAULT_SIZE 1024
int merge_sort(int arr[],int low,int high);
int merge(int arr[],int l,int m,int h);
int* read_input(int* size)
// read input from stdin into array a; return size of array in size parameter
{
int *a = NULL;
int next = 0;
int sz = DEFAULT_SIZE;
a = malloc(sizeof(int) * sz);
if (a == NULL)
{
fprintf(stderr, "ERROR: malloc failed for size: %d\n", sz);
exit(1);
}
while (!feof (stdin))
{
int i = 0;
if (scanf ("%d", &i) == EOF)
{
break;
}
a[next++] = i;
// reached end of array--double size and allocate again;
if (next == sz)
{
sz = 2 * sz;
a = realloc(a, sizeof(int) * sz);
if (a == NULL)
{
fprintf(stderr, "ERROR: malloc failed for size: %d\n", sz);
exit(1);
}
}
}
*size = next;
printf("READ %d elements into array\n", next);
return a;
}
int merge_sort(int arr[],int low,int high)
{
int mid;
if(low<high)
{
mid=(low+high)/2;
// Divide and Conquer
merge_sort(arr,low,mid);
merge_sort(arr,mid+1,high);
// Combine
merge(arr,low,mid,high);
}
return 0;
}
int merge(int arr[],int l,int m,int h)
{
int arr1[10],arr2[10]; // Two temporary arrays to
// hold the two arrays to be merged
int n1,n2,i,j,k;
n1=m-l+1;
n2=h-m;
for(i=0;i<n1;i++)
arr1[i]=arr[l+i];
for(j=0;j<n2;j++)
arr2[j]=arr[m+j+1];
arr1[i]=9999; // To mark the end of each temporary array
arr2[j]=9999;
i=0;j=0;
for(k=l;k<=h;k++) //process of combining two sorted arrays
{
if(arr1[i]<=arr2[j])
arr[k]=arr1[i++];
else
arr[k]=arr2[j++];
}
return 0;
}
int do_merge_sort(int a[], int size)
{
printf("BEGIN merge_sort...\n");
merge_sort(a,0, size);
printf("END merge_sort...\n");
return 0;
}
int do_heap_sort(int a[], int size)
// heapsort driver function
{
printf("BEGIN heap_sort...\n");
// TO BE FILLED IN
printf("END heap_sort...\n");
return 0;
}
// qiocksort driver function
int do_quick_sort(int a[], int size)
{
printf("BEGIN quick_sort...\n");
// TO BE FILLED IN
printf("END quick_sort...\n");
return 0;
}
int usage()
{
char *usage_str =
"./hw2 [-h] mergesort|heapsort|quicksort\n"
"\n"
"Driver program to test different sort algorithn performance.\n"
"\n"
"Example\n"
"\n"
"./hw2 mergesort\n"
"\n"
"will test mergesrt\n"
;
fprintf(stderr, "%s\n\n", usage_str);
exit(1);
}
int main(int argc, char *argv[])
// driver function
{
int *a = NULL;
int size;
int ret = 0;
if (argc < 2)
{
fprintf(stderr, "ERROR: at least one argument needed\n");
usage();
}
// read the input into array;
a = read_input(&size);
if (strcmp(argv[1], "mergesort") == 0) {
do_merge_sort(a, size);
}
else if (strcmp(argv[1], "heapsort") == 0) {
do_heap_sort(a, size);
}
else if (strcmp(argv[1], "quicksort") == 0) {
do_quick_sort(a, size);
}
else {
fprintf(stderr, "ERROR: BAD argument\n");
usage();
}
// free allocated memory
if (a) {
free(a);
}
exit(0);
}
I use this python code to generate random number data:
#! usr/env/bin python
# to generate random data:
# python ./gen_data.py 1 1000000 > 1000000.dat
#
# you can verify that data by
# cat 1000000.dat | sort -g >1000000s.dat
# vi 1000000s.dat
#
import sys
import random
start_num = int(sys.argv[1])
end_num = int(sys.argv[2])
data = range(start_num, end_num)
random.shuffle(data)
for x in range(len(data)):
# print(str(data[x]) + '\n')
print(data[x])
It's possible that your loop’s end condition for no more input isn’t being met. That’s resulting in sz being doubled repeatedly until, as an Int, it loops back around to a negative value. Your kernel then complains that it can’t carry out a request to assign a block of memory that takes up negative space.
The malloc call is telling you that the given value (if it is cast as a signed type, because obviously, -1 gives a super large long number when it is red as unsigned) is negative, you should use a size_t when calling a function that request a size_t.
Consider using valgrind to debug your program and see where the "strange call" appears to be, we can't just help you by saying "It does not work please help".
Thus, I think the value that you give to malloc, kinda exceed it's type size, you are doing too much loops that result in too much sz = sz * 2. The value gets bigger and bigger, at some point, malloc is not able to request such amount of memory.

C CUDA send struct array to host from device

I have this structure
struct Data {
int x
int y;
float z;
};
I sent it without problems to kernel
__global__ void calculate(Data *d_data) {
d_data[myCounter].x = 1;
d_data[myCounter].y = 1;
d_data[myCounter].z = 1.0;
}
#DEFINE MAX_SIZE 100
int main() {
Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
Data *d_data;
const int DATA_BYTES = MAX_SIZE * sizeof(Data);
int elements = 20;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculate << < 1, elements >> > (d_data);
cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaDeviceSynchronize();
for (i = 0; i < elements; i++) {
printf("%2d %2.1f %s\n", d_data[i].x, d_data[i].y,
d_data[i].z); // this prints nothing
}
cudaStatus = cudaDeviceReset();
}
When I tested struct array inside kernel function calculate it printed right results but when I try to send data from device to host using cudaMemcpy program crashes with no errros and prints nothing, how I can transfer this struct array from device?
There are several problems with the code you have shown.
You are missing a semicolon in your struct definition.
No definition is provided in the kernel code for the variable myCounter
No definition is provided for the variable i in main
You are attempting to print from the device variable d_data instead of the host variable data. This is illegal in CUDA. After copying to the host variable data, print from there.
You are using incorrect printf format specifiers. The data types in your struct are an int, an int and a float. You were using %2d %2.1f %s which would match an int, a float, and a string variable (null-terminated array of characters), but is incorrect for your struct.
The following code has the above issues addressed and seems to run correctly for me:
$ cat t430.cu
#include <stdio.h>
struct Data {
int x; // was missing semicolon
int y;
float z;
};
__global__ void calculate(Data *d_data) {
int myCounter = threadIdx.x; // this line was missing
d_data[myCounter].x = 1;
d_data[myCounter].y = 1;
d_data[myCounter].z = 1.0;
}
#define MAX_SIZE 100
int main() {
Data * data = (Data *)malloc(MAX_SIZE * sizeof(Data));
Data *d_data;
int i; // this line was missing
const int DATA_BYTES = MAX_SIZE * sizeof(Data);
int elements = 20;
cudaError_t cudaStatus;
cudaStatus = cudaMalloc((void **)&d_data, DATA_BYTES);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(d_data, data, DATA_BYTES, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
calculate << < 1, elements >> > (d_data);
cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
cudaDeviceSynchronize();
for (i = 0; i < elements; i++) {
printf("%2d %2d %2.1f\n", data[i].x, data[i].y,
data[i].z); // this was trying to print from d_data
}
cudaStatus = cudaDeviceReset();
}
$ nvcc -arch=sm_61 -o t430 t430.cu
$ cuda-memcheck ./t430
========= CUDA-MEMCHECK
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
1 1 1.0
========= ERROR SUMMARY: 0 errors
$
You should add this macro to your code
#define CUDA_SAFE_CALL(call)
do {
cudaError_t err = call;
if (cudaSuccess != err) {
fprintf (stderr, "Cuda error in file '%s' in line %i : %s.",
__FILE__, __LINE__, cudaGetErrorString(err) );
exit(EXIT_FAILURE);
}
} while (0)
then:
CUDA_SAFE_CALL(cudaMemcpy(data, d_data, DATA_BYTES, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
btw, your myCounter seems not right. Could you provide some details on the value of myCounter in the code above?

CUDA program gives cudaErrorIllegalAddress on sm_35 Kepler GPUs, but runs on fine on other GPUs

I'm having a very weird problem with my program. Essentially I'm doing a matrix multiplication on part of a matrix. The program apparently runs fine on most cards cards but crashes on sm_35 Kepler (=GK110) cards.
The initial program was written in PyCUDA, but I've since managed to boil it down to the following minimal example written in C:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
int main(int argc, char **argv)
{
cudaError_t status;
cublasStatus_t status_blas;
CUresult status_drv;
float *A = 0;
float *B = 0;
float *C = 0;
float alpha = 1.0f;
float beta = 0.0f;
float *oldA, *oldB, *oldC;
cublasHandle_t handle;
int n = 131;
int m = 2483;
int k = 3;
int i;
CUcontext ctx;
cuInit(0);
status_drv = cuCtxCreate(&ctx, 0, 0);
if (status_drv != CUDA_SUCCESS) {
fprintf(stderr, "!!!! Context creation error: %d\n", status);
return EXIT_FAILURE;
}
status_blas = cublasCreate(&handle);
if (status_blas != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! CUBLAS initialization error\n");
return EXIT_FAILURE;
}
for (i = 0; i < 5; ++i) {
printf("Iteration %d\n", i);
if (cudaMalloc((void **)&B, m * k * sizeof(B[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate B)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&C, m * m * sizeof(C[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate C)\n");
return EXIT_FAILURE;
}
if (cudaMalloc((void **)&A, n * m * sizeof(A[0])) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (allocate A)\n");
return EXIT_FAILURE;
}
int s = 3;
float * A_slice = A + 128*m;
status_blas = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, m, s,
&alpha, A_slice, m, B, k, &beta, C, m);
if (status_blas != CUBLAS_STATUS_SUCCESS) {
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
if (i == 0) {
oldA = A;
oldB = B;
oldC = C;
} else if (i == 1) {
status = cudaFree(oldA);
if (status != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free A, %d)\n", status);
return EXIT_FAILURE;
}
if (cudaFree(oldB) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free B)\n");
return EXIT_FAILURE;
}
if (cudaFree(oldC) != cudaSuccess) {
fprintf(stderr, "!!!! allocation error (free C)\n");
return EXIT_FAILURE;
}
}
}
cublasDestroy(handle);
cuCtxDestroy(ctx);
return 0;
}
I only free memory in the 2nd iteration of the for loop to mimic the behavior of the original python program. The program will crash in the 2nd iteration of the for-loop when it tries to free "A", with cudaFree returning a cudaErrorIllegalAddress error.
Concretely, the was tried on the following cards:
NVS 5400M -> no issues
GTX560Ti -> no issues
Tesla S2050 -> no issues
unknown sm_30 card (see comments to this post) -> no issues
K40c -> CRASH
GTX 780 -> CRASH
K20m -> CRASH
I used a number of Linux machines with different distributions, some of them using CUDA 5.5 and some using CUDA 6.0. At least on the machines I have direct control over, all cards were using the 331 nvidia driver series.
There are several things to note here:
the order of the malloc calls matters. If I allocate A before B things run fine
the numerical constants matter a bit. For some values (e.g. n=30) no crash occurs, for others there is a crash
The order of the free/malloc calls matter. If I free the memory in the same iteration where I allocate, everything works just fine
At this point I'm pretty desperate. I don't see why or where I'm doing anything wrong. If anyone could help me, I'd really appreciate it.
EDIT: as pointed out in the comments, apparently it only fails to run on sm_35 (i.e., GK110 cards), but runs fine on sm_30 Kepler cards.
This issue should be fixed in the CUDA 6.5 production release package, now available for download from http://www.nvidia.com/getcuda

CUDA atomicAdd() is giving wrong results

I am using atomicAdd() to add 1 to each of the elements of an array c = {0,0,0,0,0} using two different schemes
c[i] = c[i] + 1;
result - c = {1,1,1,1,1}
c[i] = atomicAdd(&(c[i]),1);
result c = {0,0,0,0,0}
I am totally clueless as to why I am getting such results, here is the small code I am using to get the results.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
void addWithCuda(int *c, int size);
__global__ void addKernel(int *c, int size)
{
int i = threadIdx.x;
if (i < size)
c[i] = c[i] + 1;
//c[i] = atomicAdd(&(c[i]),(int)1);
}
int main()
{
const int arraySize = 5;
int c[arraySize] = {0,0,0,0,0};
// Add vectors in parallel.
addWithCuda(c, arraySize);
Sleep(3000);
printf("result = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
void addWithCuda(int *c, int size)
{
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
}
cudaStatus = cudaMemcpy(dev_c, c, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, size);
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
}
}
c[i] = atomicAdd(&(c[i]),(int)1);
should be
atomicAdd(&(c[i]),(int)1);
Basically the &(c[i]), the reference call is used to add the +1 directly in the array. the atomicAdd is returning 0 ; and you are putting the zero inside the array.

cublasStrsmBatched - execution failed

I can't run cublasStrsmBatched (line 113) without CUBLAS_STATUS_EXECUTION_FAILED (13) output. To simplify, all matrix values and alpha are 1.0, all matrices are square and lda, ldb, m and n are equal.
I am able to run cublasSgemmBatched and cublasStrsm in the same way, with no error. cublasStrsmBatched should be the same, but it is not, not for me.
Please tell me if you have any idea about what am I doing wrong in this code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
cublasHandle_t handle;
void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);
void TestCublasStrsmBatched(int size, int numOfLinSys);
int main()
{
cublasStatus_t status = cublasCreate(&handle);
CheckCublasCreate(status);
/*arguments are size of square matrix
and number of linear systems*/
TestCublasStrsmBatched(2,2);
status = cublasDestroy(handle);
CheckCublasDestroy(status);
}
void TestCublasStrsmBatched(int size, int numOfLinSys)
{
cublasStatus_t status;
cudaError_t error;
float **h_A;
float **d_A;
float **h_B;
float **d_B;
float **hd_A;
float **hd_B;
float *alpha;
const int n = size;
const int m = size;
const int lda=m;
const int ldb=m;
const int matA_numOfElem = m*m;
const int matB_numOfElem = m*n;
int i,j;
h_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_A);
h_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_B);
alpha=(float *)malloc(sizeof(float));
*alpha = 1.0;
for (j=0; j<numOfLinSys; j++){
h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
CheckAllocateHost(h_A);
for (i=0; i < matA_numOfElem; i++)
h_A[j][i] = 1.0;
h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
CheckAllocateHost(h_B);
for (i=0; i < matB_numOfElem; i++)
h_B[j][i] = 1.0;
}
hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_A);
hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_B);
for (j=0; j<numOfLinSys; j++){
error = cudaMalloc((void **)&hd_A[j],
matA_numOfElem * sizeof(float));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&hd_B[j],
matB_numOfElem * sizeof(float));
CheckCudaMalloc(error);
status = cublasSetMatrix(m, m, sizeof(float),
h_A[j], lda, hd_A[j], lda);
CheckCublasSetGetMatrix(status);
status = cublasSetMatrix(m, n, sizeof(float),
h_B[j], ldb, hd_B[j], ldb);
CheckCublasSetGetMatrix(status);
}
error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
/*After cublasStrsmBatched call
status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
status = cublasStrsmBatched(handle,
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
CheckKernelExecution(status);
}
void CheckCublasCreate( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr,
"!!!! CUBLAS initialization error \n");
exit(EXIT_FAILURE);
}
}
void CheckAllocateHost( void* h_pointer )
{
if (h_pointer == 0){
fprintf(stderr,
"!!!! host memory allocation error \n");
exit(EXIT_FAILURE);
}
}
void CheckCudaMalloc( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr,
"!!!! device memory allocation error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCudaMemcpy( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr, "!!!! data copy error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCublasSetGetMatrix( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! device access error \n");
exit(EXIT_FAILURE);
}
}
void CheckKernelExecution( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! kernel execution error.\n");
exit(EXIT_FAILURE);
}
}
void CheckCublasDestroy( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! shutdown error \n");
exit(EXIT_FAILURE);
}
}
Using Linux, CUDA 5.5, T10 and Windows, CUDA 5.5, GTX285
Thanks!
The batched triangular backsolver is something I hadn't tried before in CUBLAS, so I was interested to take a look and see what might be going on. Your code is rather complex, so I didn't bother trying to understand it, but when I ran it, it appeared to be failing with an internal CUBLAS launch failure:
$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
========= Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
========= Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
========= Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
========= Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
========= Host Frame:[0x1]
(This is an OS X machine with a compute 1.2 GPU and CUDA 5.0). Error 8 is cudaErrorInvalidDeviceFunction, which usually only comes up when a library or fatbinary doesn't have an architecture which matches or can't be JIT recompiled into something your GPU can run.
Intrigued, I wrote my own much simpler repro case from scratch:
#include <iostream>
#include <cublas_v2.h>
int main(void)
{
const int Neq = 5, Nrhs = 2, Nsys = 4;
float Atri[Neq][Neq] =
{ { 1, 6, 11, 16, 21},
{ 0, 7, 12, 17, 22},
{ 0, 0, 13, 18, 23},
{ 0, 0, 0, 19, 24},
{ 0, 0, 0, 0, 25} };
float B[Nrhs][Neq] =
{ { 1, 27, 112, 290, 595},
{ 2, 40, 148, 360, 710} };
float *syslhs[Nsys], *sysrhs[Nsys];
float *A_, *B_, **syslhs_, **sysrhs_;
size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);
cudaMalloc((void **)(&A_), Asz);
cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));
cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
for(int i=0; i<Nsys; i++) {
syslhs[i] = A_;
sysrhs[i] = (float*)((char *)B_ + i*Bsz);
cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
}
size_t syssz = sizeof(float *) * (size_t)Nsys;
cudaMalloc((void **)&syslhs_, syssz);
cudaMalloc((void **)&sysrhs_, syssz);
cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);
const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
const cublasOperation_t trans = CUBLAS_OP_N;
float alpha = 1.f;
cublasHandle_t handle;
cublasCreate(&handle);
cublasStrsmBatched(
handle,
side, ulo, trans, diag,
Neq, Nrhs,
&alpha,
syslhs_, Neq,
sysrhs_, Neq,
Nsys
);
for(int k=0; k<Nsys; k++) {
cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
for(int i=0; i<Nrhs; i++) {
for(int j=0; j<Neq; j++) {
std::cout << B[i][j] << ",";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
return 0;
}
This also fails the same way as your code. At first inspection, this really does seem to be a CUBLAS internal problem, although it is very difficult to say what. About the only thing I can think of is that these solvers are only supported on compute capability 3.5 devices not supported on compute 1.x devices, but the documentation fails to mention it. Between us we have tested compute 1.2, compute 1.3, and compute 3.0[error on my part, I read K10 not T10 in your question] devices, so there isn't much else left.....
All I can suggest is trying to run your code with cuda-memcheck and see whether it reports the same error. if it does, I see a bug report to NVIDIA in your future.
EDIT: I flagrantly disregarded the EULA and used cuobjdump to explore the cubin payloads in the CUDA 5 cublas library. For the single precision batched trsm routines I found cubins for
32 bit sm_20
32 bit sm_30
32 bit sm_35
64 bit sm_20
64 bit sm_30
64 bit sm_35
There are clearly no sm_1x cubins in the library, so my compute_12 device should produce the runtime library error I see. It also explains your error with the GTX 285 and Telsa T10, which are both compute_13.
EDIT2:
As suspected, my repro code runs perfectly on a linux system with a compute_30 device under both CUDA 5.0 and CUDA 5.5 release libraries.

Resources