cudaLaunchKernel failed to launch kernel - c

I am trying to launch kernel function using the runtime API. For some reason, I am not able the directly call cudaLaunchKernel. Instead, I have call a function that calls cudaLaunchKernel inside it. Here is an example, which simply just print a message from the device:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. \n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}
Option 1, which directly calls the cudaLaunchKernel, works. However, option 2, which indirectly invokes the cudaLaunchKernel, does not work. Using option 2, no message was printed from the device, and the return value is not equal to CUDA_SUCCESS.
I was wondering if anyone has any insights into this problem.
Thank you in advance for your help and time.

grid and block dimensions cannot be zero:
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
the reason your two launches behave differently is that you are creating the grid and block dimension variables differently.
If you change to:
const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};
it will work for either case.
By the way, you're not doing yourself any favors with this sort of error trapping:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
This would be a lot more instructive:
if (res != cudaSuccess)
{
printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
return -1;
}

Related

How to pass array and int to opencl kernel in same time

I've an openCL code to do some calculate.
Can OpenCL pass array and int to kernel in the same time?
I want to pass 'myint' to kernel not via buffer.
int myint = 100;
cl_mem memObjects[3] = {0, 0, 0};
memObjects[0] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int) * ARR_SIZE, data, NULL);
memObjects[1] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int), new int[]{myint}, NULL);
memObjects[2] = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
sizeof(float) * ARR_SIZE, NULL, NULL);
opencl_errNum = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), &memObjects[0]);
opencl_errNum |= clSetKernelArg(opencl_kernel, 1, sizeof(cl_mem), &memObjects[1]);
opencl_errNum |= clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), &memObjects[2]);
//Kernel code
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" __global const int *value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value[0];\n"
"\n"
"}";
Want to change be like this.
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" __global const int value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value;\n"
"\n"
"}";
Yes you can pass either 1-dimensional arrays or constants as kernel arguments. The syntax is as follows:
int myint = 100;
cl_mem memObjects[2] = {0, 0};
memObjects[0] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ARR_SIZE, data, NULL);
memObjects[1] = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE, sizeof(float) * ARR_SIZE, NULL, NULL);
opencl_errNum = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), &memObjects[0]); // pass an array as kernel parameter
opencl_errNum |= clSetKernelArg(opencl_kernel, 1, sizeof(int), (void*)&myint); // pass a constant as kernel parameter
opencl_errNum |= clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), &memObjects[1]); // pass an array as kernel parameter
// note: the second parameter is just "const int value", without the "__global" keyword or "*" pointer
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" const int value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value;\n" // note: result is type float, and you write an int to it here
"}\n";
To make this all a lot easier, I created this OpenCL-Wrapper. With that, you can directly pass arrays (Memory objects) or numbers as kernel parameters, and the C++ code would look like this:
int main() {
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
int myint = 100;
Memory<int> src(device, ARR_SIZE); // allocate memory on both host and device
Memory<float> result(device, ARR_SIZE);
Kernel opencl_kernel(device, ARR_SIZE, "opencl_code", src, myint, result); // kernel that runs on the device
// initialize src, for example with "src[5] = 3;"
src.write_to_device(); // copy data from host memory to device memory
opencl_kernel.run(); // run kernel
result.read_from_device(); // copy data from device memory to host memory
// read result with for example "float test = result[5];"
// ...
return 0;
}

function parameter works from an initiated char*, but not if converted from char[]

I have a mqtt function that subscribes to a topic using a char* pointer. Since I want to subscribe only to my own devices id, I need to create part of this pointer dynamicly. And I really cannot manage to succeed.
I have tried to create an array[] dynamicly and then convert the entire array to a pointer array*, with no success. It only works if I staticly define the pointer in the form char*.
This is what do work well:
char* topic="/mqtt_topic/myID/";
mqtt_subscribe(module_inst, topic, 0, SubscribeHandler);
The code below compiles and are looking allright, the function is also subscribing to the topic but it does not react on data sent over mqtt. The string is also looking identical with the example above.
char topick []="/mqtt_topic/myID/";
char* topic=topick;
mqtt_subscribe(module_inst, topic, 0, SubscribeHandler);
The mqtt_subscribe function looks like below;
int mqtt_subscribe(struct mqtt_module *module, const char *topic, uint8_t qos, messageHandler msgHandler)
{
int rc;
rc = MQTTSubscribe(module->client, topic, qos, msgHandler);
if(module->callback)
module->callback(module, MQTT_CALLBACK_SUBSCRIBED, NULL);
return rc;
}
Which calls the following function.
int MQTTSubscribe(MQTTClient* c, const char* topicFilter, enum QoS qos, messageHandler msgHandler)
{
int rc = FAILURE;
Timer timer;
int len = 0;
MQTTString topic = MQTTString_initializer;
int Qoss = (int) qos;
topic.cstring = (char *)topicFilter;
#if defined(MQTT_TASK)
MutexLock(&c->mutex);
#endif
if (!c->isconnected)
goto exit;
TimerInit(&timer);
TimerCountdownMS(&timer, c->command_timeout_ms);
len = MQTTSerialize_subscribe(c->buf, c->buf_size, 0, getNextPacketId(c), 1, &topic, (int*)&Qoss);
// len = MQTTSerialize_subscribe(c->buf, c->buf_size, 0, getNextPacketId(c), 1, &topic, qos);
if (len <= 0)
goto exit;
if ((rc = sendPacket(c, len, &timer)) != SUCCESS) // send the subscribe packet
goto exit; // there was a problem
if (waitfor(c, SUBACK, &timer) == SUBACK) // wait for suback
{
int count = 0, grantedQoS = -1;
unsigned short mypacketid;
if (MQTTDeserialize_suback(&mypacketid, 1, &count, &grantedQoS, c->readbuf, c->readbuf_size) == 1)
rc = grantedQoS; // 0, 1, 2 or 0x80
if (rc != 0x80)
{
int i;
for (i = 0; i < MAX_MESSAGE_HANDLERS; ++i)
{
if (c->messageHandlers[i].topicFilter == 0)
{
c->messageHandlers[i].topicFilter = topicFilter;
c->messageHandlers[i].fp = msgHandler;
rc = 0;
break;
}
}
}
}
else
rc = FAILURE;
exit:
#if defined(MQTT_TASK)
MutexUnlock(&c->mutex);
#endif
return rc;
}
Is this the expected results? Is there any way of solving this?
You do not show us enough. However, I assume:
void myFunction(...)
{
char topick []="/mqtt_topic/myID/";
char* topic=topick;
mqtt_subscribe(module_inst, topic, 0, SubscribeHandler);
//...
}
or something like this, i.e. topick is declared inside a function. Then it is a local variable that ceases to exist when the function returns. The pointer to a string you passed does no longer point to a valid string.
On the other hand:
char* topic="/mqtt_topic/myID/";
mqtt_subscribe(module_inst, topic, 0, SubscribeHandler);
Here topic points to a literal and the literal remains to exist after the function returns. So the mqtt_.. function receives a valid string that also exists after the caller returns.
Your answer is completely correct and I am indeed declaring the array locally. Declaring the topick[64] globally solved this issue.
Thanks!
/Mikael

Segmentation fault only in realease configuration when manipulating strings in C

below is the function fast_integer_output that converts input_integer from base 2 into output_base.
my_str* fast_integer_output(bigint* input_integer, int output_base)
{
bigint** integer_to_binary_array = create_integer_to_binary_array();
bigint* base = integer_to_binary_array[output_base];
my_str* result = 0;
if(less_than(input_integer, base))
{
char* p_char = (char*) get_memory(sizeof(char));
p_char[0] = int_to_char(*(input_integer->number));
result = create_string(p_char, 1);
}
else
{
long k = find_k(input_integer, base);
bigint* base_to_k = power(base, k);
bigint* quotient;
bigint* remainder;
divide(input_integer, base_to_k, &quotient, &remainder);
delete_bigint(&base_to_k);
my_str* low = fast_integer_output(remainder, output_base);
delete_bigint(&remainder);
my_str* high = fast_integer_output(quotient, output_base);
delete_bigint(&quotient);
result = concatenate(low, k - low->length, high);
delete_string(&low);
delete_string(&high);
}
release_integer_to_binary_array(integer_to_binary_array);
return result;
}
Here are bigint and my_str structs and create_string function (bitarray is just a pointer to long)
my_str* create_string(char* c_str, long length)
{
my_str* str = (my_str*) get_memory(sizeof(my_str));
str->c_str = c_str;
str->length = length;
}
typedef struct
{
char* c_str;
long length; // logical
} my_str;
typedef struct
{
bitarray* number;
long length; // logical
long size; // physical
} bigint;
And this functions take care of memory management. Right now they are just wrapping functions to free and malloc but I would like to implement some kind of memory pool if it will be slow in operation.
void init_memory(void* memory, size_t size)
{
unsigned char* tmp = (unsigned char*) memory;
unsigned char c = 0;
while (size > 0)
{
*tmp = c;
tmp++;
size--;
}
}
void* get_memory(size_t size)
{
void* memory = malloc(size);
init_memory(memory, size);
return memory;
}
void release_memory(void** memory)
{
free(*memory);
*memory = 0;
}
Problem is that everything runs fine on debug configuration, but on release configuration the function fast_integer_output fails on the line :
result = concatenate(low, k - low->length, high);
The problem is that
my_str* low = fast_integer_output(remainder, output_base);
which returns from this piece of code
if(less_than(input_integer, base))
{
char* p_char = (char*) get_memory(sizeof(char));
p_char[0] = int_to_char(*(input_integer->number));
result = create_string(p_char, 1);
}
returns trash and thus it fails on segmentation fault.
I had the same problem with this code:
QString buffer; // function parameter in real code
instance_data_t data = {(unsigned char*) buffer.toStdString().c_str(), 0, 0, 0, 0, 0, 0, {0, 0}, {0, 0}, cur_state};
but I managed to get it working by changing it into following:
unsigned char c_buffer[1024];
memset(c_buffer, '\0', sizeof(c_buffer));
strcpy((char *) c_buffer, buffer.toStdString().c_str());
instance_data_t data = {c_buffer, 0, 0, 0, 0, 0, 0, {0, 0}, {0, 0}, cur_state};
Important note is that I cannot use any other functions than write, read, malloc and free (so no strcpy, above code is from test which I will not deliver)
This is not a homework, it is an assignment for a job (I would like to get).
I've searched and read about 15-20 questions, so I will not list them all, but the problem is that mostly, if the question is not very specific, the segmentation fault is not because of string manipulation and if it is, then it is mostly because of index out of bounds.
Your allocated string is not null ('\0') terminated.
I would do it this way :
char* p_char = (char*) get_memory(sizeof(char) * 2);
p_char[0] = int_to_char(*(input_integer->number));
p_char[1] = 0;
result = create_string(p_char, 1);
As EOF pointed out (see comments),
The create_string function as no return statement.
Thus, garbage is indeed returned.
my_str* create_string(char* c_str, long length)
{
my_str* str = (my_str*) get_memory(sizeof(my_str));
str->c_str = c_str;
str->length = length;
return str;
}

cublasStrsmBatched - execution failed

I can't run cublasStrsmBatched (line 113) without CUBLAS_STATUS_EXECUTION_FAILED (13) output. To simplify, all matrix values and alpha are 1.0, all matrices are square and lda, ldb, m and n are equal.
I am able to run cublasSgemmBatched and cublasStrsm in the same way, with no error. cublasStrsmBatched should be the same, but it is not, not for me.
Please tell me if you have any idea about what am I doing wrong in this code:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
cublasHandle_t handle;
void CheckCublasCreate(cublasStatus_t status);
void CheckAllocateHost(void* h_pointer);
void CheckCudaMalloc(cudaError_t d_allocStatus);
void CheckCudaMemcpy( cudaError_t error );
void CheckCublasSetGetMatrix(cublasStatus_t status);
void CheckKernelExecution(cublasStatus_t status);
void CheckCublasDestroy(cublasStatus_t status);
void TestCublasStrsmBatched(int size, int numOfLinSys);
int main()
{
cublasStatus_t status = cublasCreate(&handle);
CheckCublasCreate(status);
/*arguments are size of square matrix
and number of linear systems*/
TestCublasStrsmBatched(2,2);
status = cublasDestroy(handle);
CheckCublasDestroy(status);
}
void TestCublasStrsmBatched(int size, int numOfLinSys)
{
cublasStatus_t status;
cudaError_t error;
float **h_A;
float **d_A;
float **h_B;
float **d_B;
float **hd_A;
float **hd_B;
float *alpha;
const int n = size;
const int m = size;
const int lda=m;
const int ldb=m;
const int matA_numOfElem = m*m;
const int matB_numOfElem = m*n;
int i,j;
h_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_A);
h_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(h_B);
alpha=(float *)malloc(sizeof(float));
*alpha = 1.0;
for (j=0; j<numOfLinSys; j++){
h_A[j] = (float *)malloc(matA_numOfElem * sizeof(float));
CheckAllocateHost(h_A);
for (i=0; i < matA_numOfElem; i++)
h_A[j][i] = 1.0;
h_B[j] = (float *)malloc(matB_numOfElem * sizeof(float));
CheckAllocateHost(h_B);
for (i=0; i < matB_numOfElem; i++)
h_B[j][i] = 1.0;
}
hd_A = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_A);
hd_B = (float **)malloc(numOfLinSys * sizeof(float*));
CheckAllocateHost(hd_B);
for (j=0; j<numOfLinSys; j++){
error = cudaMalloc((void **)&hd_A[j],
matA_numOfElem * sizeof(float));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&hd_B[j],
matB_numOfElem * sizeof(float));
CheckCudaMalloc(error);
status = cublasSetMatrix(m, m, sizeof(float),
h_A[j], lda, hd_A[j], lda);
CheckCublasSetGetMatrix(status);
status = cublasSetMatrix(m, n, sizeof(float),
h_B[j], ldb, hd_B[j], ldb);
CheckCublasSetGetMatrix(status);
}
error = cudaMalloc((void **)&d_A, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMalloc((void **)&d_B, numOfLinSys * sizeof(float*));
CheckCudaMalloc(error);
error = cudaMemcpy(d_A, hd_A, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
error = cudaMemcpy(d_B, hd_B, numOfLinSys * sizeof(float*),
cudaMemcpyHostToDevice);
CheckCudaMemcpy(error);
/*After cublasStrsmBatched call
status changes to CUBLAS_STATUS_EXECUTION_FAILED (13)*/
status = cublasStrsmBatched(handle,
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER,
CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
m, n, alpha, d_A, lda, d_B, ldb, numOfLinSys);
CheckKernelExecution(status);
}
void CheckCublasCreate( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr,
"!!!! CUBLAS initialization error \n");
exit(EXIT_FAILURE);
}
}
void CheckAllocateHost( void* h_pointer )
{
if (h_pointer == 0){
fprintf(stderr,
"!!!! host memory allocation error \n");
exit(EXIT_FAILURE);
}
}
void CheckCudaMalloc( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr,
"!!!! device memory allocation error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCudaMemcpy( cudaError_t error )
{
if (error != cudaSuccess){
fprintf(stderr, "!!!! data copy error (error code %s)\n",
cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
void CheckCublasSetGetMatrix( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! device access error \n");
exit(EXIT_FAILURE);
}
}
void CheckKernelExecution( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! kernel execution error.\n");
exit(EXIT_FAILURE);
}
}
void CheckCublasDestroy( cublasStatus_t status )
{
if (status != CUBLAS_STATUS_SUCCESS){
fprintf(stderr, "!!!! shutdown error \n");
exit(EXIT_FAILURE);
}
}
Using Linux, CUDA 5.5, T10 and Windows, CUDA 5.5, GTX285
Thanks!
The batched triangular backsolver is something I hadn't tried before in CUBLAS, so I was interested to take a look and see what might be going on. Your code is rather complex, so I didn't bother trying to understand it, but when I ran it, it appeared to be failing with an internal CUBLAS launch failure:
$ cuda-memcheck ./a.out
========= CUDA-MEMCHHECK
!!!! kernel execution error.
========= Program hit error 8 on CUDA API call to cudaLaunch
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/Library/Frameworks/CUDA.framework/Versions/A/Libraries/libcuda_256.00.35.dylib (cudbgGetAPIVersion + 0x27bd7) [0x4538e7]
========= Host Frame:/usr/local/cuda/lib/libcudart.dylib (cudaLaunch + 0x26c) [0x45c8c]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasZgetrfBatched + 0x1e34) [0x196ae4]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0x64d) [0x1974cd]
========= Host Frame:/usr/local/cuda/lib/libcublas.dylib (cublasCtrsmBatched + 0xacb) [0x19794b]
========= Host Frame:/Users/talonmies/./a.out (_Z22TestCublasStrsmBatchedii + 0x3c1) [0x1b28]
========= Host Frame:/Users/talonmies/./a.out (main + 0x3d) [0x1b7d]
========= Host Frame:/Users/talonmies/./a.out (start + 0x35) [0x14e9]
========= Host Frame:[0x1]
(This is an OS X machine with a compute 1.2 GPU and CUDA 5.0). Error 8 is cudaErrorInvalidDeviceFunction, which usually only comes up when a library or fatbinary doesn't have an architecture which matches or can't be JIT recompiled into something your GPU can run.
Intrigued, I wrote my own much simpler repro case from scratch:
#include <iostream>
#include <cublas_v2.h>
int main(void)
{
const int Neq = 5, Nrhs = 2, Nsys = 4;
float Atri[Neq][Neq] =
{ { 1, 6, 11, 16, 21},
{ 0, 7, 12, 17, 22},
{ 0, 0, 13, 18, 23},
{ 0, 0, 0, 19, 24},
{ 0, 0, 0, 0, 25} };
float B[Nrhs][Neq] =
{ { 1, 27, 112, 290, 595},
{ 2, 40, 148, 360, 710} };
float *syslhs[Nsys], *sysrhs[Nsys];
float *A_, *B_, **syslhs_, **sysrhs_;
size_t Asz = sizeof(float) * (size_t)(Neq * Neq);
size_t Bsz = sizeof(float) * (size_t)(Neq * Nrhs);
cudaMalloc((void **)(&A_), Asz);
cudaMalloc((void **)(&B_), Bsz * size_t(Nsys));
cudaMemcpy(A_, Atri, Asz, cudaMemcpyHostToDevice);
for(int i=0; i<Nsys; i++) {
syslhs[i] = A_;
sysrhs[i] = (float*)((char *)B_ + i*Bsz);
cudaMemcpy(sysrhs[i], B, Bsz, cudaMemcpyHostToDevice);
}
size_t syssz = sizeof(float *) * (size_t)Nsys;
cudaMalloc((void **)&syslhs_, syssz);
cudaMalloc((void **)&sysrhs_, syssz);
cudaMemcpy(syslhs_, syslhs, syssz, cudaMemcpyHostToDevice);
cudaMemcpy(sysrhs_, sysrhs, syssz, cudaMemcpyHostToDevice);
const cublasSideMode_t side = CUBLAS_SIDE_LEFT;
const cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
const cublasFillMode_t ulo = CUBLAS_FILL_MODE_LOWER;
const cublasOperation_t trans = CUBLAS_OP_N;
float alpha = 1.f;
cublasHandle_t handle;
cublasCreate(&handle);
cublasStrsmBatched(
handle,
side, ulo, trans, diag,
Neq, Nrhs,
&alpha,
syslhs_, Neq,
sysrhs_, Neq,
Nsys
);
for(int k=0; k<Nsys; k++) {
cudaMemcpy(B, sysrhs[k], Bsz, cudaMemcpyDeviceToHost);
for(int i=0; i<Nrhs; i++) {
for(int j=0; j<Neq; j++) {
std::cout << B[i][j] << ",";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
return 0;
}
This also fails the same way as your code. At first inspection, this really does seem to be a CUBLAS internal problem, although it is very difficult to say what. About the only thing I can think of is that these solvers are only supported on compute capability 3.5 devices not supported on compute 1.x devices, but the documentation fails to mention it. Between us we have tested compute 1.2, compute 1.3, and compute 3.0[error on my part, I read K10 not T10 in your question] devices, so there isn't much else left.....
All I can suggest is trying to run your code with cuda-memcheck and see whether it reports the same error. if it does, I see a bug report to NVIDIA in your future.
EDIT: I flagrantly disregarded the EULA and used cuobjdump to explore the cubin payloads in the CUDA 5 cublas library. For the single precision batched trsm routines I found cubins for
32 bit sm_20
32 bit sm_30
32 bit sm_35
64 bit sm_20
64 bit sm_30
64 bit sm_35
There are clearly no sm_1x cubins in the library, so my compute_12 device should produce the runtime library error I see. It also explains your error with the GTX 285 and Telsa T10, which are both compute_13.
EDIT2:
As suspected, my repro code runs perfectly on a linux system with a compute_30 device under both CUDA 5.0 and CUDA 5.5 release libraries.

Passing struct to GPU with OpenCL that contains an array of floats

I currently have some data that I would like to pass to my GPU and the multiply it by 2.
I have created a struct which can be seen here:
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float* patterns;
};
This struct should contain an array of floats. The array of floats I will not know untill run time as it is specified by the user.
The host code:
typedef struct GPUPatternDataContatiner
{
int nodeInput,nodeOutput,patternCount, offest;
float* patterns;
} GPUPatternData;
__kernel void patternDataAddition(__global GPUPatternData* gpd,__global GPUPatternData* output)
{
int index = get_global_id(0);
if(index < gpd->patternCount)
{
output.patterns[index] = gpd.patterns[index]*2;
}
}
Here is the Host code:
GPUPattern::GPUPatternData gpd;
gpd.nodeInput = ptSet->getInputCount();
gpd.nodeOutput = ptSet->getOutputCount();
gpd.offest = gpd.nodeInput+gpd.nodeOutput;
gpd.patternCount = ptSet->getCount();
gpd.patterns = new cl_float [gpd.patternCount*gpd.offest];
GPUPattern::GPUPatternData gridC;
gridC.nodeInput = ptSet->getInputCount();
gridC.nodeOutput = ptSet->getOutputCount();
gridC.offest = gpd.nodeInput+gpd.nodeOutput;
gridC.patternCount = ptSet->getCount();
gridC.patterns = new cl_float [gpd.patternCount*gpd.offest];
All the data is initialized then initialized with values and then passed to the GPU
int elements = gpd.patternCount;
size_t ofsdf = sizeof(gridC);
size_t dataSize = sizeof(GPUPattern::GPUPatternData)+ (sizeof(cl_float)*elements);
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
//Copy the buffer to the device
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&gpd,0,NULL,NULL);
//This buffer is being written to only
cl_mem bufferC = clCreateBuffer(gpu.context,CL_MEM_WRITE_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
err = clEnqueueWriteBuffer(queue,bufferC,CL_TRUE,0,dataSize,(void*)&gridC,0,NULL,NULL);
Everything is built which I check just watching the error which stays at 0
cl_program program = clCreateProgramWithSource(gpu.context,1, (const char**) &kernelSource,NULL,&err);
////Build program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
char build[2048];
clGetProgramBuildInfo(program, gpu.device, CL_PROGRAM_BUILD_LOG, 2048, build, NULL);
////Create kernal
cl_kernel kernal = clCreateKernel(program, "patternDataAddition",&err);
////Set kernal arguments
err = clSetKernelArg(kernal, 0, sizeof(cl_mem), &bufferA);
err |= clSetKernelArg(kernal, 1, sizeof(cl_mem), &bufferC);
It is then kicked off
size_t globalWorkSize = 1024;
size_t localWorkSize = 512;
err = clEnqueueNDRangeKernel(queue, kernal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
clFinish(queue);
Its at this point it all goes wrong
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize, &gridC, 0, NULL, NULL);
clFinish(queue);
The error in this case is -5 (CL_OUT_OF_RESOURCES).
Also if I change the line:
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize, &gridC, 0, NULL,
to:
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize*1000, &gridC, 0, NULL, NULL);
I get the error -30 (CL_INVALID_VALUE).
So my question is why am i getting the errors I am when reading back the buffer. Also I am not sure if I am unable to use a pointer to my float array as could this be giving me the wrong sizeof() used for datasize which gives me the wrong buffer size.
You cannot pass a struct that contains pointers into OpenCL
http://www.khronos.org/registry/cl/specs/opencl-1.2.pdf (Section 6.9)
You can either correct as Eric Bainville pointed out or if you are not very restrict on memory you can do something like
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float patterns[MAX_SIZE];
};
EDIT: OK if memory is an issue. Since you only use the patterns and patternCount you can copy the patterns from the struct and pass them to the kernel separately.
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float patterns*;
};
copy patterns to GPU from gpd and allocate space for patterns in gridC on GPU.
then
You can pass the buffers separately
__kernel void patternDataAddition(int gpd_patternCount,
__global const float * gpd_Patterns,
__global float * gridC_Patterns) {
int index = get_global_id(0);
if(index < gpd_patternCount)
{
gridC_Patterns[index] = gpd_Patterns[index]*2;
}
}
when you come back from the kernel copy the data back to gridC.patterns directly
One more :
You don't have to change your CPU struct. It stays the same. However this part
size_t dataSize = sizeof(GPUPattern::GPUPatternData)+ (sizeof(cl_float)*elements);
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
//Copy the buffer to the device
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&gpd,0,NULL,NULL);
should be changed to something like
size_t dataSize = (sizeof(cl_float)*elements); // HERE
float* gpd_dataPointer = gpd.patterns; // HERE
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
// Now use the gpd_dataPointer
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&(gpd_dataPointer),0,NULL,NULL);
Same thing goes for the gridC
And when you copy back, copy it to gridC_dataPointer AKA gridC.dataPointer
And then continue using the struct as if nothing happened.
The problem is probably with the pointer inside your struct.
In this case, I would suggest to pass nInput,nOutput,patternCount,offset as kernel args, and the patterns as a buffer of float:
__kernel void patternDataAddition(int nInput,int nOutput,
int patternCount,int offset,
__global const float * inPatterns,
__global float * outPatterns)
I know that it is not actual now, but i passed this problem in other way:
Your code for allocation memory for struct with data stay same, but struct should bu changed to
typedef struct GPUPatternDataContatiner
{
int nodeInput, nodeOutput, patternCount, offest;
float patterns[0];
} GPUPatternData;
Using this "feature" i have created vectors for OpenCL

Resources