OpenCL Invalid Command Queue when calling clFinish() - c

I'm having a problem with some openCL code I'm writing.
I've written a collection of utility functions to remove some boilerplate code from where I'm using it. The test method runs at the beginning and works absolutely fine, the code being below:
void openCLtest(char *arg_program, char *arg_device)
{
cl_int ret;
cl_device_id device_id = getDeviceId(atoi(arg_program), atoi(arg_device));
cl_context context = get_cl_context(&device_id);
cl_command_queue queue = get_cl_command_queue(&context, &device_id);
cl_kernel kernel = compileCLkernel(&context, &device_id, "src/hello.cl", "hello");
cl_mem memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(char), NULL, &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Allocate Buffer\n");
exit(1);
}
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to set kernel Arg\n");
exit(1);
}
ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Enqueue Task\n");
exit(1);
}
ret = clFinish(queue);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to wait for finish\n");
exit(1);
}
char string[MEM_SIZE];
ret = clEnqueueReadBuffer(queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(char), string, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to read buffer\n");
exit(1);
}
printf("CL Produced: %s\n", string);
ret = clFlush(queue);
ret = clFinish(queue);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Wait for test queue to finish\n");
exit(1);
}
ret = clReleaseKernel(kernel);
ret = clReleaseMemObject(memobj);
ret = clReleaseCommandQueue(queue);
ret = clReleaseContext(context);
}
This code works fine, and I then extracted the code into more functions which can be used for the real openCL I'm writing.
The same principle has been applied in the rest of the code, but this time, it doesn't work.
main:
openCLtest(argv[2], argv[3]); //This is the code above and works great
cl_device_id device_id = getDeviceId(atoi(argv[2]), atoi(argv[3]));
cl_context context = get_cl_context(&device_id);
cl_command_queue queue = get_cl_command_queue(&context, &device_id);
....
double *coords_3D = cl_extrude_coords(&device_id, &context, &queue, coords_2D, nodes, LAYERS, LAYER_HEIGHT);
cl_extrude_coords:
double *cl_extrude_coords(cl_device_id* device_id, cl_context* context, cl_command_queue* queue, double *coords, int nodes, int layers, double layer_height)
{
cl_int ret;
cl_kernel extrude_coords = compileCLkernel(context, device_id, "src/OpenCL_Kernels/extrude_coords.cl", "extrude_coords");
cl_mem coords_2d = clCreateBuffer(*context, CL_MEM_READ_ONLY, sizeof(coords) / sizeof(coords[0]), NULL, &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Create coords_2d CL Buffer %d\n", ret);
exit(1);
}
cl_mem result = clCreateBuffer(*context, CL_MEM_WRITE_ONLY, sizeof(double) * nodes * 3 * layers, NULL, &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Create result CL Buffer %d\n", ret);
exit(1);
}
ret = clEnqueueWriteBuffer(*queue, coords_2d, CL_TRUE, 0, sizeof(coords) / sizeof(coords[0]), (const void *)&coords, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed enqueue coords_2d write to buffer %d\n", ret);
exit(1);
}
ret = clSetKernelArg(extrude_coords, 0, sizeof(cl_mem), (void *)&coords_2d);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Set kernel argument coords_2d %d\n", ret);
exit(1);
}
ret = clSetKernelArg(extrude_coords, 1, sizeof(cl_mem), (void *)&result);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Set kernel argument result CL Buffer %d\n", ret);
exit(1);
}
ret = clSetKernelArg(extrude_coords, 2, sizeof(double), (void *)&layer_height);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Set kernel argument layers %d\n", ret);
exit(1);
}
size_t gWorkSize[] = {nodes, layers};
cl_event clEvent;
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Enqueue Extrude Coordinates Kernel\n");
exit(1);
}
double *res = (double *)malloc(sizeof(double) * nodes * 3 * layers);
ret = clFinish(*queue);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to wait for queue to finish in extrude_coords %d\n", ret);
exit(1);
}
ret = clEnqueueReadBuffer(*queue, result, CL_TRUE, 0, sizeof(double) * nodes * 3 * layers, (void *)res, 1, &clEvent, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to Enqueue the extrude_coords result buffer read %d\n", ret);
exit(1);
}
ret = clReleaseKernel(extrude_coords);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to release kernel\n");
exit(1);
}
ret = clReleaseMemObject(coords_2d);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to release result memory object\n");
exit(1);
}
ret = clReleaseMemObject(result);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to release result memory object\n");
exit(1);
}
return res;
}
cl kernel:
#pragma OPENCL EXTENSION cl_khr_fp64: enable
__kernel void extrude_coords(__global const double * coords, __global double * res, const double layer_height){
uint i=get_global_id(0);
uint j=get_global_id(1);
uint layers=get_global_size(0);
res[3*(i*layers + j)] = coords[2*i];
res[3*(i*layers + j) + 1] = coords[2*i + 1];
res[3*(i*layers + j) + 2] = layer_height * j;
}
This function however, does not work, throwing the error below when clFinish(queue) is called.
Failed to wait for queue to finish in extrude_coords -36
Looking this up, I can see -36 is CL_INVALID_COMMAND_QUEUE. If I don't exit here, I then get an error thrown at the buffer read, error code -5, CL_OUT_OF_RESOURCES.
I'm not sure what is going wrong. The values of nodes and layers when this code is being tested are 151731 and 101 respectively. I'm not sure if that has something to do with it.
Does anyone have any ideas on what could be the issue and how to fix it, or even any suggestions on whether this structure for the code is a good idea. The plan was by passing the queue, context and device ID, each function can produce and execute its own kernel(s) to do something with the queue etc being released at the end of the program when they're no longer needed.
Any help would be appreciated, I've been stumped on this for several hours now.
EDIT:
I have since tried changinging the calling convention of clEnqueueNDRange in extrude_coords to
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize[0], NULL, 0, NULL, &clEvent);
as suggested in an answer but this does not work. Testing with printf("%d\n", &gWorkSize == &gWorkSize[0]); shows that the two pointers are functionally the same, so this is not the issue.
I then went on to modify the test openCL code to use clEnqueueNDRange instead of clEnqueueTask as follows:
size_t gWorkSize[] = {1, 1};
// ret = clEnqueueTask(queue, kernel, 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, NULL);
This still all works correctly, so something else is clearly wrong... I'm still not sure what...

The sizeof(coords) / sizeof(coords[0]) will not give the array size in C/C++. Best to use sizeof(coords)*elementsInCoords and pass in elementsInCoords. Alternatively, setup coords to be a std::vector<> and pass that around since you can get a data pointer out of it and the size as well.

Look at code:
size_t gWorkSize[] = {nodes, layers};
cl_event clEvent;
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, (const size_t *)&gWorkSize, NULL, 0, NULL, &clEvent);
&gWorkSize is of type size_t (*)[2], while argument must be of type const size_t*
Try this:
ret = clEnqueueNDRangeKernel(*queue, extrude_coords, 2, NULL, &gWorkSize[0], NULL, 0, NULL, &clEvent);

Related

Get cifs service ticket for the user credential using krb5

I am trying to get the krb5 service ticket for the cifs server using the below code. I am able to get the intial creds for the smb user. But when I try to get the cifs service ticket for the smb server, I am getting error PRINCIPAL_UNKNOWN error. In the packet trace, I noticed that TGS_REQ is sent with sname "krbtgt\cifs_server_name.domain_name" instead of cifs\cifs_server_name.domain_name. I am not sure what mistake I am making.. I constructed the service principal correctly( line 12-14) Not sure why TGS-REQ is send with sname krbtgt instead.
krb5_creds credentials;
krb5_creds* service_credentials;
krb5_principal user_principal = NULL;
krb5_principal service_principal = NULL;
krb5_ccache ccache;
krb5_get_init_creds_opt *options;
memset(&credentials, 0, sizeof(credentials));
char buf[100];
sprintf(buf, "%s#%s", smb2->user, smb2->domain);
ret = krb5_parse_name(context, buf, &user_principal);
sprintf(buf, "%s#%s", "cifs", smb2->target_name);
fprintf(stderr, "buff %s\n", buf);
ret = krb5_parse_name(context, buf, &service_principal);
if (ret != 0) {
fprintf(stderr, "krb5_parse_name %d\n", ret );
exit(-1);
}
ret = krb5_cc_default(context, &ccache);
if (ret != 0) {
fprintf(stderr, "krb5_parse_name %d\n", ret );
exit(-1);
}
ret = krb5_cc_initialize (context, ccache, user_principal);
if (ret != 0) {
fprintf(stderr, "krb5_cc_initialize %d\n", ret );
exit(-1);
}
ret = krb5_get_init_creds_opt_alloc(context, &options);
if (ret != 0) {
fprintf(stderr, "krb5_get_init_creds_opt_alloc %d\n", ret );
exit(-1);
}
ret = krb5_get_init_creds_opt_set_out_ccache(context, options, ccache);
if (ret != 0) {
fprintf(stderr, "krb5_get_init_creds_opt_set_out_ccache %d\n", ret );
exit(-1);
}
// Gets the realm name for the hostname
ret = krb5_get_init_creds_password(context, &credentials, user_principal,
smb2->password, NULL,
NULL, 0, NULL, options);
fprintf(stderr, "krb5_get_init_creds_password %d\n", ret);
if (ret != 0) {
fprintf(stderr, "krb5_get_init_creds_password %d\n", ret );
//exit(-1);
}
credentials.server = service_principal;
credentials.client = user_principal;
// krb5_tkt_creds_init(context, ccache, credentials, options, &)
ret = krb5_get_credentials(context, 0, ccache, &credentials, &service_credentials);
if (ret != 0) {
fprintf(stderr, "krb5_get_credentials %d\n", ret );
exit(-1);
}
fprintf(stderr, "----------------------------------------------------------krb5_get_credentials %d----------------------------------------------------------\n", ret);
Please help to resolve this issue.
Thanks
I found the problem after trial and error method. service principle should be cifs/cifs_server_name.domain_name not cifs#cifs_server_name.domain_name. Only user principle should be user#domain_name. After editing the below line, krb5_get_credentials() was able to get service ticket/
sprintf(buf, "%s/%s", "cifs", smb2->target_name);

Running OpenCL using Visual Studio 2015

I'm a newbie in OpenCL, so far I referred Dr. Dobbs tutorials for OpenCL and few others and ran it on Ubuntu which worked very well but those same codes won't/refuse to work on windows using Visual Studio with all required Environment Variables set correctly.
I'm using 980M with CUDA SDK 8 on vs 2015. I have two files, one in C and another is a kernel(cl) file. Whenever I add both .c & .cl files, the program refuses to run by throwing errors like Can't find program files and things like that. However, if I write the kernel file within C file, it works sometimes say 1 out of 3. The same program works fine on my PC running Ubuntu 16 and another PC with AMD card running on Ubuntu 16.
The program I'm trying to run is vector addition written in C.
I've attached the link to the code.
OpenCL vector addition
add_numbers.c
#define PROGRAM_FILE "add_numbers.cl"
#define KERNEL_FUNC "vecAdd"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef __linux
#include <unistd.h>
#include <fcntl.h>
#endif // __linux
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main() {
/* OpenCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_device_type dev_type; //new
// cl_int i, j, err;
size_t local_size, global_size;
// vector add pgm
// Length of vectors
unsigned int n = 1000000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
/*h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);*/
h_a[i] = i+1;
h_b[i] = i+2;
}
// size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
local_size = 64;
// Number of total work items - localSize must be devisor
global_size = (n/local_size)*local_size;
/* Create device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build program */
program = build_program(context, device, PROGRAM_FILE);
/* Create data buffer */
input_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err);
sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), sum, &err);
if(err < 0) {
perror("Couldn't create a buffer");
exit(1);
};*/
//input buffers
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
/* Create a command queue */
queue = clCreateCommandQueue(context, device, 0, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
/* Create a kernel */
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
// Newer args under test
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
if(err < 0) {
perror("Couldn't create a kernel argument");
exit(1);
}
/* Enqueue kernel */
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the kernel");
exit(1);
}
/* Read the kernel's output */
err = clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
if(err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
//cl_device_type dev_type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
if (dev_type == CL_DEVICE_TYPE_GPU) {
puts("I'm 100% sure this device is a GPU");
}
else
puts("Device is CPU\n");
// Sum calc.
double sum = 0;
for(i=0; i<n; i++)
//sum += h_c[i];
sum = h_a[i] + h_b[i];
printf("final result: %lf\n",(sum/n));
/* Deallocate resources */
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
add_number.cl kernel file
__kernel void vecAdd( __global double *a,
__global double *b,
__global double *c,
const unsigned int n)
{
#pragma OPENCL EXTENSION cl_khr_fp64 :enable
//Get our global thread ID
int id = get_global_id(0);
//Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}

OpenCL clCreateBuffer() crashes the program

I have taken up OpenCL programming as part of a university project and I have a bit of a problem when I try to input data to a buffer object during the clCreateBuffer() routine.
The program is a simple two-dimensional matrix addition. The code is as follows:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "add_kernel.cl"
#define ADD_FUNC "add_matrix"
#define MATRIX_DIM 256
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
/* Find a GPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a GPU */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err < 0) {
perror("Couldn't access any GPU type");
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main(){
/* Host/device data structures */
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel add_kernel;
size_t global_size;
cl_ulong mem_size;
cl_int i, j, err, check;
/* Data and buffers */
cl_uint matrix_dim;
float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],
c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];
cl_mem a_buffer, b_buffer, c_buffer;
/* Initialize A, B, and check matrices */
srand((unsigned int)time(0));
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
a_mat[i][j] = (float)rand()/RAND_MAX;
}
}
srand((unsigned int)time(0));
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
b_mat[i][j] = (float)rand()/RAND_MAX;
check_mat[i][j] = 0.0f;
}
}
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
check_mat[i][j] += a_mat[i][j] + b_mat[i][j];
}
}
/* Create a device and context */
device = create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
/* Build the program */
program = build_program(context, device, PROGRAM_FILE);
add_kernel = clCreateKernel(program, ADD_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
};
/* Create buffers */
a_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(a_mat), a_mat, &err);
if(err < 0) {
perror("Couldn't create buffer A");
exit(1);
};
b_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(b_mat), b_mat, &err);
if(err < 0) {
perror("Couldn't create buffer B");
exit(1);
};
c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(c_mat), NULL, &err);
if(err < 0) {
perror("Couldn't create buffer C");
exit(1);
};
/* Create a command queue */
queue = clCreateCommandQueue(context, device, 0, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
};
/* Create arguments for multiplication kernel */
err = clSetKernelArg(add_kernel, 0, sizeof(a_buffer), &a_buffer);
err |= clSetKernelArg(add_kernel, 1, sizeof(b_buffer), &b_buffer);
err |= clSetKernelArg(add_kernel, 2, sizeof(c_buffer), &c_buffer);
global_size = MATRIX_DIM * MATRIX_DIM;
//printf("%lu\n", global_size);
err = clEnqueueNDRangeKernel(queue, add_kernel, 1, NULL, &global_size,
NULL, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't enqueue the addition kernel");
exit(1);
}
/* Read output buffer */
err = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0,
sizeof(c_mat), c_mat, 0, NULL, NULL);
if(err < 0) {
perror("Couldn't read the buffer");
exit(1);
}
/* Check result */
check = 1;
for(i=0; i<MATRIX_DIM; i++) {
for(j=0; j<MATRIX_DIM; j++) {
if(c_mat[i][j] != check_mat[i][j]){
check = 0;
break;
}
}
}
if(check)
printf("Addition check succeeded.\n");
else
printf("Addition check failed.\n");
/* Deallocate resources */
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(add_kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}
The kernel code is the following:
__kernel void add_matrix(__global float* matrix_a,
__global float* matrix_b,
__global float* result) {
int i = get_global_id(0);
result[i] = matrix_a[i] + matrix_b[i];
}
Now, it works great for dimensions up to 358x358, but as soon as I put 359 in the MATRIX_DIM it crashes. It shows the usual "foo.exe has stopped working". I know it has to do something with the clCreateBuffer() command because if I remove the code from the first clCreateBuffer() and below, it runs and terminates fine, but as soon as I add even one it crashes.
The CL_DEVICE_MAX_MEM_ALLOC_SIZE option shows a number of 512MB of available memory and the data I am trying to pass is much less than that.
Is there anything I can do to increase the ammount of data I can process?
My GPU is a Radeon Sapphire HD5770
EDIT: After a suggestion in the comments I ran the debugger which yielded the following message:
Program received signal SIGSEGV, Segmentation fault.
In amdocl!_aclHsaLoader () (C:\WINDOWS\SysWOW64\amdocl.dll)
#15 0x00401355 in create_device () at C:\test\testcl.c:26
C:\test\testcl.c:26:503:beg:0x401355
I am really not sure what this means though. Any ideas?
The main problem is, that you allocate to much memory on the stack at these code lines, so that, you got a stack overflow:
float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],
c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];
In my test here, the execution didn't even entered the main method. You have to allocate these matrices on the heap with:
float *a_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*a_mat));
float *b_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*b_mat));
float *c_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*c_mat));
float *check_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*check_mat));
But now, you have only a 1-dimensional (1D) data-buffer for each matrix, so that, you have to change every 2D index [i][j] into the corresponding 1D index [i*MATRIX_DIM][j], e.g.:
a_mat[i*MATRIX_DIM+j] = (float)rand()/RAND_MAX;
EDIT: You have to also update the calls to clCreateBuffer und clEnqueueReadBuffer. The matrix size cannot be determined with sizeof(matrix_name) anymore (where matrix_name is one of a_mat, b_mat, ...). You have to replace every such sizeof (there are 4 of some) with MATRIX_DIM*MATRIX_DIM*sizeof(*matrix_name). Don't forget the derefence before the matrix_name, e.g.:
a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
MATRIX_DIM*MATRIX_DIM*sizeof(*a_mat), a_mat, &err);
(End of Edit).
Don't forget to release the data-buffers at the end:
free(a_mat);
free(b_mat);
free(c_mat);
free(check_mat);
To get even the kernel to run, I had even to fix the reading of the kernel program. The return value of ftell was always a little bit too large. The actual number of bytes is instead returned by fread. Thus, change these lines
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
to
program_size = fread(program_buffer, sizeof(char), program_size, program_handle); // changed
program_buffer[program_size] = '\0'; // moved here

my sqlite3 c function is leaking memory

Any idea why this function is leaking memory every time I call it?
char *getData(sqlite3 *db)
{
char *ret;
sqlite3_stmt *res;
int rc = sqlite3_prepare_v2(db, SELECT_STATEMENT_SQL, -1, &res, 0);
if (rc != SQLITE_OK) {
sprintf(stderr, "%s\n", sqlite3_errmsg(db));
return stderr;
}
rc = sqlite3_step(res);
if (rc == SQLITE_ROW) {
ret = sqlite3_column_text(res, 0);
} else {
ret = "error!";
}
sqlite3_free(res);
return ret;
}
You need to call sqlite3_finalize() to properly release memory, allocated for the statement.

Simple OpenCL program compiles and runs but output is incorrect

I wrote a simply OpenCL program based off the SDK and it compiles and runs, however the output is wrong. Is there something I'm doing wrong?
Any suggestions for learning to debug C and OpenCL is much appreciated. I'm quite new to the platform.
Code is below.
The output in array c is all zeros.
Thanks.
test_opencl.h
#ifndef _TEST_OPENCL_H_
#define _TEST_OPENCL_H_
int main( int argc, const char** argv);
int runTest( int argc, const char** argv);
#endif
test_opencl.cl
// simple test of adding a[i] to b[i] to get c[i]
__kernel void add_array(__global float *a, __global float *b, __global float *c)
{
int xid = get_global_id(0);
c[xid] = a[xid] + b[xid];
}
test_opencl.cpp
// standard utility and system includes
#include <oclUtils.h>
#include "test_opencl.h"
// OpenCL error catcher
cl_int err = 0;
// Main Program
// *********************************************************************
int main( int argc, const char** argv)
{
// set logfile name and start logs
shrSetLogFileName ("test_opencl.txt");
shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]);
// run the main test
int result = runTest(argc, argv);
shrCheckError(result, 0);
// finish
shrEXIT(argc, argv);
}
//! Run a simple test for OPENCL
// *********************************************************************
int runTest( int argc, const char** argv)
{
cl_context gpu_context;
cl_command_queue cmd_queue;
cl_program program;
cl_kernel test_kernel;
const size_t szGlobalWorkSize = 10;
const size_t szLocalWorkSize = 10;
// size of memory required to store the array
const unsigned int mem_size = sizeof(int) * 10;
// create the OpenCL context on a GPU device
gpu_context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
shrCheckError(err, CL_SUCCESS);
// get devices
cl_device_id device;
if( shrCheckCmdLineFlag(argc, argv, "device") ) {
int device_nr = 0;
shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
device = oclGetDev(gpu_context, device_nr);
} else {
device = oclGetMaxFlopsDev(gpu_context);
}
// create a command-queue
cmd_queue = clCreateCommandQueue(gpu_context, device, 0, &err);
shrCheckError(err, CL_SUCCESS);
// allocate and initalize host memory
int a[10], b[10], c[10];
for (int i = 0; i < 10; i++) {
a[i] = i;
b[i] = i * i;
}
// create buffers on device
cl_mem vol_a = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, a, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_b = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, b, &err);
shrCheckError(err, CL_SUCCESS);
cl_mem vol_c = clCreateBuffer(gpu_context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, c, &err);
shrCheckError(err, CL_SUCCESS);
// copy data from host to device
err = clEnqueueWriteBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(cmd_queue, vol_b, CL_TRUE, 0, mem_size, b, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
// Program Setup
size_t program_length;
char* source_path = shrFindFilePath("test_opencl.cl", argv[0]);
shrCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
shrCheckError(source != NULL, shrTRUE);
// create the program
program = clCreateProgramWithSource(gpu_context, 1, (const char **)&source, &program_length, &err);
shrCheckError(err, CL_SUCCESS);
// build the program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLog(LOGBOTH | ERRORMSG, err, STDERROR);
return(EXIT_FAILURE);
}
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Starting kernel operation...\n\n", argv[0]);
// create the test kernel
test_kernel = clCreateKernel(program, "add_array", &err);
shrCheckError(err, CL_SUCCESS);
// set the args values for the kernel
err = clSetKernelArg(test_kernel, 0, sizeof(cl_mem), (void *) &vol_a);
err |= clSetKernelArg(test_kernel, 1, sizeof(cl_mem), (void *) &vol_b);
err |= clSetKernelArg(test_kernel, 2, sizeof(cl_mem), (void *) &vol_c);
shrCheckError(err, CL_SUCCESS);
err = clEnqueueNDRangeKernel(cmd_queue, test_kernel, 1, NULL, &szGlobalWorkSize, NULL, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
// copy result from device to host
err = clEnqueueReadBuffer(cmd_queue, vol_c, CL_TRUE, 0, mem_size, c, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
int d[10];
err = clEnqueueReadBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, d, 0, NULL, NULL);
shrCheckError(err, CL_SUCCESS);
clFinish(cmd_queue);
shrLog(LOGBOTH, 0, "%s Finished kernel operation...\n\n", argv[0]);
bool passed = true;
for (int i = 0; i < 10; i++) {
if (c[i] != i + i * i)
passed = false;
shrLog(LOGBOTH, 0, "c = %d d = %d\n", c[i], d[i]);
}
if (passed)
shrLog(LOGBOTH, 0, "%s Test Passed\n\n", argv[0]);
else
shrLog(LOGBOTH, 0, "%s Test Failed\n\n", argv[0]);
// cleanup OpenCL
clReleaseMemObject(vol_a);
clReleaseMemObject(vol_b);
clReleaseMemObject(vol_c);
clReleaseKernel(test_kernel);
clReleaseProgram(program);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(gpu_context);
return 0;
}
The problems in the code and the solution can be found here.

Resources