How do I fix a CL_INVALID_MEM_OBJECT when I pass a buffer to an OpenCL kernel and the buffer is from clCreateFromGLTexture? - c

I need to do some math on the pixels of a GL texture. I have a working OpenCL kernel. I can convert the GL texture to an OpenCL buffer (at least, it doesn't give an error). But when I try to set that buffer as an argument to the kernel I get error -38 (CL_INVALID_MEM_OBJECT).
I originally tried it using rust, but when that was failing, I switched to C just to see if the problem existed independent of rust's wrappers.
This is my C app that is duct taped together from a couple of examples (c++ -g -o check2 check2.cxx -lOpenCL -lGL -lX11):
/*
https://www.khronos.org/opengl/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <X11/Xlib.h>
#include <X11/Xutil.h>
#include <GL/gl.h>
#include <GL/glx.h>
#include <CL/cl.h>
#include <CL/cl_gl.h>
#define GLX_CONTEXT_MAJOR_VERSION_ARB 0x2091
#define GLX_CONTEXT_MINOR_VERSION_ARB 0x2092
typedef GLXContext (*glXCreateContextAttribsARBProc)(Display*, GLXFBConfig, GLXContext, Bool, const int*);
// Helper to check for extension string presence. Adapted from:
// http://www.opengl.org/resources/features/OGLextensions/
static bool isExtensionSupported(const char *extList, const char *extension)
{
const char *start;
const char *where, *terminator;
/* Extension names should not have spaces. */
where = strchr(extension, ' ');
if (where || *extension == '\0')
return false;
/* It takes a bit of care to be fool-proof about parsing the
OpenGL extensions string. Don't be fooled by sub-strings,
etc. */
for (start=extList;;) {
where = strstr(start, extension);
if (!where)
break;
terminator = where + strlen(extension);
if ( where == start || *(where - 1) == ' ' )
if ( *terminator == ' ' || *terminator == '\0' )
return true;
start = terminator;
}
return false;
}
static bool ctxErrorOccurred = false;
static int ctxErrorHandler( Display *dpy, XErrorEvent *ev )
{
ctxErrorOccurred = true;
return 0;
}
void test_kernel(cl_mem a_mem_obj, cl_mem b_mem_obj, cl_mem c_mem_obj, cl_kernel kernel, int LIST_SIZE, cl_command_queue command_queue)
{
int err;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
printf("err? %d\tset kernel arg 0\n", err);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
printf("err? %d\n", err);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
printf("err? %d\n", err);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
printf("err? %d\t(kernel diff)\n", err);
// Read the memory buffer C on the device to the local variable C
long long *C;
C= (long long*)malloc(sizeof(*C)*LIST_SIZE);
err = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(*C), C, 0, NULL, NULL);
printf("err? %d(copy result)\n", err);
for (int i=0; i<LIST_SIZE && i<10; i++) {
printf("%lld\n", C[i]);
}
free(C);
}
int main(int argc, char* argv[])
{
Display *display = XOpenDisplay(NULL);
if (!display)
{
printf("Failed to open X display\n");
exit(1);
}
// Get a matching FB config
static int visual_attribs[] =
{
GLX_X_RENDERABLE , True,
GLX_DRAWABLE_TYPE , GLX_WINDOW_BIT,
GLX_RENDER_TYPE , GLX_RGBA_BIT,
GLX_X_VISUAL_TYPE , GLX_TRUE_COLOR,
GLX_RED_SIZE , 8,
GLX_GREEN_SIZE , 8,
GLX_BLUE_SIZE , 8,
GLX_ALPHA_SIZE , 8,
GLX_DEPTH_SIZE , 24,
GLX_STENCIL_SIZE , 8,
GLX_DOUBLEBUFFER , True,
//GLX_SAMPLE_BUFFERS , 1,
//GLX_SAMPLES , 4,
None
};
int glx_major, glx_minor;
// FBConfigs were added in GLX version 1.3.
if ( !glXQueryVersion( display, &glx_major, &glx_minor ) ||
( ( glx_major == 1 ) && ( glx_minor < 3 ) ) || ( glx_major < 1 ) )
{
printf("Invalid GLX version");
exit(1);
}
printf( "Getting matching framebuffer configs\n" );
int fbcount;
GLXFBConfig* fbc = glXChooseFBConfig(display, DefaultScreen(display), visual_attribs, &fbcount);
if (!fbc)
{
printf( "Failed to retrieve a framebuffer config\n" );
exit(1);
}
printf( "Found %d matching FB configs.\n", fbcount );
// Pick the FB config/visual with the most samples per pixel
printf( "Getting XVisualInfos\n" );
int best_fbc = -1, worst_fbc = -1, best_num_samp = -1, worst_num_samp = 999;
int i;
for (i=0; i<fbcount; ++i)
{
XVisualInfo *vi = glXGetVisualFromFBConfig( display, fbc[i] );
if ( vi )
{
int samp_buf, samples;
glXGetFBConfigAttrib( display, fbc[i], GLX_SAMPLE_BUFFERS, &samp_buf );
glXGetFBConfigAttrib( display, fbc[i], GLX_SAMPLES , &samples );
printf( " Matching fbconfig %d, visual ID 0x%2lx: SAMPLE_BUFFERS = %d,"
" SAMPLES = %d\n",
i, vi -> visualid, samp_buf, samples );
if ( best_fbc < 0 || samp_buf && samples > best_num_samp )
best_fbc = i, best_num_samp = samples;
if ( worst_fbc < 0 || !samp_buf || samples < worst_num_samp )
worst_fbc = i, worst_num_samp = samples;
}
XFree( vi );
}
GLXFBConfig bestFbc = fbc[ best_fbc ];
// Be sure to free the FBConfig list allocated by glXChooseFBConfig()
XFree( fbc );
#if 0
// Get a visual
XVisualInfo *vi = glXGetVisualFromFBConfig( display, bestFbc );
printf( "Chosen visual ID = 0x%lx\n", vi->visualid );
printf( "Creating colormap\n" );
XSetWindowAttributes swa;
Colormap cmap;
swa.colormap = cmap = XCreateColormap( display,
RootWindow( display, vi->screen ),
vi->visual, AllocNone );
swa.background_pixmap = None ;
swa.border_pixel = 0;
swa.event_mask = StructureNotifyMask;
printf( "Creating window\n" );
Window win = XCreateWindow( display, RootWindow( display, vi->screen ),
0, 0, 100, 100, 0, vi->depth, InputOutput,
vi->visual,
CWBorderPixel|CWColormap|CWEventMask, &swa );
if ( !win )
{
printf( "Failed to create window.\n" );
exit(1);
}
// Done with the visual info data
XFree( vi );
XStoreName( display, win, "GL 3.0 Window" );
printf( "Mapping window\n" );
XMapWindow( display, win );
#endif
// Get the default screen's GLX extension list
const char *glxExts = glXQueryExtensionsString( display,
DefaultScreen( display ) );
// NOTE: It is not necessary to create or make current to a context before
// calling glXGetProcAddressARB
glXCreateContextAttribsARBProc glXCreateContextAttribsARB = 0;
glXCreateContextAttribsARB = (glXCreateContextAttribsARBProc)
glXGetProcAddressARB( (const GLubyte *) "glXCreateContextAttribsARB" );
GLXContext ctx = 0;
// Install an X error handler so the application won't exit if GL 3.0
// context allocation fails.
//
// Note this error handler is global. All display connections in all threads
// of a process use the same error handler, so be sure to guard against other
// threads issuing X commands while this code is running.
ctxErrorOccurred = false;
int (*oldHandler)(Display*, XErrorEvent*) =
XSetErrorHandler(&ctxErrorHandler);
// Check for the GLX_ARB_create_context extension string and the function.
// If either is not present, use GLX 1.3 context creation method.
if ( !isExtensionSupported( glxExts, "GLX_ARB_create_context" ) ||
!glXCreateContextAttribsARB )
{
printf( "glXCreateContextAttribsARB() not found"
" ... using old-style GLX context\n" );
ctx = glXCreateNewContext( display, bestFbc, GLX_RGBA_TYPE, 0, True );
}
// If it does, try to get a GL 3.0 context!
else
{
int context_attribs[] =
{
GLX_CONTEXT_MAJOR_VERSION_ARB, 3,
GLX_CONTEXT_MINOR_VERSION_ARB, 0,
//GLX_CONTEXT_FLAGS_ARB , GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB,
None
};
printf( "Creating context\n" );
ctx = glXCreateContextAttribsARB( display, bestFbc, 0,
True, context_attribs );
// Sync to ensure any errors generated are processed.
XSync( display, False );
if ( !ctxErrorOccurred && ctx )
printf( "Created GL 3.0 context\n" );
else
{
// Couldn't create GL 3.0 context. Fall back to old-style 2.x context.
// When a context version below 3.0 is requested, implementations will
// return the newest context version compatible with OpenGL versions less
// than version 3.0.
// GLX_CONTEXT_MAJOR_VERSION_ARB = 1
context_attribs[1] = 1;
// GLX_CONTEXT_MINOR_VERSION_ARB = 0
context_attribs[3] = 0;
ctxErrorOccurred = false;
printf( "Failed to create GL 3.0 context"
" ... using old-style GLX context\n" );
ctx = glXCreateContextAttribsARB( display, bestFbc, 0,
True, context_attribs );
}
}
// Sync to ensure any errors generated are processed.
XSync( display, False );
// Restore the original error handler
XSetErrorHandler( oldHandler );
if ( ctxErrorOccurred || !ctx )
{
printf( "Failed to create an OpenGL context\n" );
exit(1);
}
// Verifying that context is a direct context
if ( ! glXIsDirect ( display, ctx ) )
{
printf( "Indirect GLX rendering context obtained\n" );
}
else
{
printf( "Direct GLX rendering context obtained\n" );
}
printf( "Making context current\n" );
glXMakeCurrent( display, 0, ctx );
#define IMAGE_DIAM 512
#define LIST_SIZE (IMAGE_DIAM*IMAGE_DIAM*3)
GLuint texture_id;
{
glGenTextures(1, &texture_id);
printf("err? %d\n", glGetError());
glBindTexture(GL_TEXTURE_2D, texture_id);
printf("err? %d\t(bind texture)\n", glGetError());
unsigned char random_stack_crap[LIST_SIZE];
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB8, IMAGE_DIAM, IMAGE_DIAM, 0, GL_RGB, GL_UNSIGNED_BYTE, random_stack_crap);
printf("err? %d\n", glGetError());
}
//
//
//
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
printf("device\t%p\n", device);
printf ("context\t%p\t%p\n", ctx, glXGetCurrentContext());
printf ("display\t%p\t%p\n", display, glXGetCurrentDisplay());
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR,
(cl_context_properties) glXGetCurrentContext(),
CL_GLX_DISPLAY_KHR,
(cl_context_properties) glXGetCurrentDisplay(),
CL_CONTEXT_PLATFORM,
(cl_context_properties) platform,
0
};
int err=0;
cl_context cl = clCreateContext(props, 1, &device, NULL, NULL, &err);
printf("err? %d\n", err);
printf("cl context %p\n", cl);
//
// https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
//
cl_command_queue command_queue = clCreateCommandQueue(cl, device, 0, &err);
printf("err? %d\n", err);
printf("cl queue %p\n", command_queue);
// Create memory buffers on the device for each vector
cl_mem z_mem_obj = clCreateFromGLTexture( cl,
CL_MEM_READ_WRITE,
GL_TEXTURE_2D,
0,
texture_id,
&err);
cl_mem a_mem_obj = clCreateBuffer(cl, CL_MEM_READ_ONLY,
LIST_SIZE , NULL, &err);
printf("err? %d\n", err);
cl_mem b_mem_obj = clCreateBuffer(cl, CL_MEM_READ_ONLY,
LIST_SIZE , NULL, &err);
printf("err? %d\n", err);
cl_mem c_mem_obj = clCreateBuffer(cl, CL_MEM_WRITE_ONLY,
LIST_SIZE*8 , NULL, &err);
printf("err? %d\n", err);
// Copy the lists A and B to their respective memory buffers
{
unsigned char *A = (unsigned char*) malloc(LIST_SIZE);
for (int i=0; i<LIST_SIZE; i++) {
A[i] = i+2;
}
unsigned char *B = (unsigned char*) malloc(LIST_SIZE);
for (int i=0; i<LIST_SIZE; i++) {
B[i] = 2*i;
}
err = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE, A, 0, NULL, NULL);
printf("err? %d\tcopy A\n", err);
err = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE, B, 0, NULL, NULL);
printf("err? %d\tcopy B\n", err);
}
const char* source_str = "__kernel void diff(__global uchar* rgb_a, __global uchar* rgb_b, __global ulong * diff_out)\n\
{\n\
int idx = get_global_id(0);\n\
diff_out[idx] = abs((int)rgb_a[idx] - (int)rgb_b[idx]);\n\
}";
size_t source_size = strlen(source_str);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(cl, 1,
(const char **)&source_str, (const size_t *)&source_size, &err);
printf("err? %d\t(create program)\n", err);
// Build the program
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
printf("err? %d\n", err);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "diff", &err);
printf("err? %d\n", err);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
printf("err? %d\n", err);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
printf("err? %d\n", err);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
printf("err? %d\n", err);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
printf("err? %d\t(kernel diff)\n", err);
// Read the memory buffer C on the device to the local variable C
long long *C;
C= (long long*)malloc(sizeof(*C)*LIST_SIZE);
err = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(*C), C, 0, NULL, NULL);
printf("err? %d(copy resul)\n", err);
for (int i=0; i<LIST_SIZE && i<10; i++) {
printf("%lld\n", C[i]);
}
free(C);
test_kernel(a_mem_obj, b_mem_obj, c_mem_obj, kernel, LIST_SIZE, command_queue);
test_kernel(z_mem_obj, b_mem_obj, c_mem_obj, kernel, LIST_SIZE, command_queue);
return 0;
}
The output on my linux/gentoo box with mesa-progs-8.4.0 is
Getting matching framebuffer configs
Found 10 matching FB configs.
Getting XVisualInfos
Matching fbconfig 0, visual ID 0x24: SAMPLE_BUFFERS = 0, SAMPLES = 0
Matching fbconfig 1, visual ID 0x7a: SAMPLE_BUFFERS = 0, SAMPLES = 0
Matching fbconfig 2, visual ID 0x38: SAMPLE_BUFFERS = 1, SAMPLES = 2
Matching fbconfig 3, visual ID 0x8e: SAMPLE_BUFFERS = 1, SAMPLES = 2
Matching fbconfig 4, visual ID 0x3a: SAMPLE_BUFFERS = 1, SAMPLES = 4
Matching fbconfig 5, visual ID 0x90: SAMPLE_BUFFERS = 1, SAMPLES = 4
Matching fbconfig 6, visual ID 0x44: SAMPLE_BUFFERS = 1, SAMPLES = 8
Matching fbconfig 7, visual ID 0x9a: SAMPLE_BUFFERS = 1, SAMPLES = 8
Matching fbconfig 8, visual ID 0x4c: SAMPLE_BUFFERS = 1, SAMPLES = 16
Matching fbconfig 9, visual ID 0xa2: SAMPLE_BUFFERS = 1, SAMPLES = 16
Creating context
Created GL 3.0 context
Direct GLX rendering context obtained
Making context current
err? 0
err? 0 (bind texture)
err? 0
device 0x557c1755cef0
context 0x557c173e0b68 0x557c173e0b68
display 0x557c172ee950 0x557c172ee950
err? 0
cl context 0x557c17559340
err? 0
cl queue 0x557c177dd080
err? 0
err? 0
err? 0
err? 0 copy A
err? 0 copy B
err? 0 (create program)
err? 0
err? 0
err? 0
err? 0
err? 0
err? 0 (kernel diff)
err? 0(copy resul)
2
1
0
1
2
3
4
5
6
7
err? 0 set kernel arg 0
err? 0
err? 0
err? 0 (kernel diff)
err? 0(copy result)
2
1
0
1
2
3
4
5
6
7
err? -38 set kernel arg 0
err? 0
err? 0
err? 0 (kernel diff)
err? 0(copy result)
2
1
0
1
2
3
4
5
6
7
Notice the err? -38 set kernel arg 0 which corresponds to the final test where I pass z_mem_obj to use the buffer converted from a GL texture.
clinfo | grep khr lists cl_khr_gl_sharing in the Platform Extensions and Device Extensions.

Related

why doesn't the OpenCL kernel execute even though there are no errors? (c, nvidia, kubuntu)

I'm learning opencl and for some reason the kernel does nothing:
#include <stdlib.h>
#include <stdio.h>
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
int err = 0;
#define PRINTERR() fprintf(stderr, "Error at line %u.\n", __LINE__)
#define CHECKERR(x) if(x){PRINTERR();return __LINE__;}
#define CHECKNOTERR(x) if(!x){PRINTERR();return __LINE__;}
const char *KernelSource =
"__kernel void square( \n" \
" __global float* input, \n" \
" __global float* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if(i == 0) printf(\"test\\n\"); \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i]; \n" \
"} \n" ;
#define DATA_SIZE 1024
int main(){
float data[DATA_SIZE];
float results[DATA_SIZE];
size_t global;
size_t local;
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem output;
unsigned int i = 0;
unsigned int count = DATA_SIZE;
for(i = 0; i < count; ++i)
//data[i] = rand() / (float)RAND_MAX;
data[i] = 2.f;
int gpu = 1;
err = clGetPlatformIDs (1, &platform_id, NULL); CHECKERR(err)
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); CHECKERR(err)
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); CHECKERR(!context)
commands = clCreateCommandQueueWithProperties(context, device_id, NULL, &err); CHECKERR(err)
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, &err); CHECKERR(err)
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, &err); CHECKERR(err)
CHECKERR(!input || !output)
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); CHECKERR(err)
program = clCreateProgramWithSource(context, 1, &KernelSource, NULL, &err); CHECKERR(err)
err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); CHECKERR(err)
kernel = clCreateKernel(program, "square", &err); CHECKERR(err)
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
CHECKERR(err)
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); CHECKERR(err)
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); CHECKERR(err)
err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); CHECKERR(err)
clFlush(commands);
clFinish(commands);
unsigned int correct = 0;
for(i = 0; i < count; ++i)
printf("%f\n",results[i]);
printf("Computed '%d/%d' correct values!\n", correct, count);
// free
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
i want it to do things, but it doesn't.
i tried reading the input instead of the output and it goes fine. the printf in the kernel does nothing and if i run it clEnqueueReadBuffer gives just 0. i have an amd, so i can't test it on the cpu.
i tried another example and it worked. (the one here)
help appreciated.
global is 0, so the program runs 0 times.

OpenCL/C: results always zero

I’m using Ubntu 20.04 to run my Opencl program with C but whatever the function i execute the results is always zero or other
illogical results, it’s like kernel not working,
Please if anyone has OpenCL, please try this program i want to know if the problem is in the program or in the installation of OpenCL.
This is the programme i use.
#define CL_USE_DEPRECATED_OPENCL_1_2APIS
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 10;
int* A = (int*)malloc(sizeof(int)*LIST_SIZE);
int* B = (int*)malloc(sizeof(int)*LIST_SIZE);
int* C = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
C[i] = 0;
}
// Load the kernel source code into the array source_str
/*FILE *fp;
char *source_str;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
size_t source_size;
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );*/
//size_t source_size;
const char* source_str =
"__kernel void vector_add(__global int *A, __global int *B, __global int *C) {\n"
" int i = get_global_id(0);\n"
" if(i>=10) return;\n"
" C[i] = A[i]+B[i];\n"
"}"
;
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, NULL, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t local_item_size = 64; // Divide work items into groups of 64
size_t global_item_size = ((LIST_SIZE+local_item_size-1)/local_item_size)*local_item_size; // Process the entire lists
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
clFinish(command_queue);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
The code looks oddly familiar to me...
With my PC it works as intended. I suppose the issue is that on your system no OpenCL device is detected. With this code snippet, you can select a platform and device:
// Get platform and device information
const int select_platform = 0;
const int select_device = 0;
cl_platform_id* platform_ids = NULL;
cl_uint nr_platforms = 0;
cl_device_id* device_ids = NULL;
cl_uint nr_devices = 0;
cl_device_id device_id = NULL;
cl_int ret = clGetPlatformIDs(0, NULL, &nr_platforms);
if(ret!=CL_SUCCESS || nr_platforms==0) std::cerr << "no OpenCL platforms found" << std::endl;
platform_ids = (cl_platform_id*)malloc(nr_platforms * sizeof(*platform_ids));
ret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
ret = clGetDeviceIDs(platform_ids[select_platform], CL_DEVICE_TYPE_ALL, 0, device_ids, &nr_devices);
if(ret!=CL_SUCCESS || nr_devices==0) std::cerr << "no OpenCL devices found on that platform" << std::endl;
device_ids = (cl_device_id*)malloc(nr_devices * sizeof(*device_ids));
ret = clGetDeviceIDs(platform_ids[select_platform], CL_DEVICE_TYPE_ALL, nr_devices, device_ids, &nr_devices);
device_id = device_ids[select_device];
char name[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, 1024, name, NULL);
std::cout << "selected device: " << name << std::endl;
Check different values of select_platform and select_device. If there is no device found on any platform, make sure to have compatible hardware and the latest graphics drivers or CPU runtime installed.
To make OpenCL development much less painful, consider this OpenCL-Wrapper. This automatically selects the fastest available OpenCL device for you in 1 line of code:
Device device(select_device_with_most_flops());
Alternatively, you can autiomatically select the device with most memory
Device device(select_device_with_most_memory());
or a device with specified ID:
Device device(select_device_with_id(1));

vector addition in OPENCL /C

I hope you are well, I have a problem with an opencl program, I execute the following program of the vector addition
#define CL_USE_DEPRECATED_OPENCL_1_2APIS
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 10;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
With the following kernel:
__kernel void vector_add(__global int *A, __global int *B, __global int *C) {
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
printf("calcule effectué");
}
While running a C program, I'm getting the following result and I can't seem to figure out why
0 + 10 = 714121520
1 + 9 = 21995
2 + 8 = 0
3 + 7 = 0
4 + 6 = 1852255608
5 + 5 = 1768697717
6 + 4 = 1932425826
7 + 3 = 3223151
8 + 2 = 1919885413
9 + 1 = 1953459744
I don't know what is the problem, please any help !!!!
I also get wrong results, and moreover the results are random for each execution. This means that the values in the C array are never overwritten, and the uninitialized, random values in that allocated memory location are printed.
First step to debug is to print ret and see where things go wrong:
After clGetDeviceIDs I got error -1 (device not found). CL_DEVICE_TYPE_ALL cleared that; then the selected device was my Nvidia GPU.
After clCreateProgramWithSource I got error -6 (out of host memory). To fix this, use cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, NULL, &ret);.
After clEnqueueNDRangeKernel I got error -54 (invalid workgroup size). To fix this, make global_item_size a multiple of local_item_size: size_t global_item_size = ((LIST_SIZE+local_item_size-1)/local_item_size)*local_item_size;. Since your intended global size (or rather size of allocated buffers) is not a multiple if the workgroup size, you should also put a guard clause if(i>=10) return; in the kernel; otherwise the kernel may write values in undefined memory space which can cause crashes.
Then, everything works as intended. Here is the entire fixed code:
#define CL_USE_DEPRECATED_OPENCL_1_2APIS
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <CL/cl.h>
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 10;
int* A = (int*)malloc(sizeof(int)*LIST_SIZE);
int* B = (int*)malloc(sizeof(int)*LIST_SIZE);
int* C = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
C[i] = 0;
}
// Load the kernel source code into the array source_str
size_t source_size;
const char* source_str =
"__kernel void vector_add(__global int *A, __global int *B, __global int *C) {\n"
" int i = get_global_id(0);\n"
" if(i>=10) return;\n"
" C[i] = A[i]+B[i];\n"
"}"
;
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
std::cout << "context " << ret << std::endl;
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
std::cout << "queue " << ret << std::endl;
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
std::cout << "buffer " << ret << std::endl;
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
std::cout << "write " << ret << std::endl;
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, NULL, &ret);
//cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t *)&source_size, &ret);
std::cout << "program " << ret << std::endl;
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
std::cout << "kernel " << ret << std::endl;
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&c_mem_obj);
std::cout << "args " << ret << std::endl;
// Execute the OpenCL kernel on the list
size_t local_item_size = 64; // Divide work items into groups of 64
size_t global_item_size = ((LIST_SIZE+local_item_size-1)/local_item_size)*local_item_size; // make global range a multiple of local range
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
std::cout << "run " << ret << std::endl;
clFinish(command_queue);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
std::cout << "read " << ret << std::endl;
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Debugging that was quite a hassle. Do yourself a favor and use this OpenCL-Wrapper with C++. This eliminates all the code overhead and the countless possibilities for errors that come with it.

When or why does clEnqueueNDRangeKernel return Null as event?

Hi I am trying to exectue example code from the book 7 Concurreny Models in 7 weeks. The Author uses a macbook, while I am using a a dell xps with windows 10.
My Program crashes because the timing_event is still null after I call the function clEnqueueNDRangeKernel().
cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
NULL, 0, NULL,&timing_event);
The docs state that the follwing about the event parameter
event
Returns an event object that identifies this particular kernel
execution instance. Event objects are unique and can be used to
identify a particular kernel execution instance later on. If event is
NULL, no event will be created for this kernel execution instance and
therefore it will not be possible for the application to query or
queue a wait for this particular kernel execution instance.
Can sombody provide some explanation why this happens on my dell and not on the macbook of the author?
I found the solution. The problem did not come from clEnqueueNDRangeKernel()it happened earlier in clBuildProgram(program, 0, NULL, NULL, NULL, NULL);. I retrieved the build Information with clGetProgramBuildInfo(). The problem was that my multiply_arrays.cl file wasn't utf 8 encoded
For everyone who is new to opencl. Every opencl function returns a status integer which is mapped to an specific error code. If a function does return does not return an status code, it is possible to pass a pointer to the function. See the example functions I linked below. This is very helpful to debug your program.
Returns Status Code
Status Code by Reference
main.cpp
/***
* Excerpted from "Seven Concurrency Models in Seven Weeks",
* published by The Pragmatic Bookshelf.
* Copyrights apply to this code. It may not be used to create training material,
* courses, books, articles, and the like. Contact us if you are in doubt.
* We make no guarantees that this code is fit for any purpose.
* Visit http://www.pragmaticprogrammer.com/titles/pb7con for more book information.
***/
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <mach/mach_time.h>
#else
#include <CL/cl.h>
#include <Windows.h>
#endif
#include <stdio.h>
#include<iostream>
#include <inttypes.h>
#include <chrono>
#define NUM_ELEMENTS (100000)
char* read_source(const char* filename) {
FILE *h = fopen(filename, "r");
fseek(h, 0, SEEK_END);
size_t s = ftell(h);
rewind(h);
char* program = (char*)malloc(s + 1);
fread(program, sizeof(char), s, h);
program[s] = '\0';
fclose(h);
return program;
}
void random_fill(cl_float array[], size_t size) {
for (int i = 0; i < size; ++i)
array[i] = (cl_float)rand() / RAND_MAX;
}
int main() {
//Status for Errorhandling
cl_int status;
//Identify Platform
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
//Get Id of GPU
cl_device_id device;
cl_uint num_devices = 0;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);
// Create Context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
//Use context to create Command Queue
//Que enables us to send commands to the gpu device
cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);
//Load Kernel
char* source = read_source("multiply_arrays.cl");
cl_program program = clCreateProgramWithSource(context, 1,
(const char**)&source, NULL, &status);
free(source);
// Build Program
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
size_t len;
char *buffer;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
buffer = (char *) malloc(len);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
printf("%s\n", buffer);
//Create Kernel
cl_kernel kernel = clCreateKernel(program, "multiply_arrays", &status);
// Create Arrays with random Numbers
cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
random_fill(a, NUM_ELEMENTS);
random_fill(b, NUM_ELEMENTS);
//uint64_t startGPU = mach_absolute_time();
auto start = std::chrono::high_resolution_clock::now();
//Create Readonly input Buffers with value from a and b
cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
//Create Output buffer write Only
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
//set Kernel Arguments
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
NULL, 0, NULL,&timing_event);
cl_float results[NUM_ELEMENTS];
//Calculate Results and copy from output buffer to results
clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
results, 0, NULL, NULL);
//uint64_t endGPU = mach_absolute_time();
auto finish = std::chrono::high_resolution_clock::now();
//printf("Total (GPU): %lu ns\n\n", (unsigned long)(endGPU - startGPU));
std::cout << "Total(GPU) :"<< std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
cl_ulong starttime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &starttime, NULL);
cl_ulong endtime;
clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endtime, NULL);
printf("Elapsed (GPU): %lu ns\n\n", (unsigned long)(endtime - starttime));
clReleaseEvent(timing_event);
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//uint64_t startCPU = mach_absolute_time();
start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_ELEMENTS; ++i)
results[i] = a[i] * b[i];
//uint64_t endCPU = mach_absolute_time();
finish = std::chrono::high_resolution_clock::now();
//printf("Elapsed (CPU): %lu ns\n\n", (unsigned long)(endCPU - startCPU));
std::cout << "Elapsed (CPU) :" << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
return 0;
}
multiply_arrays.cl
__kernel void multiply_arrays(__global const float* inputA,
__global const float* inputB,
__global float* output) {
int i = get_global_id(0);
output[i] = inputA[i] * inputB[i];
}
//ö

getting wrong output for Parallel Floyd Warshall Algorithm in OpenCL

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
/*#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else*/
#include <CL/cl.h>
//#endif
#define DATA_SIZE 16
using namespace std;
const char *ProgramSource =
"__kernel void floydWarshallPass(__global uint * pathDistanceBuffer,const unsigned int numNodes, __global uint * result, const unsigned int pass)\n"\
"{\n"\
"int xValue = get_global_id(0);\n"\
"int yValue = get_global_id(1);\n"\
"int k = pass;\n"\
"int oldWeight = pathDistanceBuffer[yValue * 4 + xValue];\n"\
"int tempWeight = (pathDistanceBuffer[yValue * 4 + k] + pathDistanceBuffer[k * 4 + xValue]);\n"\
"if (tempWeight < oldWeight)\n"\
"{\n"\
"pathDistanceBuffer[yValue * 4 + xValue] = tempWeight;\n"\
"result[yValue * 4 + xValue] = tempWeight;\n"\
"}\n"\
"}\n"\
"\n";
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem inputA, inputB, output;
cl_int numNodes;
size_t global;
float inputDataA[16] = {0,2,3,4,5,0,7,8,9,10,0,12,13,14,15,0};
float results[16]={0};
int i,j;
numNodes = 16;
if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS)
{
printf("Unable to get platform id\n");
return 1;
}
// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
return 1;
}
// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
// create a context with the GPU device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
// create command queue using the context and device
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "floydWarshallPass", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
global=DATA_SIZE;
// enqueue the kernel command for execution
for(cl_uint sh=0; sh<16; sh++)
{
clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *)&sh);
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float)*DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
//clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
//clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
clFinish(command_queue);
}
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);
// print the results
printf("output: ");
for(i=0;i<16; i++)
{
printf("%f ",results[i]);
}
// cleanup - release OpenCL resources
clReleaseMemObject(inputA);
//clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
I am getting -0.00000 output for every node.
P.S i am running my code on CL_DEVICE_TYPE_CPU because on GPU it is giving error that cannot get device id.
Please give some guidance on how to get correct output.
I think your question is a little too broad, you should have narrowed your code a little bit. I'll try to help you with some errors I found on your code, but I didn't debug or compile it, so those issues I describe here are only something for you to start looking at.
Why are you calling get_global_id with parameter 1 on your kernel?
Back at your clEnqueueNDRangeKernel you specified that your
work-items dimension is only one, so your get_global_id is querying
for a non-existing dimension. If you want to translate a single
dimension coordinate into two coordinate, you should use a
transformation such as below:
int id = get_global_id(0);
int x = id % size->width;
int y = id / size->height;
Pay attention when you use sizeof(float) to measure the size of the data types: they may be not of the same size inside the OpenCL implementation. Use sizeof(cl_float) instead.
Maybe you are not getting any GPU because you don't have the proper drivers installed on your computer. Go to the GPU vendor website and look for runtime drivers for OpenCL.
Take a look at those pages from the OpenCL specification
get_global_id
OpenCl data types

Resources