I have a problem. I am trying to learn OpenCl, so I've been trying to implement FFT algorithm with OpenCl. I was trying recreate this:
void FFT (cmplx* data, int dataSize){
if(dataSize == 1){
return;
}else{
cmplx* even = (cmplx*)malloc(dataSize/2*sizeof(cmplx));
cmplx* odd = (cmplx*)malloc(dataSize/2*sizeof(cmplx));
for (int i = 0;i<dataSize;i+=2){
even[i/2] = data[i];
odd[i/2] = data[i+1];
}
FFT(even,dataSize/2);
FFT(odd,dataSize/2);
for (int i = 0;i<dataSize;i++){
cmplx C = cmplx(-2*M_PI/dataSize*i);
data[i].real = even[i].real + C.real*odd[i].real - C.imag*odd[i].imag;
data[i].imag = even[i].imag + C.real*odd[i].imag + C.imag*odd[i].real;
}
}
}
cmplx is just a class that holds two floats real and imaginary parts of complex number and has a constructor that creates complex number with Euler equation. and everything else is quite straight forward
I probably do not know some small nuance, in my understanding I can do cycle calculation in independent threads so cycle like this :
for (int i = 0;i<dataSize;i++){
cmplx C = cmplx(-2*M_PI/dataSize*i);
data[i].real = even[i].real + C.real*odd[i].real - C.imag*odd[i].imag;
data[i].imag = even[i].imag + C.real*odd[i].imag + C.imag*odd[i].real;
}
With OpenCl code like this:
__kernel void FFTComplexSum(__global float *evenReal,__global float *evenImag,
__global float *oddReal,__global float *oddImag,
__global float *real,__global float *imag,
__global float *C){
int gid = get_global_id(0);
real[gid] = evenReal[gid] + cos(C[gid])*oddReal[gid] - sin(C[gid])*oddImag[gid];
imag[gid] = evenImag[gid] + cos(C[gid])*oddImag[gid] + sin(C[gid])*oddReal[gid];
}
But if run this:
.... // instantiating stuff like platform, device_id, kernel, program...
size_t buffer_size;
cl_mem evenReal_mem, evenImag_mem, oddReal_mem, oddImag_mem, real_mem, imag_mem, c_mem;
float evenReal[dataSize];
float evenImag[dataSize];
float tReal[dataSize];
float tImag[dataSize];
float oddReal[dataSize];
float oddImag[dataSize];
float C[dataSize];
for (int i = 0;i<dataSize;i+=2){
evenReal[i/2] = real[i];
evenImag[i/2] = imag[i];
oddReal[i/2] = real[i+1];
oddImag[i/2] = imag[i+1];
C[i] = -2*M_PI/dataSize*i;
C[i+1] = -2*M_PI/dataSize*(i+1);
}
doubleArray(evenReal,dataSize); // doubleArray function just makes array to loop
doubleArray(evenImag,dataSize);
doubleArray(oddReal,dataSize);
doubleArray(oddImag,dataSize);
buffer_size = sizeof(float) * dataSize;
evenReal_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, evenReal_mem, CL_TRUE, 0, buffer_size,(void*)evenReal, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
evenImag_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, evenImag_mem, CL_TRUE, 0, buffer_size,(void*)evenImag, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
oddReal_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, oddReal_mem, CL_TRUE, 0, buffer_size,(void*)oddReal, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
oddImag_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, oddImag_mem, CL_TRUE, 0, buffer_size,(void*)oddImag, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
real_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, real_mem, CL_TRUE, 0, buffer_size,(void*)real, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
imag_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, imag_mem, CL_TRUE, 0, buffer_size,(void*)imag, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
c_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float), NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, c_mem, CL_TRUE, 0, sizeof(float),(void*)C, 0, NULL, NULL);
assert(err == CL_SUCCESS); // Fail check
clFinish(cmd_queue);
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &evenReal_mem);
err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &evenImag_mem);
err = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &oddReal_mem);
err = clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &oddImag_mem);
err = clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &real_mem);
err = clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &imag_mem);
err = clSetKernelArg(kernel[0], 6, sizeof(cl_mem), &c_mem);
assert(err == CL_SUCCESS); // Fail check
size_t global_work_size = dataSize;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
assert(err == CL_SUCCESS);
clFinish(cmd_queue);
printf("test data:\n");
for (int i = 0;i<dataSize;i++){
float r,I;
r = evenReal[i] + cos(C[i])*oddReal[i] - sin(C[i])*oddImag[i];
I = evenImag[i] + cos(C[i])*oddImag[i] + sin(C[i])*oddReal[i];
printf("%f + %f\n",r,I);
}
err = clEnqueueReadBuffer(cmd_queue, real_mem, CL_TRUE, 0, buffer_size, tReal, 0, NULL, NULL);
assert(err == CL_SUCCESS);
clFinish(cmd_queue);
err = clEnqueueReadBuffer(cmd_queue, imag_mem, CL_TRUE, 0, buffer_size, tImag, 0, NULL, NULL);
assert(err == CL_SUCCESS);
clFinish(cmd_queue);
clReleaseMemObject(evenReal_mem);
clReleaseMemObject(evenImag_mem);
clReleaseMemObject(oddReal_mem);
clReleaseMemObject(oddImag_mem);
clReleaseMemObject(real_mem);
clReleaseMemObject(imag_mem);
clReleaseMemObject(c_mem);
prinf("data:");
for (int i = 0;i<dataSize;i++){
printf("%f + %f\n",tReal[i],tImag[i]);
}
It returns this:
test data:
1.000000 + 0.000000
0.000000 + 1.000000
-1.000000 + 0.000000
-0.000000 + -1.000000
data:
1.000000 + 0.000000
-1.000000 + 0.000000
1.000000 + 0.000000
-1.000000 + 0.000000
Am I am really confused why I get wrong answers. I am missing something really obvious?
Sorry for long question.
c_mem has the problem.
You access to C like C[gid] in kernel but you created it with size of sizeof(float) only. So main data can not fit in that(4 byte) memory space and you are writing only 4 byte to it. Multiplying both its creation size and write size with data_size should be enough.
Thats why real values get 1 and -1 while imaginaries get zero (sin(0)). If you had luck, overflowing C would give garbage and give both real and imaginary elements garbage result which would show the source of error immediately.
Related
I'm learning opencl and for some reason the kernel does nothing:
#include <stdlib.h>
#include <stdio.h>
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
int err = 0;
#define PRINTERR() fprintf(stderr, "Error at line %u.\n", __LINE__)
#define CHECKERR(x) if(x){PRINTERR();return __LINE__;}
#define CHECKNOTERR(x) if(!x){PRINTERR();return __LINE__;}
const char *KernelSource =
"__kernel void square( \n" \
" __global float* input, \n" \
" __global float* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if(i == 0) printf(\"test\\n\"); \n" \
" if(i < count) \n" \
" output[i] = input[i] * input[i]; \n" \
"} \n" ;
#define DATA_SIZE 1024
int main(){
float data[DATA_SIZE];
float results[DATA_SIZE];
size_t global;
size_t local;
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem output;
unsigned int i = 0;
unsigned int count = DATA_SIZE;
for(i = 0; i < count; ++i)
//data[i] = rand() / (float)RAND_MAX;
data[i] = 2.f;
int gpu = 1;
err = clGetPlatformIDs (1, &platform_id, NULL); CHECKERR(err)
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); CHECKERR(err)
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); CHECKERR(!context)
commands = clCreateCommandQueueWithProperties(context, device_id, NULL, &err); CHECKERR(err)
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, &err); CHECKERR(err)
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, &err); CHECKERR(err)
CHECKERR(!input || !output)
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); CHECKERR(err)
program = clCreateProgramWithSource(context, 1, &KernelSource, NULL, &err); CHECKERR(err)
err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); CHECKERR(err)
kernel = clCreateKernel(program, "square", &err); CHECKERR(err)
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
CHECKERR(err)
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); CHECKERR(err)
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); CHECKERR(err)
err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); CHECKERR(err)
clFlush(commands);
clFinish(commands);
unsigned int correct = 0;
for(i = 0; i < count; ++i)
printf("%f\n",results[i]);
printf("Computed '%d/%d' correct values!\n", correct, count);
// free
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
i want it to do things, but it doesn't.
i tried reading the input instead of the output and it goes fine. the printf in the kernel does nothing and if i run it clEnqueueReadBuffer gives just 0. i have an amd, so i can't test it on the cpu.
i tried another example and it worked. (the one here)
help appreciated.
global is 0, so the program runs 0 times.
I’m using Ubntu 20.04 to run my Opencl program with C but whatever the function i execute the results is always zero or other
illogical results, it’s like kernel not working,
Please if anyone has OpenCL, please try this program i want to know if the problem is in the program or in the installation of OpenCL.
This is the programme i use.
#define CL_USE_DEPRECATED_OPENCL_1_2APIS
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 10;
int* A = (int*)malloc(sizeof(int)*LIST_SIZE);
int* B = (int*)malloc(sizeof(int)*LIST_SIZE);
int* C = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
C[i] = 0;
}
// Load the kernel source code into the array source_str
/*FILE *fp;
char *source_str;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
size_t source_size;
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );*/
//size_t source_size;
const char* source_str =
"__kernel void vector_add(__global int *A, __global int *B, __global int *C) {\n"
" int i = get_global_id(0);\n"
" if(i>=10) return;\n"
" C[i] = A[i]+B[i];\n"
"}"
;
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, NULL, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t local_item_size = 64; // Divide work items into groups of 64
size_t global_item_size = ((LIST_SIZE+local_item_size-1)/local_item_size)*local_item_size; // Process the entire lists
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
clFinish(command_queue);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
The code looks oddly familiar to me...
With my PC it works as intended. I suppose the issue is that on your system no OpenCL device is detected. With this code snippet, you can select a platform and device:
// Get platform and device information
const int select_platform = 0;
const int select_device = 0;
cl_platform_id* platform_ids = NULL;
cl_uint nr_platforms = 0;
cl_device_id* device_ids = NULL;
cl_uint nr_devices = 0;
cl_device_id device_id = NULL;
cl_int ret = clGetPlatformIDs(0, NULL, &nr_platforms);
if(ret!=CL_SUCCESS || nr_platforms==0) std::cerr << "no OpenCL platforms found" << std::endl;
platform_ids = (cl_platform_id*)malloc(nr_platforms * sizeof(*platform_ids));
ret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
ret = clGetDeviceIDs(platform_ids[select_platform], CL_DEVICE_TYPE_ALL, 0, device_ids, &nr_devices);
if(ret!=CL_SUCCESS || nr_devices==0) std::cerr << "no OpenCL devices found on that platform" << std::endl;
device_ids = (cl_device_id*)malloc(nr_devices * sizeof(*device_ids));
ret = clGetDeviceIDs(platform_ids[select_platform], CL_DEVICE_TYPE_ALL, nr_devices, device_ids, &nr_devices);
device_id = device_ids[select_device];
char name[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, 1024, name, NULL);
std::cout << "selected device: " << name << std::endl;
Check different values of select_platform and select_device. If there is no device found on any platform, make sure to have compatible hardware and the latest graphics drivers or CPU runtime installed.
To make OpenCL development much less painful, consider this OpenCL-Wrapper. This automatically selects the fastest available OpenCL device for you in 1 line of code:
Device device(select_device_with_most_flops());
Alternatively, you can autiomatically select the device with most memory
Device device(select_device_with_most_memory());
or a device with specified ID:
Device device(select_device_with_id(1));
I'm completely new to OpenCL programming but have been programming in (embedded) C for years. I'm running into a problem where clFinish is returning -36, which afaik indicates some failure for the kernel to run properly. However, I can't for the the life of me figure out why. The kernel was tested to run properly in another build and the differences between the two builds seem negligible. Below is a simplified version of the code without error checking. (All return values are otherwise checked and return CL_SUCCESS)
#define INPUT_ARR_SIZE 128
int inArr[INPUT_ARR_SIZE];
int main()
{
cl_uint num = 1;
cl_int ret;
cl_platform_id platforms[1];
cl_device_id devices[1];
cl_context_properties ccp[3];
cl_context ctx;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem mem1, mem2;
cl_int result;
clGetPlatformIDs(num, platforms, &num);
printf("%i Platforms\n", num);
num = 1;
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num, devices, &num);
printf("Devices: %d\n", num);
ccp[0] = CL_CONTEXT_PLATFORM;
ccp[1] = (intptr_t)platforms[0];
ccp[2] = 0;
ctx = clCreateContext(ccp, 1, devices, NULL, NULL, &ret);
queue = clCreateCommandQueue(ctx, devices[0], (cl_ulong)0, &ret);
program = clCreateProgramWithSource(ctx, 1, &KernelSrcPtr, NULL, &ret);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
printf("Build success\n");
mem1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(inArr), NULL, &ret);
mem2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(inArr), NULL, &ret);
kernel = clCreateKernel(program, "hello", &ret);
ret = clEnqueueWriteBuffer(queue, mem1, CL_TRUE, 0, sizeof(inArr), inArr, 0, NULL, NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem1);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem2);
cl_uint num2 = INPUT_ARR_SIZE;
ret = clEnqueueNDRangeKernel(queue, kernel, 1, 0, (size_t*)&num2, NULL, 0, NULL, NULL);
ret |= clFinish(queue);
}
And here is the kernel:
const char KernelSrc[] =
"__kernel void hello(__global const int *mem1, __global int *mem2)\n"\
"{\n"\
" size_t id = get_global_id(0);\n"\
" int intm = mem1[id] * 2;\n"\
" intm = intm + 5;\n"\
" intm = intm * 6;\n"\
" mem2[id] = intm / 3;\n"\
"}\n\n";
Also, an interesting thing happens, where if I comment out the mem2[id] = intm / 3; line in the kernel, the program never finishes it's execution.
Can anyone help me with what's going on?
I have written a kernel, which should be doing nothing, except from adding an one to each component of a float3:
__kernel void GetCellIndex(__global Particle* particles) {
int globalID = get_global_id(0);
particles[globalID].position.x += 1;
particles[globalID].position.y += 1;
particles[globalID].position.z += 1;
};
with following struct (in the kernel)
typedef struct _Particle
{
cl_float3 position;
}Particle;
my problem is, that when i write my array of particles to the GPU, every component is zero. here is the neccassary code:
(Particle*) particles = new Particle[200];
for (int i = 0; i < 200; i++)
{
particles[i].position.x = 5f;
}
cl_Particles = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Particle)*200, NULL, &err);
if (err != 0)
{
std::cout << "CreateBuffer does not work!" << std::endl;
system("Pause");
}
clEnqueueWriteBuffer(queue, cl_Particles, CL_TRUE, 0, sizeof(Particle) * 200, &particles, 0, NULL, NULL);
//init of kernel etc.
err = clSetKernelArg(kernel, 0, sizeof(Particle) * 200, &cl_Particles);
if (err != 0) {
std::cout << "Error: setKernelArg 0 does not work!" << std::endl;
system("Pause");
}
and this is my struct on the CPU:
typedef struct _Particle
{
cl_float4 position;
}Particle;
can someone help me with this problem?
(any clue is worth to discuss...)
Thanks
Your code snippet contains some typical C programming errors. At first,
(Particle*) particles = new Particle[200];
does not declare a new variable particle as a pointer to Particle. It must be:
Particle *particles = new Particle[200];
As next, in your call of
clEnqueueWriteBuffer(queue, cl_Particles, CL_TRUE, 0, sizeof(Particle) * 200, &particles, 0, NULL, NULL);
you passed a pointer to the particles pointer as the 6th parameter (ptr). But, here you must pass a pointer to the region on the host containing the data. Thus, change &particles to particles:
clEnqueueWriteBuffer(queue, cl_Particles, CL_TRUE, 0, sizeof(Particle) * 200, particles, 0, NULL, NULL);
The setup of the kernel arguments is also wrong. Here, you must pass the OpenCL buffer created with clCreateBuffer. Thus, replace
err = clSetKernelArg(kernel, 0, sizeof(Particle) * 200, &cl_Particles);
with:
err = clSetKernelArg(kernel, 0, sizeof(cl_Particle), &cl_Particles);
As clCreateBuffer returns a value of type cl_mem, the expression sizeof(cl_Particle) evaluates to the same as sizeof(cl_mem). I recommend to always call sizeof() on the variable, so you need to change the data-type only in one place: the variable declaration.
On my platform, cl_float3 is the same as cl_float4. This might not be true on your/every platform, so you should always use the same type in the host code and in the kernel code. Also, in your kernel code you should/must use the type float4 instead of cl_float4.
I hope, I got the C calls right because I actually tested it with this C++ code. This code snippet contains the fixed C calls as comments:
Particle *particles = new Particle[200];
for (int i = 0; i < 200; i++)
{
//particles[i].position.x = 5f;
particles[i].position.s[0] = 0x5f; // due to VC++ compiler
}
//cl_mem cl_Particles = cl_createBuffer(context, CL_MEM_READ_WRITE, sizeof(Particle)*200, NULL, &err); // FIXED
cl::Buffer cl_Particles(context, CL_MEM_READ_WRITE, sizeof(Particle)*200, NULL, &err);
checkErr(err, "Buffer::Buffer()");
//err = clEnqueueWriteBuffer(queue, cl_Particles, CL_TRUE, 0, sizeof(Particle) * 200, particles, 0, NULL, NULL); // FIXED
queue.enqueueWriteBuffer(cl_Particles, CL_TRUE, 0, sizeof(Particle) * 200, particles, NULL, NULL);
checkErr(err, "ComamndQueue::enqueueWriteBuffer()");
//init of kernel
cl::Kernel kernel(program, "GetCellIndex", &err);
checkErr(err, "Kernel::Kernel()");
//err = clSetKernelArg(kernel, 0, sizeof(cl_Particle), &cl_Particles); // FIXED
err = kernel.setArg(0, sizeof(cl_Particles), &cl_Particles);
checkErr(err, "Kernel::setArg()");
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
/*#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else*/
#include <CL/cl.h>
//#endif
#define DATA_SIZE 16
using namespace std;
const char *ProgramSource =
"__kernel void floydWarshallPass(__global uint * pathDistanceBuffer,const unsigned int numNodes, __global uint * result, const unsigned int pass)\n"\
"{\n"\
"int xValue = get_global_id(0);\n"\
"int yValue = get_global_id(1);\n"\
"int k = pass;\n"\
"int oldWeight = pathDistanceBuffer[yValue * 4 + xValue];\n"\
"int tempWeight = (pathDistanceBuffer[yValue * 4 + k] + pathDistanceBuffer[k * 4 + xValue]);\n"\
"if (tempWeight < oldWeight)\n"\
"{\n"\
"pathDistanceBuffer[yValue * 4 + xValue] = tempWeight;\n"\
"result[yValue * 4 + xValue] = tempWeight;\n"\
"}\n"\
"}\n"\
"\n";
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
cl_mem inputA, inputB, output;
cl_int numNodes;
size_t global;
float inputDataA[16] = {0,2,3,4,5,0,7,8,9,10,0,12,13,14,15,0};
float results[16]={0};
int i,j;
numNodes = 16;
if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS)
{
printf("Unable to get platform id\n");
return 1;
}
// try to get a supported GPU device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS)
{
printf("Unable to get device_id\n");
return 1;
}
// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
// create a context with the GPU device
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err);
// create command queue using the context and device
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "floydWarshallPass", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
// set the argument list for the kernel command
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
global=DATA_SIZE;
// enqueue the kernel command for execution
for(cl_uint sh=0; sh<16; sh++)
{
clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *)&sh);
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float)*DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
//clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes);
//clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
clFinish(command_queue);
}
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL);
// print the results
printf("output: ");
for(i=0;i<16; i++)
{
printf("%f ",results[i]);
}
// cleanup - release OpenCL resources
clReleaseMemObject(inputA);
//clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
I am getting -0.00000 output for every node.
P.S i am running my code on CL_DEVICE_TYPE_CPU because on GPU it is giving error that cannot get device id.
Please give some guidance on how to get correct output.
I think your question is a little too broad, you should have narrowed your code a little bit. I'll try to help you with some errors I found on your code, but I didn't debug or compile it, so those issues I describe here are only something for you to start looking at.
Why are you calling get_global_id with parameter 1 on your kernel?
Back at your clEnqueueNDRangeKernel you specified that your
work-items dimension is only one, so your get_global_id is querying
for a non-existing dimension. If you want to translate a single
dimension coordinate into two coordinate, you should use a
transformation such as below:
int id = get_global_id(0);
int x = id % size->width;
int y = id / size->height;
Pay attention when you use sizeof(float) to measure the size of the data types: they may be not of the same size inside the OpenCL implementation. Use sizeof(cl_float) instead.
Maybe you are not getting any GPU because you don't have the proper drivers installed on your computer. Go to the GPU vendor website and look for runtime drivers for OpenCL.
Take a look at those pages from the OpenCL specification
get_global_id
OpenCl data types