Setting arguments in a kernel in OpenCL causes error - c

I am a beginner in OpenCL and thus writing a simple program to double the elements of an array.
The kernel code is:-
__kernel void dataParallel(__global int* A, __global int* B)
{
int base = get_local_id(0);
B[base]=A[base]+A[base];
}
The local_work_size=32 as I am squaring 32 elements.
In my program I have declared an integer array which holds the elements to be squared.
int *A;
A=(int*)malloc(sizeof(int)*64);
for (i=0; i < 32; i++) { A[i] = i; }
platforms[i] stores the platform id, devices[j] stores the corresponding device id. Their types:
cl_platform_id* platforms;
cl_device_id* devices;
Creating context
cl_context context=clCreateContext(NULL,1,&devices[j],NULL,NULL,NULL);
Next comes the command queue
cl_command_queue cmdqueue=cmdqueue=clCreateCommandQueue(context,devices[j],NULL,&err);
Next I created 2 memory buffers, one to hold the input data and the other to hold the result.
cl_mem Abuffer,Bbuffer;
Abuffer=clCreateBuffer(context, CL_MEM_READ_WRITE ,32*sizeof(int),NULL,&err);
Bbuffer=clCreateBuffer(context, CL_MEM_READ_WRITE ,32*sizeof(int),NULL,&err);
Then I copied the data of array A to Abuffer
ret=clEnqueueWriteBuffer(cmdqueue, Abuffer, CL_TRUE, 0, 32*sizeof(int), A, 0, NULL, NULL);
printf("%d",ret);//output is 0 thus data written successfully into the buffer
The kernel code was then read into a character string source_str and the program was created.
kernelprgrm=clCreateProgramWithSource(context,1,(const char **)&source_str,(const size_t *)&source_size,&err);
if(!err)
{
printf("\nKernel program created successfully\n");
}//Outputs -Kernel program created successfully
I then built the program using:
ret=clBuildProgram(kernelprgrm,1,&devices[j],NULL,NULL,NULL);//returns CL_SUCCESS
Getting buildinfo next
ret=clGetProgramBuildInfo(kernelprgrm,devices[j], CL_PROGRAM_BUILD_STATUS ,0,NULL,&size);//Returns success
Creating kernel
kernel = clCreateKernel(kernelprgrm, "dataParallel", &ret);
printf("\nReturn kernel program=%d",ret);
if(!ret)
{
printf("\nProgram created successfully!\n");
}
//Outputs -Program created successfully!
Now comes the devil:-
ret=clSetKernelArg(kernel,0,sizeof(cl_mem),(void *) Abuffer);
printf("\nKernel argument 1 ret=%d",ret);
ret=clSetKernelArg(kernel,1,sizeof(cl_mem),(void *) Bbuffer);
printf("\nKernel argument 2 ret=%d",ret);
Both return -38 meaning CL_INVALID_MEM_OBJECT.
P.S.:As per the errors pointed out i.e. use &Abuffer instead of Abuffer in the argument and after making the necessary changes, both return 0
size_t global_item_size = 32;
size_t local_item_size = 32;
Also ret = clEnqueueNDRangeKernel(cmdqueue, kernel, 1, NULL,&global_item_size, &local_item_size, 0, NULL, NULL); returns 0.
Trying to get the result
ret = clEnqueueReadBuffer(cmdqueue, Bbuffer, CL_TRUE, 0, 32*sizeof(int), B, 0, NULL, NULL);`
printf("\nB:-\n");
for (t=0; t < 32; t++) {
printf("%d\t ", B[t]);
}
This returns buildstatus=0 with core getting dumped for my AMD GPU (running AMD Accelerated Parallel Processing platform) and NVIDIA GPU whereas it works perfectly fine if the selected device is CPU using Intel(R) OpenCL platform.
Also I tried getting the build log using:
cl_build_status *status=(cl_build_status *)malloc(sizeof(cl_build_status )*size);
clGetProgramBuildInfo(kernelprgrm,devices[j], CL_PROGRAM_BUILD_STATUS ,size,status,NULL);
printf("\nBuild status=%d\n",*status);
//Getting build info if not successful
clGetProgramBuildInfo(kernelprgrm,devices[i], CL_PROGRAM_BUILD_LOG ,0,NULL,&size);
char *buildlog=(char*)malloc(size);
clGetProgramBuildInfo(kernelprgrm,devices[i], CL_PROGRAM_BUILD_LOG ,size,buildlog,NULL);
printf("\n!!!!!!!!!!!!!!!!!!!!!Program ended!!!!!!!!!!!\n");
printf("\n\nBuildlog: %s\n\n",buildlog);
But it returns Buildlog: Compilation started
Compilation done
Linking started
Linking done
Device build started
Device build done
Kernel <dataParallel> was successfully vectorized (4)
Done.

Here's what the OpenCL 1.2 spec has to say about setting buffers as kernel arguments:
If the argument is a memory object (buffer, image or image array), the arg_value entry will be a pointer to the appropriate buffer, image or image array object.
So, you need to pass a pointer to the cl_mem objects:
ret=clSetKernelArg(kernel,0,sizeof(cl_mem),(void *) &Abuffer);

Why are you using clEnqueueTask? I think you should use clEnqueueNDRangeKernel if you have parallel work to do. Also, just set the global work size; pass NULL for the local work group size. 32x32 is larger than some devices can do.

Related

Can't get the right memory map using uefi's GetMemoryMap function

I'm writing a little boot loader in UEFI environment for my custom os using POSIX-UEFI.
When trying to get the memory map using GetMemoryMap boot service to calculate the amount of memory, it returns not only a different number of memory descriptors each time the program is executed, but the descriptor table itself seems to be corrupted or something...
Here's the code to get the memory map:
efi_memory_descriptor_t *map = NULL;
uintn_t mapsize, mapkey, descriptorsize;
uint32_t descriptorversion;
uint64_t memory_size = 0;
ST->BootServices->GetMemoryMap(&mapsize, map, &mapkey, &descriptorsize, &descriptorversion);
ST->BootServices->AllocatePool(EfiLoaderData, mapsize, (void **) &map);
ST->BootServices->GetMemoryMap(&mapsize, map, &mapkey, &descriptorsize, &descriptorversion);
printf("Memory map size: %d descriptors\n", mapsize / descriptorsize);
for (int i = 0; i < mapsize / descriptorsize; i++) {
efi_memory_descriptor_t *desc = (efi_memory_descriptor_t *)(map + i * descriptorsize);
memory_size += desc->NumberOfPages * EFI_PAGE_SIZE;
}
printf("Memory size: %d bytes\n", memory_size);
I tried using sizeof(efi_memory_descriptor_t) too instead of descriptorsize to calculate the number of entries but nothing changes.
Here's a screenshot of the program's output:
VirtualBox with an efi shell running 3 times the code above
I use a VirtualBox (v7.0.4) virtual machine with 64MB ram size, no storage device, and a usb drive attached to load the program from.
map is a pointer to efi_memory_descriptor_t, so map + i * descriptorsize is scaling by both descriptorsize and sizeof (efi_memory_descriptor_t).
I suggest using (uint8_t)map + i * descriptorsize.
Note, descriptorsize may not be equal to sizeof (efi_memory_descriptor_t), so it is correct to use descriptorsize.

Program with while loop causes stack overflow, but only in x86 and only when injected into another process

I have an unfortunately convoluted problem that I am hopeful someone might be able to help me with.
I have written a reasonably large program that I have converted into position independent code (see here for reference: https://bruteratel.com/research/feature-update/2021/01/30/OBJEXEC/). Basically just meaning that the resulting exe (compiled using mingw) contains data only in the .text section, and thus can be injected into and ran from an arbitrary place in memory. I have successfully ported the program to this format and can compile it for both x86 and x64.
I created two "helper" exe's to run the PIC program, a local injector and a remote injector. The local injector runs the program by calling VirtualAlloc, memcpy, and CreateThread. The remote injector runs the program by calling CreateProcess (suspended), VirtualAllocEx, WriteProcessMemory, QueueAPCThread, and ResumeThread (the last two api's being called on pi.hThread which was returned from CreateProcess).
I am experiencing inconsistent results in the program depending on the architecture and method of execution.
x64 local: works
x64 inject: works
x86 local: works
x86 inject: fails; stack overflow
I have determined that my program is crashing in a while loop in a particular function. This function is used to format data contained in buffers (heap allocated) that are passed in as function args. The raw data buffer (IOBuf) contains a ~325k long string containing Base64 characters with spaces randomly placed throughout. The while loop in question iterates over this buffer and copies non-space characters to a second buffer (IntermedBuf), with the end goal being that IntermedBuf contains the full Base64 string in IOBuf minus the random spaces.
A few notes about the following code snippet:
Because the code is written to be position independent, all api's must be manually resolved which is why you see things like (SPRINTF)(Apis.sprintfFunc). I have resolved the addresses of each API in their respective DLL and have created typedef's for each API that is called. While odd, this is not in itself causing the issue as the code works fine in 3/4 of the situations.
Because this program is failing when injected, I cannot use print statements to debug, so I have added calls to MessageBoxA to pop up at certain places to determine contents of variables and/or if execution is reaching that part of the code.
The relevant code snippet is as follows:
char inter[] = {'I','n','t',' ',0};
char tools[100] = {0};
if (((STRCMP)Apis.strcmpFunc)(IntermedBuf, StringVars->b64Null) != 0)
{
int i = 0, j = 0, strLen = 0, lenIOBuf = ((STRLEN)Apis.strlenFunc)(IOBuf);
((SPRINTF)Apis.sprintfFunc)(tools, StringVars->poi, IOBuf);
((MESSAGEBOXA)Apis.MessageBoxAFunc)(NULL, tools, NULL, NULL);
((MEMSET)Apis.memsetFunc)(tools, 0, 100 * sizeof(char));
((SPRINTF)Apis.sprintfFunc)(tools, StringVars->poi, IntermedBuf);
((MESSAGEBOXA)Apis.MessageBoxAFunc)(NULL, tools, NULL, NULL);
char* locSpace;
while (j < lenIOBuf)
{
locSpace = ((STRSTR)Apis.strstrFunc)(IOBuf + j, StringVars->space);
if (locSpace == 0)
locSpace = IOBuf + lenIOBuf;
strLen = locSpace - IOBuf - j;
((MEMCPY)Apis.memcpyFunc)(IntermedBuf + i, IOBuf + j, strLen);
i += strLen, j += strLen + 1;
}
((MESSAGEBOXA)Apis.MessageBoxAFunc)(NULL, StringVars->here, NULL, NULL);
((MEMSET)Apis.memsetFunc)(IOBuf, 0, BUFFSIZE * sizeof(char));
The first two MessageBoxA calls successfully execute, each containing the address of IOBuf and IntermedBuf respectively. The last call to MessageBoxA, after the while loop, never comes, meaning the program is crashing in the while loop as it copies data from IOBuf to IntermedBuf.
I ran remote.exe which spawned a new WerFault.exe (I have tried with calc, notepad, several other processes with the same result) containing the PIC program, and stuck it into Windbg to try and get a better sense of what was happening. I found that after receiving the first two message boxes and clicking through them, WerFault crashes with a stack overflow caused by a call to strstr:
Examining the contents of the stack at crash time shows this:
Looking at the contents of IntermedBuf (which is one of the arguments passed to the strstr call) I can see that the program IS copying data from IOBuf to IntermedBuf and removing spaces as intended, however the program crashes after copying ~80k.
IOBuf (raw data):
IntermedBuf(After removing spaces)
My preliminary understanding of what is happening here is that strstr (and potentially memcpy) are pushing data to the stack with each call, and given the length of the loop (lengthIOBuf is ~325K, spaces occur randomly every 2-11 characters throught) the stack is overflowing before the while loop finishes and the stack unwinds. However this doesn't explain why this succeeds in x64 in both cases, and in x86 when the PIC program is running in a user-made program as opposed to injected into a legitimate process.
I have ran the x86 PIC program in the local injector, where it succeeds, and also attached Windbg to it in order to examine what is happening differently there. The stack similarly contains the same sort of pattern of characters as seen in the above screenshot, however later in the loop (because again the program succeeds), the stack appears to... jump? I examined the contents of the stack early into the while loop (having set bp on strstr) and see that it contains much the same pattern seen in the stack in the remote injector session:
I also added another MessageBox this time inside the while loop, set to pop when j > lenIOBuf - 500 in order to catch the program as it neared completion of the while loop.
char* locSpace;
while (j < lenIOBuf)
{
if (j > lenIOBuf - 500)
{
((MEMSET)Apis.memsetFunc)(tools, 0, 100 * sizeof(char));
((SPRINTF)Apis.sprintfFunc)(tools, StringVars->poi, IntermedBuf);
((MESSAGEBOXA)Apis.MessageBoxAFunc)(NULL, tools, NULL, NULL);
}
locSpace = ((STRSTR)Apis.strstrFunc)(IOBuf + j, StringVars->space);
if (locSpace == 0)
locSpace = IOBuf + lenIOBuf;
strLen = locSpace - IOBuf - j;
((MEMCPY)Apis.memcpyFunc)(IntermedBuf + i, IOBuf + j, strLen);
i += strLen, j += strLen + 1;
}
When this MessageBox popped, I paused execution and found that ESP was now 649fd80; previously it was around 13beb24?
So it appears that the stack relocated, or the local injector added more memory to the stack or something (I am embarassingly naive about this stuff). Looking at the "original" stack location at this stage in execution shows that the data there previously is still there at this point when the loop is near completion:
So bottom line, this code which runs successfully by all accounts in x64 local/remote and x86 local is crashing when ran in another process in x86. It appears that in the local injector case the stack fills in a similar fashion as in the remote injector where it crashes, however the local injector is relocating the stack or adding more stack space or something which isn't happening in the remote injector. Does anyone have any ideas why, or more importantly, how I could alter the code to achieve the goal of removing spaces from a large, arbitrary buffer in a different way where I might not encounter the overflow that I am currently?
Thanks for any help
typedef void*(WINAPI* MEMCPY)(void * destination, const void * source, size_t num);
typedef char*(WINAPI* STRSTR)(const char *haystack, const char *needle);
is wrong declarations. both this api used __cdecl calling convention - this mean that caller must up stack ( add esp,4*param_count) after call. but because you declare it as __stdcall (== WINAPI) compiler not generate add esp,4*param_count instruction. so you have unbalanced push for parameters.
you need use
typedef void * (__cdecl * MEMCPY)(void * _Dst, const void * _Src, _In_ size_t _MaxCount);
typedef char* (__cdecl* STRSTR)(_In_z_ char* const _String, _In_z_ char const* const _SubString);
and so on..
Familiar with what you are doing, and frankly I moved onto compiling some required functions (memcpy, etc) instead of manually looking them up and making external calls.
For example:
inline void* _memcpy(void* dest, const void* src, size_t count)
{
char *char_dest = (char *)dest;
char *char_src = (char *)src;
if ((char_dest <= char_src) || (char_dest >= (char_src+count)))
{
/* non-overlapping buffers */
while(count > 0)
{
*char_dest = *char_src;
char_dest++;
char_src++;
count--;
}
}
else
{
/* overlaping buffers */
char_dest = (char *)dest + count - 1;
char_src = (char *)src + count - 1;
while(count > 0)
{
*char_dest = *char_src;
char_dest--;
char_src--;
count--;
}
}
return dest;
}
inline char * _strstr(const char *s, const char *find)
{
char c, sc;
size_t len;
if ((c = *find++) != 0)
{
len = strlen(find);
do {
do {
if ((sc = *s++) == 0)
return 0;
} while (sc != c);
} while (strncmp(s, find, len) != 0);
s--;
}
return (char *)((size_t)s);
}
Credits for the above code from ReactOS. You can lookup the rest required (strlen, etc.)

Why is clEnqueueMapBuffer returning random data?

I am trying to learn about clEnqueueMapBuffer in OpenCL by writing a kernel which finds the square of values in an input buffer, but only returns two items at a time in the output buffer using clEnqueueMapBuffer. As I understand it, this function returns a pointer in the hosts memory which points to the buffer memory in the device. Then clEnqueueUnmapMemObject must unmap this buffer to allow the kernels to continue their computations. Now, when I call clEnqueueMapBuffer, it is returning random data.
Here is my kernel
__kernel void testStream(
__global int *input_vector,
__global int *output_vector,
__global int *mem_flag) // informs the host when the workload is finished
{
mem_flag[0] = 1;
}
and my source
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <CL/opencl.h>
#include "utils.h"
int main(void)
{
unsigned int n = 24;
int BUFF_SIZE = 2;
// Input and output vectors
int num_bytes = sizeof(int) * n;
int *output_buffer = (int *) malloc(num_bytes);
int output_buffer_offset = 0;
int *mapped_data = NULL;
// use mapped_flag for determining if the job on the device is finished
int *mapped_flag = NULL;
int *host_in = (int *) malloc(num_bytes);
int *host_out = (int *) malloc(num_bytes);
// Declare cl variables
cl_mem device_in;
cl_mem device_out;
cl_mem device_out_flag;
// Declare cl boilerplate
cl_platform_id platform = NULL;
cl_device_id device = NULL;
cl_command_queue queue = NULL;
cl_context context = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
// Located in utils.c -- the source is irrelevant here
char *kernel_source = read_kernel("kernels/test.cl");
// Initialize host_in
int i;
for (i = 0; i < n; i++) {
host_in[i] = i + 1;
}
// Set up opencl
cl_int error;
error = clGetPlatformIDs(1, &platform, NULL);
printf("clGetPlatformIDs: %d\n", (int) error);
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
printf("clGetDeviceIDs: %d\n", (int) error);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
printf("clCreateContext: %d\n", (int) error);
queue = clCreateCommandQueue(context, device, 0, &error);
printf("clCreateCommandQueue: %d\n", (int) error);
program = clCreateProgramWithSource(context, 1,
(const char**)&kernel_source, NULL, &error);
printf("clCreateProgramWithSource: %d\n", (int) error);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
kernel = clCreateKernel(program, "testStream", &error);
printf("clCreateKernel: %d\n", (int) error);
// Create the buffers
device_in = clCreateBuffer(context, CL_MEM_READ_ONLY,
num_bytes, NULL, NULL);
device_out = clCreateBuffer(context,
CL_MEM_WRITE_ONLY,
sizeof(int) * BUFF_SIZE, NULL, NULL);
device_out_flag = clCreateBuffer(context,
CL_MEM_WRITE_ONLY,
sizeof(int) * 2, NULL, NULL);
// Write the input buffer
clEnqueueWriteBuffer(
queue, device_in, CL_FALSE, 0, num_bytes,
host_in, 0, NULL, NULL);
// Set the kernel arguments
error = clSetKernelArg(kernel, 0, sizeof(cl_mem), &device_in);
error = clSetKernelArg(kernel, 1, sizeof(cl_mem), &device_out);
error = clSetKernelArg(kernel, 2, sizeof(cl_mem), &device_out_flag);
// Execute the kernel over the entire range of data
clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
(const size_t *) &n, NULL, 0, NULL, NULL);
// Map and unmap until the flag is set to true
int break_flag = 0;
while(1) {
// Map the buffers
mapped_data = (int *) clEnqueueMapBuffer(
queue, device_out, CL_TRUE, CL_MAP_READ, 0,
sizeof(int) * BUFF_SIZE, 0, NULL, NULL, &error);
mapped_flag = (int *) clEnqueueMapBuffer(
queue, device_out_flag, CL_TRUE, CL_MAP_READ, 0,
sizeof(int) , 0, NULL,NULL, &error);
// Extract the data out of the buffer
printf("mapped_flag[0] = %d\n", mapped_flag[0]);
// Set the break_flag
break_flag = mapped_flag[0];
// Unmap the buffers
error = clEnqueueUnmapMemObject(queue, device_out, mapped_data, 0,
NULL, NULL);
error = clEnqueueUnmapMemObject(queue, device_out_flag, mapped_flag,
0, NULL, NULL);
if (break_flag == 1) {break;}
usleep(1000*1000);
}
return 0;
}
When I run the program, I get output similar to
clGetPlatformIDs: 0
clGetDeviceIDs: 0
clCreateContext: 0
clCreateCommandQueue: 0
clCreateProgramWithSource: 0
clCreateKernel: 0
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
mapped_flag[0] = 45366144
Why is this happening?
Edit
I am running this code on an HP dm1z with fedora 19 64-bit on the kernel 3.13.7-100.fc19.x86_64. Here is the output from clinfo
Number of platforms: 1
Platform Profile: FULL_PROFILE
Platform Version: OpenCL 1.2 AMD-APP (1214.3)
Platform Name: AMD Accelerated Parallel Processing
Platform Vendor: Advanced Micro Devices, Inc.
Platform Extensions: cl_khr_icd cl_amd_event_callback cl_amd_offline_devices
Platform Name: AMD Accelerated Parallel Processing
Number of devices: 2
Device Type: CL_DEVICE_TYPE_GPU
Device ID: 4098
Board name: AMD Radeon HD 6310 Graphics
Device Topology: PCI[ B#0, D#1, F#0 ]
Max compute units: 2
Max work items dimensions: 3
Max work items[0]: 256
Max work items[1]: 256
Max work items[2]: 256
Max work group size: 256
Preferred vector width char: 16
Preferred vector width short: 8
Preferred vector width int: 4
Preferred vector width long: 2
Preferred vector width float: 4
Preferred vector width double: 0
Native vector width char: 16
Native vector width short: 8
Native vector width int: 4
Native vector width long: 2
Native vector width float: 4
Native vector width double: 0
Max clock frequency: 492Mhz
Address bits: 32
Max memory allocation: 134217728
Image support: Yes
Max number of images read arguments: 128
Max number of images write arguments: 8
Max image 2D width: 16384
Max image 2D height: 16384
Max image 3D width: 2048
Max image 3D height: 2048
Max image 3D depth: 2048
Max samplers within kernel: 16
Max size of kernel argument: 1024
Alignment (bits) of base address: 2048
Minimum alignment (bytes) for any datatype: 128
Single precision floating point capability
Denorms: No
Quiet NaNs: Yes
Round to nearest even: Yes
Round to zero: Yes
Round to +ve and infinity: Yes
IEEE754-2008 fused multiply-add: Yes
Cache type: None
Cache line size: 0
Cache size: 0
Global memory size: 201326592
Constant buffer size: 65536
Max number of constant args: 8
Local memory type: Scratchpad
Local memory size: 32768
Kernel Preferred work group size multiple: 32
Error correction support: 0
Unified memory for Host and Device: 1
Profiling timer resolution: 1
Device endianess: Little
Available: Yes
Compiler available: Yes
Execution capabilities:
Execute OpenCL kernels: Yes
Execute native function: No
Queue properties:
Out-of-Order: No
Profiling : Yes
Platform ID: 0x00007fd434852fc0
Name: Loveland
Vendor: Advanced Micro Devices, Inc.
Device OpenCL C version: OpenCL C 1.2
Driver version: 1214.3
Profile: FULL_PROFILE
Version: OpenCL 1.2 AMD-APP (1214.3)
Extensions: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_gl_sharing cl_ext_atomic_counters_32 cl_amd_device_attribute_query cl_amd_vec3 cl_amd_printf cl_amd_media_ops cl_amd_media_ops2 cl_amd_popcnt cl_amd_image2d_from_buffer_read_only
Also, it may be worth noting that when I began playing with OpenCL, I ran a test program to calculate the inner product, but that gave weird results. Initially I though it was an error with the program and forgot about it, but is it possible that the OpenCL implementation is faulty? If it helps, the OpenGL implementation has multiple errors, causing blocks of random data to show up on my desktop background, but this could also be a Linux problem.
You are passing NULL as the global work size to your clEnqueueNDRangeKernel call:
// Execute the kernel over the entire range of data
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, NULL, NULL, 0, NULL, NULL);
If you were checking the error code returned by this call (which you always should), you would get the error code corresponding to CL_INVALID_GLOBAL_WORK_SIZE back. You always need to specify a global work size, so your call should look something like this:
// Execute the kernel over the entire range of data
size_t global[1] = {1};
error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
// check error == CL_SUCCESS!
Your calls to map and unmap buffers are fine; I've tested this code with the above fix and it works for me.
Your updated code to fix the above problem looks like this:
unsigned int n = 24;
...
// Execute the kernel over the entire range of data
clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
(const size_t *) &n, NULL, 0, NULL, NULL);
This is not a safe way of passing the global work size parameter to the kernel. As an example, the unsigned int n variable might occupy 32 bits, whereas a size_t could be 64 bits. This means that when you pass the address of n and cast to a const size_t*, the implementation will read a 64-bit value, which will encompass the 32 bits of n plus 32 other bits that have some arbitrary value. You should either assign n to a size_t variable before passing it to clEnqueueNDRangeKernel, or just change it to be a size_t itself.
This may or may not be related to the problems you are having. You could be accidentally launching a huge number of work-items for example, which might explain why the code appears to block on the CPU.
Here are few remarks that come to my mind:
I have the feeling that somehow you think that calling clEnqueueMapBuffer will interrupt the execution of the kernel. I don't think this is correct. Once a command is launched for execution it runs until it is completed (or failed...). It is possible for several commands to be launched concurrently, but trying to read some data while a kernel is still processing it will result in undefined behavior. Besides, the way you create your command queue wouldn't let you run several commands at the same time. You need to use CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE property when creating the queue to allow that.
I don't know with which global size you call your kernel after the post of #jprice, but except if you execute the kernel with only one workitem you'll have trouble with this kind of statement: mem_flag[0] = 1; since all the workitems will write to the same location. (I'm guessing that you posted only a portion of your kernel. Check if you have other statements like that...actually that'd be useful if you post the entire kernel code).
Since you map and unmap always the same portion of the buffers and always go to check in the first element (of mapped_flag) and since the kernel has completed its computation at that moment (see first point), at least it is normal that you always have the same value read.

accessing data from file path with"space" on windows

I'm facing a weird problem on windows
I'm using a library called STDCL which runs pretty well on linux,but on windows there is an error if the output .exe file path got "spaces"
example:
c:\my file\my file.exe //won't work
c:\my_file\my file.exe //will work
c:\my file\my file.exe //won't work
// and it is accessing data from dll(any where) containing STDCL library
c:\my_file\my file.exe //will work
// and it is accessing data from dll(any where) containing STDCL library
I got the source code to compile the library
or is there an easier way to force accepting the path of the .exe inside my .dll
edit: sample code
/* hello_stdcl.c */
#include <stdio.h>
#include <stdcl.h>
int main()
{
stdcl_init(); // this is only necessary for Windows
cl_uint n = 64;
#if(1)
/* use default contexts, if no GPU use CPU */
CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;
unsigned int devnum = 0;
void* clh = clopen(cp,"matvecmult.cl",CLLD_NOW);
cl_kernel krn = clsym(cp,clh,"matvecmult_kern",0);
/* allocate OpenCL device-sharable memory */
cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);
clndrange_t ndr = clndrange_init1d( 0, n, 64);
/* initialize vectors a[] and b[], zero c[] */
int i,j;
for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
for(i=0;i<n;i++) b[i] = 2.2f*i;
for(i=0;i<n;i++) c[i] = 0.0f;
/* define the computational domain and workgroup size */
//clndrange_t ndr = clndrange_init1d( 0, n, 64);
/* non-blocking sync vectors a and b to device memory (copy to GPU)*/
clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
/* set the kernel arguments */
clarg_set(cp,krn,0,n);
clarg_set_global(cp,krn,1,aa);
clarg_set_global(cp,krn,2,b);
clarg_set_global(cp,krn,3,c);
/* non-blocking fork of the OpenCL kernel to execute on the GPU */
clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);
/* non-blocking sync vector c to host memory (copy back to host) */
clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);
/* force execution of operations in command queue (non-blocking call) */
clflush(cp,devnum,0);
/* block on completion of operations in command queue */
clwait(cp,devnum,CL_ALL_EVENT);
for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);
clfree(aa);
clfree(b);
clfree(c);
clclose(cp,clh);
#endif
system("pause");
}
edit 2:
when I compile the code above ...take the result .exe file and put it in a path without spaces (short path) it works
if I put it in a path with spaces ...it simply crashes and when I debugged it was like memory issue (so it crashes with long path)
when I contacted the library creator he told me:
"windows getcwd() call returns an unusable path with spaces"
as I told before this library works fine on Linux,what may be the solution for this on Windows
system: win7 64 bit
Use quates for the binary name/path as "my file.exe"

OpenCL Kernel Executes Slower Than Single Thread

All, I wrote a very simple OpenCL kernel which transforms an RGB image to gray scale using simple averaging.
Some background:
The image is stored in mapped memory, as a 24 bit, non padded memory block
The output array is stored in pinned memory (mapped with clEnqueueMapBuffer) and is 8 bpp
There are two buffers allocated on the device (clCreateBuffer), one is specifically read (which we clWriteBuffer into before the kernel starts) and the other is specifically write (which we clReadBuffer after the kernel finishes)
I am running this on a 1280x960 image. A serial version of the algorithm averages 60ms, the OpenCL kernel averages 200ms!!! I'm doing something wrong but I have no idea how to proceed, what to optimize. (Timing my reads/writes without a kernel call, the algorithm runs in 15ms)
I am attaching the kernel setup (sizes and arguments) as well as the kernel
EDIT: So I wrote an even dumber kernel, that does no global memory accesses inside it, and it was only 150ms... This is still ridiculously slow. I thought maybe I'm messing up with global memory reads, they have to be 4 byte aligned or something? Nope...
Edit 2: Removing the all parameters from my kernel gave me significant speed up... I'm confused I thought that since I'm clEnqueueWriteBuffer the kernel should be doing no memory transfer from host->device and device->host....
Edit 3: Figured it out, but I still don't understand why. If anyone could explain it I would be glad to award correct answer to them. The problem was passing the custom structs by value. It looks like I'll need to allocate a global memory location for them and pass their cl_mems
Kernel Call:
//Copy input to device
result = clEnqueueWriteBuffer(handles->queue, d_input_data, CL_TRUE, 0, h_input.widthStep*h_input.height, (void *)input->imageData, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to write to input buffer on device!")) return 0;
//Set kernel arguments
result = clSetKernelArg(handles->current_kernel, 0, sizeof(OpenCLImage), (void *)&h_input);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set input struct.")) return 0;
result = clSetKernelArg(handles->current_kernel, 1, sizeof(cl_mem), (void *)&d_input_data);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set input data.")) return 0;
result = clSetKernelArg(handles->current_kernel, 2, sizeof(OpenCLImage), (void *)&h_output);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set output struct.")) return 0;
result = clSetKernelArg(handles->current_kernel, 3, sizeof(cl_mem), (void *)&d_output_data);
if(check_result(result, "opencl_rgb_to_gray", "Failed to set output data.")) return 0;
//Determine run parameters
global_work_size[0] = input->width;//(unsigned int)((input->width / (float)local_work_size[0]) + 0.5);
global_work_size[1] = input->height;//(unsigned int)((input->height/ (float)local_work_size[1]) + 0.5);
printf("Global Work Group Size: %d %d\n", global_work_size[0], global_work_size[1]);
//Call kernel
result = clEnqueueNDRangeKernel(handles->queue, handles->current_kernel, 2, 0, global_work_size, local_work_size, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to run kernel!")) return 0;
result = clFinish(handles->queue);
if(check_result(result, "opencl_rgb_to_gray", "Failed to finish!")) return 0;
//Copy output
result = clEnqueueReadBuffer(handles->queue, d_output_data, CL_TRUE, 0, h_output.widthStep*h_output.height, (void *)output->imageData, 0, 0, 0);
if(check_result(result, "opencl_rgb_to_gray", "Failed to write to output buffer on device!")) return 0;
Kernel:
typedef struct OpenCLImage_t
{
int width;
int widthStep;
int height;
int channels;
} OpenCLImage;
__kernel void opencl_rgb_kernel(OpenCLImage input, __global unsigned char* input_data, OpenCLImage output, __global unsigned char * output_data)
{
int pixel_x = get_global_id(0);
int pixel_y = get_global_id(1);
unsigned char * cur_in_pixel, *cur_out_pixel;
float avg = 0;
cur_in_pixel = (unsigned char *)(input_data + pixel_y*input.widthStep + pixel_x * input.channels);
cur_out_pixel = (unsigned char *)(output_data + pixel_y*output.widthStep + pixel_x * output.channels);
avg += cur_in_pixel[0];
avg += cur_in_pixel[1];
avg+= cur_in_pixel[2];
avg /=3.0f;
if(avg > 255.0)
avg = 255.0;
else if(avg < 0)
avg = 0;
*cur_out_pixel = avg;
}
Overhead of copying the value to all the threads that will be created might be the possible reason for the time; where as for a global memory the reference will be enough in the other case. The only the SDK implementer will be able to answer exactly.. :)
You may want to try a local_work_size like [64, 1, 1] in order to coalesce your memory calls. (note that 64 is a diviser of 1280).
As previously said, you have to use a profiler in order to get more informations. Are you using an nvidia card ? Then download CUDA 4 (not 5), as it contains an openCL profiler.
Your performance must be far from the optimum. Change the local work size, the global work size, try to treat two or four pixels per tread. Can you change the way pixels are stored privous to your treatment? Then break your struct for tree arrays in order to coalesce memomry access more effectively.
Tou can hide your memory transfers with the GPU work: it will be more easy to do with a profiler near you.

Resources