How to pass array and int to opencl kernel in same time

How to pass array and int to opencl kernel in same time - c

I've an openCL code to do some calculate.
Can OpenCL pass array and int to kernel in the same time?
I want to pass 'myint' to kernel not via buffer.
int myint = 100;
cl_mem memObjects[3] = {0, 0, 0};
memObjects[0] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int) * ARR_SIZE, data, NULL);
memObjects[1] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(int), new int[]{myint}, NULL);
memObjects[2] = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE,
sizeof(float) * ARR_SIZE, NULL, NULL);
opencl_errNum = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), &memObjects[0]);
opencl_errNum |= clSetKernelArg(opencl_kernel, 1, sizeof(cl_mem), &memObjects[1]);
opencl_errNum |= clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), &memObjects[2]);
//Kernel code
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" __global const int *value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value[0];\n"
"\n"
"}";
Want to change be like this.
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" __global const int value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value;\n"
"\n"
"}";

Yes you can pass either 1-dimensional arrays or constants as kernel arguments. The syntax is as follows:
int myint = 100;
cl_mem memObjects[2] = {0, 0};
memObjects[0] = clCreateBuffer(opencl_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * ARR_SIZE, data, NULL);
memObjects[1] = clCreateBuffer(opencl_context, CL_MEM_READ_WRITE, sizeof(float) * ARR_SIZE, NULL, NULL);
opencl_errNum = clSetKernelArg(opencl_kernel, 0, sizeof(cl_mem), &memObjects[0]); // pass an array as kernel parameter
opencl_errNum |= clSetKernelArg(opencl_kernel, 1, sizeof(int), (void*)&myint); // pass a constant as kernel parameter
opencl_errNum |= clSetKernelArg(opencl_kernel, 2, sizeof(cl_mem), &memObjects[1]); // pass an array as kernel parameter
// note: the second parameter is just "const int value", without the "__global" keyword or "*" pointer
const char *opencl_code = "__kernel void opencl_code(__global const int *src,\n"
" const int value,\n"
" __global float *result) {\n"
" int gid = get_global_id(0);\n"
" result[gid] = src[gid] - value;\n" // note: result is type float, and you write an int to it here
"}\n";
To make this all a lot easier, I created this OpenCL-Wrapper. With that, you can directly pass arrays (Memory objects) or numbers as kernel parameters, and the C++ code would look like this:
int main() {
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
int myint = 100;
Memory<int> src(device, ARR_SIZE); // allocate memory on both host and device
Memory<float> result(device, ARR_SIZE);
Kernel opencl_kernel(device, ARR_SIZE, "opencl_code", src, myint, result); // kernel that runs on the device
// initialize src, for example with "src[5] = 3;"
src.write_to_device(); // copy data from host memory to device memory
opencl_kernel.run(); // run kernel
result.read_from_device(); // copy data from device memory to host memory
// read result with for example "float test = result[5];"
// ...
return 0;
}

Related

MPI_Upack inside userd define operation of MPI_reduce()

I have to send a struct that contains, among other things, a dynamically allocated array of another struct.
The receiver has to merge the received message with its data and then send the result to another process.
Basically what I want to obtain is something like that.
I have implemented the following code.
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
typedef struct Data {
char character;
int frequency;
} Data;
typedef struct Dictionary {
int charsNr;
Data *data;
} Dictionary;
typedef struct Header {
int id;
int size;
MPI_Datatype *type;
int position;
} Header;
static const int NUM_CHARS = 5;
typedef unsigned char BYTE;
void buildMyTypr(MPI_Datatype *dictType) {
int blockLengths[] = {1, 1};
MPI_Datatype types[] = {MPI_CHAR, MPI_INT};
MPI_Aint offsets[2];
offsets[0] = offsetof(Data, character);
offsets[1] = offsetof(Data, frequency);
MPI_Type_create_struct(2, blockLengths, offsets, types, dictType);
MPI_Type_commit(dictType);
}
void appendData(Dictionary *dict, Data *data) {
dict->data = realloc(dict->data, sizeof(Data) * (dict->charsNr+1));
dict->data[dict->charsNr] = (struct Data) {.character = data->character, .frequency = data->frequency};
++dict->charsNr;
}
void mergeDicts(Dictionary *dst, Dictionary *src) {
for (int i = 0; i < src->charsNr; i++) {
char character = src->data[i].character;
int frequency = src->data[i].frequency;
bool assigned = false;
for (int j = 0; j < dst->charsNr && !assigned; j++) {
if (dst->data[j].character == character) {
dst->data[j].frequency += frequency;
assigned = true;
}
}
if (!assigned)
appendData(dst, &src->data[i]);
}
}
int getRand(const int from, const int to)
{
int num = (rand() % (to - from + 1)) + from;
return num;
}
void getMessageSize(int *size, int rank, int tag, MPI_Status *status) {
MPI_Probe(rank, tag, MPI_COMM_WORLD, status);
MPI_Get_count(status, MPI_BYTE, size);
}
BYTE* packDictionary(Header *header, Dictionary *dict) {
header->size = sizeof(int) + (sizeof(Data) * dict->charsNr);
BYTE *buffer = malloc(sizeof(BYTE) * (header->size));
header->position = 0;
MPI_Pack(&dict->charsNr, 1, MPI_INT, buffer, header->size, &header->position, MPI_COMM_WORLD);
MPI_Pack(dict->data, dict->charsNr, *header->type, buffer, header->size, &header->position, MPI_COMM_WORLD);
return buffer;
}
void unpackDictionary(Header *header, Dictionary *dict, BYTE *buffer) {
MPI_Unpack(buffer, header->size, &header->position, &dict->charsNr, 1, MPI_INT, MPI_COMM_WORLD);
dict->data = malloc(sizeof(Data) * dict->charsNr);
MPI_Unpack(buffer, header->size, &header->position, dict->data, dict->charsNe, *header->type, MPI_COMM_WORLD);
}
void MyTypeOp(contType *in, contType *out, int *len, MPI_Datatype *typeptr)
{
MPI_Status *status;
Dictionary inDict = {.charsNr = 0, .data = NULL};
Dictionary outDict = {.charsNr = 0, .data = NULL};
int bufferSize = 0;
// how can I get the size of the buffers?
// in other occasion I use the getMessageSize(), but I'm not sure
// if it can be useful here
// how can I get the type of the message, basically the dictType?
Header header = {.id = 0, .size = 0, .type = NULL, .position = 0};
unpackDictionary(&header, &inDict, in);
// I should update the header with the new size
unpackDictionary(&header, &outDict, out);
mergeDicts(&outDict, &inDict);
header.size = 0;
out = packDictionary(header, &outDict);
}
int main(int argc, char **argv)
{
int proc_number;
int rank;
MPI_Comm_size(MPI_COMM_WORLD, &proc_number);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
Dictionary dict = {.charsNr = NUM_CHARS, .data = NULL};
dict.data = malloc(dict.charsNr * sizeof(Data));
// create a random dictionary
// I use for simplicity NUM_CHARS but in real life it will be unknown
// at the beginning and every dictionary can have a different size
for (int i = 0; i < NUM_CHARS; i++) {
int freq = getRand(1, 10);
dict.data[i].frequency = freq;
dict.data[i].character = 'a' + getRand(1, 5) + i + rank;
}
MPI_Datatype dictType;
buildMyType(&dictType);
MPI_Op MyOp;
MPI_Op_create((MPI_User_function *) MyTypeOp, 1, &MyOp);
MPI_Datatype contType;
MPI_Type_contiguous(1, MPI_PACKED, &contType);
MPI_Type_commit(&contType);
Header header = {.id = 0, .size = 0, .type = &dictType, .position = 0};
// when I pack the message I put everithing inside a buffer of BYTE
BYTE *buffer = packDictionary(&header, &dict);
BYTE *buffer_rcv = NULL;
MPI_Reduce(&buffer,& buffer_rcv, 1, contType, MyOp, 0, MPI_COMM_WORLD);
if(rank == 0)
for (i = 0; i < NUM_CHARS; i++)
printf("%c: %d\n", dict.data[i].character, dict.data[i].frequency);
MPI_Type_free(&contType);
MPI_Type_free(&dictType);
MPI_Op_free(&MyOp);
free(buffer);
free(buffer_rcv);
free(dict.data);
MPI_Finalize();
return 0;
}
Of course this example cannot run.
Do you have any suggestion on how can I do it?
I'm writing the code in C on Linux machine.
Thanks.

cudaLaunchKernel failed to launch kernel

I am trying to launch kernel function using the runtime API. For some reason, I am not able the directly call cudaLaunchKernel. Instead, I have call a function that calls cudaLaunchKernel inside it. Here is an example, which simply just print a message from the device:
#include<stdio.h>
#include<cuda.h>
#include<cuda_runtime.h>
__global__
void hello()
{
printf(“hello from kernel. \n”);
}
template<typename T>
int launchKernel (T kernel , const size_t grid[3] , const size_t block[3])
{
cudaError_t res;
dim3 grid3d = {(unsigned int)grid[0] , (unsigned int)grid[1] , (unsigned int)grid[2]};
dim3 block3d = {(unsigned int)block[0] , (unsigned int)block[1] , (unsigned int)block[2]};
res = cudaLaunchKernel ((void*)kernel , grid3d , block3d, NULL, 0, NULL);
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
return 0;
}
int main(void)
{
float *hx, *dx;
hx = (float*)malloc(32 * sizeof(float));
cudaMalloc(&dx, 32 * sizeof(float));
unsigned int threads = 32;
unsigned int blocks = 1;
///////////// option 1: directly call runtime api: cudaLaunchKernel //////////////
//cudaLaunchKernel((void*)hello, dim3(blocks), dim3(threads), NULL, 0, NULL);
//////////////////////////////////////////////////////////////////////////////////
///////// option 2: call a function which further calls cudaLaunchKernel /////////
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
launchKernel (hello , grid3d , block3d);
//////////////////////////////////////////////////////////////////////////////////
cudaMemcpy(hx, dx, 32 * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dx);
free(hx);
return 0;
}
Option 1, which directly calls the cudaLaunchKernel, works. However, option 2, which indirectly invokes the cudaLaunchKernel, does not work. Using option 2, no message was printed from the device, and the return value is not equal to CUDA_SUCCESS.
I was wondering if anyone has any insights into this problem.
Thank you in advance for your help and time.

grid and block dimensions cannot be zero:
const size_t grid3d[3] = {blocks, 0, 0};
const size_t block3d[3] = {threads, 0, 0};
the reason your two launches behave differently is that you are creating the grid and block dimension variables differently.
If you change to:
const size_t grid3d[3] = {blocks, 1, 1};
const size_t block3d[3] = {threads, 1, 1};
it will work for either case.
By the way, you're not doing yourself any favors with this sort of error trapping:
if (res != CUDA_SUCCESS)
{
char msg[256];
printf (“error during kernel launch \n”);
return -1;
}
This would be a lot more instructive:
if (res != cudaSuccess)
{
printf (“error during kernel launch: %s \n”, cudaGetErrorString(res));
return -1;
}

OpenCL Eclipse Loading but not reading kernel file

I have a problem. OpenCL is loading the kernel but giving garbage values. If I were to take a guess then it's not able to execute the kernel. The program is compiling but giving garbage values. All the library paths, include and library are correct. The program works in the lab. However it is not working in the laptop.
This is the main C file :-
#include<stdio.h>
#include<stdlib.h>
#include<CL/cl.h>
//Max source size of the kernel string
#define MAX_SOURCE_SIZE (0x100000)
int main (void)
{
//create the two input vectors
int i;
int LIST_SIZE;
printf("Enter how many elements = ");
scanf("%d",&LIST_SIZE);
int *A = (int *)malloc(sizeof(int)*LIST_SIZE);
//Initialize the input vector
for(i=0;i<LIST_SIZE;i++)
{
A[i] = i;
}
int *B = (int *)malloc(sizeof(int)*LIST_SIZE);
for(i=0;i<LIST_SIZE;i++)
{
B[i] = i+10;
}
//Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("5-1.cl","r");
if(!fp)
{
fprintf(stderr,"Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str,1,MAX_SOURCE_SIZE,fp);
fclose(fp);
//Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1,&platform_id,&ret_num_platforms);
ret = clGetDeviceIDs(platform_id,CL_DEVICE_TYPE_CPU,1,&device_id,&ret_num_devices);
//Create an OpenCL context
cl_context context = clCreateContext(NULL,1,&device_id,NULL,NULL,&ret);
//Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context,device_id,NULL,&ret);
//Create memory buffers on the device for each vector A,B,C
cl_mem a_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),NULL,&ret);
cl_mem b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY, LIST_SIZE*sizeof(int),NULL,&ret);
cl_mem c_mem_obj = clCreateBuffer(context,CL_MEM_WRITE_ONLY, LIST_SIZE*sizeof(int),NULL,&ret);
//Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue,a_mem_obj,CL_TRUE,0,LIST_SIZE*sizeof(int),A,0,NULL,NULL);
ret = clEnqueueWriteBuffer(command_queue,b_mem_obj,CL_TRUE,0,LIST_SIZE*sizeof(int),B,0,NULL,NULL);
//Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context,1,(const char**)&source_str,(const size_t *)&source_size,&ret);
//Build the program
ret = clBuildProgram(program,1,&device_id,NULL,NULL,NULL);
//Create the OpenCL Kernel Object
cl_kernel kernel = clCreateKernel(program,"vector_add",&ret);
//Set the arguments of the kernel
ret = clSetKernelArg(kernel,0,sizeof(cl_mem),(void *)&a_mem_obj);
ret = clSetKernelArg(kernel,1,sizeof(cl_mem),(void *)&b_mem_obj);
ret = clSetKernelArg(kernel,2,sizeof(cl_mem),(void *)&c_mem_obj);
//Execute the OpenCL kernel on the array
size_t global_item_size = LIST_SIZE;
size_t local_item_size = 1;
//Execute the kernel on the device
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue,kernel,1,NULL,&global_item_size,&local_item_size,0,NULL,NULL);
ret = clFinish(command_queue);
//Read the memory buffer in <c on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE*sizeof(int),C,0,NULL,NULL);
//Display the result to the screen
for(i=0;i<LIST_SIZE;i++)
{
printf("%d + %d = %d\n",A[i],B[i],C[i]);
}
//Clean UP
ret = clFlush(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Here is the kernel code :-
__kernel void vector_add(__global int* A,__global int* B,__global int* C)
{
//Get the index of the current work item
int i = get_global_id(0);
//Do the operation
C[i] = A[i] + B[i];
}
If someone has faced a similar situation in the past and solved it please help.

Create MPI type for struct containing dynamic array

I'm trying to send a struct which has one of the member as a dynamic array, but this array doesn't seem to be sent properly. Any suggestion on how to do this?
This is what I have:
struct bar
{
int a;
int b;
int* c;
};
void defineMPIType(MPI_Datatype* newType, int cLen, struct bar* msg)
{
int blockLengths[3] = {1, 1, cLen};
MPI_Datatype types[3] = {MPI_INT, MPI_INT, MPI_INT};
MPI_Aint offsets[3];
MPI_Aint addrB, addrC;
MPI_Address(&(msg->b), &addrB);
MPI_Address(msg->c, &addrC);
offsets[0] = offsetof(struct bar, a);
offsets[1] = offsetof(struct bar, b);
offsets[2] = addrC - addrB;
MPI_Type_create_struct(3, blockLengths, offsets, types, newType);
MPI_Type_commit(newType);
}
void main(int argc, char* argv[])
{
MPI_Init(&argc, &argv);
int rank, p;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &p);
int cLen = argv[0];
MPI_Datatype MPI_BAR_TYPE;
struct bar* msg = malloc(sizeof(*msg));
msg->c = malloc(sizeof(int) * cLen);
defineMPIType(&MPI_BAR_TYPE, cLen, msg);
if (rank == 0)
{
msg->a = 1;
msg->b = 2;
for (int i = 0; i < cLen; ++i)
msg->c[i] = i;
MPI_Send(msg, 1, MPI_BAR_TYPE, 1, 111, MPI_COMM_WORLD);
}
else
{
MPI_Status stat;
MPI_Recv(msg, 1, MPI_BAR_TYPE, 0, 111, MPI_COMM_WORLD, &stat);
}
printf("Rank %d has c = [", rank);
for (int i = 0; i < cLen; ++i)
printf("%d, ", msg->c[i]);
printf("]\n");
free(msg);
MPI_Type_free(&MPI_BAR_TYPE);
MPI_Finalize();
}
Members a and b got sent properly, but c didn't.

There are a few issues in your code, even ignoring the issue of the type itself:
The first one is that you allocated memory for your c array only on process #0, then you (tried to) send this data to process #1. But process #1 didn't allocate any memory for storing the message. So even if the way the sending is done was correct, the code would have failed.
Names starting with MPI_ are reserved for the MPI library so you cannot use them as you wish. You have to find another name for your MPI_BAR_TYPE.
This line puzzles me somewhat: int cLen = argv[0]; I imagine you want to read from the command line the size of the array to allocate, in which case maybe that should read something like int clen = atoi(argv[1]); (forgetting about test for validity of this which would need to be properly handled...)
You only test if the process is of rank #0 or not, meaning that if for some reason you launched 3 processes, the process of rank #2 will wait forever for a message from process of rank #0 that will never arrive.
And finally the array itself: in your code there is a big confusion between the pointer c and the data pointed to by c. Your structure embeds the pointer, but not the memory pointed to. So you cannot map into an MPI structure the corresponding data... The most obvious reason is that from one call to the next (or from one process to the next), there is no guaranty that the offset from the address of the structure and the address of the data pointed to by c will be identical (and indeed, it is almost guaranteed it will be different). So you cannot reliably map them.
What you need to do for solving your problem is therefore to only transfer your 2 integers a and b in one go (possibly creating a MPI structure for transferring arrays of them if needed). Then you will transfer the memory pointed by c, which you would have allocated beforehand.
Your code could become for example:
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
struct bar
{
int a;
int b;
int* c;
};
void defineMPIType( MPI_Datatype* newType ) {
struct bar tmp[2];
MPI_Aint extent = &tmp[1] - &tmp[0];
MPI_Type_create_resized( MPI_2INT, 0, extent, newType );
MPI_Type_commit( newType );
}
int main( int argc, char* argv[] ) {
MPI_Init(&argc, &argv);
int rank, p;
MPI_Comm_rank( MPI_COMM_WORLD, &rank );
MPI_Comm_size( MPI_COMM_WORLD, &p );
int cLen = atoi( argv[1] );
MPI_Datatype Bar_type;
defineMPIType( &Bar_type );
struct bar msg;
msg.c = ( int* ) malloc( sizeof( int ) * cLen );
if ( rank == 0 ) {
msg.a = 1;
msg.b = 2;
for ( int i = 0; i < cLen; ++i ) {
msg.c[i] = i;
}
MPI_Send( &msg, 1, Bar_type, 1, 111, MPI_COMM_WORLD );
MPI_Send( msg.c, cLen, MPI_INT, 1, 222, MPI_COMM_WORLD );
}
else if ( rank == 1 ) {
MPI_Recv( &msg, 1, Bar_type, 0, 111, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
MPI_Recv( msg.c, cLen, MPI_INT, 0, 222, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
}
printf("Rank %d has a = %d, b = %d, c = [", rank, msg.a, msg.b );
for ( int i = 0; i < cLen - 1; ++i ) {
printf( "%d, ", msg.c[i] );
}
printf( "%d]\n", msg.c[cLen - 1] );
free( msg.c );
MPI_Type_free( &Bar_type );
MPI_Finalize();
return 0;
}
Which gives:
$ mpirun -n 2 ./a.out 3
Rank 0 has a = 1, b = 2, c = [0, 1, 2]
Rank 1 has a = 1, b = 2, c = [0, 1, 2]
Happy MPI coding.

Passing struct to GPU with OpenCL that contains an array of floats

I currently have some data that I would like to pass to my GPU and the multiply it by 2.
I have created a struct which can be seen here:
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float* patterns;
};
This struct should contain an array of floats. The array of floats I will not know untill run time as it is specified by the user.
The host code:
typedef struct GPUPatternDataContatiner
{
int nodeInput,nodeOutput,patternCount, offest;
float* patterns;
} GPUPatternData;
__kernel void patternDataAddition(__global GPUPatternData* gpd,__global GPUPatternData* output)
{
int index = get_global_id(0);
if(index < gpd->patternCount)
{
output.patterns[index] = gpd.patterns[index]*2;
}
}
Here is the Host code:
GPUPattern::GPUPatternData gpd;
gpd.nodeInput = ptSet->getInputCount();
gpd.nodeOutput = ptSet->getOutputCount();
gpd.offest = gpd.nodeInput+gpd.nodeOutput;
gpd.patternCount = ptSet->getCount();
gpd.patterns = new cl_float [gpd.patternCount*gpd.offest];
GPUPattern::GPUPatternData gridC;
gridC.nodeInput = ptSet->getInputCount();
gridC.nodeOutput = ptSet->getOutputCount();
gridC.offest = gpd.nodeInput+gpd.nodeOutput;
gridC.patternCount = ptSet->getCount();
gridC.patterns = new cl_float [gpd.patternCount*gpd.offest];
All the data is initialized then initialized with values and then passed to the GPU
int elements = gpd.patternCount;
size_t ofsdf = sizeof(gridC);
size_t dataSize = sizeof(GPUPattern::GPUPatternData)+ (sizeof(cl_float)*elements);
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
//Copy the buffer to the device
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&gpd,0,NULL,NULL);
//This buffer is being written to only
cl_mem bufferC = clCreateBuffer(gpu.context,CL_MEM_WRITE_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
err = clEnqueueWriteBuffer(queue,bufferC,CL_TRUE,0,dataSize,(void*)&gridC,0,NULL,NULL);
Everything is built which I check just watching the error which stays at 0
cl_program program = clCreateProgramWithSource(gpu.context,1, (const char**) &kernelSource,NULL,&err);
////Build program
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
char build[2048];
clGetProgramBuildInfo(program, gpu.device, CL_PROGRAM_BUILD_LOG, 2048, build, NULL);
////Create kernal
cl_kernel kernal = clCreateKernel(program, "patternDataAddition",&err);
////Set kernal arguments
err = clSetKernelArg(kernal, 0, sizeof(cl_mem), &bufferA);
err |= clSetKernelArg(kernal, 1, sizeof(cl_mem), &bufferC);
It is then kicked off
size_t globalWorkSize = 1024;
size_t localWorkSize = 512;
err = clEnqueueNDRangeKernel(queue, kernal, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
clFinish(queue);
Its at this point it all goes wrong
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize, &gridC, 0, NULL, NULL);
clFinish(queue);
The error in this case is -5 (CL_OUT_OF_RESOURCES).
Also if I change the line:
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize, &gridC, 0, NULL,
to:
err = clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, dataSize*1000, &gridC, 0, NULL, NULL);
I get the error -30 (CL_INVALID_VALUE).
So my question is why am i getting the errors I am when reading back the buffer. Also I am not sure if I am unable to use a pointer to my float array as could this be giving me the wrong sizeof() used for datasize which gives me the wrong buffer size.

You cannot pass a struct that contains pointers into OpenCL
http://www.khronos.org/registry/cl/specs/opencl-1.2.pdf (Section 6.9)
You can either correct as Eric Bainville pointed out or if you are not very restrict on memory you can do something like
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float patterns[MAX_SIZE];
};
EDIT: OK if memory is an issue. Since you only use the patterns and patternCount you can copy the patterns from the struct and pass them to the kernel separately.
struct GPUPatternData
{
cl_int nInput,nOutput,patternCount, offest;
cl_float patterns*;
};
copy patterns to GPU from gpd and allocate space for patterns in gridC on GPU.
then
You can pass the buffers separately
__kernel void patternDataAddition(int gpd_patternCount,
__global const float * gpd_Patterns,
__global float * gridC_Patterns) {
int index = get_global_id(0);
if(index < gpd_patternCount)
{
gridC_Patterns[index] = gpd_Patterns[index]*2;
}
}
when you come back from the kernel copy the data back to gridC.patterns directly
One more :
You don't have to change your CPU struct. It stays the same. However this part
size_t dataSize = sizeof(GPUPattern::GPUPatternData)+ (sizeof(cl_float)*elements);
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
//Copy the buffer to the device
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&gpd,0,NULL,NULL);
should be changed to something like
size_t dataSize = (sizeof(cl_float)*elements); // HERE
float* gpd_dataPointer = gpd.patterns; // HERE
cl_mem bufferA = clCreateBuffer(gpu.context,CL_MEM_READ_ONLY,dataSize,NULL,&err);
openCLErrorCheck(&err);
// Now use the gpd_dataPointer
err = clEnqueueWriteBuffer(queue,bufferA,CL_TRUE,0,dataSize,(void*)&(gpd_dataPointer),0,NULL,NULL);
Same thing goes for the gridC
And when you copy back, copy it to gridC_dataPointer AKA gridC.dataPointer
And then continue using the struct as if nothing happened.

The problem is probably with the pointer inside your struct.
In this case, I would suggest to pass nInput,nOutput,patternCount,offset as kernel args, and the patterns as a buffer of float:
__kernel void patternDataAddition(int nInput,int nOutput,
int patternCount,int offset,
__global const float * inPatterns,
__global float * outPatterns)

I know that it is not actual now, but i passed this problem in other way:
Your code for allocation memory for struct with data stay same, but struct should bu changed to
typedef struct GPUPatternDataContatiner
{
int nodeInput, nodeOutput, patternCount, offest;
float patterns[0];
} GPUPatternData;
Using this "feature" i have created vectors for OpenCL