OpenCL: type casting on GPU

OpenCL: type casting on GPU - c

I store my data in a char array, and I need to read float and int variables from there.
This code works fine on CPU:
global float *p;
p = (global float*)get_pointer_to_the_field(char_array, index);
*p += 10;
But on GPU I get the error -5: CL_OUT_OF_RESOURCES. The reading itself works, but doing something with the value (adding 10 in this case) causes the error. How could I fix it?
Update:
This works on GPU:
float f = *p;
f += 10;
However, I still can't write this value back to the array.
Here is the kernel:
global void write_value(global char *data, int tuple_pos, global char *field_value,
int which_field, global int offsets[], global int *num_of_attributes) {
int tuple_size = offsets[*num_of_attributes];
global char *offset = data + tuple_pos * tuple_size;
offset += offsets[which_field];
memcpy(offset, field_value, (offsets[which_field+1] - offsets[which_field]));
}
global char *read_value(global char *data, int tuple_pos,
int which_field, global int offsets[], global int *num_of_attributes) {
int tuple_size = offsets[*num_of_attributes];
global char *offset = data + tuple_pos * tuple_size;
offset += offsets[which_field];
return offset;
}
kernel void update_single_value(global char* input_data, global int* pos, global int offsets[],
global int *num_of_attributes, global char* types) {
int g_id = get_global_id(1);
int attr_id = get_global_id(0);
int index = pos[g_id];
if (types[attr_id] == 'f') { // if float
global float *p;
p = (global float*)read_value(input_data, index, attr_id, offsets, num_of_attributes);
float f = *p;
f += 10;
//*p += 10; // not working on GPU
}
else if (types[attr_id] == 'i') { // if int
global int *p;
p = (global int*)read_value(input_data, index, attr_id, offsets, num_of_attributes);
int i = *p;
i += 10;
//*p += 10;
}
else { // if char
write_value(input_data, index, read_value(input_data, index, attr_id, offsets, num_of_attributes), attr_id, offsets, num_of_attributes);
}
}
It updates values of a table's tuples, int and float are increased by 10, char fields are just replaced with the same content.

Are you enabling the byte_addressable_store extension? As far as I'm aware, bytewise writes to global memory aren't well-defined in OpenCL unless you enable this. (You'll need to check if the extension is supported by your implementation.)
You might also want to consider using the "correct" type in the kernel argument - this might help the compiler produce more efficient code. If the type can vary dynamically, you could perhaps try using a union type (or union fields in a struct type), although I haven't tested this with OpenCL myself.

It turned out that the problem occurs because the int and float values in the char array aren't 4 bytes aligned. When I'm doing writes to addresses like
offset = data + tuple_pos*4; // or 8, 16 etc
everything works fine. However, the following causes the error:
offset = data + tuple_pos*3; // or any other number not divisible by 4
This means that either I should change the whole design and store the values somehow else, or add "empty" bytes to the char array to make int and float values 4 bytes aligned (which isn't a really good solution).

Related

Simple memory allocation for embedded application

I need to have dynamic memory allocation without the use of malloc in an embedded application.
As we have enough RAM, I would like to allocate a big array (A[A_MAX]) on startup and have buffer arrays (B[]) within this one big array.
They would always begin at A[0] and be as long as they need (at max A_MAX, but mostly shorter).
Example:
at start up:
A[8] = {0,0,0,0,0,0,0,0};
declaring B[4] at A[0] and filling B with ones
B[4] = {1,1,1,1};
now A should look like this:
A[8] = {1,1,1,1,0,0,0,0}
How can I do this?
I looked at the sourcecode for malloc() and it does return the address of the next free part of the heap.
but when I use
float *B[4] = &A[0];
my compiler returns an error (invalid initializer)

As #neon mentioned I wanted to write an address to a float array.
Writing it like float *B = A[0] worked like a charm.

You may do:
#include <stdalign.h>
#define A_MAX 8
alignas(max_align_t) char _memory[A_MAX] = {0};
then you can just use pointers:
int main() {
{
float *floatdata = (void*)_memory;
// use floatdata
}
{
int *intdata = (void*)_memory;
// use intdata
}
}
I would write a handy function to calculate the maximum size for error checking:
// query the max size and allocate
inline void *dataalloc(size_t elemsize, size_t *n) {
*n = sizeof(_data) / elemsize;
return _data;
}
and do:
size_t size;
float *floatdata = dataalloc(sizeof(*floatdata), &size);
// use as-if floatdata[size]
for (size_t i = 0; i < size; ++i) floatdata[i] = i;

do you mean something like ? I'm using pointers to the array
#include <stdio.h>
#include <stdlib.h>
void allocator(float* src,int SrcSize,float **dst,int size,int pos){
if(size <= SrcSize)
for(int i=0;i<size;i++){
dst[i] = &(src+pos)[i];
}
}
int main()
{
float A[8] = {0,0,0,0,0,0,0,0};
float *B[4];
float *C[4];
allocator(A,sizeof(A)/sizeof(float),B,4,0);
*B[2] = 4;
allocator(A,sizeof(A)/sizeof(float),C,4,4);
*C[0] = 6;
printf("A[2] value is : %f\n" , A[2]); // output is 4.000
printf("A[4] value is : %f\n" , A[4]); // output is 6.000
return 0;
}

Multiple Flexible Array Member or VLA with Shared Memory and Semaphores

I need to define a struct with two semaphores and three(at the least) or maybe more arrays as members of the struct whose size are variables. Indicative example ( not the right syntax but to give a contextual meaning ; lr is typedef for double) :
int nx = 400,ny = 400,nz = 400;
struct ShMem {
sem_t prod;
sem_t cons;
lr u_x[nx+1][ny+2][nz+2];
lr u_y[nx+2][ny+1][nz+2];
lr u_z[nx+2][ny+2][nz+2];
};
What I need to do is to make the struct ShMem as a shared memory block between two codes aka producer and consumer which compute and read this memory block with the help of the semaphores present in the struct.
Since the array size are variables and will be defined in runtime how do i get a 3 Dimensional variable length array ?
Comment :
If lets say I have nx, ny and nz #defined to 400 I follow the following step ( already tested )
#define nx (400)
#define ny (400)
#define nz (400)
struct ShMem {
sem_t prod;
sem_t cons;
lr u_x[nx+1][ny+2][nz+2];
lr u_y[nx+2][ny+1][nz+2];
lr u_z[nx+2][ny+2][nz+2];
};
...
// shared memory allocation
ShmID = shmget(ShmKEY, sizeof(struct Shmem), IPC_CREAT|0666);
...
Additional requirement is that for the application I do need those arrays as 3D arrays such that I can index them as u_x[i][j][k], whre i, j, k are indices in the x, y, and z-direction respectively.
Edit after Lundin and Felix solution.
CONSTRAINT - u_x, u_y and u_z needs to be a 3D array/*** pointer which is accessed by u_x[i][j][k] - This can't be changed since this is a legacy code. The arrays need to be set such that the sanctity of the access is maintained. Everywhere in the code it is accessed like that.

As already discussed in the comments, C doesn't support something like that. So, you will have to build it yourself. A simple example using a macro to make the "3D access" inside the structure readable could look like this:
#include <stdlib.h>
typedef int lr;
struct foo {
size_t ny;
size_t nz;
lr *u_y;
lr *u_z;
lr u_x[];
};
#define val(o, a, x, y, z) ((o).a[(o).ny * (o).nz * x + (o).nz * y + z])
struct foo *foo_create(size_t nx, size_t ny, size_t nz)
{
size_t arrsize = nx * ny * nz;
struct foo *obj = malloc(sizeof *obj + 3 * arrsize * sizeof *(obj->u_x));
if (!obj) return 0;
obj->ny = ny;
obj->nz = nz;
obj->u_y = obj->u_x + arrsize;
obj->u_z = obj->u_y + arrsize;
return obj;
}
int main(void)
{
struct foo *myFoo = foo_create(10, 10, 10);
// set u_y[9][5][2] in *myFoo to 42:
val(*myFoo, u_y, 9, 5, 2) = 42;
free(myFoo);
}
This uses the single FAM at the end of the struct supported by C, so you can allocate such a struct in a single block. To place it in shared memory, just replace the malloc() and use the same calculations for the size.

You have to build something like this
struct ShMem {
int some_stuff_here;
size_t x[3];
size_t y[3];
size_t z[3];
int array[];
};
And then ignore that the flexible array member type is a plain int array. Instead do something like
size_t size = sizeof( int[x1][y1][z1] ) +
sizeof( int[x2][y2][z2] ) +
sizeof( int[x3][y3][z3] );
ShMem* shmem = malloc(sizeof *shmem + size);
And then when accessing, you use an array pointer type instead of the int[]. The code turns a bit nasty to read:
for(size_t i=0; i<3; i++)
{
typedef int(*arr_t)[shmem->y[i]][shmem->z[i]]; // desired array pointer type
int some_offset = ... // calculate based on previously used x y z
arr_t arr = (arr_t)shmem->(array + some_offset); // cast to the array pointer type
for(size_t x=0; x<shmem->x[i]; x++)
{
for(size_t y=0; y<shmem->y[i]; y++)
{
for(size_t z=0; z<shmem->z[i]; z++)
{
arr[x][y][z] = something;
}
}
}
}
This is actually well-defined behavior since the data allocated with malloc doesn't have an effective type until you access it.
"some_offset" in the above example could be a counter variable or something stored inside the struct itself.

Aligning 12-byte struct within a warp - cuda

Lets say we have a struct of 3 integers which is not aligned:
struct data {
int x;
int y;
int z;
};
I pass an array of this struct to kernel. I'm aware that I should pass struct of array instead of array of struct but this is not important for this question.
32 threads inside a warp, access memory in coalesced manner (i to i + 31) which equals total memory of 384 bytes. 384 bytes is multiple of L1 cache line (128 bytes) which means three memory transaction of 128-byte each.
Now if we have an aligned struct:
struct __align__(16) aligned_data {
int x;
int y;
int z;
};
if access patterns remains the same as previous example, then it would fetch 512 bytes of memory which is 4 memory transaction each requesting 128-byte.
So this means is first example more efficient or second one is still more efficient although it fetches more memory.

The only real way to answer a question is by benchmarking. And if you do, you may not get the same answer depending on your hardware. When I run this:
#define NITER (128)
struct data {
int x;
int y;
int z;
};
struct __align__(16) aligned_data {
int x;
int y;
int z;
};
template<typename T, int niter>
__global__
void kernel(T *in, int *out, int dowrite=0)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int nthreads = blockDim.x * gridDim.x;
int oval = 0;
#pragma unroll
for(int i=0; i<niter; ++i,tid+=nthreads) {
T val = in[tid];
oval += val.x + val.y + val.z;
}
if (dowrite) {
out[tid] = oval;
}
}
template __global__ void kernel<data,NITER>(data *, int*, int);
template __global__ void kernel<aligned_data,NITER>(aligned_data *, int*, int);
int main()
{
const int bs = 512;
const int nb = 32;
const int nvals = bs * nb * NITER;
data *d_; cudaMalloc((void **)&d_, sizeof(data) * size_t(nvals));
aligned_data *ad_; cudaMalloc((void **)&ad_, sizeof(aligned_data) * size_t(nvals));
for(int i=0; i<10; ++i) {
kernel<data,NITER><<<nb, bs>>>(d_, (int *)0, 0);
kernel<aligned_data,NITER><<<nb, bs>>>(ad_, (int *)0, 0);
cudaDeviceSynchronize();
}
cudaDeviceReset();
return 0;
}
I see that the aligned structure version gives overall higher performance on a compute 5.2 capability device:
Time(%) Time Calls Avg Min Max Name
52.71% 2.3995ms 10 239.95us 238.10us 241.79us void kernel<data, int=128>(data*, int*, int)
47.29% 2.1529ms 10 215.29us 214.91us 215.51us void kernel<aligned_data, int=128>(aligned_data*, int*, int)
In this case I would assumed that the roughly 10% improvement is down to the lower number of load instructions which are issued. In the unaligned case the compiler issues three 32 bit loads to fetch the structure, whereas in the aligned case the compiler issues a single 128 bit load to fetch the structure. The reduction in instructions seems to offset the 25% wasted net memory bandwidth. On other hardware with different memory instruction throughput to bandwdith ratios, the result might well be different.

How can I initialize similar structs with the same function?

I have some structs that start with a void *buffer and the next members can be of different types:
struct A {
void *buffer;
type_x *x;
type_y *y;
};
struct B {
void *buffer;
type_w *w;
type_y *y;
type_z *z;
};
The buffer from struct A will store n elements of type type_x, followed by n elements of type_y. The other members, type_x *x and type_y *y, will point to those arrays, respectively. Analogous for struct B.
What I'm currently doing is something like this:
void allocate(struct B **b, unsigned int n) {
(*b)->buffer = calloc(n * (sizeof(type_w) + sizeof(type_y) + sizeof(type_z));
(*b)->w = (type_w *) (*b)->buffer;
(*b)->y = (type_y *) ((*b)->w + n);
(*b)->z = (type_z *) ((*b)->y + n);
}
Is there any way to create a function to achieve this? The function should receive as arguments a pointer to one of these structs (void *s) and an int, like this:
void allocate(void *s, unsigned int n) {
// MAGIC
}
Some other options I've considered:
Create void allocate(void *buffer, int n, ...) and give it pointers to the pointers of the struct. The problem with this is that I have to give it void * pointers, so I would have to give it the size of every type as well.
Create void *create_struct(StructType type) (where StructType is an enum) but I would have to code the case for every struct anyway and I want to be able to define new structs and not have to write aditional code.
I'm trying to do this as I will have many structs, and because the allocate function does basically the same thing for every struct I thought there may be a "cleaner" way to do it.
Also, I know that I can just remove the buffer and allocate memory directly to all the members, but I want to do it this way so data is stored contiguously.

There is no generic way to do this that doesn't play fast and loose with type safety. This means, technically, a truly generic solution will result in undefined behavior. If I was forced to implement something like this, I would have to assume I could treat the incoming structure pointer as an array of pointers. And the size of each type would need to be passed in. Ignoring alignment issues, some untested code:
void allocate(void *sp, size_t n, ... /* terminate with 0 */) {
void **sv = sp;
size_t arg, total = 0;
size_t args = 0;
va_list ap;
va_start(ap, n);
while ((arg = va_arg(ap, size_t)) != 0) {
total += arg;
++args;
}
va_end(ap);
*sv = calloc(...);
sv[1] = sv[0];
va_start(ap, n);
while (--args > 0) {
++sv;
sv[1] = (char *)sv[0] + va_arg(ap, size_t);
}
va_end(ap);
}
allocate(a, n, sizeof(type_x), sizeof(type_y), (size_t)0);
allocate(b, n, sizeof(type_w), sizeof(type_y), sizeof(type_z), (size_t)0);
Clearly hacky and ugly.
The better solution should really be to create a separate allocator function for each type. However, you can create a macro to aid in the automatic generation of the allocator. More untested code follows:
#define CREATE_ALLOCATOR(Type, X_Fields) \
void allocate_##Type (struct Type *sp, size_t n) { \
_Pragma("pop_macro(\"X\")") \
size_t total = 0 \
X_Fields \
; \
void *p; \
sp->buffer = calloc(sizeof(*sp) + total); \
p = sp->buffer; \
_Pragma("pop_macro(\"X\")") \
X_Fields \
; \
}
#include "create_allocator_helper.h"
CREATE_ALLOCATOR(A, X(x) X(y))
#include "create_allocator_helper.h"
CREATE_ALLOCATOR(B, X(w) X(y) X(z))
Where the helper header file defines and pushes some X macro definitions used by the CREATE_ALLOCATOR macro:
#ifdef X
#undef X
#endif
#define X(A) ; sp->A = p; p = sp->A + n
#pragma push_macro("X")
#undef X
#define X(A) + sizeof(sp->A)
#pragma push_macro("X")
#undef X

If you know the set of possible ns at compile time, you can let each (member set)/(array size) combination be its own type, and use convenience macros to refer to the correct ones.
#include <stddef.h>
/*
We put a type marker at the beginning of each struct. There won't be padding before the first member, and all the types
start with a struct Type, so we can do `(struct Type*)&unknown_structure` and be guaranteed to have a valid object that
tells us what type the rest of it is.
In practice, I'd probably use X-Macros to generate an enum containing all the types instead of using strings, to
make comparison faster
*/
struct Type { char *type; size_t n; };
/* We define what types of arrays each structure contains. Since the struct contains the arrays themselves
instead of pointers to them, the memory will be contiguous, +/- a bit of padding. */
#define DECL_A(N) struct A_##N { struct Type type; char x[N]; double y[N]; }
#define DECL_B(N) struct B_##N { struct Type type; size_t n; int x[N]; float y[N]; char z[N]; }
/*
Declare a struct and initialize the type and n members. This one just
declares a local variable, but we could make a malloc version easily enough.
*/
#define CREATE_STRUCT(NAME, TYPE, N) struct TYPE##_##N NAME = { .type = { #TYPE, N} }
/* We declare all struct type/size combinations we'll use */
DECL_A(42);
DECL_A(100);
DECL_B(30);
int main(void) {
int i;
CREATE_STRUCT(foo, A, 42);
CREATE_STRUCT(bar, A, 100);
CREATE_STRUCT(baz, B, 30);
return 0;
}

Here's an alternative that doesn't require n to be known at compile time.
Note that I am not completely sure about the legality of this, but I'm
reasonably sure it's valid. The key idea here is that if p points to a
properly aligned memory block for type T, then ((char*)p) + sizeof(T) * N must also point
to properly aligned memory as long as it falls within an allocated block.
As long as that's true, I'm pretty sure this must necessarily be legal, since the union
of buffer and Alignment_Hack guarantees that buffer[0] is aligned properly for all types.
Even if it's legal, it's still a bit of a hack, so I'm not entirely sure I'd recommend it, but it's a potential option.
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
/* This MUST contain all types that might be stored in the arrays */
union Alignment_Hack {
short hd;
int d;
char c;
float hf;
double f;
};
/* This struct is used for *all* structures. The structure-specific details get specified later */
struct Variable_Structure {
size_t num_arrays;
void **arrays;
union {
union Alignment_Hack *hack;
char *buffer;
} u; //u.buffer[0] is guaranteed to be properly aligned for anything in the union
};
//Here's where the details for a specific struct (struct A {short x; double y; int z; }) are specified.
size_t sizes_A[] = { sizeof(short), sizeof(double), sizeof(int) };
void create_structure(struct Variable_Structure *p, const size_t array_count, size_t *sizes, unsigned nvars) {
size_t offsets[nvars];//in bytes (NOTE: requires C99)
unsigned i;
offsets[0] = 0;
for (i = 1; i < nvars; i++) {
//offsets[i] must be an even multiple of sizes[i] and must also be past the end of the last array
size_t min = offsets[i - 1] + sizes[i - 1] * array_count;
size_t mod = min % sizes[i];
//offsets[i] = min_p such that p >= min and p % sizes[i] == 0
offsets[i] = (min - mod) + (mod ? sizes[i] : 0);// (min - mod) and (min - mod + sizes[i]) are the two possible starting points
/* Visualization of the transition from TYPE_A[] to TYPE_B[], showing where everything's pointing:
min (end of TYPE_A array)
V
... | TYPE_A | TYPE_A | TYPE_A |
... | TYPE_B | TYPE_B | TYPE_B | TYPE_B |
^ ^
min - mod (min - mod) + sizes[i] */
assert(offsets[i] >= min);//doesn't overlap previous array
assert(offsets[i] <= min + sizes[i]);//doesn't include more padding than necessary
assert(0 == offsets[i] % sizes[i]);//alignment correct
}
size_t bufsiz = offsets[nvars - 1] + sizes[nvars - 1] * array_count;
//Skipping error checking for brevity
p->num_arrays = nvars;
p->u.buffer = malloc(bufsiz);
p->arrays = malloc(sizeof(void*) * nvars);
for (i = 0; i < nvars; i++) {
p->arrays[i] = p->u.buffer + offsets[i];
}
}
void free_structure(struct Variable_Structure *p) {
free(p->arrays);
free(p->u.buffer);
}
int main(void) {
struct Variable_Structure a;
size_t n = 42;
create_structure(&a, n, sizes_A, sizeof(sizes_A)/sizeof(*sizes_A));
unsigned i;
for (i = 0; i < n; i++) {
//We could always set up some macros or something so we could say, e.g., A(x, i) instead of ((short*)a.arrays[0])[i]
((short*)a.arrays[0])[i] = i;
((double*)a.arrays[1])[i] = i;
((int*)a.arrays[2])[i] = i;
printf("%hd %f %d\n",
((short*)a.arrays[0])[i],
((double*)a.arrays[1])[i],
((int*)a.arrays[2])[i]);
}
printf("SIZES: %zu %zu %zu\n", sizeof(short), sizeof(double), sizeof(int));
printf("OFFSETS: %p %p %p\n", a.arrays[0], a.arrays[1], a.arrays[2]);
free_structure(&a);
return 0;
}

How to read 3 bytes as a whole number?

How do I read 3 bytes from unsigned char buffer at once (as a whole number)?
uint_24 p = *(unsigned char[3])buffer;
The above code doesn't work.

If the buffer can be redefined as part of a union and integer endian is as expected:
union {
unsigned char buffer[3];
uint_24 p;
} x;
foo(x.buffer); // somehow data is loaded into x.buffer
uint_24 destination = x.p; // read: let compiler do the work
By putting into a union, alignment issues are satisfied.

The short answer is: you can't (unless the machine int size is 3 bytes).
As machines generally have an even number of bytes as its int size (word size, register size), the hardware architecture will always fetch an even number of bytes from memory over the bus into its registers, or can fetch one single byte into a (lower) register. Hence the solutions provided in the comments to your question load a byte, shift it left and load the next byte etc. Alternatively you can fetch a word and AND-out the upper byte(s). You must also take the endianness into account. Lastly, not all machines can read ints starting at odd memory addersses, or they require them to be alligned at some even multiple.

you can copy any number of bytes that you want as following:
#include <stdio.h>
void showbits(int n)
{
int i,k,andmask;
for(i=31;i>=0;i--)
{
andmask = 1 << i;
k = n & andmask;
k == 0 ? printf("0") : printf("1");
}
printf("\n");
}
int main()
{
unsigned char buff[] = {'a',0,0,
0,'b',0,
0,0,'c'};
//'a'=97=01100001
//'b'=98=01100010
//'c'=99=01100011
void * src_ptr= (void *) buff;
int i;
for(i = 0 ; i < sizeof(buff) ; i += 3)
{
int num = 0 ;
void * num_ptr = &num;
memcpy(num_ptr , src_ptr , 3);
showbits(num);
src_ptr += 3;
}
return 0;
}
output:
00000000000000000000000001100001 00000000000000000110001000000000
00000000011000110000000000000000

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

OpenCL: type casting on GPU - c

Related

Simple memory allocation for embedded application

Multiple Flexible Array Member or VLA with Shared Memory and Semaphores

Aligning 12-byte struct within a warp - cuda

How can I initialize similar structs with the same function?

How to read 3 bytes as a whole number?

Categories

Resources