sending struct array to cuda kernel - c

I'm working on a project and I have to sent a struct array to cuda kernel. The struct also contains an array. To test it I have written a simple program.
struct Point {
short x;
short *y;
};
my kernel code:
__global__ void addKernel(Point *a, Point *b, Point *c)
{
int i = threadIdx.x;
c[i].x = a[i].x + b[i].x;
for (int j = 0; j<4; j++){
c[i].y[j] = a[i].y[j] + a[i].y[j];
}
}
my main code:
int main()
{
const int arraySize = 4;
const int arraySize2 = 4;
short *ya, *yb, *yc;
short *dev_ya, *dev_yb, *dev_yc;
Point *a;
Point *b;
Point *c;
Point *dev_a;
Point *dev_b;
Point *dev_c;
size_t sizeInside = sizeof(short) * arraySize2;
ya = (short *)malloc(sizeof(short) * arraySize2);
yb = (short *)malloc(sizeof(short) * arraySize2);
yc = (short *)malloc(sizeof(short) * arraySize2);
ya[0] = 1; ya[1] =2; ya[2]=3; ya[3]=4;
yb[0] = 2; yb[1] =3; yb[2]=4; yb[3]=5;
size_t sizeGeneral = (sizeInside+sizeof(short)) * arraySize;
a = (Point *)malloc( sizeGeneral );
b = (Point *)malloc( sizeGeneral );
c = (Point *)malloc( sizeGeneral );
a[0].x = 2; a[0].y = ya;
a[1].x = 2; a[1].y = ya;
a[2].x = 2; a[2].y = ya;
a[3].x = 2; a[3].y = ya;
b[0].x = 4; b[0].y = yb;
b[1].x = 4; b[1].y = yb;
b[2].x = 4; b[2].y = yb;
b[3].x = 4; b[3].y = yb;
cudaMalloc((void**)&dev_a, sizeGeneral);
cudaMalloc((void**)&dev_b, sizeGeneral);
cudaMalloc((void**)&dev_c, sizeGeneral);
cudaMemcpy(dev_a, a, sizeGeneral, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeGeneral, cudaMemcpyHostToDevice);
addKernel<<<1, 4>>>(dev_a, dev_b, dev_c);
cudaError_t err = cudaMemcpy(c, dev_c, sizeGeneral, cudaMemcpyDeviceToHost);
printf("{%d-->%d,%d,%d,%d} \n err= %d",c[0].x,c[0].y[0],c[1].y[1],c[1].y[2],c[2].y[3], err);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
It seems cuda kernel is not working. Actually I can access structs 'x' variable but I cannot access 'y' array. What can I do to access the 'y' array? Thanks in advance.

When you are sending this struct to kernel you send short and pointer to short in host memory not device. This is crucial. For simple type - as short this works, because kernel has its local copy in memory designated to accept parameters. So when you call this kernel you have moved x and y to device, but not the area pointed by y. This you have to do manually by allocating space for it and updating pointer y to point to device memory.

You are not passin the array to the device. You can either make the array a part of the struct, by defining it like this:
struct {
short normalVal;
short inStructArr[4];
}
Or pass the array into the device memory and update the pointer in the struct.

Related

Python C Extension

I am having issues returning a 2D array from a C extension back to Python. When I allocate memory using malloc the returned data is rubbish. When I just initialise an array like sol_matrix[nt][nvar] the returned data is as expected.
#include <Python.h>
#include <numpy/arrayobject.h>
#include <math.h>
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
// function to be solved by Euler solver
double func (double xt, double y){
double y_temp = pow(xt, 2);
y = y_temp;
return y;
}
static PyObject* C_Euler(double h, double xn)
{
double y_temp, dydx; //temps required for solver
double y_sav = 0; //temp required for solver
double xt = 0; //starting value for xt
int nvar = 2; //number of variables (including time)
int nt = xn/h; //timesteps
double y = 0; //y starting value
//double sol_matrix[nt][nvar]; //works fine
double **sol_matrix = malloc(nt * sizeof(double*)); //doesn't work
for (int i=0; i<nt; ++i){
sol_matrix[i] = malloc (nvar * sizeof(double));
}
int i=0;
//solution loop - Euler method.
while (i < nt){
sol_matrix[i][0]=xt;
sol_matrix[i][1]=y_sav;
dydx = func(xt, y);
y_temp = y_sav + h*dydx;
xt = xt+h;
y_sav=y_temp;
i=i+1;
}
npy_intp dims[2];
dims[0] = nt;
dims[1] = 2;
//Create Python object to copy solution array into, get pointer to
//beginning of array, memcpy the data from the C colution matrix
//to the Python object.
PyObject *newarray = PyArray_SimpleNew(2, dims, NPY_DOUBLE);
double *p = (double *) PyArray_DATA(newarray);
memcpy(p, sol_matrix, sizeof(double)*(nt*nvar));
// return array to Python
return newarray;
}
static PyObject* Euler(PyObject* self, PyObject* args)
{
double h, xn;
if (!PyArg_ParseTuple(args, "dd", &h, &xn)){
return NULL;
}
return Py_BuildValue("O", C_Euler(h,xn));
}
Could you provide any guidance on where I am going wrong?
Thank you.
The data in sol_matrix is not in contiguous memory, it's in nt separately allocated arrays. Therefore the line
memcpy(p, sol_matrix, sizeof(double)*(nt*nvar));
is not going to work.
I'm not a big fan of pointer-to-pointer arrays so believe your best option is to allocate sol_matrix as one big chunk:
double *sol_matrix = malloc(nt*nvar * sizeof(double));
This does mean you can't do 2D indexing so will need to do
// OLD: sol_matrix[i][0]=xt;
sol_matrix[i*nvar + 0] = xt;
In contrast
double sol_matrix[nt][nvar]; //works fine
is a single big chunk of memory so the copy works fine.

How to allocate array starting negative index

I am trying to allocate a 3D array u[-nx/2:nx/2-1][-nx/2:nx/2-1][-nx/2:nx/2-1]
int nx = 512;
double *** u = (double ***)malloc(nx * sizeof(double**));
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double **)malloc(nx * sizeof(double *));
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double *)malloc(nx * sizeof(double));
}
}
Is this a correct way to do it? If it's not, how should I change it?
No, that’s not correct. You can get it to work by placing every pointer in the middle of the dimension it represents:
int nx = 512;
double*** u = (double***)malloc(nx * sizeof(double**)) + nx/2;
for (int i = -nx/2; i < nx/2; i++) {
u[i] = (double**)malloc(nx * sizeof(double*)) + nx/2;
for (int j = -nx/2; j < nx/2; j++) {
u[i][j] = (double*)malloc(nx * sizeof(double)) + nx/2;
}
}
but that’s unusual and confusing, does a lot of separate allocations, and has to be undone for the deallocation step.
Consider one block with accessors instead:
#define NX 512
/* or just double* if nx is dynamic, and calculate the index manually */
double[NX][NX][NX]* u = malloc(sizeof(*u));
double array_get(double[NX][NX][NX] const* u, int i, int j, int k) {
return u[i + NX/2][j + NX/2][k + NX/2];
}
void array_set(double[NX][NX][NX]* u, int i, int j, int k, double value) {
u[i + NX/2][j + NX/2][k + NX/2] = value;
}
No.
Array in C is actually just plain/flat memory block, which is always 0 based and always in 1d (one demension).
Suppose you need a 3d array in arbitrary boundary,
say u[lb_1d, ub_1d][lb_2d, ub_2d][lb_3d, ub_3d],
you will need to do some mapping -- address space from 3d to 1d and vice versa --.
Sample implementation like this:
typedef struct
{
double* _arr;
int _lb_1d;
int _ub_1d;
int _lb_2d;
int _ub_2d;
int _lb_3d;
int _ub_3d;
}DoubleArr3D;
DoubleArr3D* create_3d_arr(int lb_1d, int ub_1d, int lb_2d, int ub_2d, int lb_3d, int ub_3d)
{
int array_size = (ub_1d - lb_1d +1) * (ub_2d - lb_2d +1) * (ub_3d - lb_3d +1);
DoubleArr3D * arr = (DoubleArr3D *)malloc( sizeof( DoubleArr3D) );
if (!arr)
{
return NULL;
}
arr->_lb_1d = lb_1d;
arr->_ub_1d = ub_1d;
arr->_lb_2d = lb_2d;
arr->_ub_2d = ub_2d;
arr->_lb_3d = lb_3d;
arr->_ub_3d = ub_3d;
arr->_arr = (double*) malloc(sizeof(double) * (size_t) array_size);
if (!arr)
{
free(arr);
return NULL;
}
return arr;
}
// arr[i1d, i2d, i3d] ==> arr_get_at(arr, i1d, i2d, i3d)
double arr_get_at(DoubleArr3D* arr, int i1d, int i2d, int i3d )
{
if (!arr || !arr->_arr)
{
// just demo of 'validation check'. in real code we should have meanful error report
return 0;
}
return arr->_arr [
i3d - arr->_lb_3d
+ (i2d - arr->_lb_2d ) * (arr->_ub_3d - arr->_lb_3d +1)
+ (i1d - arr->_lb_1d ) * (arr->_ub_2d - arr->_lb_2d +1) * (arr->_ub_3d - arr->_lb_3d +1)
];
}
First off, all C arrays have index values ranging from 0 to ELEMENT_COUNT-1. Always.
As you are using malloc, I am presuming that the value of nx is only defined at runtime. This rules out static array sizes and thus rules out using the cute arr[x][y][z] syntax as in:
#define NX 512
double arr[NX][NX][NX];
void foo(void)
{
...
arr[z1][y1][x1] += 2 * arr[z2][y2][x2];
...
}
That in turn means that to have the functionality of a 3D array with nx different values for each of its three dimensions dimension, you will need to allocate a linear memory area of size nx_cubed = nx * nx * nx. To calculate that value nx_cubed properly, you will need to check for integer overflows.
Also, you need to properly convert from signed int coordinate values to unsigned size_t values used in the 0 based index ranges.
if (nx < 0) {
fprintf(stderr, "negative value of nx\n");
exit(EXIT_FAILURE);
}
const size_t unx = nx;
const size_t nx_cubed = unx * unx * unx;
/* TODO: Complete check for overflows */
if (nx_cubed < unx) {
fprintf(stderr, "nx_cubed overflow\n");
exit(EXIT_FAILURE);
}
Then you can allocate a memory buffer of the appropriate size, and then check that the malloc call has actually worked.
double *buf = malloc(nx_cubed);
if (!buf) {
fprintf(stderr, "Error allocating memory for nx_cubed elements\n");
exit(EXIT_FAILURE);
}
Now there is the question of calculcating the array index from your x, y, and z values each ranging from -nx/2 to nx/2-1. I recommend writing a function for that which maps that range to the 0 to nx-1 range, and then calculates the proper linear index from the three 0-based values. Again, proper integer overflow checks should be performed.
size_t array3index(const size_t nx, const int x, const int y, const int z) {
const size_t half_nx = nx/2;
/* zero based 3D coordinates,
* this probably triggers some signedness warnings */
const size_t x0 = half_nx + x;
const size_t y0 = half_nx + y;
const size_t z0 = half_nx + z;
if ((x0 >= nx) || (y0 >= nx) || (z0 >= nx)) {
fprintf(stderr, "Signed coordinate(s) out of range: (%d, %d, %d)\n",
x, y, z);
exit(EXIT_FAILURE);
}
const size_t idx = nx * (nx * z0 + y0) + x0;
/* Assuming that we have already checked that nx*nx*nx does not
* overflow, and given that we have checked for x0, y0, z0 to be
* in the range of 0 to (nx-1), the idx calculation should not
* have overflown here. */
return idx;
}
Then you can do your accesses to the 3D array like
const i1 = array3index(nx, x1, y1, z1);
const i2 = array3index(nx, x2, y2, z2);
buf[i1] += 2*buf[i2];
Considering the amount of calculations needed inside array3index, I would examine whether it makes more sense to do the array iteration in the 0 to nx-1 domain directly, and only convert that to -nx/2 to nx/2-1 range values if you actually need that value within a calculation.

multithreading and parameters mixup

I have a function which behaves correctly when called by a single thread (either by calling it directly, or via CreateThread() / WaitForSingleObject() calls ), but seems to go haywire when invoked by multiple CreateThread() followed by a WaitForMultipleObject() call.
From the extensive debugging I have tried, It looks as if some of the variables passed as parameters to the main function being called are not kept isolated between different threads, and instead use the same address space (example below). Here's a summary with some details of the problem:
First, I define a type to hold all the parameters for the function every thread needs to call:
typedef struct {
tDebugInfo DebugParms; int SampleCount; double** Sample; double** Target; double** a; double** F; double** dF; double** prevF; double** prevdF; double*** W; double*** prevW; double*** prevdW; double* e; double* dk; double* dj; double* dj2; double* sk; double* sk2; double* adzev21; double* prevadzev21; double** UW10; double* ro10e; double** dW10d; double** A; double** B; double** C; double** D; double** E; double** G; double** ET; double** AB; double** ABC; double** ABCD; double** ABCDE; double** ABCDH; double** ABCDHG; double** SABCDE; double** SABCDHG; double** I; double** J; double** M; double** x; double** xT; double* xU; double** dW10; int DataSetId; int TestId; int PredictionLen; double* Forecast; double ScaleM; double ScaleP; NN_Parms* ElmanParms; int DP[2][10];} tTrainParams;
I then allocate an array of structures to hold each thread's set of parameters:
HANDLE* HTrain = (HANDLE*)malloc(DatasetsCount*sizeof(HANDLE));
tTrainParams* tp = (tTrainParams*)malloc(DatasetsCount * sizeof(tTrainParams));
DWORD tid = 0; LPDWORD th_id = &tid;
Then, I set function parameters for each thread:
tp[d].ElmanParms = pElmanParams; tp[d].SampleCount = SampleCount; tp[d].Sample = SampleData_Scaled[d]; tp[d].Target = TargetData_Scaled[d]; tp[d].a = a; tp[d].F = F; tp[d].dF = dF; tp[d].prevF = prevF; tp[d].prevdF = prevdF; tp[d].W = W; tp[d].prevW = prevW; tp[d].prevdW = prevdW; tp[d].e = e; tp[d].dk = dk; tp[d].dj = dj; tp[d].dj2 = dj2; tp[d].sk = sk; tp[d].sk2 = sk2; tp[d].adzev21 = adzev21; tp[d].prevadzev21 = prevadzev21; tp[d].UW10 = UW10; tp[d].ro10e = ro10e; tp[d].dW10d = dW10d; tp[d].A = A; tp[d].B = B; tp[d].C = C; tp[d].D = D; tp[d].E = E; tp[d].G = G; tp[d].ET = ET; tp[d].AB = AB; tp[d].ABC = ABC; tp[d].ABCD = ABCD; tp[d].ABCDE = ABCDE; tp[d].ABCDH = ABCDH; tp[d].ABCDHG = ABCDHG; tp[d].SABCDE = SABCDE; tp[d].SABCDHG = SABCDHG; tp[d].I = I; tp[d].J = J; tp[d].M = M; tp[d].x = x; tp[d].xT = xT; tp[d].xU = xU; tp[d].dW10 = dW10; tp[d].DebugParms = pDebugParms; tp[d].ElmanParms = pElmanParams; tp[d].PredictionLen = pPredictionLen; tp[d].Forecast = ForecastData[d]; tp[d].ScaleM = ScaleM[d]; tp[d].ScaleP = ScaleP[d]; tp[d].TestId = pTestId; tp[d].DataSetId = d;
Then, I call a wrapper function GetForecastFromTraining(tTrainParams* parms) for each thread, having set in advance the relevant parameters in the "tp" structure array:
HTrain[d] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)GetForecastFromTraining, &tp[d], 0, th_id);
Finally, I call WaitForMultipleObjects():
WaitForMultipleObjects(DatasetsCount, HTrain, TRUE, INFINITE);
What happens inside GetForecastFromTraining() for most variables (apparently arrays only) is that whenever one thread changes the value of one array element (say, W[0][0][0]), the new value becomes current inside all the other threads, too. This, of course, screws up all the calculations that are being made across all threads, and looks to me to be contrary to the whole segregation story across threads.
One hint of what's going on is that, when I look at "Parallel Watch" debugging window inside VS2013, I see that W has the same address across all the threads (hence the same values); however, &W is different for each thread. Other non-array variables seem to behave fine. Finally, I double-checked the /MTd flag in the compiler option, and it is there.
I'm quite lost on this. Any suggestion?
P.S.: Here is a streamlined version of my program, which displays the same problematic behaviour. In this example, breaking the execution after the Sleep(1000) line shows that a1, a2 and G variables each correctly contains the thread id, while F is the same for all threads.
#include <Windows.h>
#include <stdio.h>
#define MAX_THREADS 5
HANDLE h[MAX_THREADS];
typedef struct{
int a1;
int a2;
double* F;
double G[5];
} tMySumParms;
void MySum(tMySumParms* p){
int tid = GetCurrentThreadId();
Sleep(200);
p->a1 = tid;
p->a2 = -tid;
p->F[0] = tid;
p->F[1] = -tid;
p->G[0] = tid;
p->G[1] = -tid;
Sleep(1000);
}
extern "C" __declspec(dllexport) int GetKaz(){
LPDWORD t = NULL;
tMySumParms* p = (tMySumParms*)malloc(MAX_THREADS*sizeof(tMySumParms));
HANDLE* h = (HANDLE*)malloc(MAX_THREADS*sizeof(HANDLE));
double G[5];
double* F = (double*)malloc(5 * sizeof(double));
for (int i = 0; i < MAX_THREADS; i++){
p[i].a1 = 1;
p[i].a2 = 2 ;
p[i].F = F;
memcpy(p[i].G, G, 5 * sizeof(double));
h[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)MySum, &p[i], 0, t);
}
WaitForMultipleObjects(MAX_THREADS, h, TRUE, INFINITE);
return 0;
}
W is declared as double*** in the parameter struct, later in the question you say you use it as W[0][0][0]. So W is an array of pointers to arrays of pointers to arrays of doubles.
My guess is that one of those layers is common for all threads.
To confirm this theory, and to make sure it is not a concurrency problem but a data structure problem, I would create a simple single-threaded test function as follows:
Fill the array intended for thread 1 with 1.0
Then fill the array for thread 2 with 2.0
Check the values for thread 1.
The streamlined version shows the problem: The F array is allocated once and each thread gets a pointer to this single array. So if one thread updates the array, all the others see the changes.
double* F = (double*)malloc(5 * sizeof(double)); // one array!
for (int i = 0; i < MAX_THREADS; i++){
...
p[i].F = F; // all threads use the same array!
Change it to:
for (int i = 0; i < MAX_THREADS; i++){
...
p[i].F = malloc(5 * sizeof(double)); // each thread has its own array

Sending 2D array to Cuda Kernel

I'm having a bit of trouble understanding how to send a 2D array to Cuda. I have a program that parses a large file with a 30 data points on each line. I read about 10 rows at a time and then create a matrix for each line and items(so in my example of 10 rows with 30 data points, it would be int list[10][30]; My goal is to send this array to my kernal and have each block process a row(I have gotten this to work perfectly in normal C, but Cuda has been a bit more challenging).
Here's what I'm doing so far but no luck(note: sizeofbucket = rows, and sizeOfBucketsHoldings = items in row...I know I should win a award for odd variable names):
int list[sizeOfBuckets][sizeOfBucketsHoldings]; //this is created at the start of the file and I can confirmed its filled with the correct data
#define sizeOfBuckets 10 //size of buckets before sending to process list
#define sizeOfBucketsHoldings 30
//Cuda part
//define device variables
int *dev_current_list[sizeOfBuckets][sizeOfBucketsHoldings];
//time to malloc the 2D array on device
size_t pitch;
cudaMallocPitch((int**)&dev_current_list, (size_t *)&pitch, sizeOfBucketsHoldings * sizeof(int), sizeOfBuckets);
//copy data from host to device
cudaMemcpy2D( dev_current_list, pitch, list, sizeOfBuckets * sizeof(int), sizeOfBuckets * sizeof(int), sizeOfBucketsHoldings * sizeof(int),cudaMemcpyHostToDevice );
process_list<<<count,1>>> (sizeOfBuckets, sizeOfBucketsHoldings, dev_current_list, pitch);
//free memory of device
cudaFree( dev_current_list );
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, int pitch) {
int tid = blockIdx.x;
for (int r = 0; r < sizeOfBuckets; ++r) {
int* row = (int*)((char*)current_list + r * pitch);
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
int element = row[c];
}
}
The error I'm getting is:
main.cu(266): error: argument of type "int *(*)[30]" is incompatible with parameter of type "int *"
1 error detected in the compilation of "/tmp/tmpxft_00003f32_00000000-4_main.cpp1.ii".
line 266 is the kernel call process_list<<<count,1>>> (count, countListItem, dev_current_list, pitch); I think the problem is I am trying to create my array in my function as int * but how else can I create it? In my pure C code, I use int current_list[num_of_rows][num_items_in_row] which works but I can't get the same outcome to work in Cuda.
My end goal is simple I just want to get each block to process each row(sizeOfBuckets) and then have it loop through all items in that row(sizeOfBucketHoldings). I orginally just did a normal cudamalloc and cudaMemcpy but it wasn't working so I looked around and found out about MallocPitch and 2dcopy(both of which were not in my cuda by example book) and I have been trying to study examples but they seem to be giving me the same error(I'm currently reading the CUDA_C programming guide found this idea on page22 but still no luck). Any ideas? or suggestions of where to look?
Edit:
To test this, I just want to add the value of each row together(I copied the logic from the cuda by example array addition example).
My kernel:
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, size_t pitch, int *total) {
//TODO: we need to flip the list as well
int tid = blockIdx.x;
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
total[tid] = total + current_list[tid][c];
}
}
Here's how I declare the total array in my main:
int *dev_total;
cudaMalloc( (void**)&dev_total, sizeOfBuckets * sizeof(int) );
You have some mistakes in your code.
Then you copy host array to device you should pass one dimensional host pointer.See the function signature.
You don't need to allocate static 2D array for device memory. It creates static array in host memory then you recreate it as device array. Keep in mind it must be one dimensional array, too. See this function signature.
This example should help you with memory allocation:
__global__ void process_list(int sizeOfBucketsHoldings, int* total, int* current_list, int pitch)
{
int tid = blockIdx.x;
total[tid] = 0;
for (int c = 0; c < sizeOfBucketsHoldings; ++c)
{
total[tid] += *((int*)((char*)current_list + tid * pitch) + c);
}
}
int main()
{
size_t sizeOfBuckets = 10;
size_t sizeOfBucketsHoldings = 30;
size_t width = sizeOfBucketsHoldings * sizeof(int);//ned to be in bytes
size_t height = sizeOfBuckets;
int* list = new int [sizeOfBuckets * sizeOfBucketsHoldings];// one dimensional
for (int i = 0; i < sizeOfBuckets; i++)
for (int j = 0; j < sizeOfBucketsHoldings; j++)
list[i *sizeOfBucketsHoldings + j] = i;
size_t pitch_h = sizeOfBucketsHoldings * sizeof(int);// always in bytes
int* dev_current_list;
size_t pitch_d;
cudaMallocPitch((int**)&dev_current_list, &pitch_d, width, height);
int *test;
cudaMalloc((void**)&test, sizeOfBuckets * sizeof(int));
int* h_test = new int[sizeOfBuckets];
cudaMemcpy2D(dev_current_list, pitch_d, list, pitch_h, width, height, cudaMemcpyHostToDevice);
process_list<<<10, 1>>>(sizeOfBucketsHoldings, test, dev_current_list, pitch_d);
cudaDeviceSynchronize();
cudaMemcpy(h_test, test, sizeOfBuckets * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < sizeOfBuckets; i++)
printf("%d %d\n", i , h_test[i]);
return 0;
}
To access your 2D array in kernel you should use pattern base_addr + y * pitch_d + x.
WARNING: the pitvh allways in bytes. You need to cast your pointer to byte*.

Getting value from a dynamic allocated 2d array by pointers

I have filled a dynamic allocated float multi array in a function.
A second function has to get the values of the array exploiting the pointer to the first element of the array defined in the former function.
The second function do not access to the correct memory location so it doesn't work but it does if the multy array is defined in a static way.
Does somebody know why?
eval_cell should get values defined in div_int
float f_imp(float x, float y){
return pow(x,2)+pow(y,2)-1;
}
int eval_cell(float* p){
int s[4];
s[0] = f_imp(*p, *(p+1)) <= 0;
printf("%f %f\n",*p, *(p+1));
s[1] = f_imp(*(p+3), *(p+4)) <= 0;
printf("%f %f\n",*(p+3), *(p+4));
s[2] = f_imp(*(p+9), *(p+10)) <= 0;
printf("%f %f\n",*(p+9), *(p+10));
s[3] = f_imp(*(p+6), *(p+7)) <= 0;
printf("%f %f\n",*(p+6), *(p+7));
printf("%d%d%d%d\n",s[0],s[1],s[2],s[3]);
return s[0];
}
void div_int(float* x1, float* y1,float* x2,float* y2,
float* f0, float* f2,float* f6,float* f8){
int i,j,m;
float* p;
float** a_cell; // array 9x3 contente coordinate vertici e valore funzione
*a_cell = (float**) malloc(9*sizeof(float*));
for (i=0;i<9;i++){
a_cell[i] = (float*) malloc(3*sizeof(float));
}
a_cell[0][0] = *x1;
a_cell[0][1] = *y1;
a_cell[0][2] = *f0;
a_cell[2][0] = *x2;
a_cell[2][1] = *y1;
a_cell[2][2] = *f2;
a_cell[6][0] = *x1;
a_cell[6][1] = *y2;
a_cell[6][2] = *f6;
a_cell[8][0] = *x2;
a_cell[8][1] = *y2;
a_cell[8][2] = *f8;
/*** calcolo dei valori incogniti di a_cell ***/
a_cell[1][0] = (*x1+*x2)/2;
a_cell[1][1] = *y1;
a_cell[1][2] = f_imp(a_cell[1][0], a_cell[1][1]);
a_cell[3][0] = *x1;
a_cell[3][1] = (*y1+*y2)/2;
a_cell[3][2] = f_imp(a_cell[3][0], a_cell[3][1]);;
a_cell[4][0] = (*x2+*x1)/2;
a_cell[4][1] = (*y2+*y1)/2;
a_cell[4][2] = f_imp(a_cell[4][0], a_cell[4][1]);
a_cell[5][0] = *x2;
a_cell[5][1] = (*y2+*y1)/2;
a_cell[5][2] = f_imp(a_cell[5][0], a_cell[5][1]);
a_cell[7][0] = (*x1+*x2)/2;
a_cell[7][1] = *y2;
a_cell[7][2] = f_imp(a_cell[7][0], a_cell[7][1]);
for (j=0;j<2;j++){
m = j*3;
for(i=0;i<2;i++){
m += i;
eval_cell(&a_cell[m][0]);
}
}
p = *a_cell;
for (i=0;i<9;i++){
for (j=0;j<3;j++){
printf("%f \n",*(p+3*i+j));
printf("%f \n",a_cell[i][j]);
printf("\n");
}
}
free(a_cell);
return;
}
It's because you using pointer in incorrect way:
See a_cell is pointer to dynamic array of 9 pointers to dynamic array of 3 floats.
So when you do eval_cell(&a_cell[m][0]) (or just eval_cell(a_cell[m]) this is actually the same) you actually get pointer to array of 3 floats. And after that you do:
int eval_cell(float* p){
...
s[2] = f_imp(*(p+9), *(p+10)) <= 0;
*(p+9) will get 9th element in array of 3 floats, so this is incorrect.
It works in static way, because static multi dimension array in memory is just one dimension array for which you was given multi indexing (by compiler). That's why in static you will probably address valid memory area.
See picture for more explanation:
If you want a completely dynamic matrix (2d array), you have to make your own element access function:
double *
make_array (unsigned int rows, unsigned int cols)
{
return malloc (rows * cols * sizeof (double));
}
double *
array_element (double *a, unsigned int cols, unsigned int i, unsigned int j)
{
return a + i * cols + j;
}
#define A(i,j) (*array_element ((a), (cols), (i), (j)))
double *a;
unsigned int rows, cols;
a = make_array (rows, cols);
A(3,4) = 3.14;
printf ("%f\n:" A(3,4));
EDIT:
In your program
*a_cell = (float**) malloc(9*sizeof(float*));
should be
a_cell = (float**) malloc(9*sizeof(float*));
And likewise for
p = *a_cell;

Resources