#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BLOCK_SIZE 6
#define GRID_SIZE 1
__global__ void test(int A[BLOCK_SIZE][BLOCK_SIZE], int B[BLOCK_SIZE][BLOCK_SIZE], int C[BLOCK_SIZE][BLOCK_SIZE]) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
C[i][j] = A[i][j] + B[i][j];
}
int main(){
int A[BLOCK_SIZE][BLOCK_SIZE];
int B[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for (int i = 0; i<BLOCK_SIZE; i++)
for (int j = 0; j<BLOCK_SIZE; j++){
A[i][j] = i + j;
B[i][j] = i + j;
}
int dev_A[BLOCK_SIZE][BLOCK_SIZE];
int dev_B[BLOCK_SIZE][BLOCK_SIZE];
int dev_C[BLOCK_SIZE][BLOCK_SIZE];
cudaMalloc((void**)&dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMalloc((void**)&dev_A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMalloc((void**)&dev_B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMemcpy(dev_A, A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 36 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
test <<<dimGrid, dimBlock >>> (dev_A, dev_B, dev_C);
cudaDeviceSynchronize();
cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
}
I tried to copy this code How to use 2D Arrays in CUDA?.
Some website tell me to use something like
result[row*WIDTH + col] = array1[row*WIDTH + col] + array2[row*WIDTH + col];
but I don't know how to use it.
My solution is always -858993460
There are two main issues to your code:
Firstly, when you define an array within function scope like this:
int dev_A[BLOCK_SIZE][BLOCK_SIZE];
This creates an array of arrays in host memory which is stored contiguously on the stack. This array can be used straight away from host code without further allocating any memory for it. This is a real C array and not a pointer. While this is fine and correct for A, B and C, this will not suffice for your declarations of dev_A, dev_B and dev_C, as you require memory allocated on the device for these.
There are a couple of ways to correct this. One way is to instead use a pointer to an array of arrays of ints. The syntax for such a declaration is as follows:
int (*dev_A)[BLOCK_SIZE][BLOCK_SIZE];
If you go by this approach, I would recommend changing your cudaMalloc and cudaMemcpy calls as follows:
cudaMalloc((void **) &dev_A, sizeof *dev_A);
// ...
cudaMemcpy(dev_A, &A, sizeof *dev_A, cudaMemcpyHostToDevice);
The difference here is that using sizeof *dev_A is the same as writing sizeof(int [BLOCK_SIZE][BLOCK_SIZE]), which gives the number of bytes taken up by the entire host array, and using &A instead of A, since &A gives a pointer to an array of arrays, while A decays to a pointer to an array. Technically what you already have should evaluate to the exact same values, since the size of an array is equal to the size of its elements multiplied by its length, and also a pointer to an array points to the same address as the first element in that array, however it would be more correct and consistent with how you would use cudaMalloc and cudaMemcpy with any other non-array type, and rightly treats the array of arrays as one single value:
int A, *dev_A;
cudaMalloc((void **) &dev_A, sizeof *dev_A);
cudaMemcpy(dev_A, &A, sizeof *dev_A, cudaMemcpyHostToDevice);
The other approach would be to dynamically allocate memory for multiple contiguous int [BLOCK_SIZE]s rather than a single int [BLOCK_SIZE][BLOCK_SIZE], which could be done as follows:
int (*dev_A)[BLOCK_SIZE];
// ...
cudaMalloc((void **) &dev_A, sizeof *dev_A * BLOCK_SIZE);
// ...
cudaMemcpy(dev_A, A, sizeof *dev_A * BLOCK_SIZE, cudaMemcpyHostToDevice);
This means dev_A now represents a pointer to an array of BLOCK_SIZE ints which is the first element of a sequence of BLOCK_SIZE contiguous arrays in memory. Notice how this time, A is used for cudaMemcpy rather than &A, as A's int [BLOCK_SIZE][BLOCK_SIZE] type decays to int (*)[BLOCK_SIZE] which matches the type of dev_A. Technically speaking, all the approaches mentioned so far do exactly the same thing and pass the same numerical values to the cudaMalloc and cudaMemcpy functions, however, the type of dev_A, dev_B and dev_C is important for how the arrays are used later.
The second issue with your code is in the signature of the test kernel function itself. This function has parameters declared like int A[BLOCK_SIZE][BLOCK_SIZE], however, in C (and C++), when you declare an array parameter in a function, it is instead adjusted to actually be a pointer to the array's element type. So int A[N] as a function parameter actually declares int *A, and the size is ignored. In the case of arrays of arrays, such as int A[N][M], this is converted to int (*A)[M], which means your parameters are int (*)[BLOCK_SIZE] (pointer to an array of BLOCK_SIZE ints) and your function currently has the following effective signature:
__global__
void test(int (*A)[BLOCK_SIZE],
int (*B)[BLOCK_SIZE],
int (*C)[BLOCK_SIZE])
If you stick with this function signature, then if you follow the approach of making dev_A and friends of type int (*)[BLOCK_SIZE], then your code should work as is, as the expression A[i][j] in your function first locates and dereferences the ith array after the address A, and then this array value decays into an int * pointer, and then the jth int after this address is accessed. However if you take the approach of declaring your device pointers as int (*dev_A)[BLOCK_SIZE][BLOCK_SIZE], then you will either have to dereference these pointers when calling your kernel like so (which should be fine as the dereferenced array immediately decays into a pointer so device memory should not be accessed from host code):
test<<<dimGrid, dimBlock>>>(*dev_A, *dev_B, *dev_C);
Or alternatively, the signature of the test function can be changed as follows:
__global__
void test(int (*A)[BLOCK_SIZE][BLOCK_SIZE],
int (*B)[BLOCK_SIZE][BLOCK_SIZE],
int (*C)[BLOCK_SIZE][BLOCK_SIZE])
When doing so however, these pointers-to-arrays must be first dereferenced before accessing their data, so your code within your function will have to be changed as follows:
(*C)[i][j] = (*A)[i][j] + (*B)[i][j];
Using plain C arrays, arrays of arrays, pointers to arrays, and pointers to arrays of arrays can have quite confusing semantics, and also requires your array's size to be fixed at compile-time, so you may prefer instead of using any of these approaches to use a single linear sequence of ints, and then index the elements yourself, for example:
void test(int *A)
{
A[row * BLOCK_SIZE + col] = 123;
}
Device memory for this can easily be allocated as follows:
int *dev_A;
cudaMalloc((void **) &dev_A, sizeof *dev_A * BLOCK_SIZE * BLOCK_SIZE);
An important note is that CUDA code is not C and is actually C++, however your code and the code discussed in this answer is both valid C and C++ (ignoring CUDA extensions). This may create some additional obstacles when writing C-like code, for example having to explicitly cast void * values to other pointer types, but also allows you to make use of useful C++ features such as operator overloading, as featured in talonmies's answer, to encapsulate addressing a 2D grid of values within a single linear buffer of data (so you can write A(row, col) instead of A[row * BLOCK_SIZE + col]).
There is a lot wrong with the code you posted, and most of it probably related to the ambiguous way that C and related languages deal with statically declared multidimensional arrays and the [][] style indexing scheme it supports.
Rather than describe all the required fixes I will just leave this here:
#include <stdio.h>
#define BLOCK_SIZE 6
#define GRID_SIZE 1
template<typename T>
struct array2D
{
T* p;
int lda;
__device__ __host__
array2D(T* _p, int cols) : p(_p), lda(cols) {}
__device__ __host__
T& operator()(int i, int j) { return p[i * lda + j]; }
__device__ __host__
T& operator()(int i, int j) const { return p[i * lda + j]; }
};
__global__ void test(array2D<int> A, array2D<int> B, array2D<int> C) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
C(i,j) = A(i,j) + B(i,j);
}
int main(){
int A[BLOCK_SIZE][BLOCK_SIZE];
int B[BLOCK_SIZE][BLOCK_SIZE];
int C[BLOCK_SIZE][BLOCK_SIZE];
for (int i = 0; i<BLOCK_SIZE; i++) {
for (int j = 0; j<BLOCK_SIZE; j++){
A[i][j] = i + j;
B[i][j] = i + j;
}
}
int* dev_A; cudaMalloc((void**)&dev_A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
int* dev_B; cudaMalloc((void**)&dev_B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
int* dev_C; cudaMalloc((void**)&dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int));
cudaMemcpy(dev_A, A, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_B, B, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyHostToDevice);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // so your threads are BLOCK_SIZE*BLOCK_SIZE, 36 in this case
dim3 dimGrid(GRID_SIZE, GRID_SIZE); // 1*1 blocks in a grid
test <<<dimGrid, dimBlock >>> (array2D<int>(dev_A, BLOCK_SIZE),
array2D<int>(dev_B, BLOCK_SIZE),
array2D<int>(dev_C, BLOCK_SIZE));
cudaDeviceSynchronize();
cudaMemcpy(C, dev_C, BLOCK_SIZE * BLOCK_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i<BLOCK_SIZE; i++) {
for (int j = 0; j<BLOCK_SIZE; j++){
printf("(%d,%d) = %d {%d}\n", i, j, C[i][j], A[i][j] + B[i][j]);
}
}
return 0;
}
The most important feature of the code is the use of a tiny wrapper class which provides you with the (i,j) style indexing you apparently want without any complexity in the kernel code. At this point you don't even need to understand how it works, just accept that it provides you with the necessary indexing mechanism you want within the kernel and use it.
It you compile and run the code like so:
$ nvcc --std=c++11 myfirstpony.cu -o myfirstpony
$ ./myfirstpony
(0,0) = 0 {0}
(0,1) = 2 {2}
(0,2) = 4 {4}
(0,3) = 6 {6}
(0,4) = 8 {8}
(0,5) = 10 {10}
(1,0) = 2 {2}
(1,1) = 4 {4}
(1,2) = 6 {6}
(1,3) = 8 {8}
(1,4) = 10 {10}
(1,5) = 12 {12}
(2,0) = 4 {4}
(2,1) = 6 {6}
(2,2) = 8 {8}
(2,3) = 10 {10}
(2,4) = 12 {12}
(2,5) = 14 {14}
(3,0) = 6 {6}
(3,1) = 8 {8}
(3,2) = 10 {10}
(3,3) = 12 {12}
(3,4) = 14 {14}
(3,5) = 16 {16}
(4,0) = 8 {8}
(4,1) = 10 {10}
(4,2) = 12 {12}
(4,3) = 14 {14}
(4,4) = 16 {16}
(4,5) = 18 {18}
(5,0) = 10 {10}
(5,1) = 12 {12}
(5,2) = 14 {14}
(5,3) = 16 {16}
(5,4) = 18 {18}
(5,5) = 20 {20}
You can see for yourself the correctness of the result.
Related
I dynamically allocated memory for 3D array of pointers. My question is how many pointers do I have? I mean, do I have X·Y number of pointers pointing to an array of double or X·Y·Z pointers pointing to a double element or is there another variant?
double*** arr;
arr = (double***)calloc(X, sizeof(double));
for (int i = 0; i < X; ++i) {
*(arr + i) = (double**)calloc(Y, sizeof(double));
for (int k = 0; k < Y; ++k) {
*(*(arr+i) + k) = (double*)calloc(Z, sizeof(double));
}
}
The code you apparently intended to write would start:
double ***arr = calloc(X, sizeof *arr);
Notes:
Here we define one pointer, arr, and set it to point to memory provided by calloc.
Using sizeof (double) with this is wrong; arr is going to point to things of type double **, so we want the size of that. The sizeof operator accepts either types in parentheses or objects. So we can write sizeof *arr to mean “the size of a thing that arr will point to”. This always gets the right size for whatever arr points to; we never have to figure out the type.
There is no need to use calloc if we are going to assign values to all of the elements. We can use just double ***arr = malloc(X * sizeof *arr);.
In C, there is no need to cast the return value of calloc or malloc. Its type is void *, and the compiler will automatically convert that to whatever pointer type we assign it to. If the compiler complains, you are probably using a C++ compiler, not a C compiler, and the rules are different.
You should check the return value from calloc or malloc in case not enough memory was available. For brevity, I omit showing the code for that.
Then the code would continue:
for (ptrdiff_t i = 0; i < X; ++i)
{
arr[i] = calloc(Y, sizeof *arr[i]);
…
}
Notes:
Here we assign values to the X pointers that arr points to.
ptrdiff_t is defined in stddef.h. You should generally use it for array indices, unless there is a reason to use another type.
arr[i] is equivalent to *(arr + i) but is generally easier for humans to read and think about.
As before sizeof *arr[i] automatically gives us the right size for the pointer we are setting, arr[i].
Finally, the … in there is:
for (ptrdiff_t k = 0; k < Y; ++k)
arr[i][k] = calloc(Z, sizeof *arr[i][k]);
Notes:
Here we assign values to the Y pointers that arr[i] points to, and this loop is inside the loop on i that executes X times, so this code assigns XY pointers in total.
So the answer to your question is we have 1 + X + XY pointers.
Nobody producing good commercial code uses this. Using pointers-to-pointers-to-pointers is bad for the hardware (meaning inefficient in performance) because the processor generally cannot predict where a pointer points to until it fetches it. Accessing some member of your array, arr[i][j][k], requires loading three pointers from memory.
In most C implementations, you can simply allocate a three-dimensional array:
double (*arr)[Y][Z] = calloc(X, sizeof *arr);
With this, when you access arr[i][j][k], the compiler will calculate the address (as, in effect, arr + (i*Y + j)*Z + k). Although that involves several multiplications and additions, they are fairly simple for modern processors and are likely as fast or faster than fetching pointers from memory and they leave the processor’s load-store unit free to fetch the actual array data. Also, when you are using the same i and/or j repeatedly, the compiler likely generates code that keeps i*Y and/or (i*Y + j)*Z around for multiple uses without recalculating them.
Well, short answer is: it is not known.
As a classic example, keep in mind the main() prototype
int main( int argc, char** argv);
argc keeps the number of pointers. Without it we do not know how many they are. The system builds the array argv, gently updates argc with the value and then launches the program.
Back to your array
double*** arr;
All you know is that
arr is a pointer.
*arr is double**, also a pointer
**arr is double*, also a pointer
***arr is a double.
What you will get in code depends on how you build this. A common way if you need an array of arrays and things like that is to mimic the system and use a few unsigned and wrap them all with the pointers into a struct like
typedef struct
{
int n_planes;
int n_rows;
int n_columns;
double*** array;
} Cube;
A CSV file for example is char ** **, a sheet workbook is char ** ** ** and it is a bit scary, but works. For each ** a counter is needed, as said above about main()
A C example
The code below uses arr, declared as double***, to
store a pointer to a pointer to a pointer to a double
prints the value using the 3 pointers
then uses arr again to build a cube of X*Y*Z doubles, using a bit of math to set values to 9XY9.Z9
the program uses 2, 3 and 4 for a total of 24 values
lists the full array
list the first and the very last element, arr[0][0][0] and arr[X-1][Y-1][Z-1]
frees the whole thing in reverse order
The code
#include <stdio.h>
#include <stdlib.h>
typedef struct
{
int n_planes;
int n_rows;
int n_columns;
double*** array;
} Cube;
int print_array(double***, int, int, int);
int main(void)
{
double sample = 20.21;
double* pDouble = &sample;
double** ppDouble = &pDouble;
double*** arr = &ppDouble;
printf("***arr is %.2ff\n", ***arr);
printf("original double is %.2ff\n", sample);
printf("*pDouble is %.2ff\n", *pDouble);
printf("**ppDouble is %.2ff\n", **ppDouble);
// but we can build a cube of XxYxZ doubles for arr
int X = 2;
int Y = 3;
int Z = 4; // 24 elements
arr = (double***)malloc(X * sizeof(double**));
// now each arr[i] must point to an array of double**
for (int i = 0; i < X; i += 1)
{
arr[i] = (double**)malloc(Y * sizeof(double*));
for (int j = 0; j < Y; j += 1)
{
arr[i][j] = (double*)malloc(Z * sizeof(double));
for (int k = 0; k < Z; k += 1)
{
arr[i][j][k] = (100. * i) + (10. * j) + (.1 * k) + 9009.09;
}
}
}
print_array(arr, X, Y, Z);
printf("\n\
Test: first element is arr[%d][%d[%d] = %6.2f (9XY9.Z9)\n\
last element is arr[%d][%d[%d] = %6.2f (9XY9.Z9)\n",
0, 0, 0, arr[0][0][0],
(X-1), (Y-1), (Z-1), arr[X-1][Y-1][Z-1]
);
// now to free this monster
for (int x = 0; x < X; x += 1)
{
for (int y = 0; y < Y; y += 1)
{
free(arr[x][y]); // the Z rows
}
free(arr[x]); // the plane Y
}
free(arr); // the initial pointer;
return 0;
}; // main()
int print_array(double*** block, int I, int J, int K)
{
for (int a = 0; a < I; a += 1)
{
printf("\nPlane %d\n\n", a);
for (int b = 0; b < J; b += 1)
{
for (int c = 0; c < K; c += 1)
{
printf("%6.2f ", block[a][b][c]);
}
printf("\n");
}
}
return 0;
}; // print_array()
The output
***arr is 20.21f
original double is 20.21f
*pDouble is 20.21f
**ppDouble is 20.21f
Plane 0
9009.09 9009.19 9009.29 9009.39
9019.09 9019.19 9019.29 9019.39
9029.09 9029.19 9029.29 9029.39
Plane 1
9109.09 9109.19 9109.29 9109.39
9119.09 9119.19 9119.29 9119.39
9129.09 9129.19 9129.29 9129.39
Test: first element is arr[0][0[0] = 9009.09 (9XY9.Z9)
last element is arr[1][2[3] = 9129.39 (9XY9.Z9)
I am trying to write an OpenCL code to do element-wise operations on multi-dimensional arrays.
I know that OpenCL buffers are flattened, which makes indexing a bit tricky. I succeeded when dealing with 2-dimensional arrays, but for 3+ dimensional arrays, I have either indexing errors or the wrong result.
It is all the more surprising so that I use the same indexing principle/formula as in the 2D case.
2D case:
__kernel void test1(__global int* a, __global int* b, __global int* c, const int height) {
int i = get_global_id(0);
int j = get_global_id(1);
c[i + height * j] = a[i + height * j] + b[i + height * j];
}
Correct.
3D case:
__kernel void test1(__global int* a, __global int* b, __global int* c, const int dim1, const int dim2) {
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx = i + dim1 * j + dim1 * dim2 * k;
c[idx] = a[idx] + b[idx];
}
Wrong result (usually an output buffer filled with values very close to 0).
4D case:
__kernel void test1(__global int* a, __global int* b, __global int* c, const int dim1, const int dim2, const int dim3) {
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int l = get_global_id(3);
int idx = i + dim1 * j + dim1 * dim2 * k + l * dim1 * dim2 * dim3;
c[idx] = a[idx] + b[idx];
}
Here is the indexing error: enqueue_knl_test1 pyopencl._cl.LogicError: clEnqueueNDRangeKernel failed: INVALID_WORK_DIMENSION
In the 4D case, you are simply using the API wrongly. OpenCL does not support an infinite number of global / local dimensions. Just up to 3.
In the 2D case, your indexing seems wrong. Assuming row-major arrays. It should be i + j * width not i + j * height.
In the 3D case, the indexing inside the kernel seems OK, assuming row-major memory layout and that dim1 equals cols (width) and dim2 equals rows (height). But anyway, your question lacks context:
Input buffers allocation and initialization.
Kernel invocation code (parameters, work group and global size).
Result collection. synchronization.
You could be accessing beyond the buffer allocated size. It should be checked.
Doing these steps incorrectly can easily lead to unexpected results. Even if your kernel code is OK.
If you wish to debug indexing issues, the easiest thing to do is to write a simple kernel that output the calculated index.
__kernel void test1(__global int* c, const int dim1, const int dim2) {
int i = get_global_id(0);
int j = get_global_id(1);
int k = get_global_id(2);
int idx = i + dim1 * j + dim1 * dim2 * k;
c[idx] = idx;
}
You should then expect a result with linearly increasing values. I would start with a single workgroup and then move on to using multiple workgroups.
Also, If you perform a simple element-wise operation between arrays, then it is much simpler to use 1D indexing. You could simply use a 1D workgroup and global size that equals the number of elements (rounded up to to fit workgroup dim):
__kernel void test1(__global int* a, __global int* b, __global int* c, const int total) {
// no need for complex indexing for elementwise operations
int idx = get_global_id(0);
if (idx < total)
{
c[idx] = a[idx] + b[idx];
}
}
You would probably set local_work_size to the max size the hardware allows (for instance 512 for Nvidia, 256 for AMD) and global_work_size to the total of elements rounded up to multiples of local_work_size. See clEnqueueNDRangeKernel.
2D & 3D dims are usually used for operations that access adjacent elements in 2D / 3D space. Such as image convolutions.
After I compile and run the function, I get a segmentation fault: 11. I believe malloc should be performed correctly so I am not sure why I get a seg fault. Any insights would be greatly appreciated!
typedef struct matrix matrix;
struct matrix {
unsigned int n_rows;
unsigned int n_cols;
float **entries;
};
//refer to matrix.h
matrix *matrix_zero(unsigned int n_rows, unsigned int n_cols){
struct matrix* new = (struct matrix*)malloc(sizeof(struct matrix));
new->entries = malloc(n_rows * n_cols * sizeof(float));
new->n_rows=n_rows;
new->n_cols=n_cols;
for(int x = 0; x < n_rows; x++){
for(int y = 0; y < n_cols; y++){
new->entries[x][y] = 0;
}
}
return new;
}
/* main: run the evidence functions above */
int main(int argc, char *argv[])
{
struct matrix* test1 = matrix_zero(3,3);
// matrix_show(test1);
}
The problem appears to be in your allocation for matrix->entries. The struct defines a pointer to a pointer, but you allocate a float pointer float* vs float**. You need to allocate n_rows number of float* and each of those needs to point to an allocations of n_cols number of float value. For example:
int i;
// No error checking shown here
new->entries = malloc(sizeof(float*) * n_rows);
for (i = 0; i < n_rows; i++) {
new->entries[i] = malloc(sizeof(float) * n_cols);
}
You have allocated an array for rows by cols size, but there's no way for the compiler to know the actual row size for each row, so the notation entries[i] indeed expects a pointer to a simple array of floats, and not a bidimensional array. This is one of the main differences in structure from a n-dimensional array and arrays of pointers in C. The compiler knows how to dimension it only when you fully qualify the array dimensions (as when you declare it as float entries[N][M]; --- look that you cannot use variable expressions in the dimensions, only static compile-time constants)
You have two approaches here:
Use a single dimension array an compute the index based on the rows sizes:
typedef struct matrix matrix;
struct matrix {
unsigned int n_rows;
unsigned int n_cols;
float *entries; /* we make this to point to a single array of n_rows * n_cols entries */
};
new->entries = malloc(n_rows * n_cols * sizeof(float));
new->n_rows=n_rows;
new->n_cols=n_cols;
for(int x = 0; x < n_rows; x++){
for(int y = 0; y < n_cols; y++){
new->entries[x * n_cols + y] = 0.0; /* entry position should be as shown */
}
Use individual row arrays of n_cols entries (this has been shown in another answer by #joel already)
typedef struct matrix matrix;
struct matrix {
unsigned int n_rows;
unsigned int n_cols;
float **entries; /* this time the entries field points to an array of pointers to float. */
};
new->entries = malloc(n_rows * sizeof *new->entries); /* individual cells are 'float *', not 'float' this time. */
new->n_rows=n_rows;
new->n_cols=n_cols;
for(int x = 0; x < n_rows; x++){
new->entries[x] = malloc(n_cols* sizeof **new->entries); /* this time float *'s are here */
for(int y = 0; y < n_cols; y++){
new->entries[x][y] = 0; /* entry position should be as shown */
}
}
Both methods have remarks:
First method requires only one malloc(3) for the entries array so this makes it easier to allocate and deallocate, but some implementations could limit the actual size of a single malloc(3) in case you want to allocate huge matrices. This makes the deallocation for the whole matrix also easier.
Second method only requires a malloc of n_rows pointers and n_rows mallocs of n_cols floats. This will make possible to allocate huge matrices (you never allocate the whole matrix in one chunk) but you'll have to deallocate all rows first, then the array of pointers to the rows, before deallocating the matrix struct.
I recommend you to use malloc(n_cols * sizeof *new->entries) instead of malloc(n_cols * sizeof (float *)), so you don't need to change this expression in case you change the definition type of new->entries.
Finally, think that there's no magic in the C language in respect of calling functions. You probably erroneously assumed that making malloc( n_rows * n_cols * sizeof(float) ) converted automatically the pointer to a bidimensional array, but there's no magic there, malloc(3) is a normal C function like the ones you can write, and that's the reason it requires the actual number of bytes, and not the dimensions (in elements) of the array.
This question already has answers here:
Using sizeof with a dynamically allocated array
(5 answers)
Closed 7 years ago.
I'm working on a C program that involves generating adjacency matrices of random graphs. Here is a snippet of the source code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "test.h"
int main()
{
int **A = create_matrix(4, 3);
destory_matrix(A);
return 0;
}
int** create_matrix(int size, int seed)
{
// Allocate space for matrix
int **A = malloc(size * size * sizeof(int));
for (int r = 0; r < size; r++) {
A[r] = malloc(size * sizeof(int));
}
// Fill entries
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
A[i][j] = seed * (i + 1) * (j + 1);
}
}
return A;
}
void destory_matrix(int **A)
{
int size = sizeof(A[0]) / sizeof(int);
for (int r = 0; r < size; r++) {
free(A[r])
}
free(A);
}
This portion of the code is responsible for creating the matrix (the create_matrix() function) and free'ing memory (destroy_matrix()). I'm looking at destroy_matrix(), and noticed that when a 4x4 matrix is passed in, the variable size evaluated to 2, rather than 4. Could anyone explain why this happens?
I think you have a basic misunderstanding of the sizeof operator. It can't, in general, be used to get the size of dynamically allocated compound objects. The sizeof operator returns to you the size based on the type of the operand. In your case, the type of the operand is int *. I guess you are running on a 64 bit system. So the sizeof any pointer is 8. Hence your size variable will always be 2 no matter the size of your matrix.
The sizeof operator applied to a pointer returns the size of the pointer type, not the size of any allocated memory it happens to point to.
This is one of the key differences between an array type and a pointer type in C (note: arrays can decay to pointers). sizeof applied to an statically specified array type (e.g. int foo[n];) will get you the size of the array in bytes.
Since your word size is probably 8 bytes (64-bit), the size of a pointer will be 8 bytes, and if sizeof(int) is 4 bytes (32-bits), you have 8 / 4 = 2;
You need to consider some other way to store the dimensions of your matrices if you need runtime-sized heap-allocated matrices, e.g. a struct that stores the dimensions and a pointer to the allocated memory. It would be better to avoid possible heap-fragmentation altogether, though.
Try this if you have C99:
int n = 4, m = 5;
int (*A)[n] = malloc(m * sizeof A[0]));
free(A);
This allocates an m length array of int[n] as a single block, so you can do size_t n = sizeof(A)/sizeof(A[0]); to get one dimension(n) but you'll need to store m if you want to iterate correctly.
I'm having a bit of trouble understanding how to send a 2D array to Cuda. I have a program that parses a large file with a 30 data points on each line. I read about 10 rows at a time and then create a matrix for each line and items(so in my example of 10 rows with 30 data points, it would be int list[10][30]; My goal is to send this array to my kernal and have each block process a row(I have gotten this to work perfectly in normal C, but Cuda has been a bit more challenging).
Here's what I'm doing so far but no luck(note: sizeofbucket = rows, and sizeOfBucketsHoldings = items in row...I know I should win a award for odd variable names):
int list[sizeOfBuckets][sizeOfBucketsHoldings]; //this is created at the start of the file and I can confirmed its filled with the correct data
#define sizeOfBuckets 10 //size of buckets before sending to process list
#define sizeOfBucketsHoldings 30
//Cuda part
//define device variables
int *dev_current_list[sizeOfBuckets][sizeOfBucketsHoldings];
//time to malloc the 2D array on device
size_t pitch;
cudaMallocPitch((int**)&dev_current_list, (size_t *)&pitch, sizeOfBucketsHoldings * sizeof(int), sizeOfBuckets);
//copy data from host to device
cudaMemcpy2D( dev_current_list, pitch, list, sizeOfBuckets * sizeof(int), sizeOfBuckets * sizeof(int), sizeOfBucketsHoldings * sizeof(int),cudaMemcpyHostToDevice );
process_list<<<count,1>>> (sizeOfBuckets, sizeOfBucketsHoldings, dev_current_list, pitch);
//free memory of device
cudaFree( dev_current_list );
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, int pitch) {
int tid = blockIdx.x;
for (int r = 0; r < sizeOfBuckets; ++r) {
int* row = (int*)((char*)current_list + r * pitch);
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
int element = row[c];
}
}
The error I'm getting is:
main.cu(266): error: argument of type "int *(*)[30]" is incompatible with parameter of type "int *"
1 error detected in the compilation of "/tmp/tmpxft_00003f32_00000000-4_main.cpp1.ii".
line 266 is the kernel call process_list<<<count,1>>> (count, countListItem, dev_current_list, pitch); I think the problem is I am trying to create my array in my function as int * but how else can I create it? In my pure C code, I use int current_list[num_of_rows][num_items_in_row] which works but I can't get the same outcome to work in Cuda.
My end goal is simple I just want to get each block to process each row(sizeOfBuckets) and then have it loop through all items in that row(sizeOfBucketHoldings). I orginally just did a normal cudamalloc and cudaMemcpy but it wasn't working so I looked around and found out about MallocPitch and 2dcopy(both of which were not in my cuda by example book) and I have been trying to study examples but they seem to be giving me the same error(I'm currently reading the CUDA_C programming guide found this idea on page22 but still no luck). Any ideas? or suggestions of where to look?
Edit:
To test this, I just want to add the value of each row together(I copied the logic from the cuda by example array addition example).
My kernel:
__global__ void process_list(int sizeOfBuckets, int sizeOfBucketsHoldings, int *current_list, size_t pitch, int *total) {
//TODO: we need to flip the list as well
int tid = blockIdx.x;
for (int c = 0; c < sizeOfBucketsHoldings; ++c) {
total[tid] = total + current_list[tid][c];
}
}
Here's how I declare the total array in my main:
int *dev_total;
cudaMalloc( (void**)&dev_total, sizeOfBuckets * sizeof(int) );
You have some mistakes in your code.
Then you copy host array to device you should pass one dimensional host pointer.See the function signature.
You don't need to allocate static 2D array for device memory. It creates static array in host memory then you recreate it as device array. Keep in mind it must be one dimensional array, too. See this function signature.
This example should help you with memory allocation:
__global__ void process_list(int sizeOfBucketsHoldings, int* total, int* current_list, int pitch)
{
int tid = blockIdx.x;
total[tid] = 0;
for (int c = 0; c < sizeOfBucketsHoldings; ++c)
{
total[tid] += *((int*)((char*)current_list + tid * pitch) + c);
}
}
int main()
{
size_t sizeOfBuckets = 10;
size_t sizeOfBucketsHoldings = 30;
size_t width = sizeOfBucketsHoldings * sizeof(int);//ned to be in bytes
size_t height = sizeOfBuckets;
int* list = new int [sizeOfBuckets * sizeOfBucketsHoldings];// one dimensional
for (int i = 0; i < sizeOfBuckets; i++)
for (int j = 0; j < sizeOfBucketsHoldings; j++)
list[i *sizeOfBucketsHoldings + j] = i;
size_t pitch_h = sizeOfBucketsHoldings * sizeof(int);// always in bytes
int* dev_current_list;
size_t pitch_d;
cudaMallocPitch((int**)&dev_current_list, &pitch_d, width, height);
int *test;
cudaMalloc((void**)&test, sizeOfBuckets * sizeof(int));
int* h_test = new int[sizeOfBuckets];
cudaMemcpy2D(dev_current_list, pitch_d, list, pitch_h, width, height, cudaMemcpyHostToDevice);
process_list<<<10, 1>>>(sizeOfBucketsHoldings, test, dev_current_list, pitch_d);
cudaDeviceSynchronize();
cudaMemcpy(h_test, test, sizeOfBuckets * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < sizeOfBuckets; i++)
printf("%d %d\n", i , h_test[i]);
return 0;
}
To access your 2D array in kernel you should use pattern base_addr + y * pitch_d + x.
WARNING: the pitvh allways in bytes. You need to cast your pointer to byte*.