I'm trying to use dynamic 3D array argument in kernel function in cuda but I can't do well.
__global__ void kernel ( 3D array pointer )
{
// do something
}
int main()
{
const int NUM_OF_ARRAY;
const int ROW;
const int CAL;
int arr[NUM_OF_ARRAY][ROW][CAL];
// Maybe I should use cudaMalloc3D or cudaMalloc3DArray
dim3 grid( , , ,);
dim3 block( , , , );
kernel <<< grid, block >>> ( ? );
}
I saw Robert's answer for sending 3d array to CUDA kernel but I think my case is little different.
If array's row and cal are determined at runtime, how can I allocate that memory in cuda and give that's pointer to kernel function?
I tried to use cudaMalloc3D or cudaMalloc3DArray but I could't well because I have never used before.
Can anyone shows simple example using dynamic 3D array arguments?
It will be helpful for me. Thanks.
For all of the reasons suggested in the previous linked answer and elsewhere, this isn't necessarily a good approach for handling 3D arrays. A better approach is to flatten the array and use pointer arithmetic to simulate 3D access.
But just to demonstrate that the previous example doesn't really need to be hard coded dimensions, here's that example modified to show variable (run-time) dimension usage:
#include <iostream>
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
__global__ void doSmth(int*** a, int sz_x, int sz_y, int sz_z) {
for(int i=0; i<sz_z; i++)
for(int j=0; j<sz_y; j++)
for(int k=0; k<sz_x; k++)
a[i][j][k]=i-j+k;
}
int main() {
unsigned sx;
unsigned sy;
unsigned sz;
std::cout << std::endl << "Enter x dimension (3rd subscript): " ;
std::cin >> sx;
std::cout << std::endl << "Enter y dimension (2nd subscript): " ;
std::cin >> sy;
std::cout << std::endl << "Enter z dimension (1st subscript): " ;
std::cin >> sz;
int*** h_c = (int***) malloc(sz*sizeof(int**));
for(int i=0; i<sz; i++) {
h_c[i] = (int**) malloc(sy*sizeof(int*));
for(int j=0; j<sy; j++)
GPUerrchk(cudaMalloc((void**)&h_c[i][j],sx*sizeof(int)));
}
int ***h_c1 = (int ***) malloc(sz*sizeof(int **));
for (int i=0; i<sz; i++){
GPUerrchk(cudaMalloc((void***)&(h_c1[i]), sy*sizeof(int*)));
GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], sy*sizeof(int*), cudaMemcpyHostToDevice));
}
int*** d_c;
GPUerrchk(cudaMalloc((void****)&d_c,sz*sizeof(int**)));
GPUerrchk(cudaMemcpy(d_c,h_c1,sz*sizeof(int**),cudaMemcpyHostToDevice));
doSmth<<<1,1>>>(d_c, sx, sy, sz);
GPUerrchk(cudaPeekAtLastError());
int res[sz][sy][sx];
for(int i=0; i<sz; i++)
for(int j=0; j<sy; j++)
GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],sx*sizeof(int),cudaMemcpyDeviceToHost));
std::cout << std::endl;
for(int i=0; i<sz; i++)
for(int j=0; j<sy; j++)
for(int k=0; k<sx; k++)
printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);
}
I have modified the data stored by the kernel to i-j+k instead of i+j+k. Also, I have created a [z][y][x] order to the subscripts, because this will suggest the usage of thread index computed arrangements such as [threadIdx.z][threadIdx.y][threadIdx.x] which will be most conducive to coalesced access. However, this type of multiple-subscripted array in the kernel will still tend to be inefficient due to pointer-chasing to resolve the final location of the data.
Related
I want to pass a 2D array already filled with chars to a different method to do something with it.
Background: I am trying to implement GameOfLife. And I have already successfully implement the gameboard with a random amount of living cells. But now I want to pass the board(Array) to a different method to continue working with it. How to do so?
//wow das wird hurenshon
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
void spielStarten(int x, int amountOfLiving){
char feld[x][x];
for(int i = 0; i < x; i++){
for(int j = 0; j < x; j++){
feld[i][j] = 'o';
}
}
for(int i = 0; i < amountOfLiving; i++){
int a = (rand()%x);
int b = (rand()%x);
feld[a][b] = 'x';
}
printf("Gameboard: \n");
for(int i = 0; i < x; i++){
for(int j = 0; j < x; j++){
printf("%c ", feld[i][j]);
}
printf("\n");
}
spielRun(feld);
}
void spielRun(char feld[][]){
int neighbCount;
char feldNew[][] = feld[][];
for(int i = 0; i < x; i++) {
for(int j = 0; j < x; j++) {
checkForNeighbours(feld[x][y]);
// in progress
}
}
}
int main(int argc, char* argv[]){
srand(time(NULL));
int x = 16;
if(argc < 2 || argc > 3){
printf("2. Argument eine Zahl fuer Feldgroesse eingeben\n");
printf("1. Argument eine Zahl 0-10 fuer ungefähre prozentuale Belegung mit lebenden
Zellen eingeben \n");
return 0;
}
if(argv[2] != NULL){
x = atoi(argv[2]);
}
int i;
i = atoi(argv[1]);
i = (x^2)*(0,1*i);
spielStarten (x,i);
return 0;
}
In the last line of the Method "Spiel starten" i want to give the array to the next Method "spielRun".
Edit: thanks to an other user I found this struture:
void printarray( char (*array)[50], int SIZE )
But it doesn't work for me since I can´t hardcode the number, because the arraysize depends on a user input.
thanks!
The difficulty here is that the size of your array is not known statically (once upon a time, your code would even not compile for the same reason).
That, combined with the fact that 2D-arrays are not arrays of 1D arrays (contrarily to what happen when you malloc a int ** and then every int * in it), and so it doesn't make sense not to specify the size when passing it to a function.
When using arrays of arrays (technically, pointers to a bunch of pointers to ints), like this
void f(int **a){
printf("%d %d %d\n", a[0][0], a[1][0], a[0][1]);
}
int main(){
int **t=malloc(10*sizeof(int *));
for(int i=0; i<10; i++) t[i]=malloc(20*sizeof(int));
f(t);
}
That code is useless, it prints only unitialized values. But point is, f understands what values it is supposed to print. Pointers arithmetics tells it what a[1] is, and then what a[1][0] is.
But if this 2D-array is not pointers to pointers, but real arrays, like this
void f(int a[][20]){
printf("%d %d %d\n", a[0][0], a[1][0], a[0][1]);
}
int main(){
int t[10][20];
f(t);
}
Then, it is essential that the called function knows the size (or at least all sizes, but for the first dimension) of the array. Because it is not pointers to pointers. It is an area of 200 ints. The compiler needs to know the shape to deduce that t[5][3] is the 5×20+3=103th int at address t.
So, that is roughly what is (better) explained in the link that was given in comments: you need to specify the size.
Like I did here.
Now, in your case, it is more complicated, because you don't know (statically) the size.
So three methods. You could switch to pointers to pointers. You could cast your array into a char * and then do the index computation yourself (x*i+j). Or with modern enough C, you can just pass the size, and then use it, even in parameters, declaration
void f(int x, int a[][x]){
printf("%d %d %d\n", a[0][0], a[1][0], a[0][1]);
}
int main(){
int t[10][20];
f(t);
}
Anyway, from an applicative point of view (or just to avoid segfault) you need to know the size. So you would have had to pass it. So why not pass it as first parameter (Note that the function in which you have this size problem, spielRun, does refers to a x, which it doesn't know. So, passing the size x would have been your next problem anyway)
So, spielRun could look like this (not commenting in other errors it contains)
void spielRun(int x, char feld[][x]){
int neighbCount;
char feldNew[][] = feld[][]; // Other error
for(int i = 0; i < x; i++) {
for(int j = 0; j < x; j++) {
checkForNeighbours(feld[i][j]); // Corrected one here
// in progress
}
}
}
And then calls to this spielRun could be
spielRun(x, feld);
Note that I address only the passing of array of size x here. There are plenty of other errors, and, anyway, it is obviously not a finished code. For example, you can't neither declare a double array char newFeld[][] = oldFeld[][]; nor affect it that way. You need to explicitly copy that yourself, and to specify size (which you can do, if you pass it).
I am also pretty sure that i = (x^2)*(0,1*i); does not remotely what you expect it to do.
This is simple, I am allocating a dynamic 2d array using functions. I limited the scanf() len and my problem is when input a value over the limit, something weird happen.
Example
Input: 111,222,333,444
Expected output: 11,22,33,44
Real output: 11,12,33,34
#include <stdio.h>
#include <stdlib.h>
#define gd 2
void get_mem(int ***arr);
void get_data(int **arr);
int main(){
int **arr;
arr = NULL;
get_mem(&arr);
get_data(arr);
free(*arr);
return 0;
}
void get_mem(int ***arr){
int i;
*arr = (int**)malloc(gd*sizeof(int*));
for(i=0;i<5;i++){
(*arr)[i] = (int*)malloc(gd*sizeof(int));
}
printf("oki\n");
}
void get_data(int **arr){
int c,f;
for(c=0;c<gd;c++){
for(f=0;f<gd;f++){
scanf("%2d",&*(*arr+c)+f);
fpurge(stdin);
fflush(stdin);
}
}
for(c=0;c<gd;c++){
for(f=0;f<gd;f++){
printf("%d ",*(*arr+c)+f);
printf("\n");
}
}
}
The value of macro gd is 2. In get_mem(), allocating memory for 2 int *:
*arr = (int**)malloc(gd*sizeof(int*));
and below it, accessing arr beyond it size:
for(i=0;i<5;i++){ //allocating memory to 5 int pointers
^^
(*arr)[i] = (int*)malloc(gd*sizeof(int));
}
Accessing an unallocated memory is undefined behaviour.
Instead of using magic number 5 in the loop condition, you should check i with gd, like this
for(i=0;i<gd;i++){
In get_data(), the way you are accessing elements of arr for input is wrong
scanf("%2d",&*(*arr+c)+f);
^^^^^^^^^^^^
because
&arr[c][f] --> &(*(arr[c] + f) --> &(*(*(arr + c) + f)) --> &*(*(arr + c) + f) --> (*(arr + c) + f)
Note: The operator & is used to get the address and the operator * is used for dereferencing. These operators cancel the effect of each other when used one after another. Hence, &(*(arr + i)) is equivalent to arr + i.
That means, &arr[c][f] is equivalent to (*(arr + c) + f) and you should use &arr[c][f] which is less error prone and more readable:
for(f = 0; f < gd; f++) {
scanf("%2d", &arr[c][f]);
Same mistake you have made while printing the arr elements in second for loop:
for(f=0;f<gd;f++){
printf("%d ",*(*arr+c)+f);
^^^^^^^^^^^
It should be *(*(arr + c) + f). More readable form is arr[c][f]:
for(f = 0; f < gd; f++){
printf("%d ", arr[c][f]);
You should not use fflush() for input stream. It's undefined behavior. From C Standards#7.21.5.2p2 -
If stream points to an output stream or an update stream in which the most recent operation was not input, the fflush function causes any unwritten data for that stream to be delivered to the host environment to be written to the file; otherwise, the behavior is undefined.
Also, fpurge() is nonstandard and not portable. Moreover, you don't need to use either of them.
Rather than using more & more pointers, I'd like to do it in this way:-
#include <stdio.h>
#include <stdlib.h>
//#define gd 2
#define ROW 2
#define COLUMN 2
void get_mem(int ***arr);
void get_data(int **arr);
int main(){
int **arr = NULL;
get_mem(&arr);
printf("Enter 4 int values: ");
get_data(arr);
free(*arr);
return 0;
}
void get_mem(int ***arr)
{
int i;
*arr = ( int ** )malloc( ROW * sizeof(int *) );
for(i = 0; i < COLUMN; i++)
{
(*arr)[i] = ( int * )malloc( COLUMN * sizeof(int) );
}
printf("Okay!\n");
}
void get_data(int **arr)
{
int c, f;
for(c = 0; c < ROW; c++)
{
for(f = 0; f < COLUMN; f++)
{
scanf("%2d", &arr[c][f]); //*(*arr+c)+f)
}
}
for(c = 0; c < ROW; c++)
{
for(f = 0; f < COLUMN; f++)
{
printf("%d ", arr[c][f]); //*(*arr+c)+f)
}
putchar('\n');
}
}
I don't know what that gd is but it was making code ambiguous so I removed that and replaced it with ROW & COLUMN everywhere in the program(where it was necessary).
After allocating space to int **arr via get_mem() function, at least ask the user to input values and use proper spacing & indenting.
There's no need of fflush or fpurge, so I removed them.
Now, here if you're accessing array this way you need to be very careful of using parenthesis at proper places. You should use *(*(arr+c)+f) instead of *(*arr+c)+f)(It was an error.) this. But I chose to access the elements or store values as we do in 2D arrays. That's easier.
If you want to access this array using pointers only, instead of arr[c][f] you can do it in this way:-
scanf("%2d", &(*(*(arr+c)+f)));
&
printf("%d ", *(*(arr+c)+f));
Note: Also, you should check for any error while allocating memory.
Hope, it helps.
I was trying to run a simulation in which I need to fill three matrices of size "2 x iterations", this is (iterations=)10^8 columns and 2 rows. I also work with a vector t of size 10^8. Using dynamic memory allocation I wrote the following code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define T 10000
#define dt 0.0001
#define iterations (T/dt)
/*(more code)*/
int main(){
int i, j;
double *t;
double (*x)[2], (*y)[2], (*c)[2];
t=(double *) malloc((iterations-1)*sizeof(double));
x=(double (*)[2]) malloc((2*(iterations))*sizeof(double));
y=(double (*)[2]) malloc((2*(iterations))*sizeof(double));
c=(double (*)[2]) malloc((2*(iterations))*sizeof(double));
for(i=0; i=1; i++){
x[i][0]=50+500*i;
y[i][0]=300;
c[i][0]=15;
}
for(j=0; j<=iterations-2; j++){
t[j+1]=t[j]+dt;
/*(more code)*/
printf("%G %G %G %G %G %G\n",x[0][j+1],x[1][j+1],y[0][j+1],y[1][j+1],c[0][j+1],c[1][j+1]);
}
return 0;
}
Is the dynamic memory allocation correctly written? I mean, do I really have a vector t of size "iterations" and the three matrices of size "2 x iterations"?
And, if I want to fill each component of the matrices, for example I want a 50 in the position (1,4) of the matrix x, then do I have to write x[1][4]=50? (Like in the first "for".)
The problem is that executing the program I get an error: segmentation fault. Then, using a debugger I get the following:
Program received signal SIGSEGV, Segmentation fault.
x[0][0]=50
A generic way to allocate a matrix:
double **mat_init(int n_rows, int n_cols)
{
double **m;
int i;
m = (double**)malloc(n_rows * sizeof(double*));
for (i = 0; i < n_rows; ++i)
m[i] = (double*)calloc(n_cols, sizeof(double));
return m;
}
void mat_destroy(int n_rows, double **m)
{
int i;
for (i = 0; i < n_rows; ++i) free(m[i]);
free(m);
}
You can also do this:
double **mat_init2(int n_rows, int n_cols)
{
double **m;
int i;
m = (double**)malloc(n_rows * sizeof(double*));
m[0] = (double*)calloc(n_rows * n_cols, sizeof(double));
for (i = 1; i < n_rows; ++i)
m[i] = m[i-1] + n_cols;
return m;
}
void mat_destroy2(double **m)
{
free(m[0]); free(m);
}
For both methods above, you can use matrix[row][col] to access a cell. Sometimes, you may prefer to allocate a single array and use matrix[row*n_cols+col] to access a cell.
BTW, I am sure someone will say "don't use cast", but there are benefits of using cast – that is off-topic.
I mean, do I really have a vector t of size "iterations"
t=(double *) malloc((iterations-1)*sizeof(double));
^^^
Since you subtract one, the answer is no.
.... and the three matrices of size "2 x iterations"?
Well - yes you have three matrices of size "2 x iterations". However, what you have is equivalent to:
double m[iterations][2];
so you have "iterations" rows and 2 columns.
Remember to always check for "out of memory", i.e.
p = malloc(....);
if (p == NULL)
{
printf("out of mem\n");
return -1;
}
So you access it like:
m[0][0]
m[0][1]
m[1][0]
m[1][1]
m[2][0]
m[2][1]
m[3][0]
......
As per my previous question (many thanks to Jonathan Leffler), I edited my code (second two blocks of code), but I ran into a rather strange problem.
The following one breaks unpredictably...
void free_array(array_info *A)
{
int i;
for(i = 0; i < (A->height); ++i)
{
printf("About to free: %x\n", A->dat_ptr[i]);//for debugging purposes
free(A->dat_ptr[i]);
printf("Freed row %i\n", i);//for debugging purposes
}
free(A->dat_ptr);
}
I initially tested create_array directly followd by free_array and it worked flawlessly with rather big arrays (10^8). However, when I do my calculations in between and then try to free() the arrays, I get an access violation exception (c00000005). When I was debugging it, I noticed that the program would execute perfectly every time if I had a breakpoint within the "free_array" loop and did every line individually. However, the compiled code wouldn't ever run past row6 of my second array on its own. I turned off all optimisations in the compiler, and I still got the error upon execution.
Additional info
typedef struct {
int height;
int width;
int bottom;//position of the bottom tube/slice boundary
unsigned int** dat_ptr;//a pointer to a 2d array
} array_info;
Where the dat_ptr is now a proper 2D pointer. The create_array function that creates the array that is to be put in the structure is (i have stripped NULL checks for readability):
int create_array(array_info *A)
{
int i;
unsigned int **array = malloc(sizeof(*array) * A->height);
for (i = 0; i < A->height; ++i)
{
array[i] = malloc(sizeof(**array) * A->width);
}
A->dat_ptr = array;
return 0;
}
This function works exactly as expected.
More Additional Info
Added after the responses of Jonathan, Chris, and rharrison33
Thank you so much, Jonathan, with every one of your posts I find out so much about programming :) I finally found the culprit. The code causing the exception was the following:
void fill_number(array_info* array, int value, int x1, int y1, int x2, int y2)//fills a rectangular part of the array with `value`
{
int i, j;
for(i=y1 ; ((i<=y2)&&(i<array->height)) ; i++)//start seeding the values by row (as in vertically)
{
for(j=x1 ; ((i<=x2)&&(i<array->width)) ; j++)//seed the values by columns (as in horizontally)
{
array->dat_ptr[i][j]=value;
}
}
}
And ((i<=x2)&&(i<=array->width)) wasn't being evaluated as I expected (Chris Dodd, you were right). I thought that it would evaluate both conditions in that order or stop if either was "FALSE", independent of their order. However, it turned out it didn't work that way and it was simply refusing to evaluate the (i<array->width) part correctly. Also, I assumed that it would trigger an exception upon trying to access memory outside of the array range, but it didn't. Anyway,
I changed the code to:
void fill_number(array_info* array, int value, int x1, int y1,
int x2, int y2)
{
int i, j;
if(y1>=array->height){ y1=array->height-1;}
if(y2>=array->height){ y1=array->height-1;}
if(x1>=array->width) { x2=array->width-1;}
if(x2>=array->width) { x2=array->width-1;}
for(i=y1 ; i<=y2 ; i++)//start seeding the values by row
{
for(j=x1 ; j<=x2 ; j++)//seed the values by column
{
array->dat_ptr[i][j]=value;
}
}
}
And now it works. The block of if()s is there because I won't be calling the function very often compared to the rest of the code and I need a visual way to remind me that the check is there.
Again, thank you so much Jonathan Leffler, Chris Dodd, and rharrison33 :)
This code, closely based on what you've gotten from me and what you wrote above, seems to be working as expected. Note the use of <inttypes.h> and PRIXPTR (and the cast to (uintptr_t)). It avoids making assumptions about the size of pointers and works equally well on 32-bit and 64-bit systems (though the %.8 means you get full 8-digit hex values on 32-bit compilations, and 12 (out of a maximum of 16) on this specific 64-bit platform).
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
typedef struct
{
int height;
int width;
int bottom;
unsigned int **dat_ptr; // Double pointer, not triple pointer
} array_info;
static void create_array(array_info *A)
{
unsigned int **array = malloc(sizeof(*array) * A->height);
printf("array (%zu) = 0x%.8" PRIXPTR "\n",
sizeof(*array) * A->height, (uintptr_t)array);
for (int i = 0; i < A->height; ++i)
{
array[i] = malloc(sizeof(**array) * A->width);
printf("array[%d] (%zu) = 0x%.8" PRIXPTR "\n",
i, sizeof(**array) * A->width, (uintptr_t)array[i]);
}
A->dat_ptr = array;
}
static void free_array(array_info *A)
{
int i;
for(i = 0; i < (A->height); ++i)
{
printf("About to free %d: 0x%.8" PRIXPTR "\n",
i, (uintptr_t)A->dat_ptr[i]);
free(A->dat_ptr[i]);
}
printf("About to free: 0x%.8" PRIXPTR "\n", (uintptr_t)A->dat_ptr);
free(A->dat_ptr);
}
int main(void)
{
array_info array = { .height = 5, .width = 10, .dat_ptr = 0 };
create_array(&array);
if (array.dat_ptr == 0)
{
fprintf(stderr, "Out of memory\n");
exit(1);
}
free_array(&array);
puts("OK");
return(0);
}
Sample output
array (40) = 0x7FAFB3C03980
array[0] (40) = 0x7FAFB3C039B0
array[1] (40) = 0x7FAFB3C039E0
array[2] (40) = 0x7FAFB3C03A10
array[3] (40) = 0x7FAFB3C03A40
array[4] (40) = 0x7FAFB3C03A70
About to free 0: 0x7FAFB3C039B0
About to free 1: 0x7FAFB3C039E0
About to free 2: 0x7FAFB3C03A10
About to free 3: 0x7FAFB3C03A40
About to free 4: 0x7FAFB3C03A70
About to free: 0x7FAFB3C03980
OK
I've not got valgrind on this machine, but the addresses being allocated and freed can be eyeballed to show that there's no obvious problem there. It's coincidence that I sized the arrays such that they're all 40 bytes (on a 64-bit machine).
Follow-up Questions
What else are you doing with your data?
How big are the arrays that you're allocating?
Are you sure you're not running into arithmetic overflows?
Testing on Mac OS X 10.8.2 and the XCode version of GCC/Clang:
i686-apple-darwin11-llvm-gcc-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)
Array setting and printing functions
static void init_array(array_info *A)
{
unsigned int ctr = 0;
printf("D = 0x%.8" PRIXPTR "\n", (uintptr_t)A->dat_ptr);
for (int i = 0; i < A->height; i++)
{
printf("D[%d] = 0x%.8" PRIXPTR "\n",i, (uintptr_t)A->dat_ptr[i]);
for (int j = 0; j < A->width; j++)
{
printf("D[%d][%d] = 0x%.8" PRIXPTR " (%u)\n",
i, j, (uintptr_t)&A->dat_ptr[i][j], ctr);
A->dat_ptr[i][j] = ctr;
ctr += 7;
}
}
}
static void print_array(array_info *A)
{
printf("D = 0x%.8" PRIXPTR "\n", (uintptr_t)A->dat_ptr);
for (int i = 0; i < A->height; i++)
{
printf("D[%d] = 0x%.8" PRIXPTR "\n",i, (uintptr_t)A->dat_ptr[i]);
for (int j = 0; j < A->width; j++)
{
printf("D[%d][%d] = 0x%.8" PRIXPTR " (%u)\n",
i, j, (uintptr_t)&A->dat_ptr[i][j], A->dat_ptr[i][j]);
}
}
}
With a call init_array(&array); in main() after the successful create_array() and a call to print_array(&array); after that, I got the expected output. It's too boring to show here.
I believe you are malloc'ing incorrectly. Try modifying your create_array function to this:
int create_array(array_info *A)
{
int i;
unsigned int **array = malloc(sizeof(unsigned int*) * A->height);
for (i = 0; i < A->height; ++i)
{
array[i] = malloc(sizeof(unsigned int) * A->width);
}
A->dat_ptr = array;
return 0;
}
I have a dynamically declared 2D array in my C program, the contents of which I want to transfer to a CUDA kernel for further processing. Once processed, I want to populate the dynamically declared 2D array in my C code with the CUDA processed data. I am able to do this with static 2D C arrays but not with dynamically declared C arrays. Any inputs would be welcome!
I mean the dynamic array of dynamic arrays. The test code that I have written is as below.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <conio.h>
#include <math.h>
#include <stdlib.h>
const int nItt = 10;
const int nP = 5;
__device__ int d_nItt = 10;
__device__ int d_nP = 5;
__global__ void arr_chk(float *d_x_k, float *d_w_k, int row_num)
{
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
int index1 = (row_num * d_nP) + index;
if ( (index1 >= row_num * d_nP) && (index1 < ((row_num +1)*d_nP))) //Modifying only one row data pertaining to one particular iteration
{
d_x_k[index1] = row_num * d_nP;
d_w_k[index1] = index;
}
}
float **mat_create2(int r, int c)
{
float **dynamicArray;
dynamicArray = (float **) malloc (sizeof (float)*r);
for(int i=0; i<r; i++)
{
dynamicArray[i] = (float *) malloc (sizeof (float)*c);
for(int j= 0; j<c;j++)
{
dynamicArray[i][j] = 0;
}
}
return dynamicArray;
}
/* Freeing memory - here only number of rows are passed*/
void cleanup2d(float **mat_arr, int x)
{
int i;
for(i=0; i<x; i++)
{
free(mat_arr[i]);
}
free(mat_arr);
}
int main()
{
//float w_k[nItt][nP]; //Static array declaration - works!
//float x_k[nItt][nP];
// if I uncomment this dynamic declaration and comment the static one, it does not work.....
float **w_k = mat_create2(nItt,nP);
float **x_k = mat_create2(nItt,nP);
float *d_w_k, *d_x_k; // Device variables for w_k and x_k
int nblocks, blocksize, nthreads;
for(int i=0;i<nItt;i++)
{
for(int j=0;j<nP;j++)
{
x_k[i][j] = (nP*i);
w_k[i][j] = j;
}
}
for(int i=0;i<nItt;i++)
{
for(int j=0;j<nP;j++)
{
printf("x_k[%d][%d] = %f\t",i,j,x_k[i][j]);
printf("w_k[%d][%d] = %f\n",i,j,w_k[i][j]);
}
}
int size1 = nItt * nP * sizeof(float);
printf("\nThe array size in memory bytes is: %d\n",size1);
cudaMalloc( (void**)&d_x_k, size1 );
cudaMalloc( (void**)&d_w_k, size1 );
if((nP*nItt)<32)
{
blocksize = nP*nItt;
nblocks = 1;
}
else
{
blocksize = 32; // Defines the number of threads running per block. Taken equal to warp size
nthreads = blocksize;
nblocks = ceil(float(nP*nItt) / nthreads); // Calculated total number of blocks thus required
}
for(int i = 0; i< nItt; i++)
{
cudaMemcpy( d_x_k, x_k, size1,cudaMemcpyHostToDevice ); //copy of x_k to device
cudaMemcpy( d_w_k, w_k, size1,cudaMemcpyHostToDevice ); //copy of w_k to device
arr_chk<<<nblocks, blocksize>>>(d_x_k,d_w_k,i);
cudaMemcpy( x_k, d_x_k, size1, cudaMemcpyDeviceToHost );
cudaMemcpy( w_k, d_w_k, size1, cudaMemcpyDeviceToHost );
}
printf("\nVerification after return from gpu\n");
for(int i = 0; i<nItt; i++)
{
for(int j=0;j<nP;j++)
{
printf("x_k[%d][%d] = %f\t",i,j,x_k[i][j]);
printf("w_k[%d][%d] = %f\n",i,j,w_k[i][j]);
}
}
cudaFree( d_x_k );
cudaFree( d_w_k );
cleanup2d(x_k,nItt);
cleanup2d(w_k,nItt);
getch();
return 0;
I mean the dynamic array of dynamic arrays.
Well, that's exactly where the problem lies. A dynamic array of dynamic arrays consists of a whole bunch of disjoint memory blocks, one for each line in the array (as is clearly seen from the malloc inside you for loop in mat_create2). So you can't copy such a data structure to device memory with just one call to cudaMemcpy*. Instead, you have to do either
Also use dynamic arrays of dynamic arrays on CUDA. To do this, you have to basically recreate your mat_create2 function, using cudaMalloc instead of malloc, then copy each row seperately.
Use a "tight" 2d array on CUDA, like you do now (which is a good thing, at least performance-wise!). But if you keep using dyn-dyn-arrays on host memory, you still have copy each row seperately, like
for(int i=0; i<r; ++i){
cudaMemcpy(d_x_k + i*c, x_k[i], c*sizeof(float), cudaMemcpyHostToDevice)
}
You may wonder "why did it work with a static 2d array, then"? Well, static 2d arrays in C are proper, tight arrays that can be copied in one go. It's a bit confusing that these are indexed with exactly the same syntax as dyn-dyn arrays (arr[x][y]), because it actually works completely different.
But you should consider using tight arrays on host memory, too, perhaps with an object-oriented wrapper like
typedef struct {
float* data;
int n_rows, n_cols;
} tight2dFloatArray;
#define INDEX_TIGHT2DARRAY(arr, y, x)\
(arr).data[(y)*(arr).n_cols + (x)]
such an approach of course can be implemented much safer as a C++ class.
*You also can't copy it inside main memory with just one memcpy: that only copies the array of pointers, not the actual data.