CUDA Array set to 0 after kernel call - arrays

I have a simple program with 3 array, that count how much the third array is 0 and the first and second has same values. when it's true increment another array index.
The problems are:
If kernel has only the first if() then function the array A is ever 0
If I insert if() then else function the values of array A is set to 0 after index = 2 and don't count the state when A,B,C=0
this is the code
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, float *b, float *c, float *res)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (a[idx]=b[idx] && c[idx]==0) {
res[0]++;
}
else if (a[idx]=b[idx] && c[idx]==1){
res[1]++;
}
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
float *b_h, *b_d; // Pointer to host & device arrays
float *c_h, *c_d; // Pointer to host & device arrays
float *res_h, *res_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
//size_t size_s = 4 * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
b_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &b_d, size); // Allocate array on device
c_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &c_d, size); // Allocate array on device
res_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &res_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
// for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) b_h[i] = (float)i;
for (int i=0; i<N; i++) c_h[i] = (float)i;
for (int i=0; i<4; i++) res_h[i] = 0;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 8;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, b_d, c_d, res_d);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(b_h, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(c_h, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(res_h, res_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++){
printf("%f A \n", a_h[i]);
}
for (int i=0; i<N; i++){
printf("%f B \n", b_h[i]);
}
for (int i=0; i<N; i++){
printf("%f C \n", c_h[i]);
}
for (int i=0; i<4; i++){
printf("%f res \n", res_h[i]);
}
// Cleanup
free(a_h); cudaFree(a_d);
free(b_h); cudaFree(b_d);
free(c_h); cudaFree(c_d);
free(res_h); cudaFree(res_d);
}

Aside from the = in if (a[idx]=b[idx] && c[idx]==0) { that should be == as you already found (and same goes for the following if statement), there are at least two other issues in your code:
You don't check that the thread index doesn't go over the limit of the arrays. So since you are using 2 block of 8 threads, you have 16 threads accessing 10 elements arrays. To avoid the issue, you need to pass N as parameter for your kernel and add a if ( idx < N ) somewhere.
You accumulate in res in parallel without any sort of protection, leading to all kinds of race conditions. This is a very typical histogram issue that is explained aplenty in the literature (web, books, CUDA examples...). A quick fix for you (albeit probably not the most effective one) would be to use atomic operations, such as atomicAdd. In you case, the line res[0]++; would become atomicAdd( &res[0], 1 );, and res[1]++; would become (as you guessed) atomicAdd( &res[1], 1 );. The support of this for float implies you compile your code while using compute capability at least 2.0.
HTH

Sorry, I solved the problem.It was a mistake typing control = and not true ==

Related

Realloc + memcpy 2D float array results in segmentation fault

I made a structure (SomeMisc) which has a float array, so I can fill it with some values, and then try to memcpy its float array to a different struct's float array, and print out the result to see if it worked.
The other structure (ArrayPairs) is supposed to hold two arrays of arrays. So that low[i] belongs to high[i] and vice versa when I want to make some changes on "a couple".
So I make 2 SomeMisc objects, fill their arrays with numbers, and then try to make a function where I expand the low-array and high-array of the ArrayPairs object with realloc firstly, then I try to malloc space to the new rows, and then finally memcpy content from the 2 SomeMisc member arrays given as arguments to the function.
But it keeps resulting in segmentation faults and/or undefined behavior, and I can't figure out why.
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include <stdlib.h>
typedef struct some{
int32_t len;
float* arr;
} SomeMisc;
typedef struct arrPrs{
int32_t amountOfRows;
int32_t amountOfColumns;
float** low;
float** high;
} ArrayPairs;
void initializeArrayPairArray(ArrayPairs* AP, int32_t length, int32_t width){
AP->amountOfRows = length;
AP->amountOfColumns = width;
AP->low = (float**)malloc(length * sizeof(float*));
AP->high = (float**)malloc(length * sizeof(float*));
for(int i=0; i<length; i++){
AP->low[i] = (float*)malloc(width * sizeof(float));
AP->high[i] = (float*)malloc(width * sizeof(float));
for(int j=0; j<width; j++){
AP->low[i][j] = 32;
AP->high[i][j] = 44;
}
}
}
void addArrayPair(ArrayPairs* AP, float* low, float* high){
AP->amountOfRows++;
AP->low = (float**)realloc(AP->low, AP->amountOfRows * sizeof(float*));
AP->high = (float**)realloc(AP->high, AP->amountOfRows * sizeof(float*));
AP->low[AP->amountOfRows] = (float*)malloc(AP->amountOfColumns * sizeof(float));
AP->high[AP->amountOfRows] = (float*)malloc(AP->amountOfColumns * sizeof(float));
memcpy(AP->low[AP->amountOfRows], low, AP->amountOfColumns * sizeof(float));
memcpy(AP->high[AP->amountOfRows], high, AP->amountOfColumns * sizeof(float));
printf("TESTING PRINT: %.2f\n", AP->high[10][5]);
}
int main () {
int32_t nrOfCols = 8;
int32_t nrOfRows = 10;
ArrayPairs arr;
initializeArrayPairArray(&arr, nrOfRows, nrOfCols);
int32_t mArrLength = 2;
SomeMisc* mArr = (SomeMisc*)malloc(mArrLength*sizeof(SomeMisc));
for(int i=0; i<mArrLength; i++){
mArr[i].arr = (float*)malloc(nrOfCols*sizeof(float));
for(int j=0; j<nrOfCols; j++){
mArr[i].arr[j] = (i+1)*j;
}
}
addArrayPair(&arr, mArr[0].arr, mArr[1].arr);
printf("LOW:\tHIGH:\n");
for(int i=9; i<arr.amountOfRows; i++){
printf("INDEX: %d\n",i);
for(int j=0; j<arr.amountOfColumns; j++){
printf("%.2f\t%.2f\n",arr.low[i][j],arr.high[i][j]);
}
printf("\n");
}
return(0);
}
I followed this answer: 2d array realloc Segmentation Fault Error
But I already have the ArrayPairs* AP in the parameter list of addArrayPair, and the & with the object arr when calling the function.
I also tried dereferencing as was suggested in that answer, but this didn't work either:
void addArrayPair(ArrayPairs* AP, float* low, float* high){
(*AP).amountOfRows++;
(*AP).low = (float**)realloc((*AP).low, AP->amountOfRows * sizeof(float*));
(*AP).high = (float**)realloc((*AP).high, AP->amountOfRows * sizeof(float*));
(*AP).low[AP->amountOfRows] = (float*)malloc((*AP).amountOfColumns * sizeof(float));
(*AP).high[AP->amountOfRows] = (float*)malloc((*AP).amountOfColumns * sizeof(float));
memcpy((*AP).low[(*AP).amountOfRows], low, (*AP).amountOfColumns * sizeof(float));
memcpy((*AP).high[(*AP).amountOfRows], high, (*AP).amountOfColumns * sizeof(float));
}
You increase AP->amountOfRows too early. That means when you do AP->low[AP->amountOfRows] you will use an out-of-bounds index, and have undefined behavior
Instead (re)allocate AP->amountOfRows + 1 elements, and increase AP->amountOfRows once all allocations and copying is done:
void addArrayPair(ArrayPairs* AP, float* low, float* high){
AP->low = realloc(AP->low, (AP->amountOfRows + 1) * sizeof(float*));
AP->high = realloc(AP->high, (AP->amountOfRows + 1) * sizeof(float*));
AP->low[AP->amountOfRows] = malloc(AP->amountOfColumns * sizeof(float));
AP->high[AP->amountOfRows] = malloc(AP->amountOfColumns * sizeof(float));
memcpy(AP->low[AP->amountOfRows], low, AP->amountOfColumns * sizeof(float));
memcpy(AP->high[AP->amountOfRows], high, AP->amountOfColumns * sizeof(float));
// Increase once all is done
AP->amountOfRows++;
}

MPI gathering 2D subarrays

I know this has been answered many times before and there is a comprehensive answer here which I have read and attempted to use but I just can't get my code to work for some reason.
I have stripped my code down a bit to make it a bit easier to follow, but basically what I am trying to do is have each process initialise a sub-array and work on it, then put the whole big array back together on rank 0. MPI_Gatherv is giving me a segfault and I cannot figure out why.
Any help would be greatly appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <mpi.h>
#define N 32
void init_lattice(double **site, int row, int col){
int i,j;
for(i=0; i<row; i++){
for(j=0; j<col; j++){
site[i][j]=(drand48()/4294967295.0 + 0.5)*2*M_PI;
}
}
}
int main(int argc, char *argv[]){
int nprocs, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank (MPI_COMM_WORLD, &rank);
int dim = 2;
int grid[dim];
grid[0]=0;
grid[1]=0;
// Assign the grid dimensions
MPI_Dims_create(nprocs, dim, grid);
printf("Dim grid: length: %d, width: %d\n", grid[0], grid[1]);
// The new communicator
MPI_Comm comm_grid;
// Allow cyclic behavior
int periodic[dim];
periodic[0] = 1;
periodic[1] = 1;
// Create the communicator
MPI_Cart_create(MPI_COMM_WORLD, dim, grid, periodic, 0, &comm_grid);
int block_len, block_width;
block_len = N/grid[1];
block_width = N/grid[0];
int i, j;
//Create lattice subset
double *data = (double *) malloc (block_len * block_width * sizeof(double));
double **site = (double **) malloc (block_len * sizeof(double *));
for (i = 0; i < block_len; i++)
site[i] = & (data[i * block_width]);
//Initialise lattice
init_lattice(site, block_len, block_width);
MPI_Datatype newtype, subtype;
int sizes[dim];
sizes[0]=N;
sizes[1]=N;
int subsizes[dim];
subsizes[0] = block_len;
subsizes[1] = block_width;
int starts[dim];
starts[0] = 0;
starts[1] = 0;
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_DOUBLE, &newtype);
MPI_Type_create_resized(newtype, 0, N/grid[1]*sizeof(double), &subtype);
MPI_Type_commit(&subtype);
int sendcounts[grid[0]*grid[1]];
int displs[grid[0]*grid[1]];
if (rank == 0) {
for (i=0; i<grid[0]*grid[1]; i++) sendcounts[i] = 1;
int disp = 0;
for (i=0; i<grid[0]; i++) {
for (j=0; j<grid[1]; j++) {
displs[i*grid[0]+j] = disp;
disp += 1;
}
disp += ((N/grid[1])-1)*grid[0];
}
}
//Create global lattice
double *global_data = (double *) malloc (N * N * sizeof(double));
double **global_site = (double **) malloc (N * sizeof(double *));
for (i = 0; i < N; i++)
global_site[i] = & (global_data[i * N]);
MPI_Gatherv(&(site[0][0]), N*N/(grid[0]*grid[1]), MPI_DOUBLE, &(global_site[0][0]), sendcounts, displs, subtype, 0, MPI_COMM_WORLD);
if(rank==0){
printf("Rank: %d\n", rank);
for(i=0; i<N; i++){
for(j=0; j<N; j++){
printf("%.2lf ", global_site[i][j]);
}
printf("\n");
}
}
return 0;
}
EDIT:
Ok so I have changed my array allocations to contiguous memory and everything is working as it should now. Thanks talonmies!
The fundamental problem here is that MPI expects all allocations to be contiguous blocks of memory. Your site and global_site arrays are not, they are arrays of pointers. The MPI routines are just reading past the end of each individual row allocation and causing your segfault.
If you want to allocate an n x n array to use with the MPI then you need to replace this:
double **global_site;
if(rank==0){
global_site = malloc(sizeof(double *)*(N));
for(i=0; i<N; i++)
global_site[i] = malloc(sizeof(double)*(N));
}
with something like this:
double *global_site = malloc(sizeof(double)*(N * N));
You will obviously need to adjust the rest of your code accordingly.
It seems the only reason you are actually using arrays of pointers is for the convenience of [i][j] style 2D indexing. If you use linear or pitched linear memory, you can easily make a little preprocessor macro or helper function which can give you that style of indexing into row or column major ordered storage which is still compatible with MPI.

Dynamic Matrix Multiplication with Pthreads

I'm a beginner with Thread Programming and C in general and I'm trying to figure out how to do a simple Matrix Multiplication with Pthreads. I want to create a thread for every column and put the results in a Result Matrix. I'm trying to do it dynamicly, which means the user is allowed to use an input as a size to create two n x n matrices.
My code right now, excluding filling the matrix and reading the size n is the following:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double (*MA)[];
double (*MB)[];
double (*MC)[];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[size][size], MB[size][size], MC[size][size];
int i, j;
//filling the matrices (missing here, does work)
pthread_t threads[size];
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
The problem is, that I cant use neither MA, MB nor NC(:= the result) in the multiply method with something like its shown in the code.
I just get the error "invalid use of array with unspecific bounds" even though I declared all three of them in the main method.
Do I understand anything wrong here or how can I fix that? I tried to adapt a example of my lecture where a thread for every element will be created.
Thanks in advance!
Just about the error:
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
MC was declared as double (*MC)[] and you try to use it as a two dimensional array like you had declared it double MC[N]{M]. You can use a two (or more) dimensional array like you did if and only if the first dimension was fixed or if you alloc it row by row.
So your program could be:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double MA[][];
double MB[][];
double MC[][];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[][], MB[][], MC[][];
int i, j;
pthread_t threads[size];
MA = (double **) malloc(size * sizeof(double *));
MB = (double **) malloc(size * sizeof(double *));
MC = (double **) malloc(size * sizeof(double *));
for(int i=0;i<size;++i){
MA[i] = (double *) malloc(size * sizeof(double));
MB[i] = (double *) malloc(size * sizeof(double));
MC[i] = (double *) malloc(size * sizeof(double));
}
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->MA = MA;
m->MB = MB;
m->MC = MC;
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
But you must TAKE CARE that the thread can access to the data concurrently and so you should use some locks if different threads can use and change same values.

Implementing CUDA VecAdd from sample code

I'm trying to test out a sample code from the CUDA site http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels.
I simply want to add two arrays A and B of size 4, and store it in array C. Here is what I have so far:
#include <stdio.h>
#include "util.h"
void print_array(int* array, int size) {
int i;
for (i = 0; i < size; i++) {
printf("%d ", array[i]);
}
printf("\n");
}
__global__ void VecAdd(int* A, int* B, int* C) {
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc , char **argv) {
int N = 4;
int i;
int *A = (int *) malloc(N * sizeof(int));
int *B = (int *) malloc(N * sizeof(int));
int *C = (int *) malloc(N * sizeof(int));
for (i = 0; i < N; i++) {
A[i] = i + 1;
B[i] = i + 1;
}
print_array(A, N);
print_array(B, N);
VecAdd<<<1, N>>>(A, B, C);
print_array(C, N);
return 0;
}
I'm expecting the C array (the last row of the output) to be 2, 4, 6, 8, but it doesn't seem to get added:
1 2 3 4
1 2 3 4
0 0 0 0
What am I missing?
First, you have to define the pointers that will hold the data that will be copied to GPU:
In your example, we want to copy the arrays 'a','b' and 'c' from CPU to the GPU's global memory.
int a[array_size], b[array_size],c[array_size]; // your original arrays
int *a_cuda,*b_cuda,*c_cuda; // defining the "cuda" pointers
define the size that each array will occupy.
int size = array_size * sizeof(int); // Is the same for the 3 arrays
Then you will allocate the space to the data that will be used in cuda:
Cuda memory allocation:
msg_erro[0] = cudaMalloc((void **)&a_cuda,size);
msg_erro[1] = cudaMalloc((void **)&b_cuda,size);
msg_erro[2] = cudaMalloc((void **)&c_cuda,size);
Now we need to copy this data from CPU to the GPU:
Copy from CPU to GPU:
msg_erro[3] = cudaMemcpy(a_cuda, a,size,cudaMemcpyHostToDevice);
msg_erro[4] = cudaMemcpy(b_cuda, b,size,cudaMemcpyHostToDevice);
msg_erro[5] = cudaMemcpy(c_cuda, c,size,cudaMemcpyHostToDevice);
Execute the kernel
int blocks = //;
int threads_per_block = //;
VecAdd<<<blocks, threads_per_block>>>(a_cuda, b_cuda, c_cuda);
Copy the results from GPU to CPU (in our example array C):
msg_erro[6] = cudaMemcpy(c,c_cuda,size,cudaMemcpyDeviceToHost);
Free Memory:
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
For debugging purposes, I normally save the status of the functions on an array, like this:
cudaError_t msg_erro[var];
However, this is not strictly necessary but it will save you time if an error occurs during the allocation or memory transference. You can take out all the 'msg_erro[x] =' from the code above if you wish.
If you mantain the 'msg_erro[x] =', and if a error does occur you can use a function like the one that follows, to print these erros:
void printErros(cudaError_t *erros,int size)
{
for(int i = 0; i < size; i++)
printf("{%d} => %s\n",i ,cudaGetErrorString(erros[i]));
}
You need to transfer the memory back and forth from/to the GPU, something like
int *a_GPU, *b_GPU, *c_GPU;
cudaMalloc(&a_GPU, N*sizeof(int));
cudaMalloc(&b_GPU, N*sizeof(int));
cudaMalloc(&c_GPU, N*sizeof(int));
cudaMemcpy(a_GPU, A, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_GPU, B, N*sizeof(int), cudaMemcpyHostToDevice);
VecAdd<<<1, N>>>(a_GPU, b_GPU, c_GPU);
cudaMemcpy(C, c_GPU, N*sizeof(int), cudaMemcpyDeviceToHost);
print_array(C, N);
cudaFree(a_GPU);
cudaFree(b_GPU);
cudaFree(c_GPU);

How can I add up two 2d (pitched) arrays using nested for loops?

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Resources