Implementing CUDA VecAdd from sample code - arrays

I'm trying to test out a sample code from the CUDA site http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels.
I simply want to add two arrays A and B of size 4, and store it in array C. Here is what I have so far:
#include <stdio.h>
#include "util.h"
void print_array(int* array, int size) {
int i;
for (i = 0; i < size; i++) {
printf("%d ", array[i]);
}
printf("\n");
}
__global__ void VecAdd(int* A, int* B, int* C) {
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc , char **argv) {
int N = 4;
int i;
int *A = (int *) malloc(N * sizeof(int));
int *B = (int *) malloc(N * sizeof(int));
int *C = (int *) malloc(N * sizeof(int));
for (i = 0; i < N; i++) {
A[i] = i + 1;
B[i] = i + 1;
}
print_array(A, N);
print_array(B, N);
VecAdd<<<1, N>>>(A, B, C);
print_array(C, N);
return 0;
}
I'm expecting the C array (the last row of the output) to be 2, 4, 6, 8, but it doesn't seem to get added:
1 2 3 4
1 2 3 4
0 0 0 0
What am I missing?

First, you have to define the pointers that will hold the data that will be copied to GPU:
In your example, we want to copy the arrays 'a','b' and 'c' from CPU to the GPU's global memory.
int a[array_size], b[array_size],c[array_size]; // your original arrays
int *a_cuda,*b_cuda,*c_cuda; // defining the "cuda" pointers
define the size that each array will occupy.
int size = array_size * sizeof(int); // Is the same for the 3 arrays
Then you will allocate the space to the data that will be used in cuda:
Cuda memory allocation:
msg_erro[0] = cudaMalloc((void **)&a_cuda,size);
msg_erro[1] = cudaMalloc((void **)&b_cuda,size);
msg_erro[2] = cudaMalloc((void **)&c_cuda,size);
Now we need to copy this data from CPU to the GPU:
Copy from CPU to GPU:
msg_erro[3] = cudaMemcpy(a_cuda, a,size,cudaMemcpyHostToDevice);
msg_erro[4] = cudaMemcpy(b_cuda, b,size,cudaMemcpyHostToDevice);
msg_erro[5] = cudaMemcpy(c_cuda, c,size,cudaMemcpyHostToDevice);
Execute the kernel
int blocks = //;
int threads_per_block = //;
VecAdd<<<blocks, threads_per_block>>>(a_cuda, b_cuda, c_cuda);
Copy the results from GPU to CPU (in our example array C):
msg_erro[6] = cudaMemcpy(c,c_cuda,size,cudaMemcpyDeviceToHost);
Free Memory:
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
For debugging purposes, I normally save the status of the functions on an array, like this:
cudaError_t msg_erro[var];
However, this is not strictly necessary but it will save you time if an error occurs during the allocation or memory transference. You can take out all the 'msg_erro[x] =' from the code above if you wish.
If you mantain the 'msg_erro[x] =', and if a error does occur you can use a function like the one that follows, to print these erros:
void printErros(cudaError_t *erros,int size)
{
for(int i = 0; i < size; i++)
printf("{%d} => %s\n",i ,cudaGetErrorString(erros[i]));
}

You need to transfer the memory back and forth from/to the GPU, something like
int *a_GPU, *b_GPU, *c_GPU;
cudaMalloc(&a_GPU, N*sizeof(int));
cudaMalloc(&b_GPU, N*sizeof(int));
cudaMalloc(&c_GPU, N*sizeof(int));
cudaMemcpy(a_GPU, A, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_GPU, B, N*sizeof(int), cudaMemcpyHostToDevice);
VecAdd<<<1, N>>>(a_GPU, b_GPU, c_GPU);
cudaMemcpy(C, c_GPU, N*sizeof(int), cudaMemcpyDeviceToHost);
print_array(C, N);
cudaFree(a_GPU);
cudaFree(b_GPU);
cudaFree(c_GPU);

Related

different ways to declare a matrix c

I don't really understand why method 1 works but not method 2. I don't really see why it works for characters and not an int.
#include <stdlib.h>
#include <stdio.h>
int main(void)
{
/// WORK (METHODE 1)
char **string_array = malloc(sizeof(char **) * 10);
string_array[0] = "Hi there";
printf("%s\n", string_array[0]); /// -> Hi there
/// DOES NOT WORK (METHODE 2)
int **int_matrix = malloc(sizeof(int **) * 10);
int_matrix[0][0] = 1; // -> Segmentation fault
/// WORK (METHODE 3)
int **int_matrix2 = malloc(sizeof(int *));
for (int i = 0; i < 10; i++)
{
int_matrix2[i] = malloc(sizeof(int));
}
int_matrix2[0][0] = 42;
printf("%d\n", int_matrix2[0][0]); // -> 42
}
In terms of the types, you want to allocate memory for the type "one level up" from the pointer you're assigning it to. For example, an int pointer (an int*), points to one or more ints. That means, when you allocate space for it, you should allocate based on the int type:
#define NUM_INTS 10
...
int* intPtr = malloc(NUM_INTS * sizeof(int));
// ^^ // we want ints, so allocate for sizeof(int)
In one of your cases, you have a double int pointer (an int**). This must point to one or more int pointers (int*), so that's the type you need to allocate space for:
#define NUM_INT_PTRS 5
...
int** myDblIntPtr = malloc(NUM_INT_PTRS * sizeof(int*));
// ^^ "one level up" from int** is int*
However, there's an even better way to do this. You can specify the size of your object it points to rather than a type:
int* intPtr = malloc(NUM_INTS * sizeof(*intPtr));
Here, intPtr is an int* type, and the object it points to is an int, and that's exactly what *intPtr gives us. This has the added benefit of less maintenance. Pretend some time down the line, int* intPtr changes to int** intPtr. For the first way of doing things, you'd have to change code in two places:
int** intPtr = malloc(NUM_INTS * sizeof(int*));
// ^^ here ^^ and here
However, with the 2nd way, you only need to change the declaration:
int** intPtr = malloc(NUM_INTS * sizeof(*intPtr));
// ^^ still changed here ^^ nothing to change here
With the change of declaration from int* to int**, *intPtr also changed "automatically", from int to int*. This means that the paradigm:
T* myPtr = malloc(NUM_ITEMS * sizeof(*myPtr));
is preferred, since *myPtr will always refer to the correct object we need to size for the correct amount of memory, no matter what type T is.
Others have already answered most of the question, but I thought I would add some illustrations...
When you want an array-like object, i.e., a sequence of consecutive elements of a given type T, you use a pointer to T, T *, but you want to point to objects of type T, and that is what you must allocate memory for.
If you want to allocate 10 T objects, you should use malloc(10 * sizeof(T)). If you have a pointer to assign the array to, you can get the size from that
T * ptr = malloc(10 * sizeof *ptr);
Here *ptr has type T and so sizeof *ptr is the same as sizeof(T), but this syntax is safer for reasons explained in other answers.
When you use
T * ptr = malloc(10 * sizeof(T *));
you do not get memory for 10 T objects, but for 10 T * objects. If sizeof(T*) >= sizeof(T) you are fine, except that you are wasting some memory, but if sizeof(T*) < sizeof(T) you have less memory than you need.
Whether you run into this problem or not depends on your objects and the system you are on. On my system, all pointers have the same size, 8 bytes, so it doesn't really matter if I allocate
char **string_array = malloc(sizeof(char **) * 10);
or
char **string_array = malloc(sizeof(char *) * 10);
or if I allocate
int **int_matrix = malloc(sizeof(int **) * 10);
or
int **int_matrix = malloc(sizeof(int *) * 10);
but it could be on other architectures.
For your third solution, you have a different problem. When you allocate
int **int_matrix2 = malloc(sizeof(int *));
you allocate space for a single int pointer, but you immediately treat that memory as if you had 10
for (int i = 0; i < 10; i++)
{
int_matrix2[i] = malloc(sizeof(int));
}
You can safely assign to the first element, int_matrix2[0] (but there is a problem with how you do it that I get to); the following 9 addresses you write to are not yours to modify.
The next issue is that once you have allocated the first dimension of your matrix, you have an array of pointers. Those pointers are not initialised, and presumably pointing at random places in memory.
That isn't a problem yet; it doesn't do any harm that these pointers are pointing into the void. You can just point them to somewhere else. This is what you do with your char ** array. You point the first pointer in the array to a string, and it is happy to point there instead.
Once you have pointed the arrays somewhere safe, you can access the memory there. But you cannot safely dereference the pointers when they are not initialised. That is what you try to do with your integer array. At int_matrix[0] you have an uninitialised pointer. The type-system doesn't warn you about that, it can't, so you can easily compile code that modifies int_matrix[0][0], but if int_matrix[0] is pointing into the void, int_matrix[0][0] is not an address you can safely read or write. What happens if you try is undefined, but undefined is generally was way of saying that something bad will happen.
You can get what you want in several ways. The closest to what it looks like you are trying is to implement matrices as arrays of pointers to arrays of values.
There, you just have to remember to allocate the arrays for each row in your matrix as well.
#include <stdio.h>
#include <stdlib.h>
int **new_matrix(int n, int m)
{
int **matrix = malloc(n * sizeof *matrix);
for (int i = 0; i < n; i++)
{
matrix[i] = malloc(m * sizeof *matrix[i]);
}
return matrix;
}
void init_matrix(int n, int m, int **matrix)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
matrix[i][j] = 10 * i + j + 1;
}
}
}
void print_matrix(int n, int m, int **matrix)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%d ", matrix[i][j]);
}
printf("\n");
}
}
int main(void)
{
int n = 3, m = 5;
int **matrix = new_matrix(n, m);
init_matrix(n, m, matrix);
print_matrix(n, m, matrix);
return 0;
}
Here, each row can lie somewhere random in memory, but you can also put the row in contiguous memory, so you allocate all the memory in a single malloc and compute indices to get at the two-dimensional matrix structure.
Row i will start at offset i*m into this flat array, and index matrix[i,j] is at index matrix[i * m + j].
#include <stdio.h>
#include <stdlib.h>
int *new_matrix(int n, int m)
{
int *matrix = malloc(n * m * sizeof *matrix);
return matrix;
}
void init_matrix(int n, int m, int *matrix)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
matrix[m * i + j] = 10 * i + j + 1;
}
}
}
void print_matrix(int n, int m, int *matrix)
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%d ", matrix[m * i + j]);
}
printf("\n");
}
}
int main(void)
{
int n = 3, m = 5;
int *matrix = new_matrix(n, m);
init_matrix(n, m, matrix);
print_matrix(n, m, matrix);
return 0;
}
With the exact same memory layout, you can also use multidimensional arrays. If you declare a matrix as int matrix[n][m] you will get what amounts to an array of length n where the objects in the arrays are integer arrays of length m, exactly as on the figure above.
If you just write that expression, you are putting the matrix on the stack (it has auto scope), but you can allocate such matrices as well if you use a pointer to int [m] arrays.
#include <stdio.h>
#include <stdlib.h>
void *new_matrix(int n, int m)
{
int(*matrix)[n][m] = malloc(sizeof *matrix);
return matrix;
}
void init_matrix(int n, int m, int matrix[static n][m])
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
matrix[i][j] = 10 * i + j + 1;
}
}
}
void print_matrix(int n, int m, int matrix[static n][m])
{
for (int i = 0; i < n; i++)
{
for (int j = 0; j < m; j++)
{
printf("%d ", matrix[i][j]);
}
printf("\n");
}
}
int main(void)
{
int n = 3, m = 5;
int(*matrix)[m] = new_matrix(n, m);
init_matrix(n, m, matrix);
print_matrix(n, m, matrix);
int(*matrix2)[m] = new_matrix(2 * n, 3 * m);
init_matrix(2 * n, 3 * m, matrix2);
print_matrix(2 * n, 3 * m, matrix2);
return 0;
}
The new_matrix() function returns a void * because the return type cannot depend on the runtime arguments n and m, so I cannot return the right type.
Don't let the function types fool you, here. The functions that take a matrix[n][m] argument do not check if the matrix has the right dimensions. You can get a little type checking with pointers to arrays, but pointer decay will generally limit the checking. The last solution is really only different syntax for the previous one, and the arguments n and m determines how the (flat) memory that matrix points to is interpreted.
The method 1 works only becuse you assign the char * element of the array string_array with the reference of the string literal `"Hi there". String literal is simply a char array.
Try: string_array[0][0] = 'a'; and it will fail as well as you will dereference not initialized pointer.
Same happens in method 2.
Method 3. You allocate the memory for one int value and store the reference to it in the [0] element of the array. As the pointer references the valid object you can derefence it (int_matrix2[0][0] = 42;)

C, Why does my custom free function give me the "pointer being freed was not allocated" error

I am trying to dynamically allocate an array, put some data in it, and then free it and set the array pointer to null so that it can not be accessed in the future. Also, unrelated, but I am storing the size of the array in the first element and then passing it back indexed one up, it is part of the assignment, so hopefully that doesn't confuse anyone.
If I am understanding the error correctly, I am trying to call free() on the array that my malloc'ed array was copied in to. This is not allowed because free() is not being called on the actual malloc'ed array but rather the one that's holding its values.
If this is the case, how would I fix my call of free() to only receive an array address and dereference it like free(*array);. Right now I have some mess of asteriscs and a cast and I have no idea why it works. If you know how to fix the free call into the above or just explain why what I have now works, I would greatly appreciate it. My goal is to be able to set the parameter for the custom free function to a void pointer instead of a specific data type pointer. Thanks!!
#include <stdlib.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(double ** array);
int main(void){
double * arr = createArray(10, sizeof(double));
int size = getSizeArray(arr);
/* using output for error checking
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
*/
void* p = &arr;
freeArray(p);
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return (void*) (array + 1);
}
void freeArray(double ** array){
free(*array);
*array = NULL;
}
EDIT: Look to #JonathanLeffler comment. The issue is with alignment. I switched around some of my code but I had to index back one and not cast in my functions but instead in main
#include <stdlib.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(double ** array);
int main(void){
double * arr = createArray(10, sizeof(double));
arr = (void*) (arr + 1);
int size = getSizeArray(arr);
/* using output for error checking*/
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
arr = (double*) (arr - 1);
freeArray(&arr);
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return array;
}
void freeArray(double ** array){
free(*array);
*array = NULL;
}
I provided a complete solution to this problem for another user. Must be a class assignment. My version is very similar to yours except I used macros instead of functions. Anyway, #Serge answer was so close. It -1 not +1.
Here what I plug into my code and it worked fine:
void freeArray(void** array)
{
free( ((int*)(*array)) - 1 );
*array = NULL;
}
Let me explain what going on. The C allocation routines are basically doing what you are doing. They save the array size one word above the actual array. Follow link for more information on how free() works. In our version, we are saving the array size one int (2 words/4 bytes) above the actual array. Your code was wrong because the address you reference is the 3rd element and not the first. You need to pass in the address where the array allocation originated which is ((int*)(*array)) - 1.
If you free(*array), you don't need to *array = NULL after that.
Also, you can't cast a (void *) onto an (int *) and assign it to a (double *).
Lastly, you can't freeArray(p); if p is a single pointer since freeArray(double ** array) has a parameter of double double-pointer.
Hopefully, this helps.
You can compare my modified code.
#include <stdlib.h>
#include <stdio.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(void ** array);
int main(void){
double * arr = (double *)createArray(10, sizeof(double));
int size = getSizeArray(arr);
printf("size of arr %d\n", size);
// using output for error checking
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
void ** p = (void **)&arr;
freeArray(p);
printf("del arr, then arr = %u\n",(unsigned)arr);
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = (int*)malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return (void*) (array + 1);
}
void freeArray(void ** array){
free(((int*)*array)-1);
*array = NULL;
}
output:
size of arr 10
0.000000
1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
del arr, then arr = 0

CUDA Array set to 0 after kernel call

I have a simple program with 3 array, that count how much the third array is 0 and the first and second has same values. when it's true increment another array index.
The problems are:
If kernel has only the first if() then function the array A is ever 0
If I insert if() then else function the values of array A is set to 0 after index = 2 and don't count the state when A,B,C=0
this is the code
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#include <cuda_runtime_api.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, float *b, float *c, float *res)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (a[idx]=b[idx] && c[idx]==0) {
res[0]++;
}
else if (a[idx]=b[idx] && c[idx]==1){
res[1]++;
}
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
float *b_h, *b_d; // Pointer to host & device arrays
float *c_h, *c_d; // Pointer to host & device arrays
float *res_h, *res_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
//size_t size_s = 4 * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
b_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &b_d, size); // Allocate array on device
c_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &c_d, size); // Allocate array on device
res_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &res_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
// for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) b_h[i] = (float)i;
for (int i=0; i<N; i++) c_h[i] = (float)i;
for (int i=0; i<4; i++) res_h[i] = 0;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(c_d, c_h, size, cudaMemcpyHostToDevice);
cudaMemcpy(res_d, res_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 8;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, b_d, c_d, res_d);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(b_h, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(c_h, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
cudaMemcpy(res_h, res_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++){
printf("%f A \n", a_h[i]);
}
for (int i=0; i<N; i++){
printf("%f B \n", b_h[i]);
}
for (int i=0; i<N; i++){
printf("%f C \n", c_h[i]);
}
for (int i=0; i<4; i++){
printf("%f res \n", res_h[i]);
}
// Cleanup
free(a_h); cudaFree(a_d);
free(b_h); cudaFree(b_d);
free(c_h); cudaFree(c_d);
free(res_h); cudaFree(res_d);
}
Aside from the = in if (a[idx]=b[idx] && c[idx]==0) { that should be == as you already found (and same goes for the following if statement), there are at least two other issues in your code:
You don't check that the thread index doesn't go over the limit of the arrays. So since you are using 2 block of 8 threads, you have 16 threads accessing 10 elements arrays. To avoid the issue, you need to pass N as parameter for your kernel and add a if ( idx < N ) somewhere.
You accumulate in res in parallel without any sort of protection, leading to all kinds of race conditions. This is a very typical histogram issue that is explained aplenty in the literature (web, books, CUDA examples...). A quick fix for you (albeit probably not the most effective one) would be to use atomic operations, such as atomicAdd. In you case, the line res[0]++; would become atomicAdd( &res[0], 1 );, and res[1]++; would become (as you guessed) atomicAdd( &res[1], 1 );. The support of this for float implies you compile your code while using compute capability at least 2.0.
HTH
Sorry, I solved the problem.It was a mistake typing control = and not true ==

Dynamic Matrix Multiplication with Pthreads

I'm a beginner with Thread Programming and C in general and I'm trying to figure out how to do a simple Matrix Multiplication with Pthreads. I want to create a thread for every column and put the results in a Result Matrix. I'm trying to do it dynamicly, which means the user is allowed to use an input as a size to create two n x n matrices.
My code right now, excluding filling the matrix and reading the size n is the following:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double (*MA)[];
double (*MB)[];
double (*MC)[];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[size][size], MB[size][size], MC[size][size];
int i, j;
//filling the matrices (missing here, does work)
pthread_t threads[size];
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
The problem is, that I cant use neither MA, MB nor NC(:= the result) in the multiply method with something like its shown in the code.
I just get the error "invalid use of array with unspecific bounds" even though I declared all three of them in the main method.
Do I understand anything wrong here or how can I fix that? I tried to adapt a example of my lecture where a thread for every element will be created.
Thanks in advance!
Just about the error:
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
MC was declared as double (*MC)[] and you try to use it as a two dimensional array like you had declared it double MC[N]{M]. You can use a two (or more) dimensional array like you did if and only if the first dimension was fixed or if you alloc it row by row.
So your program could be:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double MA[][];
double MB[][];
double MC[][];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[][], MB[][], MC[][];
int i, j;
pthread_t threads[size];
MA = (double **) malloc(size * sizeof(double *));
MB = (double **) malloc(size * sizeof(double *));
MC = (double **) malloc(size * sizeof(double *));
for(int i=0;i<size;++i){
MA[i] = (double *) malloc(size * sizeof(double));
MB[i] = (double *) malloc(size * sizeof(double));
MC[i] = (double *) malloc(size * sizeof(double));
}
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->MA = MA;
m->MB = MB;
m->MC = MC;
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
But you must TAKE CARE that the thread can access to the data concurrently and so you should use some locks if different threads can use and change same values.

How can I add up two 2d (pitched) arrays using nested for loops?

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Resources