Realloc + memcpy 2D float array results in segmentation fault - c

I made a structure (SomeMisc) which has a float array, so I can fill it with some values, and then try to memcpy its float array to a different struct's float array, and print out the result to see if it worked.
The other structure (ArrayPairs) is supposed to hold two arrays of arrays. So that low[i] belongs to high[i] and vice versa when I want to make some changes on "a couple".
So I make 2 SomeMisc objects, fill their arrays with numbers, and then try to make a function where I expand the low-array and high-array of the ArrayPairs object with realloc firstly, then I try to malloc space to the new rows, and then finally memcpy content from the 2 SomeMisc member arrays given as arguments to the function.
But it keeps resulting in segmentation faults and/or undefined behavior, and I can't figure out why.
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include <stdlib.h>
typedef struct some{
int32_t len;
float* arr;
} SomeMisc;
typedef struct arrPrs{
int32_t amountOfRows;
int32_t amountOfColumns;
float** low;
float** high;
} ArrayPairs;
void initializeArrayPairArray(ArrayPairs* AP, int32_t length, int32_t width){
AP->amountOfRows = length;
AP->amountOfColumns = width;
AP->low = (float**)malloc(length * sizeof(float*));
AP->high = (float**)malloc(length * sizeof(float*));
for(int i=0; i<length; i++){
AP->low[i] = (float*)malloc(width * sizeof(float));
AP->high[i] = (float*)malloc(width * sizeof(float));
for(int j=0; j<width; j++){
AP->low[i][j] = 32;
AP->high[i][j] = 44;
}
}
}
void addArrayPair(ArrayPairs* AP, float* low, float* high){
AP->amountOfRows++;
AP->low = (float**)realloc(AP->low, AP->amountOfRows * sizeof(float*));
AP->high = (float**)realloc(AP->high, AP->amountOfRows * sizeof(float*));
AP->low[AP->amountOfRows] = (float*)malloc(AP->amountOfColumns * sizeof(float));
AP->high[AP->amountOfRows] = (float*)malloc(AP->amountOfColumns * sizeof(float));
memcpy(AP->low[AP->amountOfRows], low, AP->amountOfColumns * sizeof(float));
memcpy(AP->high[AP->amountOfRows], high, AP->amountOfColumns * sizeof(float));
printf("TESTING PRINT: %.2f\n", AP->high[10][5]);
}
int main () {
int32_t nrOfCols = 8;
int32_t nrOfRows = 10;
ArrayPairs arr;
initializeArrayPairArray(&arr, nrOfRows, nrOfCols);
int32_t mArrLength = 2;
SomeMisc* mArr = (SomeMisc*)malloc(mArrLength*sizeof(SomeMisc));
for(int i=0; i<mArrLength; i++){
mArr[i].arr = (float*)malloc(nrOfCols*sizeof(float));
for(int j=0; j<nrOfCols; j++){
mArr[i].arr[j] = (i+1)*j;
}
}
addArrayPair(&arr, mArr[0].arr, mArr[1].arr);
printf("LOW:\tHIGH:\n");
for(int i=9; i<arr.amountOfRows; i++){
printf("INDEX: %d\n",i);
for(int j=0; j<arr.amountOfColumns; j++){
printf("%.2f\t%.2f\n",arr.low[i][j],arr.high[i][j]);
}
printf("\n");
}
return(0);
}
I followed this answer: 2d array realloc Segmentation Fault Error
But I already have the ArrayPairs* AP in the parameter list of addArrayPair, and the & with the object arr when calling the function.
I also tried dereferencing as was suggested in that answer, but this didn't work either:
void addArrayPair(ArrayPairs* AP, float* low, float* high){
(*AP).amountOfRows++;
(*AP).low = (float**)realloc((*AP).low, AP->amountOfRows * sizeof(float*));
(*AP).high = (float**)realloc((*AP).high, AP->amountOfRows * sizeof(float*));
(*AP).low[AP->amountOfRows] = (float*)malloc((*AP).amountOfColumns * sizeof(float));
(*AP).high[AP->amountOfRows] = (float*)malloc((*AP).amountOfColumns * sizeof(float));
memcpy((*AP).low[(*AP).amountOfRows], low, (*AP).amountOfColumns * sizeof(float));
memcpy((*AP).high[(*AP).amountOfRows], high, (*AP).amountOfColumns * sizeof(float));
}

You increase AP->amountOfRows too early. That means when you do AP->low[AP->amountOfRows] you will use an out-of-bounds index, and have undefined behavior
Instead (re)allocate AP->amountOfRows + 1 elements, and increase AP->amountOfRows once all allocations and copying is done:
void addArrayPair(ArrayPairs* AP, float* low, float* high){
AP->low = realloc(AP->low, (AP->amountOfRows + 1) * sizeof(float*));
AP->high = realloc(AP->high, (AP->amountOfRows + 1) * sizeof(float*));
AP->low[AP->amountOfRows] = malloc(AP->amountOfColumns * sizeof(float));
AP->high[AP->amountOfRows] = malloc(AP->amountOfColumns * sizeof(float));
memcpy(AP->low[AP->amountOfRows], low, AP->amountOfColumns * sizeof(float));
memcpy(AP->high[AP->amountOfRows], high, AP->amountOfColumns * sizeof(float));
// Increase once all is done
AP->amountOfRows++;
}

Related

C, Why does my custom free function give me the "pointer being freed was not allocated" error

I am trying to dynamically allocate an array, put some data in it, and then free it and set the array pointer to null so that it can not be accessed in the future. Also, unrelated, but I am storing the size of the array in the first element and then passing it back indexed one up, it is part of the assignment, so hopefully that doesn't confuse anyone.
If I am understanding the error correctly, I am trying to call free() on the array that my malloc'ed array was copied in to. This is not allowed because free() is not being called on the actual malloc'ed array but rather the one that's holding its values.
If this is the case, how would I fix my call of free() to only receive an array address and dereference it like free(*array);. Right now I have some mess of asteriscs and a cast and I have no idea why it works. If you know how to fix the free call into the above or just explain why what I have now works, I would greatly appreciate it. My goal is to be able to set the parameter for the custom free function to a void pointer instead of a specific data type pointer. Thanks!!
#include <stdlib.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(double ** array);
int main(void){
double * arr = createArray(10, sizeof(double));
int size = getSizeArray(arr);
/* using output for error checking
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
*/
void* p = &arr;
freeArray(p);
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return (void*) (array + 1);
}
void freeArray(double ** array){
free(*array);
*array = NULL;
}
EDIT: Look to #JonathanLeffler comment. The issue is with alignment. I switched around some of my code but I had to index back one and not cast in my functions but instead in main
#include <stdlib.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(double ** array);
int main(void){
double * arr = createArray(10, sizeof(double));
arr = (void*) (arr + 1);
int size = getSizeArray(arr);
/* using output for error checking*/
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
arr = (double*) (arr - 1);
freeArray(&arr);
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return array;
}
void freeArray(double ** array){
free(*array);
*array = NULL;
}
I provided a complete solution to this problem for another user. Must be a class assignment. My version is very similar to yours except I used macros instead of functions. Anyway, #Serge answer was so close. It -1 not +1.
Here what I plug into my code and it worked fine:
void freeArray(void** array)
{
free( ((int*)(*array)) - 1 );
*array = NULL;
}
Let me explain what going on. The C allocation routines are basically doing what you are doing. They save the array size one word above the actual array. Follow link for more information on how free() works. In our version, we are saving the array size one int (2 words/4 bytes) above the actual array. Your code was wrong because the address you reference is the 3rd element and not the first. You need to pass in the address where the array allocation originated which is ((int*)(*array)) - 1.
If you free(*array), you don't need to *array = NULL after that.
Also, you can't cast a (void *) onto an (int *) and assign it to a (double *).
Lastly, you can't freeArray(p); if p is a single pointer since freeArray(double ** array) has a parameter of double double-pointer.
Hopefully, this helps.
You can compare my modified code.
#include <stdlib.h>
#include <stdio.h>
int getSizeArray(void *array);
void * createArray(int n, int sizeOfDatatype);
void freeArray(void ** array);
int main(void){
double * arr = (double *)createArray(10, sizeof(double));
int size = getSizeArray(arr);
printf("size of arr %d\n", size);
// using output for error checking
for(int i = 0; i < 10; i++){
arr[i] = i;
}
for(int j = 0; j < 10; j++){
printf("%f\n", arr[j]);
}
void ** p = (void **)&arr;
freeArray(p);
printf("del arr, then arr = %u\n",(unsigned)arr);
}
int getSizeArray(void *array){
int s = ((int *) array)[-1];
return s;
}
void * createArray(int n, int sizeOfDatatype){
int * array = (int*)malloc((n * sizeOfDatatype) + sizeof(int));
array[0] = n;
return (void*) (array + 1);
}
void freeArray(void ** array){
free(((int*)*array)-1);
*array = NULL;
}
output:
size of arr 10
0.000000
1.000000
2.000000
3.000000
4.000000
5.000000
6.000000
7.000000
8.000000
9.000000
del arr, then arr = 0

Segmentation code when accessing allocated memory

I need to fill sparse matrix with random elemnts. I am trying to get random elements and write them as value, column and row elements of their arrays but I keep running into segmentation faults.
This only happens if I set N=1000 then SIZE=10000 (because if I set SIZE=1000 or less, it works).
Does that mean I can't allocate all this memory or access it after I allocated it?
What should I do if I really need to get all this memory (SIZE=10'000) allocated? can someone please help me?
#include <stdio.h>
#include <stdlib.h>
typedef struct _matrix {
int size; //number of not-null elements
int ord; //order of matrix
int* val;
int* col;
int* row;
} matrix;
matrix init (int ord, int size)
{
matrix m;
m.ord = ord;
m.size = size;
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
return m;
}
matrix fill_matrix (int ord)
{
int i, j, gap, size = ord * ord / 100;
matrix new_matrix = init(ord, size);
j = 1;
for (i = 0; i < size; i++) {
new_matrix.val[i] = rand() % 9 + 1;
new_matrix.col[i] = rand() % ord + 1; //<------SEGFAULT
new_matrix.row[i] = rand() % ord + 1;
j++;
}
return new_matrix;
}
int main()
{
matrix A;
int n = 1000;
A = fill_matrix(n);
return 0;
}
You assign to val three times, but never initialize the other two pointers.
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
I'm guessing you meant this:
m.val = malloc(sizeof(int) * size);
m.col = malloc(sizeof(int) * size);
m.row = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
m.val = malloc(sizeof(int) * size);
You never malloc m.col and m.row :-) Cut & Paste programming is evil...

Dynamic Matrix Multiplication with Pthreads

I'm a beginner with Thread Programming and C in general and I'm trying to figure out how to do a simple Matrix Multiplication with Pthreads. I want to create a thread for every column and put the results in a Result Matrix. I'm trying to do it dynamicly, which means the user is allowed to use an input as a size to create two n x n matrices.
My code right now, excluding filling the matrix and reading the size n is the following:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double (*MA)[];
double (*MB)[];
double (*MC)[];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[size][size], MB[size][size], MC[size][size];
int i, j;
//filling the matrices (missing here, does work)
pthread_t threads[size];
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
The problem is, that I cant use neither MA, MB nor NC(:= the result) in the multiply method with something like its shown in the code.
I just get the error "invalid use of array with unspecific bounds" even though I declared all three of them in the main method.
Do I understand anything wrong here or how can I fix that? I tried to adapt a example of my lecture where a thread for every element will be created.
Thanks in advance!
Just about the error:
work->MC[0][0] = 0.0.//can't use MC, MB, MA here!!
MC was declared as double (*MC)[] and you try to use it as a two dimensional array like you had declared it double MC[N]{M]. You can use a two (or more) dimensional array like you did if and only if the first dimension was fixed or if you alloc it row by row.
So your program could be:
#include <pthread.h>
#include <stdio.h>
#include<stdlib.h>
typedef struct Matrix {
int line, col, size;
double MA[][];
double MB[][];
double MC[][];
} Matrix;
void *multiply(void *arg) {
Matrix* work = (Matrix*) arg;
int s, z;
s = work->col;
z = work->line;
work->MC[0][0] = 0.0
return 0;
}
int main() {
Matrix* m;
//read size and set it to int size (miissing here, does work)
double MA[][], MB[][], MC[][];
int i, j;
pthread_t threads[size];
MA = (double **) malloc(size * sizeof(double *));
MB = (double **) malloc(size * sizeof(double *));
MC = (double **) malloc(size * sizeof(double *));
for(int i=0;i<size;++i){
MA[i] = (double *) malloc(size * sizeof(double));
MB[i] = (double *) malloc(size * sizeof(double));
MC[i] = (double *) malloc(size * sizeof(double));
}
for (i = 0; i < size; i++) {
m = malloc(sizeof(Matrix*));
m->MA = MA;
m->MB = MB;
m->MC = MC;
m->size = size;
m->col = i;
pthread_create(&threads[i], NULL, multiply, m);
}
for (i = 0; i < size; i++) {
pthread_join(threads[i], NULL);
}
return 0;
}
But you must TAKE CARE that the thread can access to the data concurrently and so you should use some locks if different threads can use and change same values.

Implementing CUDA VecAdd from sample code

I'm trying to test out a sample code from the CUDA site http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels.
I simply want to add two arrays A and B of size 4, and store it in array C. Here is what I have so far:
#include <stdio.h>
#include "util.h"
void print_array(int* array, int size) {
int i;
for (i = 0; i < size; i++) {
printf("%d ", array[i]);
}
printf("\n");
}
__global__ void VecAdd(int* A, int* B, int* C) {
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc , char **argv) {
int N = 4;
int i;
int *A = (int *) malloc(N * sizeof(int));
int *B = (int *) malloc(N * sizeof(int));
int *C = (int *) malloc(N * sizeof(int));
for (i = 0; i < N; i++) {
A[i] = i + 1;
B[i] = i + 1;
}
print_array(A, N);
print_array(B, N);
VecAdd<<<1, N>>>(A, B, C);
print_array(C, N);
return 0;
}
I'm expecting the C array (the last row of the output) to be 2, 4, 6, 8, but it doesn't seem to get added:
1 2 3 4
1 2 3 4
0 0 0 0
What am I missing?
First, you have to define the pointers that will hold the data that will be copied to GPU:
In your example, we want to copy the arrays 'a','b' and 'c' from CPU to the GPU's global memory.
int a[array_size], b[array_size],c[array_size]; // your original arrays
int *a_cuda,*b_cuda,*c_cuda; // defining the "cuda" pointers
define the size that each array will occupy.
int size = array_size * sizeof(int); // Is the same for the 3 arrays
Then you will allocate the space to the data that will be used in cuda:
Cuda memory allocation:
msg_erro[0] = cudaMalloc((void **)&a_cuda,size);
msg_erro[1] = cudaMalloc((void **)&b_cuda,size);
msg_erro[2] = cudaMalloc((void **)&c_cuda,size);
Now we need to copy this data from CPU to the GPU:
Copy from CPU to GPU:
msg_erro[3] = cudaMemcpy(a_cuda, a,size,cudaMemcpyHostToDevice);
msg_erro[4] = cudaMemcpy(b_cuda, b,size,cudaMemcpyHostToDevice);
msg_erro[5] = cudaMemcpy(c_cuda, c,size,cudaMemcpyHostToDevice);
Execute the kernel
int blocks = //;
int threads_per_block = //;
VecAdd<<<blocks, threads_per_block>>>(a_cuda, b_cuda, c_cuda);
Copy the results from GPU to CPU (in our example array C):
msg_erro[6] = cudaMemcpy(c,c_cuda,size,cudaMemcpyDeviceToHost);
Free Memory:
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
For debugging purposes, I normally save the status of the functions on an array, like this:
cudaError_t msg_erro[var];
However, this is not strictly necessary but it will save you time if an error occurs during the allocation or memory transference. You can take out all the 'msg_erro[x] =' from the code above if you wish.
If you mantain the 'msg_erro[x] =', and if a error does occur you can use a function like the one that follows, to print these erros:
void printErros(cudaError_t *erros,int size)
{
for(int i = 0; i < size; i++)
printf("{%d} => %s\n",i ,cudaGetErrorString(erros[i]));
}
You need to transfer the memory back and forth from/to the GPU, something like
int *a_GPU, *b_GPU, *c_GPU;
cudaMalloc(&a_GPU, N*sizeof(int));
cudaMalloc(&b_GPU, N*sizeof(int));
cudaMalloc(&c_GPU, N*sizeof(int));
cudaMemcpy(a_GPU, A, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_GPU, B, N*sizeof(int), cudaMemcpyHostToDevice);
VecAdd<<<1, N>>>(a_GPU, b_GPU, c_GPU);
cudaMemcpy(C, c_GPU, N*sizeof(int), cudaMemcpyDeviceToHost);
print_array(C, N);
cudaFree(a_GPU);
cudaFree(b_GPU);
cudaFree(c_GPU);

How can I add up two 2d (pitched) arrays using nested for loops?

I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}

Resources