I am using CUDA to add two matrices, and to give their result in another matrix. I wish to make use of shared memory feature, and for this, I wrote the following:
#include <stdio.h>
#include <cuda.h>
#define grid 1024
#define BSZ 16
__global__ void addition(int *dev_a, int *dev_b, int *dev_c)
{
__shared__ int as[BSZ][BSZ];
__shared__ int bs[BSZ][BSZ];
int by = blockIdx.y;
int bx = blockIdx.x;
int cvalue;
int ty = threadIdx.y;
int tx = threadIdx.x;
int row = by * BSZ + ty;
int col = bx * BSZ + tx;
as[ty][tx] = dev_a[row*grid + col];
bs[ty][tx] = dev_b[row*grid + col];
__syncthreads();
cvalue = as[ty][tx] + bs[ty][tx];
__syncthreads();
dev_c[row*grid + col] = cvalue;
}
int main ()
{
int a[grid][grid], b[grid][grid], c[grid][grid];
//c = a + b
for(int i=0;i<grid;i++)
{
for(int j=0;j<grid;j++)
{
a[i][j]=2;
b[i][j]=1;
}
}
printf("Working fine here");
int *dev_a;
int *dev_b;
int *dev_c;
int size = grid * grid * sizeof(int);
printf("Working fine");
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
cudaMemcpy(dev_a,a,size,cudaMemcpyHostToDevice);
cudaMemcpy(dev_b,b,size,cudaMemcpyHostToDevice);
dim3 dimBlock(BSZ,BSZ);
dim3 dimGrid(grid/dimBlock.x,grid/dimBlock.y);
//Kernel launch
addition<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c,dev_c,size,cudaMemcpyDeviceToHost);
for (int i=0; i<grid; i++)
{
for(int j=0;j<grid;j++)
{
printf( "%d + %d = %d\n", a[i][j], b[i][j], c[i][j] );
}
}
}
I am getting a segmentation fault error, which I am not able to understand why! Please someone help me with this.
int a[1024][1024], b[1024][1024], c[1024][1024];
The size of these objects is astronomical! You're probably overflowing the stack. I think you'll find the segfaults vanish if you reduce their sizes, or increase the size of your stack however your implementation permits you to do that, or perhaps even allocate them with dynamic storage duration (eg. malloc or in your case cudaMalloc) rather than automatic storage duration.
Related
I have written a cuda program to do some operation on large array. But when I pass that array to a cuda kernel, then all of its elements are not accessed by threads. Below, there is a simple program explaining my use case:
#include <stdio.h>
#include <stdlib.h>
__global__
void kernel(int n){
int s = threadIdx.x + blockIdx.x*blockDim.x;
int t = blockDim.x*gridDim.x;
for(int i=s;i<n;i+=t){
printf("%d\n",i); //printing index of array which is being accessed
}
}
int main(void){
int i,n = 10000; //array_size
int blockSize = 64;
int numBlocks = (n + blockSize - 1) / blockSize;
kernel<<<numBlocks, blockSize>>>(n);
cudaDeviceSynchronize();
}
I've tried with different blockSize = 256, 128, 64, etc, It is not printing all the indices of array. Ideally, it should print any permutation of 0 to n-1, however it is printing lesser(<n) numbers.
If numBlocks and blockSize both are 1, then it is accessing all the element. And if array size is less than 4096, then also it is accessing all the elements.
Actually, all of the values are being printed in the current case. but you may not be able to see all of them due to buffer limit of the output console. Try increasing the output console's buffer size.
Additionally, keep in mind that the printf calls inside the kernel execute out-of-order. Also, there are limitations of the printf buffer on the device which are explained in the documentation.
Use better debugging techniques! Your code is properly functional
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
__global__
void kernel(int* in, int n){
int s = threadIdx.x + blockIdx.x*blockDim.x;
int t = blockDim.x*gridDim.x;
for (int i = s; i<n; i += t){
in[i] = 1; //printing index of array which is being accessed
}
}
int main(void){
int i, n = 10000; //array_size
int blockSize = 64;
int numBlocks = (n + blockSize - 1) / blockSize;
int* d_res,*h_res;
cudaMalloc(&d_res, n*sizeof(int));
h_res = (int*)malloc(n*sizeof(int));
kernel << <numBlocks, blockSize >> >(d_res, n);
cudaDeviceSynchronize();
cudaMemcpy(h_res, d_res, n*sizeof(int), cudaMemcpyDeviceToHost);
int sum = 0;
for (int i = 0; i < n; i++)
sum += h_res[i];
printf("%d", sum);
}
I am trying to reduce an array to the sum of its elements using CUDA. I am having trouble communicating the sum calculated in the device back to the host so that it can be printed out.
this is the output I get:
contents of Array: 33 36 27 15 43 35 36 42 49 21
Reduced sum of Array elements = 4204303
reduced sum is obviously wrong.
here is my code.
#include <stdio.h>
#include <cuda.h>
#define N 10
__global__ void reduce(int *g_idata, int *g_odata);
void random_ints (int *a, int n);
int main( void ) {
int a[N], b[N]; // copies of a, b, c
int *dev_a, *dev_b; // device copies of a, b, c
int size = N * sizeof( int ); // we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
//a = (int *)malloc( size );
//b = (int *)malloc( size );
random_ints( a, N );
printf("contents of Array: ");
for(int i =0; i<N; i++)
{
printf(" %d ", a[i]);
}
printf("\n");
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice );
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice );
// launch dot() kernel with 1 block and N threads
reduce<<< 1, N >>>( dev_a, dev_b);
// copy device result back to host copy of c
cudaMemcpy( b, dev_b, sizeof( int ) , cudaMemcpyDeviceToHost );
printf("Reduced sum of Array elements = %d ", b[0]);
//free( a );
// free( b );
cudaFree( dev_a );
cudaFree( dev_b );
return 0;
}
__global__ void reduce(int *g_idata, int *g_odata) {
extern __shared__ int sdata[];
// each thread loads one element from global to shared mem
int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[threadIdx.x] = g_idata[i];
__syncthreads();
// do reduction in shared mem
for (int s=1; s < blockDim.x; s *=2)
{
int index = 2 * s * threadIdx.x;;
if (index < blockDim.x)
{
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (threadIdx.x == 0) g_odata[blockIdx.x] = sdata[0];
}
// CPU function to generate a vector of random integers
void random_ints (int *a, int n) {
for (int i = 0; i < n; i++)
a[i] = rand() % 50; // random number between 0 and 49
}
You need to specify the amount of shared memory if you use extern shared memory.
You have two solutions :
with
extern __shared__ int sdata[];
reduce<<< 1, N, N*sizeof(int) >>>( dev_a, dev_b);
with this parameter, set the size of shared memory to use in this kernel.
with
__shared__ int sdata[N];
with N a constant number of elements.
I have a strange problem dealing with 2D array on CUDA device.
#define VR 100 // rows
#define ST 13 // columns
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
I don't get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381
The problem is that the original data array is being allocated using cudaMallocPitch whereas the copying is being done using ordinary cudaMemcpy. This will give unexpected results because the cudaMallocPitch operation creates "padded" rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:
#include <stdio.h>
#include <iostream>
#include <curand_kernel.h>
#define VR 100 // rows
#define ST 13 // columns
__device__ float generate(curandState* globalState, int id)
{
//int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curandState localState = globalState[id];
float rand;
do {
rand = curand_uniform( &localState );
} while(rand == 0); //
globalState[id] = localState;
return rand;
}
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
//cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
using namespace std;
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy2D(h_arr, ST*sizeof(float), d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
}
Compiling the above code using:
nvcc -arch=sm_20 -lcurand -o t70 t70.cu
and then running I get what appears to be "normal" output:
N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869
(results truncated)
I think it's wrong, you should assign VR number of threads or blocks because you already loop through ST in the kernel.
maybe that will fix it.
I'm trying to test out a sample code from the CUDA site http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels.
I simply want to add two arrays A and B of size 4, and store it in array C. Here is what I have so far:
#include <stdio.h>
#include "util.h"
void print_array(int* array, int size) {
int i;
for (i = 0; i < size; i++) {
printf("%d ", array[i]);
}
printf("\n");
}
__global__ void VecAdd(int* A, int* B, int* C) {
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
int main(int argc , char **argv) {
int N = 4;
int i;
int *A = (int *) malloc(N * sizeof(int));
int *B = (int *) malloc(N * sizeof(int));
int *C = (int *) malloc(N * sizeof(int));
for (i = 0; i < N; i++) {
A[i] = i + 1;
B[i] = i + 1;
}
print_array(A, N);
print_array(B, N);
VecAdd<<<1, N>>>(A, B, C);
print_array(C, N);
return 0;
}
I'm expecting the C array (the last row of the output) to be 2, 4, 6, 8, but it doesn't seem to get added:
1 2 3 4
1 2 3 4
0 0 0 0
What am I missing?
First, you have to define the pointers that will hold the data that will be copied to GPU:
In your example, we want to copy the arrays 'a','b' and 'c' from CPU to the GPU's global memory.
int a[array_size], b[array_size],c[array_size]; // your original arrays
int *a_cuda,*b_cuda,*c_cuda; // defining the "cuda" pointers
define the size that each array will occupy.
int size = array_size * sizeof(int); // Is the same for the 3 arrays
Then you will allocate the space to the data that will be used in cuda:
Cuda memory allocation:
msg_erro[0] = cudaMalloc((void **)&a_cuda,size);
msg_erro[1] = cudaMalloc((void **)&b_cuda,size);
msg_erro[2] = cudaMalloc((void **)&c_cuda,size);
Now we need to copy this data from CPU to the GPU:
Copy from CPU to GPU:
msg_erro[3] = cudaMemcpy(a_cuda, a,size,cudaMemcpyHostToDevice);
msg_erro[4] = cudaMemcpy(b_cuda, b,size,cudaMemcpyHostToDevice);
msg_erro[5] = cudaMemcpy(c_cuda, c,size,cudaMemcpyHostToDevice);
Execute the kernel
int blocks = //;
int threads_per_block = //;
VecAdd<<<blocks, threads_per_block>>>(a_cuda, b_cuda, c_cuda);
Copy the results from GPU to CPU (in our example array C):
msg_erro[6] = cudaMemcpy(c,c_cuda,size,cudaMemcpyDeviceToHost);
Free Memory:
cudaFree(a_cuda);
cudaFree(b_cuda);
cudaFree(c_cuda);
For debugging purposes, I normally save the status of the functions on an array, like this:
cudaError_t msg_erro[var];
However, this is not strictly necessary but it will save you time if an error occurs during the allocation or memory transference. You can take out all the 'msg_erro[x] =' from the code above if you wish.
If you mantain the 'msg_erro[x] =', and if a error does occur you can use a function like the one that follows, to print these erros:
void printErros(cudaError_t *erros,int size)
{
for(int i = 0; i < size; i++)
printf("{%d} => %s\n",i ,cudaGetErrorString(erros[i]));
}
You need to transfer the memory back and forth from/to the GPU, something like
int *a_GPU, *b_GPU, *c_GPU;
cudaMalloc(&a_GPU, N*sizeof(int));
cudaMalloc(&b_GPU, N*sizeof(int));
cudaMalloc(&c_GPU, N*sizeof(int));
cudaMemcpy(a_GPU, A, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_GPU, B, N*sizeof(int), cudaMemcpyHostToDevice);
VecAdd<<<1, N>>>(a_GPU, b_GPU, c_GPU);
cudaMemcpy(C, c_GPU, N*sizeof(int), cudaMemcpyDeviceToHost);
print_array(C, N);
cudaFree(a_GPU);
cudaFree(b_GPU);
cudaFree(c_GPU);
I'm new to cuda. I want to add up two 2d array into a third array.
I use following code:
cudaMallocPitch((void**)&device_a, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_b, &pitch, 2*sizeof(int),2);
cudaMallocPitch((void**)&device_c, &pitch, 2*sizeof(int),2);
now my problem is that i dont want to use these array as flattened 2-d array
all in my kernel code i want to di is use two for loop & put the result in the third array like
__global__ void add(int *dev_a ,int *dev_b,int* dec_c)
{
for i=0;i<2;i++)
{
for j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
How i can do this in CUDA?
please tell me how to use 2-d array in this way ?
What should be the kernel call for using 2d-array ?
If possible, please explain using code samples.
The short answer is, you can't. The cudaMallocPitch()function does exactly what its name implies, it allocates pitched linear memory, where the pitch is chosen to be optimal for the GPU memory controller and texture hardware.
If you wanted to use arrays of pointers in the kernel, the kernel code would have to look like this:
__global___ void add(int *dev_a[] ,int *dev_b[], int* dec_c[])
{
for i=0;i<2;i++) {
for j=0;j<2;j++) {
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
and then you would need nested cudaMalloc calls on the host side to construct the array of pointers and copy it to device memory. For your rather trivial 2x2 example, the code to allocate a single array would look like this:
int ** h_a = (int **)malloc(2 * sizeof(int *));
cudaMalloc((void**)&h_a[0], 2*sizeof(int));
cudaMalloc((void**)&h_a[1], 2*sizeof(int));
int **d_a;
cudaMalloc((void ***)&d_a, 2 * sizeof(int *));
cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice);
Which would leave the allocated device array of pointers in d_a, and you would pass that to your kernel.
For code complexity and performance reasons, you really don't want to do that, using arrays of pointers in CUDA code is both harder and slower than the alternative using linear memory.
To show what folly using arrays of pointers is in CUDA, here is a complete working example of your sample problem which combines the two ideas above:
#include <cstdio>
__global__ void add(int * dev_a[], int * dev_b[], int * dev_c[])
{
for(int i=0;i<2;i++)
{
for(int j=0;j<2;j++)
{
dev_c[i][j]=dev_a[i][j]+dev_b[i][j];
}
}
}
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
int main(void)
{
const int aa[2][2]={{1,2},{3,4}};
const int bb[2][2]={{5,6},{7,8}};
int cc[2][2];
int ** h_a = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_a[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_a[i], &aa[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int **d_a;
GPUerrchk(cudaMalloc((void ***)&d_a, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_a, h_a, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_b = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_b[i], 2*sizeof(int)));
GPUerrchk(cudaMemcpy(h_b[i], &bb[i][0], 2*sizeof(int), cudaMemcpyHostToDevice));
}
int ** d_b;
GPUerrchk(cudaMalloc((void ***)&d_b, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_b, h_b, 2*sizeof(int *), cudaMemcpyHostToDevice));
int ** h_c = (int **)malloc(2 * sizeof(int *));
for(int i=0; i<2;i++){
GPUerrchk(cudaMalloc((void**)&h_c[i], 2*sizeof(int)));
}
int ** d_c;
GPUerrchk(cudaMalloc((void ***)&d_c, 2 * sizeof(int *)));
GPUerrchk(cudaMemcpy(d_c, h_c, 2*sizeof(int *), cudaMemcpyHostToDevice));
add<<<1,1>>>(d_a,d_b,d_c);
GPUerrchk(cudaPeekAtLastError());
for(int i=0; i<2;i++){
GPUerrchk(cudaMemcpy(&cc[i][0], h_c[i], 2*sizeof(int), cudaMemcpyDeviceToHost));
}
for(int i=0;i<2;i++) {
for(int j=0;j<2;j++) {
printf("(%d,%d):%d\n",i,j,cc[i][j]);
}
}
return cudaThreadExit();
}
I recommend you study it until you understand what it does, and why it is such a poor idea compared to using linear memory.
You don't need to use for loops inside the device. Try this code.
#include <stdio.h>
#include <cuda.h>
#include <stdlib.h>
#include <time.h>
#define N 800
__global__ void matrixAdd(float* A, float* B, float* C){
int i = threadIdx.x;
int j = blockIdx.x;
C[N*j+i] = A[N*j+i] + B[N*j+i];
}
int main (void) {
clock_t start = clock();
float a[N][N], b[N][N], c[N][N];
float *dev_a, *dev_b, *dev_c;
cudaMalloc((void **)&dev_a, N * N * sizeof(float));
cudaMalloc((void **)&dev_b, N * N * sizeof(float));
cudaMalloc((void **)&dev_c, N * N * sizeof(float));
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = rand() % 10;
b[i][j] = rand() % 10;
}
}
cudaMemcpy(dev_a, a, N * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * N * sizeof(float), cudaMemcpyHostToDevice);
matrixAdd <<<N,N>>> (dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N * N * sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
printf("[%d, %d ]= %f + %f = %f\n",i,j, a[i][j], b[i][j], c[i][j]);
}
}
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}