A simple reduction program in CUDA - c

In the below code, I am trying to implement a simple parallel reduction with blocksize and number of threads per block being 1024. However, after implementing partial reduction, I wish to see whether my implementation is going right or not and in that process I make the program print the first element of the host memory (after data has been copied from device memory to host memory).
My host memory is initialize with '1' and is copied to device memory for reduction. And the printf statement after the reduction process still gives me '1' at the first element of the array.
Is there a problem in what I am getting to print or is it something logical in the implementation of reduction?
In addition printf statements in the kernel do not print anything. Is there something wrong in my syntax or the call to the printf statement?
My code is as below:
ifndef CUDACC
define CUDACC
endif
include "cuda_runtime.h"
include "device_launch_parameters.h"
include
include
ifndef THREADSPERBLOCK
define THREADSPERBLOCK 1024
endif
ifndef NUMBLOCKS
define NUMBLOCKS 1024
endif
global void reduceKernel(int *c)
{
extern shared int sh_arr[];
int index = blockDim.x*blockIdx.x + threadIdx.x;
int sh_index = threadIdx.x;
// Storing data from Global memory to shared Memory
sh_arr[sh_index] = c[index];
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0 ; i>>=1)
{
if(sh_index < i){
sh_arr[sh_index] += sh_arr[i+sh_index];
}
__syncthreads();
}
if(sh_index ==0)
c[blockIdx.x]=sh_arr[sh_index];
printf("value stored at %d is %d \n", blockIdx.x, c[blockIdx.x]);
return;
}
int main()
{
int *h_a;
int *d_a;
int share_memSize, h_memSize;
size_t d_memSize;
share_memSize = THREADSPERBLOCK*sizeof(int);
h_memSize = THREADSPERBLOCK*NUMBLOCKS;
h_a = (int*)malloc(sizeof(int)*h_memSize);
d_memSize=THREADSPERBLOCK*NUMBLOCKS;
cudaMalloc( (void**)&d_a, h_memSize*sizeof(int));
for(int i=0; i<h_memSize; i++)
{
h_a[i]=1;
};
//printf("last element of array %d \n", h_a[h_memSize-1]);
cudaMemcpy((void**)&d_a, (void**)&h_a, h_memSize, cudaMemcpyHostToDevice);
reduceKernel<<<NUMBLOCKS, THREADSPERBLOCK, share_memSize>>>(d_a);
cudaMemcpy((void**)&h_a, (void**)&d_a, d_memSize, cudaMemcpyDeviceToHost);
printf("sizeof host memory %d \n", d_memSize); //sizeof(h_a));
printf("sum after reduction %d \n", h_a[0]);
}

There are a number of problems with this code.
much of what you've posted is not valid code. As just a few examples, your global and shared keywords are supposed to have double-underscores before and after, like this: __global__ and __shared__. I assume this is some sort of copy-paste error or formatting error. There are problems with your define statements as well. You should endeavor to post code that doesn't have these sorts of problems.
Any time you are having trouble with a CUDA code, you should use proper cuda error checking and run your code with cuda-memcheck before asking for help. If you had done this , it would have focused your attention on item 3 below.
Your cudaMemcpy operations are broken in a couple of ways:
cudaMemcpy((void**)&d_a, (void**)&h_a, h_memSize, cudaMemcpyHostToDevice);
First, unlike cudaMalloc, but like memcpy, cudaMemcpy just takes ordinary pointer arguments. Second, the size of the transfer (like memcpy) is in bytes, so your sizes need to be scaled up by sizeof(int):
cudaMemcpy(d_a, h_a, h_memSize*sizeof(int), cudaMemcpyHostToDevice);
and similarly for the one after the kernel.
printf from every thread in a large kernel (like this one which has 1048576 threads) is probably not a good idea. You won't actually get all the output you expect, and on windows (appears you are running on windows) you may run into a WDDM watchdog timeout due to kernel execution taking too long. If you need to printf from a large kernel, be selective and condition your printf on threadIdx.x and blockIdx.x
The above things are probably enough to get some sensible printout, and as you point out you're not finished yet anyway: "I wish to see whether my implementation is going right or not ". However, this kernel, as crafted, overwrites its input data with output data:
__global__ void reduceKernel(int *c)
...
c[blockIdx.x]=sh_arr[sh_index];
This will lead to a race condition. Rather than trying to sort this out for you, I'd suggest separating your output data from your input data. Even better, you should study the cuda reduction sample code which also has an associated presentation.
Here is a modified version of your code which has most of the above issues fixed. It's still not correct. It still has defect 5 above in it. Rather than completely rewrite your code to fix defect 5, I would direct you to the cuda sample code mentioned above.
$ cat t820.cu
#include <stdio.h>
#ifndef THREADSPERBLOCK
#define THREADSPERBLOCK 1024
#endif
#ifndef NUMBLOCKS
#define NUMBLOCKS 1024
#endif
__global__ void reduceKernel(int *c)
{
extern __shared__ int sh_arr[];
int index = blockDim.x*blockIdx.x + threadIdx.x;
int sh_index = threadIdx.x;
// Storing data from Global memory to shared Memory
sh_arr[sh_index] = c[index];
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0 ; i>>=1)
{
if(sh_index < i){
sh_arr[sh_index] += sh_arr[i+sh_index];
}
__syncthreads();
}
if(sh_index ==0)
c[blockIdx.x]=sh_arr[sh_index];
// printf("value stored at %d is %d \n", blockIdx.x, c[blockIdx.x]);
return;
}
int main()
{
int *h_a;
int *d_a;
int share_memSize, h_memSize;
size_t d_memSize;
share_memSize = THREADSPERBLOCK*sizeof(int);
h_memSize = THREADSPERBLOCK*NUMBLOCKS;
h_a = (int*)malloc(sizeof(int)*h_memSize);
d_memSize=THREADSPERBLOCK*NUMBLOCKS;
cudaMalloc( (void**)&d_a, h_memSize*sizeof(int));
for(int i=0; i<h_memSize; i++)
{
h_a[i]=1;
};
//printf("last element of array %d \n", h_a[h_memSize-1]);
cudaMemcpy(d_a, h_a, h_memSize*sizeof(int), cudaMemcpyHostToDevice);
reduceKernel<<<NUMBLOCKS, THREADSPERBLOCK, share_memSize>>>(d_a);
cudaMemcpy(h_a, d_a, d_memSize*sizeof(int), cudaMemcpyDeviceToHost);
printf("sizeof host memory %d \n", d_memSize); //sizeof(h_a));
printf("first block sum after reduction %d \n", h_a[0]);
}
$ nvcc -o t820 t820.cu
$ cuda-memcheck ./t820
========= CUDA-MEMCHECK
sizeof host memory 1048576
first block sum after reduction 1024
========= ERROR SUMMARY: 0 errors
$

Related

How to measure overall performance of parallel programs (with papi)

I asked myself what would be the best way to measure the performance (in flops) of a parallel program. I read about papi_flops. This seems to work fine for a serial program. But I don't know how I can measure the overall performance of a parallel program.
I would like to measure the performance of a blas/lapack function, in my example below gemm. But I also want to measure other function, specially functions where the number of operation is not known. (In the case of gemm the ops are known (ops(gemm) = 2*n^3), so I could calculate the performance as a function of the number of operations and the execution time.) The library (I am using Intel MKL) spawn the threads automatically. So I can't measure the performance of each thread individually and then reduce it.
This is my example:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "mkl.h"
#include "omp.h"
#include "papi.h"
int main(int argc, char *argv[] )
{
int i, j, l, k, n, m, idx, iter;
int mat, mat_min, mat_max;
int threads;
double *A, *B, *C;
double alpha =1.0, beta=0.0;
float rtime1, rtime2, ptime1, ptime2, mflops;
long long flpops;
#pragma omp parallel
{
#pragma omp master
threads = omp_get_num_threads();
}
if(argc < 4){
printf("pass me 3 arguments!\n");
return( -1 );
}
else
{
mat_min = atoi(argv[1]);
mat_max = atoi(argv[2]);
iter = atoi(argv[3]);
}
m = mat_max; n = mat_max; k = mat_max;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
A = (double *) malloc( m*k * sizeof(double) );
B = (double *) malloc( k*n * sizeof(double) );
C = (double *) malloc( m*n * sizeof(double) );
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*k); i++)
A[i] = (double)(i+1);
for (i = 0; i < (k*n); i++)
B[i] = (double)(-i-1);
memset(C,0,m*n*sizeof(double));
// actual meassurment
for(mat=mat_min;mat<=mat_max;mat+=5)
{
m = mat; n = mat; k = mat;
for( idx=-1; idx<iter; idx++ ){
PAPI_flops( &rtime1, &ptime1, &flpops, &mflops );
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, A, k, B, n, beta, C, n);
PAPI_flops( &rtime2, &ptime2, &flpops, &mflops );
}
printf("%d threads: %d in %f sec, %f MFLOPS\n",threads,mat,rtime2-rtime1,mflops);fflush(stdout);
}
printf("Done\n");fflush(stdout);
free(A);
free(B);
free(C);
return 0;
}
This is one output (for matrix size 200):
1 threads: 200 in 0.001459 sec, 5570.258789 MFLOPS
2 threads: 200 in 0.000785 sec, 5254.993652 MFLOPS
4 threads: 200 in 0.000423 sec, 4919.640137 MFLOPS
8 threads: 200 in 0.000264 sec, 3894.036865 MFLOPS
We can see for the execution time, that the function gemm scales. But the flops that I am measuring is only the performance of thread 0.
My question is: How can I measure the overall performance? I am grateful for any input.
First, I'm just curious - why do you need the FLOPS? don't you just care how much time is taken? or maybe time taken in compare to other BLAS libraries?
PAPI is thread based not much help on its own here.
What I would do is measure around the function call and see how time changes with number of threads it spawns. It should not spawn more threads than physical cores (HT is no good here). Then, if the matrix is big enough, and the machine is not loaded, the time should simply divide by the number of threads. E.g., 10 seconds over 4 core should become 2.5 seconds.
Other than that, there are 2 things you can do to really measure it:
1. Use whatever you use now but inject your start/end measurement code around the BLAS code. One way to do that (in linux) is by pre-loading a lib that defines pthread_start and using your own functions that call the originals but do some extra measurements. Another way to to override the function pointer when the process is already running (=trampoline). In linux it's in the GOT/PLT and in windows it's more complicated - look for a library.
2. Use oprofile, or some other profiler, to report number of instructions executed in the time you care for. Or better yet, to report the number of floating point instructions executed. A little problem with this is that SSE instructions are multiplying or adding 2 or more doubles at a time so you'd have to account for that. I guess you can assume they always use the maximum possible operands.

CUDA How to access constant memory in device kernel when the constant memory is declared in the host code?

For the record this is homework so help as little or as much with that in mind. We are using constant memory to store a "mask matrix" that will be used to perform a convolution on a larger matrix. When I am in the host code I am copying the mask to constant memory using the cudaMemcpyToSymbol().
My question is once this is copied over and I launch my device kernel code how does the device know where to access the constant memory mask matrix. Is there a pointer that I need to pass in on kernel launch. Most of the code that the professor gave us is not supposed to be changed (there is no pointer to the mask passed in) but there is always the possibility that he made a mistake ( although it is most likely my understanding of something)
Is the constant memeory declaratoin supposed to be included in the seperate kernel.cu file?
I am minimizing the code to just show the things having to do with the constant memory. As such please don't point out if something is not initialized ect. There is code for that but that is not of concern at this time.
main.cu:
#include <stdio.h>
#include "kernel.cu"
__constant__ float M_d[FILTER_SIZE * FILTER_SIZE];
int main(int argc, char* argv[])
{
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
cudaError_t cudda_ret = cudaMemcpyToSymbol(M_d, M_h.elements, M_h.height * M_h.width * sizeof(float), 0, cudaMemcpyHostToDevice);
//char* cudda_ret_pointer = cudaGetErrorString(cudda_ret);
if( cudda_ret != cudaSuccess){
printf("\n\ncudaMemcpyToSymbol failed\n\n");
printf("%s, \n\n", cudaGetErrorString(cudda_ret));
}
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
//INSERT CODE HERE
//block size is 16x16
// \\\\\\\\\\\\\**DONE**
dim_grid = dim3(ceil(N_h.width / (float) BLOCK_SIZE), ceil(N_h.height / (float) BLOCK_SIZE));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
//KERNEL Launch
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
return 0;
}
kernel.cu: THIS IS WHERE I DO NOT KNOW HOW TO ACCESS THE CONSTANT MEMORY.
//__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P)
{
/********************************************************************
Determine input and output indexes of each thread
Load a tile of the input image to shared memory
Apply the filter on the input image tile
Write the compute values to the output image at the correct indexes
********************************************************************/
//INSERT KERNEL CODE HERE
//__shared__ float N_shared[BLOCK_SIZE][BLOCK_SIZE];
//int row = (blockIdx.y * blockDim.y) + threadIdx.y;
//int col = (blockIdx.x * blockDim.x) + threadIdx.x;
}
In "classic" CUDA compilation you must define all code and symbols (textures, constant memory, device functions) and any host API calls which access them (including kernel launches, binding to textures, copying to symbols) within the same translation unit. This means, effectively, in the same file (or via multiple include statements within the same file). This is because "classic" CUDA compilation doesn't include a device code linker.
Since CUDA 5 was released, there is the possibility of using separate compilation mode and linking different device code objects into a single fatbinary payload on architectures which support it. In that case, you need to declare any __constant__ variables using the extern keyword and define the symbol exactly once.
If you can't use separate compilation, then the usual workaround is to define the __constant__ symbol in the same .cu file as your kernel, and include a small host wrapper function which just calls cudaMemcpyToSymbol to set the __constant__ symbol in question. You would probably do the same with kernel calls and texture operations.
Below is a "minimum-sized" example showing the use of __constant__ symbols. You do not need to pass any pointer to the __global__ function.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
__constant__ float test_const;
__global__ void test_kernel(float* d_test_array) {
d_test_array[threadIdx.x] = test_const;
}
#include <conio.h>
int main(int argc, char **argv) {
float test = 3.f;
int N = 16;
float* test_array = (float*)malloc(N*sizeof(float));
float* d_test_array;
cudaMalloc((void**)&d_test_array,N*sizeof(float));
cudaMemcpyToSymbol(test_const, &test, sizeof(float));
test_kernel<<<1,N>>>(d_test_array);
cudaMemcpy(test_array,d_test_array,N*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<N; i++) printf("%i %f\n",i,test_array[i]);
getch();
return 0;
}

CUDA Array Reduction

I'm aware that there are multiple questions similar to this one already answered but I've been unable to piece together anything very helpful from them other than that I'm probably incorrectly indexing something.
I'm trying to preform a sequential addressing reduction on input vector A into output vector B.
The full code is available here http://pastebin.com/7UGadgjX, but this is the kernel:
__global__ void vectorSum(int *A, int *B, int numElements) {
extern __shared__ int S[];
// Each thread loads one element from global to shared memory
int tid = threadIdx.x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
S[tid] = A[i];
__syncthreads();
// Reduce in shared memory
for (int t = blockDim.x/2; t > 0; t>>=1) {
if (tid < t) {
S[tid] += S[tid + t];
}
__syncthreads();
}
if (tid == 0) B[blockIdx.x] = S[0];
}
}
and these are the kernel launch statements:
// Launch the Vector Summation CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorSum<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, numElements);
I'm getting a unspecified launch error which I've read is similar to a segfault. I've been following the nvidia reduction documentation closely and tried to keep my kernel within the bounds of numElements but I seem to be missing something key considering how simple the code is.
Your problem is that the reduction kernel requires dynamically allocated shared memory to operate correctly, but your kernel launch doesn't specify any. The result is out of bounds/illegal shared memory access which aborts the kernel.
In CUDA runtime API syntax, the kernel launch statement has four arguments. The first two are the grid and block dimensions for the launch. The latter two are optional with zero default values, but specify the dynamically allocated shared memory size and stream.
To fix this, change the launch code as follows:
// Launch the Vector Summation CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t shmsz = (size_t)threadsPerBlock * sizeof(int);
vectorSum<<<blocksPerGrid, threadsPerBlock, shmsz>>>(d_A, d_B, numElements);
[disclaimer: code written in browser, not compiled or tested, use at own risk]
This should at least fix the most obvious problem with your code.

How to dynamically allocate arrays inside a kernel?

I need to dynamically allocate some arrays inside the kernel function. How can a I do that?
My code is something like that:
__global__ func(float *grid_d,int n, int nn){
int i,j;
float x[n],y[nn];
//Do some really cool and heavy computations here that takes hours.
}
But that will not work. If this was inside the host code I could use malloc. cudaMalloc needs a pointer on host, and other on device. Inside the kernel function I don't have the host pointer.
So, what should I do?
If takes too long (some seconds) to allocate all the arrays (I need about 4 of size n and 5 of size nn), this won't be a problem. Since the kernel will probably run for 20 minutes, at least.
Dynamic memory allocation is only supported on compute capability 2.x and newer hardware. You can use either the C++ new keyword or malloc in the kernel, so your example could become:
__global__ func(float *grid_d,int n, int nn){
int i,j;
float *x = new float[n], *y = new float[nn];
}
This allocates memory on a local memory runtime heap which has the lifetime of the context, so make sure you free the memory after the kernel finishes running if your intention is not to use the memory again. You should also note that runtime heap memory cannot be accessed directly from the host APIs, so you cannot pass a pointer allocated inside a kernel as an argument to cudaMemcpy, for example.
#talonmies answered your question on how to dynamically allocate memory within a kernel. This is intended as a supplemental answer, addressing performance of __device__ malloc() and an alternative you might want to consider.
Allocating memory dynamically in the kernel can be tempting because it allows GPU code to look more like CPU code. But it can seriously affect performance. I wrote a self contained test and have included it below. The test launches some 2.6 million threads. Each thread populates 16 integers of global memory with some values derived from the thread index, then sums up the values and returns the sum.
The test implements two approaches. The first approach uses __device__ malloc() and the second approach uses memory that is allocated before the kernel runs.
On my 2.0 device, the kernel runs in 1500ms when using __device__ malloc() and 27ms when using pre-allocated memory. In other words, the test takes 56x longer to run when memory is allocated dynamically within the kernel. The time includes the outer loop cudaMalloc() / cudaFree(), which is not part of the kernel. If the same kernel is launched many times with the same number of threads, as is often the case, the cost of the cudaMalloc() / cudaFree() is amortized over all the kernel launches. That brings the difference even higher, to around 60x.
Speculating, I think that the performance hit is in part caused by implicit serialization. The GPU must probably serialize all simultaneous calls to __device__ malloc() in order to provide separate chunks of memory to each caller.
The version that does not use __device__ malloc() allocates all the GPU memory before running the kernel. A pointer to the memory is passed to the kernel. Each thread calculates an index into the previously allocated memory instead of using a __device__ malloc().
The potential issue with allocating memory up front is that, if only some threads need to allocate memory, and it is not known which threads those are, it will be necessary to allocate memory for all the threads. If there is not enough memory for that, it might be more efficient to reduce the number of threads per kernel call then using __device__ malloc(). Other workarounds would probably end up reimplementing what __device__ malloc() is doing in the background, and would see a similar performance hit.
Test the performance of __device__ malloc():
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
const int N_ITEMS(16);
#define USE_DYNAMIC_MALLOC
__global__ void test_malloc(int* totals)
{
int tx(blockIdx.x * blockDim.x + threadIdx.x);
int* s(new int[N_ITEMS]);
for (int i(0); i < N_ITEMS; ++i) {
s[i] = tx * i;
}
int total(0);
for (int i(0); i < N_ITEMS; ++i) {
total += s[i];
}
totals[tx] = total;
delete[] s;
}
__global__ void test_malloc_2(int* items, int* totals)
{
int tx(blockIdx.x * blockDim.x + threadIdx.x);
int* s(items + tx * N_ITEMS);
for (int i(0); i < N_ITEMS; ++i) {
s[i] = tx * i;
}
int total(0);
for (int i(0); i < N_ITEMS; ++i) {
total += s[i];
}
totals[tx] = total;
}
int main()
{
cudaError_t cuda_status;
cudaSetDevice(0);
int blocks_per_launch(1024 * 10);
int threads_per_block(256);
int threads_per_launch(blocks_per_launch * threads_per_block);
int* totals_d;
cudaMalloc((void**)&totals_d, threads_per_launch * sizeof(int));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaDeviceSynchronize();
cudaEventRecord(start, 0);
#ifdef USE_DYNAMIC_MALLOC
cudaDeviceSetLimit(cudaLimitMallocHeapSize, threads_per_launch * N_ITEMS * sizeof(int));
test_malloc<<<blocks_per_launch, threads_per_block>>>(totals_d);
#else
int* items_d;
cudaMalloc((void**)&items_d, threads_per_launch * sizeof(int) * N_ITEMS);
test_malloc_2<<<blocks_per_launch, threads_per_block>>>(items_d, totals_d);
cudaFree(items_d);
#endif
cuda_status = cudaDeviceSynchronize();
if (cuda_status != cudaSuccess) {
printf("Error: %d\n", cuda_status);
exit(1);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed: %f\n", elapsedTime);
int* totals_h(new int[threads_per_launch]);
cuda_status = cudaMemcpy(totals_h, totals_d, threads_per_launch * sizeof(int), cudaMemcpyDeviceToHost);
if (cuda_status != cudaSuccess) {
printf("Error: %d\n", cuda_status);
exit(1);
}
for (int i(0); i < 10; ++i) {
printf("%d ", totals_h[i]);
}
printf("\n");
cudaFree(totals_d);
delete[] totals_h;
return cuda_status;
}
Output:
C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 27.311169
0 120 240 360 480 600 720 840 960 1080
C:\rd\projects\test_cuda_malloc\Release>test_cuda_malloc.exe
Elapsed: 1516.711914
0 120 240 360 480 600 720 840 960 1080
If the value of n and nn were known before the kernel is called, then why not cudaMalloc the memory on host side and pass in the device memory pointer to the kernel?
Ran an experiment based on the concepts in #rogerdahl's post. Assumptions:
4MB of memory allocated in 64B chunks.
1 GPU block and 32 warp threads in that block
Run on a P100
The malloc+free calls local to the GPU seemed to be much faster than the cudaMalloc + cudaFree calls. The program's output:
Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
timer for cuda malloc timer took 1.169631s
Starting timer for device malloc timer
Stopping timer for device malloc timer
timer for device malloc timer took 0.029794s
I'm leaving out the code for timer.h and timer.cpp, but here's the code for the test itself:
#include "cuda_runtime.h"
#include <stdio.h>
#include <thrust/system/cuda/error.h>
#include "timer.h"
static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
const int BLOCK_COUNT = 1;
const int THREADS_PER_BLOCK = 32;
const int ITERATIONS = 1 << 12;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);
const int ARRAY_SIZE = 64;
void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) {
if (err == cudaSuccess)
return;
std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
exit (1);
}
__global__ void mallocai() {
for (int i = 0; i < ITERATIONS_PER_BLOCKTHREAD; ++i) {
int * foo;
foo = (int *) malloc(sizeof(int) * ARRAY_SIZE);
free(foo);
}
}
int main() {
Timer cuda_malloc_timer("cuda malloc timer");
for (int i = 0; i < ITERATIONS; ++ i) {
if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
int * foo;
cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
cudaFree(foo);
}
cuda_malloc_timer.stop_and_report();
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
Timer device_malloc_timer("device malloc timer");
device_malloc_timer.start();
mallocai<<<BLOCK_COUNT, THREADS_PER_BLOCK>>>();
CUDA_CHECK_RETURN(cudaDeviceSynchronize());
device_malloc_timer.stop_and_report();
}
If you find mistakes, please lmk in the comments, and I'll try to fix them.
And I ran them again with larger everything:
const int BLOCK_COUNT = 56;
const int THREADS_PER_BLOCK = 1024;
const int ITERATIONS = 1 << 18;
const int ITERATIONS_PER_BLOCKTHREAD = ITERATIONS / (BLOCK_COUNT * THREADS_PER_BLOCK);
const int ARRAY_SIZE = 1024;
And cudaMalloc was still slower by a lot:
Starting timer for cuda malloc timer
Stopping timer for cuda malloc timer
timer for cuda malloc timer took 74.878016s
Starting timer for device malloc timer
Stopping timer for device malloc timer
timer for device malloc timer took 0.167331s
Maybe you should test
cudaMalloc(&foo,sizeof(int) * ARRAY_SIZE * ITERATIONS);
cudaFree(foo);
instead
for (int i = 0; i < ITERATIONS; ++ i) {
if (i == 1) cuda_malloc_timer.start(); // let it warm up one cycle
int * foo;
cudaMalloc(&foo, sizeof(int) * ARRAY_SIZE);
cudaFree(foo);
}

Data Gathering Portion of CUDA Code is Unexpectedly Outputting "0"s

EDIT
In the initial posting's code snippet (see below) I was not properly sending the struct to the device, this has been fixed, but the results are still the same. In my full code this mistake was not present. (There were two mistakes in that command in my initial posting -- one, the structure was being copied from HostToDevice, but was actually reversed, and the size of the copy was also wrong. Apologies; both errors were fixed, but the recompiled code still displays the zeros phenomena described below, as does my full code.)
EDIT 2
In the haste of my de-proprietarization rewrite of the code I made a couple errors which dalekchef kindly pointed out to me (the copy of the struct to the device was performed BEFORE the allocation on the device, in my rewritten code and the device cudaMalloc calls were not multiplied with the sizeof(...) the type of the array elements. I added these fixes, recompiled and retested, but it did not fix the problem. Also double checked my original code -- it did not have those mistakes. Apologies again, for the confusion.
I'm trying to dump statistics from a large simulations program. A similar pared down code is displayed below. Both codes exhibit the same problem -- they output zeroes, when they should be outputting averaged values.
#include "stdio.h"
struct __align__(8) DynamicVals
{
double a;
double b;
int n1;
int n2;
int perDump;
};
__device__ int *dev_arrN1, *dev_arrN2;
__device__ double *dev_arrA, *dev_arrB;
__device__ DynamicVals *dev_myVals;
__device__ int stepsA, stepsB;
__device__ double sumA, sumB;
__device__ int stepsN1, stepsN2;
__device__ int sumN1, sumN2;
__global__ void TEST
(int step, double dev_arrA[], double dev_arrB[],
int dev_arrN1[], int dev_arrN2[],DynamicVals *dev_myVals)
{
if (step % dev_myVals->perDump)
{
dev_arrN1[step/dev_myVals->perDump] = 0;
dev_arrN2[step/dev_myVals->perDump] = 0;
dev_arrA[step/dev_myVals->perDump] = 0.0;
dev_arrB[step/dev_myVals->perDump] = 0.0;
stepsA = 0;
stepsB = 0;
stepsN1 = 0;
stepsN2 = 0;
sumA = 0.0;
sumB = 0.0;
sumN1 = 0;
sumN2 = 0;
}
sumA += dev_myVals->a;
sumB += dev_myVals->b;
sumN1 += dev_myVals->n1;
sumN2 += dev_myVals->n2;
stepsA++;
stepsB++;
stepsN1++;
stepsN2++;
if ( sumA > 100000000 )
{
dev_arrA[step/dev_myVals->perDump] +=
sumA / stepsA;
sumA = 0.0;
stepsA = 0;
}
if ( sumB > 100000000 )
{
dev_arrB[step/dev_myVals->perDump] +=
sumB / stepsB;
sumB = 0.0;
stepsB = 0;
}
if ( sumN1 > 1000000 )
{
dev_arrN1[step/dev_myVals->perDump] +=
sumN1 / stepsN1;
sumN1 = 0;
stepsN1 = 0;
}
if ( sumN2 > 1000000 )
{
dev_arrN2[step/dev_myVals->perDump] +=
sumN2 / stepsN2;
sumN2 = 0;
stepsN2 = 0;
}
if ((step+1) % dev_myVals->perDump)
{
dev_arrA[step/dev_myVals->perDump] +=
sumA / stepsA;
dev_arrB[step/dev_myVals->perDump] +=
sumB / stepsB;
dev_arrN1[step/dev_myVals->perDump] +=
sumN1 / stepsN1;
dev_arrN2[step/dev_myVals->perDump] +=
sumN2 / stepsN2;
}
}
int main()
{
const int TOTAL_STEPS = 10000000;
DynamicVals vals;
int *arrN1, *arrN2;
double *arrA, *arrB;
int statCnt;
vals.perDump = TOTAL_STEPS/10;
statCnt = TOTAL_STEPS/vals.perDump+1;
vals.a = 30000.0;
vals.b = 60000.0;
vals.n1 = 10000;
vals.n2 = 20000;
cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) );
cudaMalloc( (void**)&dev_arrB, statCnt*sizeof(double) );
cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int) );
cudaMalloc( (void**)&dev_arrN2, statCnt*sizeof(int) );
cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals));
cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals),
cudaMemcpyHostToDevice);
arrA = (double *)malloc(statCnt * sizeof(double));
arrB = (double *)malloc(statCnt * sizeof(double));
arrN1 = (int *)malloc(statCnt * sizeof(int));
arrN2 = (int *)malloc(statCnt * sizeof(int));
for (int i=0; i< TOTAL_STEPS; i++)
TEST<<<1,1>>>(i, dev_arrA,dev_arrB,dev_arrN1,dev_arrN2,dev_myVals);
cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
cudaMemcpy(arrB,dev_arrB,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(arrN2,dev_arrN2,statCnt * sizeof(int),cudaMemcpyDeviceToHost);
for (int i=0; i< statCnt; i++)
{
printf("Step: %d ; A=%g B=%g N1=%d N2=%d\n",
i*vals.perDump,
arrA[i], arrB[i], arrN1[i], arrN2[i]);
}
}
Output:
Step: 0 ; A=0 B=0 N1=0 N2=0
Step: 1000000 ; A=0 B=0 N1=0 N2=0
Step: 2000000 ; A=0 B=0 N1=0 N2=0
Step: 3000000 ; A=0 B=0 N1=0 N2=0
Step: 4000000 ; A=0 B=0 N1=0 N2=0
Step: 5000000 ; A=0 B=0 N1=0 N2=0
Step: 6000000 ; A=0 B=0 N1=0 N2=0
Step: 7000000 ; A=0 B=0 N1=0 N2=0
Step: 8000000 ; A=0 B=0 N1=0 N2=0
Step: 9000000 ; A=0 B=0 N1=0 N2=0
Step: 10000000 ; A=0 B=0 N1=0 N2=0
Now, if I were to use a small period for my dumps or if my #s were smaller, I could get away with just a direct
add
divide by period and the end of period
...algorithm, but I use temporary sums as otherwise my int would overflow (the double wouldn't overflow, but I was concerned about it losing precision).
If I use the above direct algorithm for smaller values I get correct non-zero values, but the second I use the intermediates (e.g. stepsA, sumA, etc.) the values go to zero.
I know I'm doing something silly here... what am I missing?
Notes:
A.) Yes, I know this code in its above form is not parallel and by itself does not warrant parallelization. It is part of a small statistics collecting portion of a much longer code. In that code it is encased in a thread index specific conditional logic to prevent clashing (making it parallel) and serves as data gathering to a simulations program (which warrants parallelization). Hopefully you can understand where the above code originates and avoid snide comments about its lack of thread-safety. (This disclaimer is added out of past experience receiving unproductive comments from people who didn't understand I was posting an excerpt, not a full code, despite me writing in less explicit terms as such.)
B.) Yes, I know the names of the variables are ambiguous. That is the point. The code I'm working on is proprietary, though it will eventually be open sourced. I only write this as I have posted similarly anonymized codes in the past and received rude commentary about my naming convention.
C.) Yes, I have read the CUDA manual several times, though I do make errors and I admit there's some features I don't understand. I'm not using shared memory here, but I am using shared memory (OF COURSE) in my full code.
D.) Yes, the above code does represent the exact same features as the data dumping portion of my non-working code, with the logic not related to this particular problem removed, and with it the thread safety conditional. The variable names have been changed, but algorithmically it should be unaltered and this is verified by the exact same non-working output (zeroes).
E.) I do realize the "dynamic" struct in the above snippet has non-dynamic values. I named the structure that because in the full code, this struct contains simulations data, and is dynamic. The static nature in the pared-down code should not make the statistics collecting code fail, it will simply mean that the average for each dump should be constant (and non-zero).
A couple of things:
It seems like you are calling cudaMemcpy for dev_MyVals before you are calling cudaMalloc for it. This is not how it should be.
ALSO: You do not multiply by sizeof int when you do your cudaMalloc calls.
You should really check all of your CUDA calls cudaMalloc/cudaMemcpy for an error code. They should all return an error or CUDA_SUCCESS. I believe the CUDA examples all show how to do this.
Also, for future reference NEVER use the modulo operator in CUDA it is incredibly slow. Just Google for "Modulo CUDA" for some alternatives.
Let me know how it goes, this will probably take a couple of iterations to fix.
The biggest problem I see here is one of scope. The way this code is written leads me to conclude that you might not understand how variable scoping in C++ works in general, and how device and host code scope works in CUDA in particular. A couple of observations:
When you do this type of thing in code:
__device__ double *dev_arrA, *dev_arrB;
__global__ void TEST(int step, double dev_arrA[], double dev_arrB[], ....)
you have a variable scope problem. dev_arrA is declared at both compilation unit scope and function scope. The two declarations do not refer to the same variable -- the function unit scope declaration (in the kernel) takes precedence over the compilation unit scope declaration inside the kernel. you modify that variable, you are modifying the kernel scope declaration, not the __device__variable. This can lead to all sorts of subtle and unexpactd behaviour. It is much better to avoid ever having the same variable declared at multiple scopes.
When you declare a variable using the __device__ specifier, it is intended to be exclusively a device context symbol, and should only be used directly in device code. So something like this:
__device__ double *dev_arrA;
int main()
{
....
cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) );
....
}
is illegal. You cannot call an API function like cudaMalloc directly on a __device__ variable. Even though it will compile (because of the hackery involved in the CUDA compilation tradjectories for host and device code), it is incorrect to do so. In the above example dev_arrA is a device symbol. You can interact with it via the API symbol manipulation calls, but that is all it is technically legal to do. In you code, variables intended to hold device pointers and be passed as kernel arguments (like dev_arrA) should be declared at main() scope, and passed by value to the kernel.
It is a combination of the above two things which is probably causing your problems.
But the difficulty is that you have chosen to post roughy 150 lines of code (a lot of which is redundant) as a repro case. I doubt anyone cares enough about your problems to go through that much code with a fine tooth comb and pinpoint where the precise problem is. Further, you habit of doing these nasty "top edits" in your questions quickly turn what might have been reasonably written starting points into unintelligible psuedo changelogs which are incredibly hard to follow and are unlikely to be of help to anyone. Also, the mildly passive-aggressive notes section serves no real purpose - it adds nothing of value to the question.
So I will leave you with a greatly simplified version of the code you posted which I think has all the basic things which you are trying to do working. I leave it as an "exercise for the reader" to turn it back into whatever it is that you are trying to do.
#include "stdio.h"
typedef float Real;
struct __align__(8) DynamicVals
{
Real a;
int n1;
int perDump;
};
__device__ int stepsA;
__device__ Real sumA;
__device__ int stepsN1;
__device__ int sumN1;
__global__ void TEST
(int step, Real dev_arrA[], int dev_arrN1[], DynamicVals *dev_myVals)
{
if (step % dev_myVals->perDump)
{
dev_arrN1[step/dev_myVals->perDump] = 0;
dev_arrA[step/dev_myVals->perDump] = 0.0;
stepsA = 0;
stepsN1 = 0;
sumA = 0.0;
sumN1 = 0;
}
sumA += dev_myVals->a;
sumN1 += dev_myVals->n1;
stepsA++;
stepsN1++;
dev_arrA[step/dev_myVals->perDump] += sumA / stepsA;
dev_arrN1[step/dev_myVals->perDump] += sumN1 / stepsN1;
}
inline void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
}
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main()
{
const int TOTAL_STEPS = 1000;
DynamicVals vals;
int *arrN1;
Real *arrA;
int statCnt;
vals.perDump = TOTAL_STEPS/10;
statCnt = TOTAL_STEPS/vals.perDump;
vals.a = 30000.0;
vals.n1 = 10000;
Real *dev_arrA;
int *dev_arrN1;
DynamicVals *dev_myVals;
gpuErrchk( cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(Real)) );
gpuErrchk( cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int)) );
gpuErrchk( cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals)) );
gpuErrchk( cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals),
cudaMemcpyHostToDevice) );
arrA = (Real *)malloc(statCnt * sizeof(Real));
arrN1 = (int *)malloc(statCnt * sizeof(int));
for (int i=0; i< TOTAL_STEPS; i++) {
TEST<<<1,1>>>(i, dev_arrA,dev_arrN1,dev_myVals);
gpuErrchk( cudaPeekAtLastError() );
}
gpuErrchk( cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(Real),
cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),
cudaMemcpyDeviceToHost) );
for (int i=0; i< statCnt; i++)
{
printf("Step: %d ; A=%g N1=%d\n",
i*vals.perDump, arrA[i], arrN1[i] );
}
}

Resources