I am studying a CUDA C example (ripple.cu in chapter 5) on the CUDA C by Example book; when I compile the file it seems there is no problem; here's what i type on the terminal:
nvcc ripple.cu -lGL -lGLU -lX11 -lXi -lXmu -lglut -lGLEW
When I run the executable i should get an image like this:
However this is what i get instead:
Here I post the file ripple.cu and the related header files:
// ripple.cu
#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_anim.h"
#define DIM 1024
#define PI 3.1415926535897932f
__global__ void kernel( unsigned char *ptr, int ticks ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
// now calculate the value at that position
float fx = x - DIM/2;
float fy = y - DIM/2;
float d = sqrtf( fx * fx + fy * fy );
unsigned char grey = (unsigned char)(128.0f + 127.0f *
cos(d/10.0f - ticks/7.0f) /
(d/10.0f + 1.0f));
ptr[offset*4 + 0] = grey;
ptr[offset*4 + 1] = grey;
ptr[offset*4 + 2] = grey;
ptr[offset*4 + 3] = 255;
}
struct DataBlock {
unsigned char *dev_bitmap;
CPUAnimBitmap *bitmap;
};
void generate_frame( DataBlock *d, int ticks ) {
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<blocks,threads>>>( d->dev_bitmap, ticks );
HANDLE_ERROR( cudaMemcpy( d->bitmap->get_ptr(),
d->dev_bitmap,
d->bitmap->image_size(),
cudaMemcpyDeviceToHost ) );
}
// clean up memory allocated on the GPU
void cleanup( DataBlock *d ) {
HANDLE_ERROR( cudaFree( d->dev_bitmap ) );
}
int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_bitmap,
bitmap.image_size() ) );
bitmap.anim_and_exit( (void (*)(void*,int))generate_frame,
(void (*)(void*))cleanup );
}
Now i post the headers which are contained into a folder named common:
// book.h
#ifndef __BOOK_H__
#define __BOOK_H__
#include <stdio.h>
static void HandleError( cudaError_t err,
const char *file,
int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
file, line );
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
#define HANDLE_NULL( a ) {if (a == NULL) { \
printf( "Host memory failed in %s at line %d\n", \
__FILE__, __LINE__ ); \
exit( EXIT_FAILURE );}}
template< typename T >
void swap( T& a, T& b ) {
T t = a;
a = b;
b = t;
}
void* big_random_block( int size ) {
unsigned char *data = (unsigned char*)malloc( size );
HANDLE_NULL( data );
for (int i=0; i<size; i++)
data[i] = rand();
return data;
}
int* big_random_block_int( int size ) {
int *data = (int*)malloc( size * sizeof(int) );
HANDLE_NULL( data );
for (int i=0; i<size; i++)
data[i] = rand();
return data;
}
// a place for common kernels - starts here
__device__ unsigned char value( float n1, float n2, int hue ) {
if (hue > 360) hue -= 360;
else if (hue < 0) hue += 360;
if (hue < 60)
return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
if (hue < 180)
return (unsigned char)(255 * n2);
if (hue < 240)
return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
return (unsigned char)(255 * n1);
}
__global__ void float_to_color( unsigned char *optr,
const float *outSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float l = outSrc[offset];
float s = 1;
int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
float m1, m2;
if (l <= 0.5f)
m2 = l * (1 + s);
else
m2 = l + s - l * s;
m1 = 2 * l - m2;
optr[offset*4 + 0] = value( m1, m2, h+120 );
optr[offset*4 + 1] = value( m1, m2, h );
optr[offset*4 + 2] = value( m1, m2, h -120 );
optr[offset*4 + 3] = 255;
}
__global__ void float_to_color( uchar4 *optr,
const float *outSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float l = outSrc[offset];
float s = 1;
int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
float m1, m2;
if (l <= 0.5f)
m2 = l * (1 + s);
else
m2 = l + s - l * s;
m1 = 2 * l - m2;
optr[offset].x = value( m1, m2, h+120 );
optr[offset].y = value( m1, m2, h );
optr[offset].z = value( m1, m2, h -120 );
optr[offset].w = 255;
}
#if _WIN32
//Windows threads.
#include <windows.h>
typedef HANDLE CUTThread;
typedef unsigned (WINAPI *CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC unsigned WINAPI
#define CUT_THREADEND return 0
#else
//POSIX threads.
#include <pthread.h>
typedef pthread_t CUTThread;
typedef void *(*CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC void
#define CUT_THREADEND
#endif
//Create thread.
CUTThread start_thread( CUT_THREADROUTINE, void *data );
//Wait for thread to finish.
void end_thread( CUTThread thread );
//Destroy thread.
void destroy_thread( CUTThread thread );
//Wait for multiple threads.
void wait_for_threads( const CUTThread *threads, int num );
#if _WIN32
//Create thread
CUTThread start_thread(CUT_THREADROUTINE func, void *data){
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
}
//Wait for thread to finish
void end_thread(CUTThread thread){
WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
}
//Destroy thread
void destroy_thread( CUTThread thread ){
TerminateThread(thread, 0);
CloseHandle(thread);
}
//Wait for multiple threads
void wait_for_threads(const CUTThread * threads, int num){
WaitForMultipleObjects(num, threads, true, INFINITE);
for(int i = 0; i < num; i++)
CloseHandle(threads[i]);
}
#else
//Create thread
CUTThread start_thread(CUT_THREADROUTINE func, void * data){
pthread_t thread;
pthread_create(&thread, NULL, func, data);
return thread;
}
//Wait for thread to finish
void end_thread(CUTThread thread){
pthread_join(thread, NULL);
}
//Destroy thread
void destroy_thread( CUTThread thread ){
pthread_cancel(thread);
}
//Wait for multiple threads
void wait_for_threads(const CUTThread * threads, int num){
for(int i = 0; i < num; i++)
end_thread( threads[i] );
}
#endif
// cpu_anim.h
#endif // __BOOK_H__
Here's the second header:
// cpu_anim.h
#ifndef __CPU_ANIM_H__
#define __CPU_ANIM_H__
#include "gl_helper.h"
#include <iostream>
struct CPUAnimBitmap {
unsigned char *pixels;
int width, height;
void *dataBlock;
void (*fAnim)(void*,int);
void (*animExit)(void*);
void (*clickDrag)(void*,int,int,int,int);
int dragStartX, dragStartY;
CPUAnimBitmap( int w, int h, void *d = NULL ) {
width = w;
height = h;
pixels = new unsigned char[width * height * 4];
dataBlock = d;
clickDrag = NULL;
}
~CPUAnimBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return width * height * 4; }
void click_drag( void (*f)(void*,int,int,int,int)) {
clickDrag = f;
}
void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {
CPUAnimBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
fAnim = f;
animExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = (char *)(void *)"";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( width, height );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
if (clickDrag != NULL)
glutMouseFunc( mouse_func );
glutIdleFunc( idle_func );
glutMainLoop();
}
// static method used for glut callbacks
static CPUAnimBitmap** get_bitmap_ptr( void ) {
static CPUAnimBitmap* gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void mouse_func( int button, int state,
int mx, int my ) {
if (button == GLUT_LEFT_BUTTON) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
if (state == GLUT_DOWN) {
bitmap->dragStartX = mx;
bitmap->dragStartY = my;
} else if (state == GLUT_UP) {
bitmap->clickDrag( bitmap->dataBlock,
bitmap->dragStartX,
bitmap->dragStartY,
mx, my );
}
}
}
// static method used for glut callbacks
static void idle_func( void ) {
static int ticks = 1;
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->fAnim( bitmap->dataBlock, ticks++ );
glutPostRedisplay();
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->animExit( bitmap->dataBlock );
//delete bitmap;
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glutSwapBuffers();
}
};
#endif // __CPU_ANIM_H__
I don't really know where the problem might be... I have already asked in the NVIDA FORUM without success... Here's the link where you can download the source code in case you want: https://developer.nvidia.com/content/cuda-example-introduction-general-purpose-gpu-programming-0
I know it is a very specific problem and it takes a lot of effort to read it but any suggestion is welcome.
I just figured out how to make it work... so i basically changed the dimensions of the window from 1024 to 512:
ripple.cu: #define DIM 1024 ----> #define DIM 512
I don't know why but it works now! I just got lucky.
Related
I'm trying to parallelize my code, but i got errors. I need to calc a Cauchy problem (it's already done) but than i need to parallelize it using OpenMP lib.
I've tried to write some code with OpenMP, but it's not working.
I've created a struct to collect result.
struct Dots {
double par;
double x;
double y;
};
This is my target function with parameter.
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = x + 2 * y[0] / (1 + mu * mu);
return GSL_SUCCESS;
}
This is the main function. I currently didn't find a way how to create a array of arrays of struct, but this is not the main problem.
void calc_cauchy_problem(struct Dots ArrayOfDots[], double x_start, double x_end, double y_start,
int count) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int mu = 5;
int param = 0;
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys,
gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
int status = 0;
#pragma omp parallel for shared(ArrayOfDots) private(sys, param, d, status)
for (int param = 1; param < mu; param++) {
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d = gsl_odeiv2_driver_alloc_y_new (&sys,
gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++)
{
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
// ArrayOfDots[i].par = mu;
// ArrayOfDots[i].x = xi;
// ArrayOfDots[i].y = y[0];
}
gsl_odeiv2_driver_free (d);
}
}
The main
int main() {
double x_start = 0;
double x_end = 10;
double y_start = 0;
int count = 10;
struct Dots ArrayOfDots[count];
calc_cauchy_problem(ArrayOfDots, x_start, x_end, y_start, count);
return 0;
}
It's compiled successfully with this gcc main.c -o main -fopenmp -lgsl -std=gnu11 but when i launch it i got error
gsl: driver.c:354: ERROR: integration limits and/or step direction not consistent
Default GSL error handler invoked.
I think that the main problem with this #pragma omp parallel for shared(ArrayOfDots) private(sys, param, d, status) but i have no idea how to rewrite this in the other way.
Thanks for your responses.
UPD:
With Kaveh Vahedipour help my code partially start to work. It means that half of my for cycle start to work.
UPD UPD:
After another investigations i had the following code:
It's compile and run, but i got Process finished with exit code 4 and printf("Elapsed time = %f\n", omp_get_wtime() - start_time); don't print anything.
struct Dots {
double par;
double x;
double y;
};
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = (x + 2 * y[0]) / (1 + mu * mu);
return GSL_SUCCESS;
}
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int param = param1;
int j = 0;
int status = 0;
char filename[10];
#pragma omp parallel for private(param, status, x, y)
for (param = param1; param <= param2; param++) {
struct Dots ArrayOfDots[count];
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++) {
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
}
gsl_odeiv2_driver_free (d);
}
}
int main() {
double start_time = omp_get_wtime();
double x_start = 0;
double x_end = 10;
double y_start = 0;
const int count = 500;
int param1 = 1;
int param2 = 10;
calc_cauchy_problem(x_start, x_end, y_start, count, param1, param2);
printf("Elapsed time = %f\n", omp_get_wtime() - start_time);
return 0;
}
Add x to private loop vars: private(sys, param, d, status, x). Please get back to me, if you still experience issues.
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
int dim = 1;
double x = x_start;
double y[1] = {y_start};
int param = param1;
int j = 0;
int status = 0;
char filename[10];
#pragma omp parallel for private(param, status, x, y)
for (param = param1; param <= param2; param++) {
struct Dots ArrayOfDots[count];
gsl_odeiv2_system sys = {ode_func, NULL, dim, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rkf45, 1e-6, 1e-6, 0.0);
for (int i = 1; i <= count; i++) {
double xi = x_start + i * (x_end - x_start) / count;
int status = gsl_odeiv2_driver_apply(d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
}
//write_data_to_file(param, count, ArrayOfDots);
for (int i = 0; i < count; ++i) {
printf ("%d: %f, %f, %f\n", omp_get_thread_num(),
ArrayOfDots[i].par, ArrayOfDots[i].x, ArrayOfDots[i].y);
}
gsl_odeiv2_driver_free (d);
}
}
Seems like this version works fine. I think problem was with this struct Dots ArrayOfDots[count]; and when i try to push values to this struct.
ArrayOfDots[i].par = param;
ArrayOfDots[i].x = xi;
ArrayOfDots[i].y = y[0];
Here is the full code.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
// GSL lib includes
#include <gsl/gsl_sf_bessel.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_odeiv2.h>
int ode_func (double x, const double y[], double f[], void *params)
{
double mu = *(int *)params;
f[0] = (x + 2 * y[0]) / (1 + mu * mu);
return GSL_SUCCESS;
}
void calc_cauchy_problem(double x_start, double x_end, double y_start,
int count, int param1, int param2) {
#pragma omp parallel for
for(int param = param1; param < param2; param++) {
gsl_odeiv2_system sys = {ode_func, NULL, 1, ¶m};
gsl_odeiv2_driver * d =
gsl_odeiv2_driver_alloc_y_new (&sys, gsl_odeiv2_step_rk8pd,
1e-6, 1e-6, 0.0);
int i;
double x = x_start, x1 = x_end;
double y[1] = { y_start };
for (i = 1; i <= count; i++)
{
double xi = i * x1 / count;
int status = gsl_odeiv2_driver_apply (d, &x, xi, y);
if (status != GSL_SUCCESS)
{
printf ("error, return value=%d\n", status);
break;
}
// printf ("%d %d %.5e %.5e\n", omp_get_thread_num(), param, x, y[0]);
}
gsl_odeiv2_driver_free (d);
}
}
int main() {
double start_time = omp_get_wtime();
double x_start = 0;
double x_end = 10;
double y_start = 0;
const int count = 100000;
int param1 = 1;
int param2 = 20;
calc_cauchy_problem(x_start, x_end, y_start, count, param1, param2);
printf("Elapsed time = %f\n", omp_get_wtime() - start_time);
return 0;
}
Really thanks to Kaveh Vahedipour.
I implemented the aforementioned algorithm (https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm).
Here' s the full working code.
I receive streaming input from microphone in a non interleaved stereo Callback.
Then i fill a buffer with input samples and process this buffer into fft and ifft
functions.
Finally i use a pointer to send the processed buffer to output.
I use Asio4all V2.
I really can' t understand where is the problem because it does work properly but i can' t go beyound N=16 or the output sounds highly intermittent.
Thanks a lot in advance.
Callback + Main:
#include <stdio.h>
#include <stdlib.h>
#include "portaudio.h"
#include "FFT.h"
#define SAMPLE_RATE (44100)
#define FRAMES_PER_BUFFER (64)
#define NUM_SECONDS (10)
typedef struct
{
}
paTestData;
static int patestCallback( void *INbuffers[2], void *OUTbuffers[2],
unsigned long framesPerBuffer,
const PaStreamCallbackTimeInfo* timeInfo,
PaStreamCallbackFlags statusFlags,
void *userData )
{
paTestData *data = (paTestData*)userData;
float *inL = (float *) INbuffers [0];
float *inR = (float *) INbuffers [1];
float *outL = (float *) OUTbuffers[0];
float *outR = (float *) OUTbuffers[1];
unsigned long i;
(void) timeInfo;
(void) statusFlags;
for( i=0; i<framesPerBuffer; i++ )
{
for(int i=0; i<N; i++)
{
Y[i] = *inL + 0*I;
}
*inL++;
fft(Y, N);
ifft(Y, N);
for(int i=0; i<N; i++)
{
p_out=&Y[i];
}
*outL++ = *p_out++;
*outR++ = *p_out++;
}
return paContinue;
}
int main()
{
PaStreamParameters inputParameters;
PaStreamParameters outputParameters;
PaStream *stream;
paTestData data;
Pa_Initialize();
inputParameters.device = Pa_GetDefaultInputDevice();
inputParameters.channelCount = 2;
inputParameters.sampleFormat = paFloat32 | paNonInterleaved;
inputParameters.suggestedLatency = 0;
inputParameters.hostApiSpecificStreamInfo = NULL;
outputParameters.device = Pa_GetDefaultOutputDevice();
outputParameters.channelCount = 2;
outputParameters.sampleFormat = paFloat32 | paNonInterleaved;
outputParameters.suggestedLatency = 0;
outputParameters.hostApiSpecificStreamInfo = NULL;
Pa_OpenStream( &stream,
&inputParameters,
&outputParameters,
SAMPLE_RATE,
FRAMES_PER_BUFFER,
paClipOff,
patestCallback,
&data );
Pa_StartStream( stream );
Pa_Sleep( NUM_SECONDS * 1000);
Pa_StopStream( stream );
Pa_CloseStream( stream );
}
Header:
#ifndef FFT_H_INCLUDED
#define FFT_H_INCLUDED
#include <stdio.h>
#include <math.h>
#include <complex.h>
#define N (16)
complex Y[N];
complex *p_out;
void separate (complex *a, int n)
{
complex b[N/2];
for(int i=0; i<n/2; i++) // copy all odd elements to b
b[i] = a[i*2+1];
for(int i=0; i<n/2; i++) // copy all even elements to lower-half of a[]
a[i] = a[i*2];
for(int i=0; i<n/2; i++) // copy all odd (from heap) to upper-half of a[]
a[i+n/2] = b[i];
}
complex fft(complex *X, int m) // forward fft
{
if(m < 2) {
// bottom of recursion.
// Do nothing here, because already X[0] = x[0]
} else {
separate(X, m); // all evens to lower half, all odds to upper half
fft(X, m/2); // recurse even items
fft(X+m/2, m/2); // recurse odd items
for(int k=0; k<m/2; k++) // combine results of two half recursions
{
complex e = X[k ]; // even
complex o = X[k+m/2]; // odd
complex w = cexp( 0 + (-2.*M_PI*k/m)*I ); // w is the twiddle-factor
X[k ] = e + w * o;
X[k+m/2] = e - w * o;
}
}
}
float ifft(complex *X, int m) // inverse fft
{
for (int i=0; i<m; i++)
{
X[i]=conj(X[i]); // conjugate the complex array
}
fft(X, m); // forward fft
for (int i=0; i<m; i++)
{
X[i]=conj(X[i]); // conjugate the complex array again
}
for (int i=0; i<m; i++)
{
X[i] /= m; // scale the numbers
}
}
#endif // FFT_H_INCLUDED
I just got GSL set up on my windows box and I am trying to learn how to use the nonlinear fitting functions. First thing I did was pull an example directly off their website: https://www.gnu.org/software/gsl/manual/html_node/Example-programs-for-Nonlinear-Least_002dSquares-Fitting.html
which is here:
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_multifit_nlin.h>
#define N 40
#define FIT(i) gsl_vector_get(s->x, i)
#define ERR(i) sqrt(gsl_matrix_get(covar,i,i))
struct data {
size_t n;
double * y;
double * sigma;
};
int expb_f (const gsl_vector * x, void *data, gsl_vector * f)
{
size_t n = ((struct data *)data)->n;
double *y = ((struct data *)data)->y;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
double b = gsl_vector_get (x, 2);
size_t i;
for (i = 0; i < n; i++)
{
/* Model Yi = A * exp(-lambda * i) + b */
double t = i;
double Yi = A * exp (-lambda * t) + b;
gsl_vector_set (f, i, (Yi - y[i])/sigma[i]);
}
return GSL_SUCCESS;
}
int expb_df (const gsl_vector * x, void *data, gsl_matrix * J)
{
size_t n = ((struct data *)data)->n;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
size_t i;
for (i = 0; i < n; i++)
{
/* Jacobian matrix J(i,j) = dfi / dxj, */
/* where fi = (Yi - yi)/sigma[i], */
/* Yi = A * exp(-lambda * i) + b */
/* and the xj are the parameters (A,lambda,b) */
double t = i;
double s = sigma[i];
double e = exp(-lambda * t);
gsl_matrix_set (J, i, 0, e/s);
gsl_matrix_set (J, i, 1, -t * A * e/s);
gsl_matrix_set (J, i, 2, 1/s);
}
return GSL_SUCCESS;
}
int expb_fdf (const gsl_vector * x, void *data, gsl_vector * f, gsl_matrix * J)
{
expb_f (x, data, f);
expb_df (x, data, J);
return GSL_SUCCESS;
}
void print_state (size_t iter, gsl_multifit_fdfsolver * s);
int main (void)
{
const gsl_multifit_fdfsolver_type *T;
gsl_multifit_fdfsolver *s;
int status;
unsigned int i, iter = 0;
const size_t n = N;
const size_t p = 3;
gsl_matrix *covar = gsl_matrix_alloc (p, p);
double y[N], sigma[N];
struct data d = { n, y, sigma};
gsl_multifit_function_fdf f;
double x_init[3] = { 1.0, 0.0, 0.0 };
gsl_vector_view x = gsl_vector_view_array (x_init, p);
const gsl_rng_type * type;
gsl_rng * r;
gsl_rng_env_setup();
type = gsl_rng_default;
r = gsl_rng_alloc (type);
f.f = &expb_f;
f.df = &expb_df;
f.fdf = &expb_fdf;
f.n = n;
f.p = p;
f.params = &d;
/* This is the data to be fitted */
for (i = 0; i < n; i++)
{
double t = i;
y[i] = 1.0 + 5 * exp (-0.1 * t) + gsl_ran_gaussian (r, 0.1);
sigma[i] = 0.1;
printf ("data: %u %g %g\n", i, y[i], sigma[i]);
};
T = gsl_multifit_fdfsolver_lmsder;
s = gsl_multifit_fdfsolver_alloc (T, n, p);
gsl_multifit_fdfsolver_set (s, &f, &x.vector);
print_state (iter, s);
do
{
iter++;
status = gsl_multifit_fdfsolver_iterate (s);
printf ("status = %s\n", gsl_strerror (status));
print_state (iter, s);
if (status)
break;
status = gsl_multifit_test_delta (s->dx, s->x,
1e-4, 1e-4);
}
while (status == GSL_CONTINUE && iter < 500);
gsl_multifit_covar (s->J, 0.0, covar);
{
double chi = gsl_blas_dnrm2(s->f);
double dof = n - p;
double c = GSL_MAX_DBL(1, chi / sqrt(dof));
printf("chisq/dof = %g\n", pow(chi, 2.0) / dof);
printf ("A = %.5f +/- %.5f\n", FIT(0), c*ERR(0));
printf ("lambda = %.5f +/- %.5f\n", FIT(1), c*ERR(1));
printf ("b = %.5f +/- %.5f\n", FIT(2), c*ERR(2));
}
printf ("status = %s\n", gsl_strerror (status));
gsl_multifit_fdfsolver_free (s);
gsl_matrix_free (covar);
gsl_rng_free (r);
return 0;
}
void print_state (size_t iter, gsl_multifit_fdfsolver * s)
{
printf ("iter: %3u x = % 15.8f % 15.8f % 15.8f "
"|f(x)| = %g\n",
iter,
gsl_vector_get (s->x, 0),
gsl_vector_get (s->x, 1),
gsl_vector_get (s->x, 2),
gsl_blas_dnrm2 (s->f));
}
Ideally it should simply generate a short data set that follows a decaying exponential with some white noise on top and then fit it.
To get it running in Code::Blocks in windows I followed the procedure outlined here: installing GSL on Windows XP 32bit for use with codeblocks
It compiles without warnings even with -Wall and -Wextra flags. However, it fails on the line: gsl_multifit_fdfsolver_set (s, &f, &x.vector); with the error: multifit\fdfsolver.c:132: ERROR: vector length does not match solver. Default GSL error handler invoked.
I was a little surprised to find this in what should be raw example code, but here we are. So I am hoping someone more knowledgeable than I can tell me what I am doing wrong with this simple example.
Figured it out: They were allocating their initial vector wrong. The fixed code is here:
#include <stdlib.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdio.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_multifit_nlin.h>
#define N 40
#define FIT(i) gsl_vector_get(s->x, i)
#define ERR(i) sqrt(gsl_matrix_get(covar,i,i))
struct data {
size_t n;
double * y;
double * sigma;
};
int expb_f (const gsl_vector * x, void *data, gsl_vector * f)
{
size_t n = ((struct data *)data)->n;
double *y = ((struct data *)data)->y;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
double b = gsl_vector_get (x, 2);
size_t i;
for (i = 0; i < n; i++)
{
/* Model Yi = A * exp(-lambda * i) + b */
double t = i;
double Yi = A * exp (-lambda * t) + b;
gsl_vector_set (f, i, (Yi - y[i])/sigma[i]);
}
return GSL_SUCCESS;
}
int expb_df (const gsl_vector * x, void *data, gsl_matrix * J)
{
size_t n = ((struct data *)data)->n;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
size_t i;
for (i = 0; i < n; i++)
{
/* Jacobian matrix J(i,j) = dfi / dxj, */
/* where fi = (Yi - yi)/sigma[i], */
/* Yi = A * exp(-lambda * i) + b */
/* and the xj are the parameters (A,lambda,b) */
double t = i;
double s = sigma[i];
double e = exp(-lambda * t);
gsl_matrix_set (J, i, 0, e/s);
gsl_matrix_set (J, i, 1, -t * A * e/s);
gsl_matrix_set (J, i, 2, 1/s);
}
return GSL_SUCCESS;
}
int expb_fdf (const gsl_vector * x, void *data, gsl_vector * f, gsl_matrix * J)
{
expb_f (x, data, f);
expb_df (x, data, J);
return GSL_SUCCESS;
}
void print_state (size_t iter, gsl_multifit_fdfsolver * s);
int main (void)
{
const gsl_multifit_fdfsolver_type *T;
gsl_multifit_fdfsolver *s;
int status;
unsigned int i, iter = 0;
const size_t n = N;
const size_t p = 3;
gsl_matrix *covar = gsl_matrix_alloc (p, p);
double y[N], sigma[N];
struct data d = { n, y, sigma};
gsl_multifit_function_fdf f;
gsl_vector *x = gsl_vector_alloc(p);
for (i=0; i<p; i++)
{
gsl_vector_set(x,i,i==0 ? 1 : 0);
}
const gsl_rng_type * type;
gsl_rng * r;
gsl_rng_env_setup();
type = gsl_rng_default;
r = gsl_rng_alloc (type);
f.f = &expb_f;
f.df = &expb_df;
f.fdf = &expb_fdf;
f.n = n;
f.p = p;
f.params = &d;
/* This is the data to be fitted */
for (i = 0; i < n; i++)
{
double t = i;
y[i] = 1.0 + 5 * exp (-0.1 * t) + gsl_ran_gaussian (r, 0.1);
sigma[i] = 0.1;
printf ("data: %u %g %g\n", i, y[i], sigma[i]);
};
T = gsl_multifit_fdfsolver_lmsder;
s = gsl_multifit_fdfsolver_alloc (T, n, p);
gsl_multifit_fdfsolver_set (s, &f, x);
print_state (iter, s);
do
{
iter++;
status = gsl_multifit_fdfsolver_iterate (s);
printf ("status = %s\n", gsl_strerror (status));
print_state (iter, s);
if (status)
break;
status = gsl_multifit_test_delta (s->dx, s->x,
1e-4, 1e-4);
}
while (status == GSL_CONTINUE && iter < 500);
gsl_multifit_covar (s->J, 0.0, covar);
{
double chi = gsl_blas_dnrm2(s->f);
double dof = n - p;
double c = GSL_MAX_DBL(1, chi / sqrt(dof));
printf("chisq/dof = %g\n", pow(chi, 2.0) / dof);
printf ("A = %.5f +/- %.5f\n", FIT(0), c*ERR(0));
printf ("lambda = %.5f +/- %.5f\n", FIT(1), c*ERR(1));
printf ("b = %.5f +/- %.5f\n", FIT(2), c*ERR(2));
}
printf ("status = %s\n", gsl_strerror (status));
gsl_multifit_fdfsolver_free (s);
gsl_matrix_free (covar);
gsl_rng_free (r);
gsl_vector_free(x);
return 0;
}
void print_state (size_t iter, gsl_multifit_fdfsolver * s)
{
printf ("iter: %3u x = % 15.8f % 15.8f % 15.8f "
"|f(x)| = %g\n",
iter,
gsl_vector_get (s->x, 0),
gsl_vector_get (s->x, 1),
gsl_vector_get (s->x, 2),
gsl_blas_dnrm2 (s->f));
}
I want to know the impact on performance when using cudaMalloc or cudaMalloc3D when allocating, copying and accessing memory for a 2D array. I have code that I tried to test the run time on where on one I use cudaMalloc and on the other cudaMalloc3D. I have included the code below. An explanation on how the performance is impacted by either api would be much appreciated.
cudaMalloc code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError() ; \
if( e != cudaSuccess ) {\
printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
__global__ void solveWaveGPU ( REAL *uold, REAL *u, REAL *unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
unew[IC] = 2.0*u[IC] - uold[IC] + FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 100;
REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side
REAL *d_unew, *d_u, *d_uold, *tmp; //pointers for the device
// variable declaration for timing
cudaEvent_t timeStart, timeStop;
cudaEventCreate(&timeStart);
cudaEventCreate(&timeStop);
float elapsedTime_gpu;
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// create device copies of the variables
cudaMalloc( (void**) &d_unew, NX*NY*sizeof(REAL) ); cudaCheckError();
cudaMalloc( (void**) &d_u, NX*NY*sizeof(REAL) ); cudaCheckError();
cudaMalloc( (void**) &d_uold, NX*NY*sizeof(REAL) ); cudaCheckError();
meshGrid( x, y );
initWave( unew, u, uold, x, y );
// start timing the GPU
cudaMemcpy( d_u, u, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
cudaMemcpy( d_uold, uold, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
cudaMemcpy( d_unew, unew, NX*NY*sizeof(REAL), cudaMemcpyHostToDevice ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
// launch the GPU kernel
cudaEventRecord(timeStart, 0);
for (INT n=1; n<nTimeSteps+1; n++) {
solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
cudaDeviceSynchronize();
cudaCheckError();
tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = tmp;
}
cudaEventRecord(timeStop, 0);
cudaEventSynchronize(timeStop);
cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);
cudaMemcpy( uFinal, d_u, NX*NY*sizeof(REAL), cudaMemcpyDeviceToHost ); cudaCheckError();
printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);
free(unew); free(u); free(uold);
cudaFree(d_unew); cudaFree(d_u); cudaFree(d_uold);
free(uFinal); free(x); free(y);
cudaEventDestroy(timeStart);
cudaEventDestroy(timeStop);
return (0);
}
cudaMalloc3D code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError() ; \
if( e != cudaSuccess ) {\
printf("\nCuda Failure %s:%d: %s\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *d_u = (char *) u.ptr;
char *d_uold = (char *) uold.ptr;
char *d_unew = (char *) unew.ptr;
REAL *u_row = (REAL *)(d_u + j * u.pitch);
REAL u_IP1 = ( (REAL *)(d_u + (j+1) * u.pitch) )[i];
REAL u_IM1 = ( (REAL *)(d_u + (j-1) * u.pitch) )[i];
REAL u_JP1 = u_row[i+1];
REAL u_JM1 = u_row[i-1];
REAL u_IC = u_row[i];
REAL uold_IC = ( (REAL *)(d_uold + j * uold.pitch) )[i];
REAL *unew_row = (REAL *)(d_unew + j * unew.pitch);
unew_row[i] = 2.0 * u_IC - uold_IC + FACTOR * ( u_IP1 + u_IM1 + u_JP1 + u_JM1 - 4.0 * u_IC );
}
}
void initWave ( REAL *unew, REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 100;
REAL *unew, *u, *uold, *uFinal, *x, *y; //pointers for the host side
// variable declaration for timing
cudaEvent_t timeStart, timeStop;
cudaEventCreate(&timeStart);
cudaEventCreate(&timeStop);
float elapsedTime_gpu;
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
cudaExtent myExtent = make_cudaExtent(NX * sizeof(REAL), NY, NZ);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
// create device copies of the variables
cudaMalloc3D( &d_u , myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
meshGrid( x, y );
initWave( unew, u, uold, x, y );
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.extent = myExtent;
cpy3D.kind = cudaMemcpyHostToDevice;
// copy 3D from u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, NX*sizeof(REAL), NX, NY);
cpy3D.dstPtr = d_u;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copy 3D from uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, NX*sizeof(REAL), NX, NY);
cpy3D.dstPtr = d_uold;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
// launch the GPU kernel
// start timing the GPU
cudaEventRecord(timeStart, 0);
for (INT n=1; n<nTimeSteps+1; n++) {
solveWaveGPU <<<dimGrid,dimBlock>>>(d_uold, d_u, d_unew);
cudaDeviceSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
cudaEventRecord(timeStop, 0);
cudaEventSynchronize(timeStop);
cudaEventElapsedTime(&elapsedTime_gpu, timeStart, timeStop);
// copy 3D from d_u to uFinal
cpy3D.kind = cudaMemcpyDeviceToHost;
cpy3D.srcPtr = d_u;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, NX*sizeof(REAL), NX, NY);
cudaMemcpy3D( &cpy3D ); cudaCheckError();
printf("elapsedTime on the GPU= %f s.\n", elapsedTime_gpu/1000.0);
free(u); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_u.ptr);
free(unew); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
cudaEventDestroy(timeStart);
cudaEventDestroy(timeStop);
return (0);
}
Timing:
cudaMalloc3D: 1.192510 s
cudaMalloc: 0.960322 s
Machine specification:
GNU/Linux x86_64
NVIDIA GeForce GTX Titan CC: 3.5
CUDA ver 7.0
The performance difference you observe is mostly due to the increased instruction overhead in the pitched memory indexing scheme. Because your array size is a large power of two in the major direction, it is very likely that the pitched array allocated with cudaMalloc3D is the same size as the naïve allocation using cudaMalloc. You may find that the performance difference between the two versions changes if you vary the problem size.
(Take note of the comments regarding compiler regressions in CUDA 7. If you refactor your code to pass the Fourier number as a kernel parameter, you will probably get a far bigger performance change than any difference due to pitched memory).
I have a strange problem dealing with 2D array on CUDA device.
#define VR 100 // rows
#define ST 13 // columns
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy(h_arr, d_arr,VR*ST*sizeof(float),cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
I don't get evenly distributed numbers, instead they appear in sequence of 13 with bunch of zeros in between. See: http://pastie.org/6106381
The problem is that the original data array is being allocated using cudaMallocPitch whereas the copying is being done using ordinary cudaMemcpy. This will give unexpected results because the cudaMallocPitch operation creates "padded" rows to satisfy alignment requirements, whereas cudaMemcpy assumes everything is stored contiguously. Below is code that I believe has corrections to be functional:
#include <stdio.h>
#include <iostream>
#include <curand_kernel.h>
#define VR 100 // rows
#define ST 13 // columns
__device__ float generate(curandState* globalState, int id)
{
//int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curandState localState = globalState[id];
float rand;
do {
rand = curand_uniform( &localState );
} while(rand == 0); //
globalState[id] = localState;
return rand;
}
__global__ void test(float *arr, curandState *globalState, size_t pitch, unsigned long seed) {
int id = (blockIdx.x * blockDim.x) + threadIdx.x;
curand_init ( seed, id, 0, &globalState[id] );
//cuPrintf("Thread id: %d \n", id);
float* row = (float*)(((char*)arr) + id * pitch);
for (int j = 0; j < ST; ++j) {
row[j] = generate(globalState, id);
}
}
using namespace std;
int main() {
float *d_arr;
float *h_arr = new float[VR*ST];
size_t pitch;
cudaMallocPitch(&d_arr, &pitch, ST* sizeof(float), VR);
dim3 dimBlock(VR);
dim3 dimGrid(1,1);
curandState* devStates;
cudaMalloc ( &devStates, VR*ST*sizeof( curandState ) );
test <<< dimGrid, dimBlock >>> (d_arr, devStates, pitch, unsigned(time(NULL)));
cudaMemcpy2D(h_arr, ST*sizeof(float), d_arr, pitch, ST*sizeof(float), VR ,cudaMemcpyDeviceToHost);
for (int i=0; i<VR; i++) {
for (int j=0; j<ST; j++) {
cout << "N["<<i<<"]["<<j<<"]=" << h_arr[(i*ST)+j]<<endl;
}
}
}
Compiling the above code using:
nvcc -arch=sm_20 -lcurand -o t70 t70.cu
and then running I get what appears to be "normal" output:
N[0][0]=0.876772
N[0][1]=0.550017
N[0][2]=0.49023
N[0][3]=0.530145
N[0][4]=0.501616
N[0][5]=0.326232
N[0][6]=0.438308
N[0][7]=0.857651
N[0][8]=0.462743
N[0][9]=0.38252
N[0][10]=0.258212
N[0][11]=0.194021
N[0][12]=0.895522
N[1][0]=0.559201
N[1][1]=0.257747
N[1][2]=0.430971
N[1][3]=0.707209
N[1][4]=0.599081
N[1][5]=0.0457626
N[1][6]=0.702412
N[1][7]=0.88791
N[1][8]=0.508877
N[1][9]=0.702734
N[1][10]=0.379898
N[1][11]=0.138841
N[1][12]=0.540869
(results truncated)
I think it's wrong, you should assign VR number of threads or blocks because you already loop through ST in the kernel.
maybe that will fix it.