The following code describes 2 harmonic oscillators. They are initially uncoupled and independent and am only looking at one of them, which is a mechanical oscillator. The other oscillator's variables have been declared and forced to 0. I first do 300,000 time iterations and do this for 80 different frequencies of wdm (mechanical drive w).
//find frequency response
int index_A;
double wdm_1;
double wdm_2;
double wdm_prec=2*3.142*5e3;
printf("%d \n", index_A);
However, my code runs till 58 frequencies and gives this error:
Strangely, if I run the code in 2 parts of 40 frequencies each and append the files together, it works fine.
Resonance peak in frequency domain
Also, when I reduce the size of v0 to 3, the code works properly. However, I will need the other variables later.
int j=0;
for (j=0; j<= index_A ; j++){
printf("%d \n",j);
v0[0] = 0;
v0[1] = 0;
v0[2] = 0;
v0[3] = 0;
v0[4] = 0;
v0[5] = 0;
v0[6]= wdm;
for (i=0; i< n ; i++){
if (cabs(xa)>=1){
printf("Breaking Loop \n");
v1 = rk4vec_ameya_complex_1 ( tau, 7, v0, dtau, rk4vec_f_ameya_complex_1 );
memcpy(v0, v1, 4 * sizeof ( double complex ) );
fprintf(f1, "%g, %g \n", wdm/(2*3.142), cabs(v1[2]) );
At the end of time iterations, I save the value of the frequency wdm and last value of displacement x=v1[2] in a file and move on to do time iterations with another frequency. Hence, my file contains the frequency response.
fprintf(f1, "%g, %g \n", wdm/(2*3.142), cabs(v1[2]) );
I have used runge kutta (rk4.c) from people.sc.fsu.edu/~jburkardt/c_src/rk4/rk4.html and modified it for complex datatype using
#include <complex.h>
Following is the function runge-kutta-4 has to solve:
double complex *rk4vec_f_ameya_complex_1 ( double t, int n, double complex u[] )
double complex drive_m;
double complex drive_c;
double x;
double xrf0_1;
double complex *uprime;
uprime = ( double * ) malloc ( 7 * sizeof ( double complex ) );
//Check if memory unavailable
printf("No memory available \n");
return 0;
///////////////////Second Order////////////////////////
return uprime;
Kindly suggest any solutions if my usage of malloc() is responsible for running out of memory.

One obvious problem - you free uprime after you have returned from rk4vec_f_ameya_complex_1, I'd expect you to be getting an unreachable code warning. Also as uprime is always the same size why do you use malloc at all? If you just make it an array of size 7 then you won't have issues with malloc and your code will probably run faster.


A lot of 0's received when using cudaMemcpy()

I've just started to learn CUDA and i wanted to fill an array (a 2D array represented as a 1D array) with random numbers. I followed another posts in order to generate random numbers, but i don't know if there is a problem with the generation of numbers or with the memory recovering from the device or anything else. The problem is that, though i have tried to fill any cell of the array with the id of the thread that is atending it in order to see the results after copying into the host memory, i receive an array that is filled with 0 in any position after recovering the data with cudaMemcpy().
I'm programming on Visual Studio 2013, with cuda 7.5, on a i5 2500k as my processor and a 960 GTX graphic card.
Here is the main and the method where i try to fill it. I'll update the cuRand Initialization too. If you need to see something else, just tell me.
__global__ void setup_cuRand(curandState * state, unsigned long seed)
int id = threadIdx.x;
curand_init(seed, id, 0, &state[id]);
__global__ void poblar(int * adn, curandState * state){
curandState localState = state[threadIdx.x];
int random = curand(&localState);
adn[threadIdx.x] = random;
// It doesn't mind if i use the following instruction, the result is a lot of 0's
//adn[threadIdx.x] = threadIdx.x;
int main()
const int adnLength = NUMCROMOSOMAS * SIZECROMOSOMAS; // 256 * 128 (32.768)
const size_t adnSize = adnLength * sizeof(int);
int adnCPU[adnLength];
int * adnDevice;
cudaError_t error = cudaSetDevice(0);
if (error != cudaSuccess)
curandState * randState;
error = cudaMalloc(&randState, adnLength * sizeof(curandState));
if (error != cudaSuccess){
//Here is initialized cuRand
setup_cuRand <<<1, adnLength >> > (randState, unsigned(time(NULL)));
error = cudaMalloc((void **)&adnDevice, adnSize);
if (error == cudaErrorMemoryAllocation){// cudaSuccess){
printf("\n error");
poblar <<<1, adnLength >>> (adnDevice, randState);
error = cudaMemcpy(adnCPU, adnDevice, adnSize, cudaMemcpyDeviceToHost);
//After here, for any i, adnCPU[i] is 0 and i cannot figure what is wrong
if (error == cudaSuccess){
for (int i = 0; i < NUMCROMOSOMAS; i++){
for (int j = 0; j < SIZECROMOSOMAS; j++){
printf("%i,", adnCPU[(i*SIZECROMOSOMAS) + j]);
return 0;
EDIT after answer solved: There was a particularity over the answer given, and is that you need a lower number of threads (half of that quantity worked for me) in order to seed correctly the random numbers with cuRand. For some reason, i could create the threads perfectly but i couldn't seed the pseudo-random algorithm generator.
The maximum number of threads per block is 1024 on your hardware, hence, you may not schedule a call with adnLength if it is larger than 1024.
The error you are having is most probably a call configuration error, and it is returned by cudaPeekAtLastError, as it occurs before any GPU work, right after the triple angled-bracket call. Indeed cudaMemcpy may not return it, even though it returns error from previous asynchronous calls.
The error that may occur is cudaErrorLaunchOutOfResources.

How to make gaussian package move in numerical simulation of a square barrier in C

I am trying to use Gaussian packages to study the transmission probability via Trotter-Suzuki formula and fast Fourier transform (FFT) when confronted with a square barrier, just as done in this Quantum Python article. But I need to realize it using C. In principle, the wave function will remain its shape before the collision with the square barrier. But I found that the wave function becomes flat dramatically with time before colliding with the square barrier. Anybody finds problems in the following codes?
Here, two files - result and psi.txt - are created to store the initial and evolved wave-function. The first two data for each are x coordinates, the probability of the wave function at that x. The third data for each line in file result is the square barrier distribution. The FFT I use is shown in this C program.
#include <stdio.h>
#include <math.h>
#define h_bar 1.0
#define pi 3.1415926535897932385E0
#define m0 1.0
typedef double real;
typedef struct { real Re; real Im; } complex;
extern void fft(complex x[], int N, int flag);
complex complex_product(complex x, real y_power, real y_scale)
real Re, Im;
Re = (x.Re*cos(y_power)-x.Im*sin(y_power))*y_scale;
Im = (x.Re*sin(y_power)+x.Im*cos(y_power))*y_scale;
x.Re = Re; x.Im = Im;
return x;
real potential(real x, real a)
return (x<0 || x>=a) ? 0 : 1;
void main()
int t_steps=20, i, N=pow(2,10), m, n;
complex psi[N];
real x0=-2, p0=1, k0=p0/h_bar, x[N], k[N], V[N];
real sigma=0.5, a=0.1, x_lower=-5, x_upper=5;
real dt=1, dx=(x_upper-x_lower)/N, dk=2*pi/(dx*N);
FILE *file;
file = fopen("result", "w");
for (n=0; n<N; n++)
x[n] = x_lower+n*dx;
k[n] = k0+(n-N*0.5)*dk;
V[n] = potential(x[n], a);
psi[n].Re = exp(-pow((x[n]-x0)/sigma, 2)/2)*cos(p0*(x[n]-x0)/h_bar);
psi[n].Im = exp(-pow((x[n]-x0)/sigma, 2)/2)*sin(p0*(x[n]-x0)/h_bar);
for (m=0; m<N; m++)
fprintf(file, "%g %g %g\n", x[m], psi[m].Re*psi[m].Re+psi[m].Im*psi[m].Im, V[m]);
for (i=0; i<t_steps; i++)
printf("t_steps=%d\n", i);
for (n=0; n<N; n++)
psi[n]=complex_product(psi[n], -V[n]*dt/h_bar, 1);
psi[n]=complex_product(psi[n], -k[0]*x[n], dx/sqrt(2*pi));//x--->x_mod
fft(psi, N, 1);//psi: x_mod--->k_mod
for (m=0; m<N; m++)
psi[m]=complex_product(psi[m], -m*dk*x[0], 1);//k_mod--->k
psi[m]=complex_product(psi[m], -h_bar*k[m]*k[m]*dt/(2*m0), 1./N);
psi[m]=complex_product(psi[m], m*dk*x[0], 1);//k--->k_mod
fft(psi, N, -1);
for (n=0; n<N; n++)
psi[n] = complex_product(psi[n], k[0]*x[n], sqrt(2*pi)/dx);//x_mod--->x
file = fopen("psi.txt", "w");
for (m=0; m<N; m++)
fprintf(file, "%g %g 0\n", x[m], pow((psi[m]).Re, 2)+pow((psi[m]).Im, 2));
I use the following Python code to plot the initial and final evolved wave functions:
call: `>>> python plot.py result psi.txt`
import matplotlib.pyplot as plt
from sys import argv
for filename in argv[1:]:
print filename
f = open(filename, 'r')
lines = [line.strip(" \n").split(" ") for line in f]
x = [float(line[0]) for line in lines]
y = [float(line[2]) for line in lines]
psi = [float(line[1]) for line in lines]
print "x=%g, max=%g" % (x[psi.index(max(psi))], max(psi))
plt.plot(x, y, x, psi)
#plt.xlim([-1.0e-10, 1.0e-10])
plt.ylim([0, 3])
Your code is almost correct, sans the fact that you are missing the initial/final half-step in the real domain and some unnecessary operations (kmod -> k and back), but the main problem is that your initial conditions are really chosen badly. The time evolution of a Gaussian wavepacket results in the uncertainty spreading out quadratically in time:
Given your choice of particle mass and initial wavepacket width, the term in the braces equals 1 + 4 t2. After one timestep, the wavepacket is already significantly wider than initially and after another timestep becomes wider than the entire simulation box. The periodicity implied by the use of FFT results in spatial and frequency aliasing, which together with the overly large timestep is why your final wavefunction looks that strange.
I would advise that you try to replicate exactly the conditions of the Python program, including the fact that the entire system is in a deep potential well (Vborder -> +oo).
The variable i is uninitialised here:
k[n] = k0+(i-N*0.5)*dk;

C Segmentation Fault in while loop-cannot resolve issue despite debugging

This is the piece of code I have which prints my diffused density matrices to a file after every nth time step of the simulation time given by fdparam_1.t_domain. t and fdparam_1.Dt are variables of the type double. All variables are declared and defined either with user input or with pre-defined values in the code.
Please note that the last time I posted the code for the segmentation fault, I modified the code as per the suggestions and this piece of code below is the modified one, although the operations are obviously the same.
int main(void)
int i,j,size,sz;
double *ux, *vy, *ux0, *vy0, *r, *r0, t, sum, sum1;
struct fdparam fdparam_1;
printf("Enter the number of grid points: \t");
scanf("%d", &fdparam_1.N);
printf("Enter the maximum number of iterations: \t");
scanf("%d", &fdparam_1.MAXIT);
printf("Enter the value for time domain and the time interval: \t");
scanf("%d\t%d", &fdparam_1.t_domain, &fdparam_1.Del_t);
printf("Enter the time step, number of molecules: \t \t");
scanf("%lf\t%lf", &fdparam_1.Dt, &fdparam_1.dens);
printf("Enter the volume of the fluid: \t");
scanf("%lf", &fdparam_1.V);
printf("Enter the diffusion coefficient and viscosity and angular velocity(in rad/s): \t \t");
scanf("%lf\t%lf\t%lf",&fdparam_1.diff, &fdparam_1.mu, &fdparam_1.wv);
double map[fdparam_1.N+2][fdparam_1.N+2],map_init[fdparam_1.N+2][fdparam_1.N+2],n_calc, time[sz+1];
r = (double*) calloc (size,sizeof(double));
r0 = (double*) calloc (size,sizeof(double));
ux = (double*) calloc (size,sizeof(double));
vy = (double*) calloc (size,sizeof(double));
ux0 = (double*)calloc (size,sizeof(double));
vy0 = (double*)calloc (size,sizeof(double));
double vol = fdparam_1.V;
FILE *fp1[sz+1];
for (i=0;i<=fdparam_1.N+1;i++){
for (j=0;j<=fdparam_1.N+1;j++){
double n_act = sum1*vol;
printf("Time = %lf \t Initial Nr. of Molecules is: %e \n",t,n_act);
int l = 0;
int k=0;
density_solve(fdparam_1.N,r,r0,ux0,vy0,fdparam_1.Dt,fdparam_1.diff,fdparam_1.MAXIT); //uses ux and vy calculated from Navier Stokes in the velocity solver to calculate density
// creating multiple files to store the density values during the simulation at every Del_t time interval
char filename[sz+1];
for (i=0;i<=fdparam_1.N+1;i++){
for (j=0;j<=fdparam_1.N+1;j++){
fprintf(fp1[l],"%lf \t",map[i][j]);
for (i=0;i<=fdparam_1.N+1;i++){
for (j=0;j<=fdparam_1.N+1;j++){
printf("Time = %lf \t Calculated Nr. of Molecules = %e \n",t,n_act);
void add_source(int n, double *x, double *s, double dt)
int i, size;
size = (n+2)*(n+2);
for (i=0; i<size; i++)
x[i]+=s[i]; //add source terms to the density
I am sorry the code is divided into numerous functions and header files and it is really difficult for me to prepare a minimal working code out of it. The above is my complete main function but here is what is happening now when I run the gdb debugger again without supplying any breakpoint, it seems to be executing the step where it is supposed to print t and n_act because this is the actual expected output which I am supposed to get but I get segmentation fault instead,
Printing source densities now:
Time = 0.000000 Initial Nr. of Molecules is: 8.820000e+06
Time = 0.000000 Calculated Nr. of Molecules = 8.820000e+06
Time = 10.000000 Calculated Nr. of Molecules = 8.820000e+06
Time = 20.000000 Calculated Nr. of Molecules = 8.820000e+06
... and so on till Time=1000
Where the issue is:
Based on your previous post, Segmentation fault - Two functions don't run simultaneously, where N is the number of points, it looks like your indexes are going of of bounds.
How to resolve it:
Revise your loop comparisons for i and for j where used for map.

DTMF Goertzel Algorithm Not Working

So I am opening a .raw file of a DTMF tone I generated in audacity. I grabbed a canned goertzel algorithm similar to the one on the wikipedia article. It doesn't seem to decode the correct numbers though.
The decoded number also changes depending on what value of N I pass to the algorithm. As far as I understood a higher value of N gives it better accuracy but should not change what number would get decoded correct?
Here is the code,
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
double goertzel(short samples[], double freq, int N)
double s_prev = 0.0;
double s_prev2 = 0.0;
double coeff, normalizedfreq, power, s;
int i;
normalizedfreq = freq / 8000;
coeff = 2*cos(2*M_PI*normalizedfreq);
for (i=0; i<N; i++)
s = samples[i] + coeff * s_prev - s_prev2;
s_prev2 = s_prev;
s_prev = s;
power = s_prev2*s_prev2+s_prev*s_prev-coeff*s_prev*s_prev2;
return power;
int main()
FILE *fp = fopen("9.raw", "rb");
short *buffer;
float *sample;
int sample_size;
int file_size;
int i=0, x=0;
float frequency_row[] = {697, 770, 852, 941};
float frequency_col[] = {1209, 1336, 1477};
float magnitude_row[4];
float magnitude_col[4];
double result;
fseek(fp, 0, SEEK_END);
file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);
buffer = malloc(file_size);
buffer[x] = getc(fp);
buffer[x] = buffer[x]<<8;
buffer[x] = buffer[x] | getc(fp);
buffer[x] = getc(fp);
buffer[x] = buffer[x]<<8;
buffer[x] = buffer[x] | getc(fp);
for(i=0; i<x; i++)
//printf("%#x\n", (unsigned short)buffer[i]);
for(i=0; i<4; i++)
magnitude_row[i] = goertzel(buffer, frequency_row[i], 8000);
for(i=0; i<3; i++)
magnitude_col[i] = goertzel(buffer, frequency_col[i], 8000);
for(i=0; i<4; i++)
if(magnitude_row[i] > magnitude_row[x])
x = i;
printf("Freq: %f\t Mag: %f\n", frequency_row[x], magnitude_row[x]);
for(i=0; i<3; i++)
if(magnitude_col[i] > magnitude_col[x])
x = i;
printf("Freq: %f\t Mag: %f\n", frequency_col[x], magnitude_col[x]);
return 0;
The algorithm is actually tricky to use, even for something as simple as detecting DTMF tones. It is actually effectively a band-pass filter - it singles out a band of frequencies centered around the frequency given. This is actually a good thing - you can't count on your sampled tone to be exactly the frequency you are trying to detect.
The tricky part is attempting to set the bandwidth of the filter - how wide the range of frequencies is that will be filtered to detect a particular tone.
One of the references on the Wikipedia page on the subject (this one to be precise) talks about implementing DTMF tone detection using the Goertzel Algorithm in DSP. The principles are the same for C - to get the bandwidth right you have to use the right combination of provided constants. Apparently there is no simple formula - the paper mentions having to use a brute force search, and provides a list of optimal constants for the DTMF frequencies sampled at 8kHz.
Are you sure the audio data Audacity generated is in big-endian format? You are interpreting it in big-endian, whereas they are normally in little-endian if you run it on x86.
There are some interesting answers here.
First, the goertzel is in fact a "sympathetic" oscillator.
That means that the poles are on the unit circle in DSP terms.
The internal variables s, s_prev, s_prev2 will grow without bound if you run the code on a long block of data containing the expected tone (freq) for that detector.
This means that you need to run a kind of integrate an dump process to get results.
The goertzel works best at discriminating between DTMF digits if you run about 105 to 110 samples into it at a time. So set N = 110 and call the goertzel repeatedly as you run through you data.
Incidentally, real DTMF digits may only last as little as 60 msec and you should report their presence if you find more than 40 msec.
Think about the 110 samples I mentioned, means one call covers 110/8000 = 13.75 msec. If you are very fortunate, then you will see positive output from 4 consecutive iterations of calls to the detector.
In the past I have found that running a pair of detectors in parallel with staggered start times, with provide better coverage of very short tone bursts.

Data Gathering Portion of CUDA Code is Unexpectedly Outputting "0"s

In the initial posting's code snippet (see below) I was not properly sending the struct to the device, this has been fixed, but the results are still the same. In my full code this mistake was not present. (There were two mistakes in that command in my initial posting -- one, the structure was being copied from HostToDevice, but was actually reversed, and the size of the copy was also wrong. Apologies; both errors were fixed, but the recompiled code still displays the zeros phenomena described below, as does my full code.)
In the haste of my de-proprietarization rewrite of the code I made a couple errors which dalekchef kindly pointed out to me (the copy of the struct to the device was performed BEFORE the allocation on the device, in my rewritten code and the device cudaMalloc calls were not multiplied with the sizeof(...) the type of the array elements. I added these fixes, recompiled and retested, but it did not fix the problem. Also double checked my original code -- it did not have those mistakes. Apologies again, for the confusion.
I'm trying to dump statistics from a large simulations program. A similar pared down code is displayed below. Both codes exhibit the same problem -- they output zeroes, when they should be outputting averaged values.
#include "stdio.h"
struct __align__(8) DynamicVals
double a;
double b;
int n1;
int n2;
int perDump;
__device__ int *dev_arrN1, *dev_arrN2;
__device__ double *dev_arrA, *dev_arrB;
__device__ DynamicVals *dev_myVals;
__device__ int stepsA, stepsB;
__device__ double sumA, sumB;
__device__ int stepsN1, stepsN2;
__device__ int sumN1, sumN2;
__global__ void TEST
(int step, double dev_arrA[], double dev_arrB[],
int dev_arrN1[], int dev_arrN2[],DynamicVals *dev_myVals)
if (step % dev_myVals->perDump)
dev_arrN1[step/dev_myVals->perDump] = 0;
dev_arrN2[step/dev_myVals->perDump] = 0;
dev_arrA[step/dev_myVals->perDump] = 0.0;
dev_arrB[step/dev_myVals->perDump] = 0.0;
stepsA = 0;
stepsB = 0;
stepsN1 = 0;
stepsN2 = 0;
sumA = 0.0;
sumB = 0.0;
sumN1 = 0;
sumN2 = 0;
sumA += dev_myVals->a;
sumB += dev_myVals->b;
sumN1 += dev_myVals->n1;
sumN2 += dev_myVals->n2;
if ( sumA > 100000000 )
dev_arrA[step/dev_myVals->perDump] +=
sumA / stepsA;
sumA = 0.0;
stepsA = 0;
if ( sumB > 100000000 )
dev_arrB[step/dev_myVals->perDump] +=
sumB / stepsB;
sumB = 0.0;
stepsB = 0;
if ( sumN1 > 1000000 )
dev_arrN1[step/dev_myVals->perDump] +=
sumN1 / stepsN1;
sumN1 = 0;
stepsN1 = 0;
if ( sumN2 > 1000000 )
dev_arrN2[step/dev_myVals->perDump] +=
sumN2 / stepsN2;
sumN2 = 0;
stepsN2 = 0;
if ((step+1) % dev_myVals->perDump)
dev_arrA[step/dev_myVals->perDump] +=
sumA / stepsA;
dev_arrB[step/dev_myVals->perDump] +=
sumB / stepsB;
dev_arrN1[step/dev_myVals->perDump] +=
sumN1 / stepsN1;
dev_arrN2[step/dev_myVals->perDump] +=
sumN2 / stepsN2;
int main()
const int TOTAL_STEPS = 10000000;
DynamicVals vals;
int *arrN1, *arrN2;
double *arrA, *arrB;
int statCnt;
vals.perDump = TOTAL_STEPS/10;
statCnt = TOTAL_STEPS/vals.perDump+1;
vals.a = 30000.0;
vals.b = 60000.0;
vals.n1 = 10000;
vals.n2 = 20000;
cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) );
cudaMalloc( (void**)&dev_arrB, statCnt*sizeof(double) );
cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int) );
cudaMalloc( (void**)&dev_arrN2, statCnt*sizeof(int) );
cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals));
cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals),
arrA = (double *)malloc(statCnt * sizeof(double));
arrB = (double *)malloc(statCnt * sizeof(double));
arrN1 = (int *)malloc(statCnt * sizeof(int));
arrN2 = (int *)malloc(statCnt * sizeof(int));
for (int i=0; i< TOTAL_STEPS; i++)
TEST<<<1,1>>>(i, dev_arrA,dev_arrB,dev_arrN1,dev_arrN2,dev_myVals);
cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
cudaMemcpy(arrB,dev_arrB,statCnt * sizeof(double),cudaMemcpyDeviceToHost);
cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(arrN2,dev_arrN2,statCnt * sizeof(int),cudaMemcpyDeviceToHost);
for (int i=0; i< statCnt; i++)
printf("Step: %d ; A=%g B=%g N1=%d N2=%d\n",
arrA[i], arrB[i], arrN1[i], arrN2[i]);
Step: 0 ; A=0 B=0 N1=0 N2=0
Step: 1000000 ; A=0 B=0 N1=0 N2=0
Step: 2000000 ; A=0 B=0 N1=0 N2=0
Step: 3000000 ; A=0 B=0 N1=0 N2=0
Step: 4000000 ; A=0 B=0 N1=0 N2=0
Step: 5000000 ; A=0 B=0 N1=0 N2=0
Step: 6000000 ; A=0 B=0 N1=0 N2=0
Step: 7000000 ; A=0 B=0 N1=0 N2=0
Step: 8000000 ; A=0 B=0 N1=0 N2=0
Step: 9000000 ; A=0 B=0 N1=0 N2=0
Step: 10000000 ; A=0 B=0 N1=0 N2=0
Now, if I were to use a small period for my dumps or if my #s were smaller, I could get away with just a direct
divide by period and the end of period
...algorithm, but I use temporary sums as otherwise my int would overflow (the double wouldn't overflow, but I was concerned about it losing precision).
If I use the above direct algorithm for smaller values I get correct non-zero values, but the second I use the intermediates (e.g. stepsA, sumA, etc.) the values go to zero.
I know I'm doing something silly here... what am I missing?
A.) Yes, I know this code in its above form is not parallel and by itself does not warrant parallelization. It is part of a small statistics collecting portion of a much longer code. In that code it is encased in a thread index specific conditional logic to prevent clashing (making it parallel) and serves as data gathering to a simulations program (which warrants parallelization). Hopefully you can understand where the above code originates and avoid snide comments about its lack of thread-safety. (This disclaimer is added out of past experience receiving unproductive comments from people who didn't understand I was posting an excerpt, not a full code, despite me writing in less explicit terms as such.)
B.) Yes, I know the names of the variables are ambiguous. That is the point. The code I'm working on is proprietary, though it will eventually be open sourced. I only write this as I have posted similarly anonymized codes in the past and received rude commentary about my naming convention.
C.) Yes, I have read the CUDA manual several times, though I do make errors and I admit there's some features I don't understand. I'm not using shared memory here, but I am using shared memory (OF COURSE) in my full code.
D.) Yes, the above code does represent the exact same features as the data dumping portion of my non-working code, with the logic not related to this particular problem removed, and with it the thread safety conditional. The variable names have been changed, but algorithmically it should be unaltered and this is verified by the exact same non-working output (zeroes).
E.) I do realize the "dynamic" struct in the above snippet has non-dynamic values. I named the structure that because in the full code, this struct contains simulations data, and is dynamic. The static nature in the pared-down code should not make the statistics collecting code fail, it will simply mean that the average for each dump should be constant (and non-zero).
A couple of things:
It seems like you are calling cudaMemcpy for dev_MyVals before you are calling cudaMalloc for it. This is not how it should be.
ALSO: You do not multiply by sizeof int when you do your cudaMalloc calls.
You should really check all of your CUDA calls cudaMalloc/cudaMemcpy for an error code. They should all return an error or CUDA_SUCCESS. I believe the CUDA examples all show how to do this.
Also, for future reference NEVER use the modulo operator in CUDA it is incredibly slow. Just Google for "Modulo CUDA" for some alternatives.
Let me know how it goes, this will probably take a couple of iterations to fix.
The biggest problem I see here is one of scope. The way this code is written leads me to conclude that you might not understand how variable scoping in C++ works in general, and how device and host code scope works in CUDA in particular. A couple of observations:
When you do this type of thing in code:
__device__ double *dev_arrA, *dev_arrB;
__global__ void TEST(int step, double dev_arrA[], double dev_arrB[], ....)
you have a variable scope problem. dev_arrA is declared at both compilation unit scope and function scope. The two declarations do not refer to the same variable -- the function unit scope declaration (in the kernel) takes precedence over the compilation unit scope declaration inside the kernel. you modify that variable, you are modifying the kernel scope declaration, not the __device__variable. This can lead to all sorts of subtle and unexpactd behaviour. It is much better to avoid ever having the same variable declared at multiple scopes.
When you declare a variable using the __device__ specifier, it is intended to be exclusively a device context symbol, and should only be used directly in device code. So something like this:
__device__ double *dev_arrA;
int main()
cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(double) );
is illegal. You cannot call an API function like cudaMalloc directly on a __device__ variable. Even though it will compile (because of the hackery involved in the CUDA compilation tradjectories for host and device code), it is incorrect to do so. In the above example dev_arrA is a device symbol. You can interact with it via the API symbol manipulation calls, but that is all it is technically legal to do. In you code, variables intended to hold device pointers and be passed as kernel arguments (like dev_arrA) should be declared at main() scope, and passed by value to the kernel.
It is a combination of the above two things which is probably causing your problems.
But the difficulty is that you have chosen to post roughy 150 lines of code (a lot of which is redundant) as a repro case. I doubt anyone cares enough about your problems to go through that much code with a fine tooth comb and pinpoint where the precise problem is. Further, you habit of doing these nasty "top edits" in your questions quickly turn what might have been reasonably written starting points into unintelligible psuedo changelogs which are incredibly hard to follow and are unlikely to be of help to anyone. Also, the mildly passive-aggressive notes section serves no real purpose - it adds nothing of value to the question.
So I will leave you with a greatly simplified version of the code you posted which I think has all the basic things which you are trying to do working. I leave it as an "exercise for the reader" to turn it back into whatever it is that you are trying to do.
#include "stdio.h"
typedef float Real;
struct __align__(8) DynamicVals
Real a;
int n1;
int perDump;
__device__ int stepsA;
__device__ Real sumA;
__device__ int stepsN1;
__device__ int sumN1;
__global__ void TEST
(int step, Real dev_arrA[], int dev_arrN1[], DynamicVals *dev_myVals)
if (step % dev_myVals->perDump)
dev_arrN1[step/dev_myVals->perDump] = 0;
dev_arrA[step/dev_myVals->perDump] = 0.0;
stepsA = 0;
stepsN1 = 0;
sumA = 0.0;
sumN1 = 0;
sumA += dev_myVals->a;
sumN1 += dev_myVals->n1;
dev_arrA[step/dev_myVals->perDump] += sumA / stepsA;
dev_arrN1[step/dev_myVals->perDump] += sumN1 / stepsN1;
inline void gpuAssert(cudaError_t code, char *file, int line,
bool abort=true)
if (code != cudaSuccess)
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code),
file, line);
if (abort) exit(code);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
int main()
const int TOTAL_STEPS = 1000;
DynamicVals vals;
int *arrN1;
Real *arrA;
int statCnt;
vals.perDump = TOTAL_STEPS/10;
statCnt = TOTAL_STEPS/vals.perDump;
vals.a = 30000.0;
vals.n1 = 10000;
Real *dev_arrA;
int *dev_arrN1;
DynamicVals *dev_myVals;
gpuErrchk( cudaMalloc( (void**)&dev_arrA, statCnt*sizeof(Real)) );
gpuErrchk( cudaMalloc( (void**)&dev_arrN1, statCnt*sizeof(int)) );
gpuErrchk( cudaMalloc( (void**)&dev_myVals, sizeof(DynamicVals)) );
gpuErrchk( cudaMemcpy(dev_myVals, &vals, sizeof(DynamicVals),
cudaMemcpyHostToDevice) );
arrA = (Real *)malloc(statCnt * sizeof(Real));
arrN1 = (int *)malloc(statCnt * sizeof(int));
for (int i=0; i< TOTAL_STEPS; i++) {
TEST<<<1,1>>>(i, dev_arrA,dev_arrN1,dev_myVals);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaMemcpy(arrA,dev_arrA,statCnt * sizeof(Real),
cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(arrN1,dev_arrN1,statCnt * sizeof(int),
cudaMemcpyDeviceToHost) );
for (int i=0; i< statCnt; i++)
printf("Step: %d ; A=%g N1=%d\n",
i*vals.perDump, arrA[i], arrN1[i] );
