Increasing n-body program performance using OpenMP

Increasing n-body program performance using OpenMP - c

My goal is to increase the performance of a code that simulates the n-body problem.
This is where the time is to be calculated. The two functions that need to be parallelized are the calculate_forces() and the *move_bodies() functions but since the loop control variable t is a double I cannot have a #pragma omp parallel for statement there.
t0 = gettime ();
for (t = 0; t < t_end; t += dt)
{
// draw bodies
show_bodies (window);
// computation
calculate_forces ();
move_bodies ();
}
// print out calculation speed every second
t0 = gettime () - t0;
The two functions calculate_forces() and move_bodies() with the respective directives that I used are the following:
static void
calculate_forces ()
{
double distance, magnitude, factor, r;
vector_t direction;
int i, j;
#pragma omp parallel private(distance,magnitude,factor,direction)
{
#pragma omp for private(i,j)
for (i = 0; i < n_body - 1; i++)
{
for (j = i + 1; j < n_body; j++)
{
r = SQR (bodies[i].position.x - bodies[j].position.x) + SQR (bodies[i].position.y - bodies[j].position.y);
// avoid numerical instabilities
if (r < EPSILON)
{
// this is not how nature works :-)
r += EPSILON;
}
distance = sqrt (r);
magnitude = (G * bodies[i].mass * bodies[j].mass) / (distance * distance);
factor = magnitude / distance;
direction.x = bodies[j].position.x - bodies[i].position.x;
direction.y = bodies[j].position.y - bodies[i].position.y;
// +force for body i
#pragma omp critical
{
bodies[i].force.x += factor * direction.x;
bodies[i].force.y += factor * direction.y;
// -force for body j
bodies[j].force.x -= factor * direction.x;
bodies[j].force.y -= factor * direction.y;
}
}
}
}
}
static void
move_bodies ()
{
vector_t delta_v, delta_p;
int i;
#pragma omp parallel private(delta_v,delta_p,i)
{
#pragma omp for
for (i = 0; i < n_body; i++)
{
// calculate delta_v
delta_v.x = bodies[i].force.x / bodies[i].mass * dt;
delta_v.y = bodies[i].force.y / bodies[i].mass * dt;
// calculate delta_p
delta_p.x = (bodies[i].velocity.x + delta_v.x / 2.0) * dt;
delta_p.y = (bodies[i].velocity.y + delta_v.y / 2.0) * dt;
// update body velocity and position
#pragma omp critical
{
bodies[i].velocity.x += delta_v.x;
bodies[i].velocity.y += delta_v.y;
bodies[i].position.x += delta_p.x;
bodies[i].position.y += delta_p.y;
}
// reset forces
bodies[i].force.x = bodies[i].force.y = 0.0;
if (bounce)
{
// bounce on boundaries (i.e. it's more like billard)
if ((bodies[i].position.x < -body_distance_factor) || (bodies[i].position.x > body_distance_factor))
bodies[i].velocity.x = -bodies[i].velocity.x;
if ((bodies[i].position.y < -body_distance_factor) || (bodies[i].position.y > body_distance_factor))
bodies[i].velocity.y = -bodies[i].velocity.y;
}
}
}
The values of bodies.velocity and bodies.position are changed in the move bodies function, but I couldn't use a reduction.
There is also a checksum function to calculate if the calculated checksum is equal to the reference checksum. That function looks like this:
static unsigned long
checksum()
{
unsigned long checksum = 0;
// initialize bodies
for (int i = 0; i < n_body; i++)
{
// random position vector
checksum += (unsigned long)round(bodies[i].position.x);
checksum += (unsigned long)round(bodies[i].position.y);
}
return checksum;
}
This function uses the previously calculated values of bodies.position.x and bodies.position.y which were calculated in the move_bodies function hence the reason why I used a critical block while calculating those value which didn't seem to yield a correct answer. Can anyone give me some insight on where I am going wrong? Thank you in advance.

Related

Writing a wave generator with SDL

I've coded a simple sequencer in C with SDL 1.2 and SDL_mixer(to play .wav file). It works well and I want to add some audio synthesis to this program. I've look up the and I found this sinewave code using SDL2(https://github.com/lundstroem/synth-samples-sdl2/blob/master/src/synth_samples_sdl2_2.c)
Here's how the sinewave is coded in the program:
static void build_sine_table(int16_t *data, int wave_length)
{
/*
Build sine table to use as oscillator:
Generate a 16bit signed integer sinewave table with 1024 samples.
This table will be used to produce the notes.
Different notes will be created by stepping through
the table at different intervals (phase).
*/
double phase_increment = (2.0f * pi) / (double)wave_length;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(sin(current_phase) * INT16_MAX);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
static double get_pitch(double note) {
/*
Calculate pitch from note value.
offset note by 57 halfnotes to get correct pitch from the range we have chosen for the notes.
*/
double p = pow(chromatic_ratio, note - 57);
p *= 440;
return p;
}
static void audio_callback(void *unused, Uint8 *byte_stream, int byte_stream_length) {
/*
This function is called whenever the audio buffer needs to be filled to allow
for a continuous stream of audio.
Write samples to byteStream according to byteStreamLength.
The audio buffer is interleaved, meaning that both left and right channels exist in the same
buffer.
*/
// zero the buffer
memset(byte_stream, 0, byte_stream_length);
if(quit) {
return;
}
// cast buffer as 16bit signed int.
Sint16 *s_byte_stream = (Sint16*)byte_stream;
// buffer is interleaved, so get the length of 1 channel.
int remain = byte_stream_length / 2;
// split the rendering up in chunks to make it buffersize agnostic.
long chunk_size = 64;
int iterations = remain/chunk_size;
for(long i = 0; i < iterations; i++) {
long begin = i*chunk_size;
long end = (i*chunk_size) + chunk_size;
write_samples(s_byte_stream, begin, end, chunk_size);
}
}
static void write_samples(int16_t *s_byteStream, long begin, long end, long length) {
if(note > 0) {
double d_sample_rate = sample_rate;
double d_table_length = table_length;
double d_note = note;
/*
get correct phase increment for note depending on sample rate and table length.
*/
double phase_increment = (get_pitch(d_note) / d_sample_rate) * d_table_length;
/*
loop through the buffer and write samples.
*/
for (int i = 0; i < length; i+=2) {
phase_double += phase_increment;
phase_int = (int)phase_double;
if(phase_double >= table_length) {
double diff = phase_double - table_length;
phase_double = diff;
phase_int = (int)diff;
}
if(phase_int < table_length && phase_int > -1) {
if(s_byteStream != NULL) {
int16_t sample = sine_wave_table[phase_int];
sample *= 0.6; // scale volume.
s_byteStream[i+begin] = sample; // left channel
s_byteStream[i+begin+1] = sample; // right channel
}
}
}
}
}
I don't understand how I could change the sinewave formula to genrate other waveform like square/triangle/saw ect...
EDIT:
Because I forgot to explain it, here's what I tried.
I followed the example I've seen on this video series(https://www.youtube.com/watch?v=tgamhuQnOkM). The source code of the method provided by the video is on github, and the wave generation code is looking like this:
double w(double dHertz)
{
return dHertz * 2.0 * PI;
}
// General purpose oscillator
double osc(double dHertz, double dTime, int nType = OSC_SINE)
{
switch (nType)
{
case OSC_SINE: // Sine wave bewteen -1 and +1
return sin(w(dHertz) * dTime);
case OSC_SQUARE: // Square wave between -1 and +1
return sin(w(dHertz) * dTime) > 0 ? 1.0 : -1.0;
case OSC_TRIANGLE: // Triangle wave between -1 and +1
return asin(sin(w(dHertz) * dTime)) * (2.0 / PI);
}
Because the C++ code here uses windows soun api I could not copy/paste this method to make it work on the piece of code I've found using SDL2.
So I tried to this in order to obtain a square wave:
static void build_sine_table(int16_t *data, int wave_length)
{
double phase_increment = ((2.0f * pi) / (double)wave_length) > 0 ? 1.0 : -1.0;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(sin(current_phase) * INT16_MAX);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
This didn't gave me a square wave but more a saw wave.
Here's what I tried to get a triangle wave:
static void build_sine_table(int16_t *data, int wave_length)
{
double phase_increment = (2.0f * pi) / (double)wave_length;
double current_phase = 0;
for(int i = 0; i < wave_length; i++) {
int sample = (int)(asin(sin(current_phase) * INT16_MAX)) * (2 / pi);
data[i] = (int16_t)sample;
current_phase += phase_increment;
}
}
This also gave me another type of waveform, not triangle.

You’d replace the sin function call with call to one of the following:
// this is a helper function only
double normalize(double phase)
{
double cycles = phase/(2.0*M_PI);
phase -= trunc(cycles) * 2.0 * M_PI;
if (phase < 0) phase += 2.0*M_PI;
return phase;
}
double square(double phase)
{ return (normalize(phase) < M_PI) ? 1.0 : -1.0; }
double sawtooth(double phase)
{ return -1.0 + normalize(phase) / M_PI; }
double triangle(double phase)
{
phase = normalize(phase);
if (phase >= M_PI)
phase = 2*M_PI - phase;
return -1.0 + 2.0 * phase / M_PI;
}
You’d be building tables just like you did for the sine, except they’d be the square, sawtooth and triangle tables, respectively.

OpenMP parallel for loop

void calc_mean(float *left_mean, float *right_mean, const uint8_t* left, const uint8_t* right, int32_t block_width, int32_t block_height, int32_t d, uint32_t w, uint32_t h, int32_t i,int32_t j)
{
*left_mean = 0;
*right_mean = 0;
int32_t i_b;
float local_left = 0, local_right = 0;
for (i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp parallel for reduction(+:local_left,local_right)
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
// Borders checking
if (!(i+i_b >= 0) || !(i+i_b < h) || !(j+j_b >= 0) || !(j+j_b < w) || !(j+j_b-d >= 0) || !(j+j_b-d < w)) {
continue;
}
// Calculating indices of the block within the whole image
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
// Updating the block means
//*left_mean += *(left+ind_l);
//*right_mean += *(right+ind_r);
local_left += left[ind_l];
local_right += right[ind_r];
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
}
This now makes the program execution longer than non-threaded version. I added private(left,right) but it leads to bad memory access for ind_l.

I think this should get you closer to what you want, although I'm not quite sure about one final part.
float local_left, local_right = 0;
for ( int32_t i_b = -(block_height-1)/2; i_b < (block_height-1)/2; i_b++) {
#pragma omp for schedule(static, CORES) reduction(+:left_mean, +: right_mean)
{
for ( int32_t j_b = -(block_width-1)/2; j_b < (block_width-1)/2; j_b++) {
if (your conditions) continue;
int32_t ind_l = (i+i_b)*w + (j+j_b);
int32_t ind_r = (i+i_b)*w + (j+j_b-d);
local_left += *(left+ind_l);
local_right += *(right+ind_r);
}
}
}
*left_mean = local_left/(block_height * block_width);
*right_mean = local_right/(block_height * block_width);
Part I am unsure of is whether you need the schedule() and how to do two different reductions. I know for one reduction, you can simply do
reduction(+:left_mean)
EDIT: some reference for the schedule() http://pages.tacc.utexas.edu/~eijkhout/pcse/html/omp-loop.html#Loopschedules
It looks like you do not need this, but using it could produce a better runtime

OpenMP parallelization not efficient

I'm trying to parallelize this code using OpenMP.
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x) cur_row++;
else cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
for(i=0;i<Npt_x;i++) {
for(j=0;j<Npt_y;j++) {
i_corrected = cur_row - i;
if(i_corrected < 0) i_corrected = Npt_x + i_corrected;
R[i][j] += val[cur_row][0]*val[i_corrected][j]/Ntot;
}
}
}
with
- val and R declared as **double,
- Npt_x and Npt_y are about 500,
- Ntot is about 10^6.
I've done this
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x) cur_row++;
else cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
#pragma omp parallel for collapse(2), private(i,j,i_corrected)
for(i=0;i<Npt_x;i++) {
for(j=0;j<Npt_y;j++) {
i_corrected = cur_row - i;
if(i_corrected < 0) i_corrected = Npt_x + i_corrected;
R[i][j] += val[cur_row][0]*val[i_corrected][j]/Ntot;
}
}
}
The problem is that it doesn't seem to be efficient. Is there a way to use OpenMP more efficiently in this case ?
Many thks

Right now, I would try something like this:
for(t_step=0;t_step<Ntot;t_step++) {
// current row
if(cur_row + 1 < Npt_x)
cur_row++;
else
cur_row = 0;
// get data from file which update only the row "cur_row" of array val
read_line(f_u, val[cur_row]);
// computes
#pragma omp parallel for private(i,j,i_corrected)
for(i=0;i<Npt_x;i++) {
i_corrected = cur_row - i;
if(i_corrected < 0)
i_corrected += Npt_x;
double tmp = val[cur_row][0]/Ntot;
#if defined(_OPENMP) && _OPENMP > 201306
#pragma omp simd
#endif
for(j=0;j<Npt_y;j++) {
R[i][j] += tmp*val[i_corrected][j];
}
}
}
However, since the code will be memory bound, that's not sure it'll get you much parallel speed-up... Worth a try though.

C: Accessing lookup tables faster?

I have a piece of code that traces 4 sines at a time.
My original code was making roughly 12000 sin() function calls per frame and was running at 30 fps.
I tried optimizing it by generating lookup tables. I ended up with 16 different lookup tables. I declared and load them in a separate header file at the top of my program. Each table is declared like so:
static const float d4_lookup[800] {...};
Now, with this new method I actually lost fps?! I'm running at 20 fps now instead of 30. Each frame now only has to do 8 sin / cos calls and 19200 lookup calls vs 12000 sin() calls.
I compile using gcc with -O3 flag on. At the moment, the lookup tables are included at the top and are part of the global scope of the program.
I assume I'm not loading them in the right memory or something to that effect. How can I speed up the lookup time?
** EDIT 1 **
As requested, here's the function that uses the lookup calls, it is called once per frame:
void
update_sines(void)
{
static float c1_sin, c1_cos;
static float c2_sin, c2_cos;
static float c3_sin, c3_cos;
static float c4_sin, c4_cos;
clock_gettime(CLOCK_MONOTONIC, &spec);
s = spec.tv_sec;
ms = spec.tv_nsec * 0.0000001;
etime = concatenate((long)s, ms);
c1_sin = sinf(etime * 0.00525);
c1_cos = cosf(etime * 0.00525);
c2_sin = sinf(etime * 0.007326);
c2_cos = cosf(etime * 0.007326);
c3_sin = sinf(etime * 0.0046);
c3_cos = cosf(etime * 0.0046);
c4_sin = sinf(etime * 0.007992);
c4_cos = cosf(etime * 0.007992);
int k;
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
}
}
** UPDATE **
For anyone reading this thread, I gave up on this problem. I tried using OpenCL kernels, structs, SIMD instructions as well as all the solutions shown here. In the end the original code that computed the sinf() 12800 per frame worked faster than the lookup tables since the lookup tables didn't fit into the cache. Yet it was still only doing 30 fps. It just had too much going on to keep up with my 60fps expectations. I've decided to take a different direction. Thanks to everyone who contributed to this thread. Most of these solutions would probably work to get some half decent speed improvements but nothing like the 200% speed up I needed here to have the lookup tables work the way I wanted.

Sometimes it's hard to know what's slowing you down, but potentially you are going to ruin your cache hits, you could try a lookup of a struct
typedef struct
{
float bx1_sin;
float bx2_sin;
float bx3_sin;
float bx4_sin;
float bx1_cos;
etc etc
including sine1,2,3,4 as well
} lookup_table
then
lookup_table lookup[800]
now everything at the kth lookup will be in the same small chunk of memory.
also, if you use a macro that takes k as a parameter to do do the contents of the loop lets say SINE_CALC(k), or an inline function...
you can do
for (k = 0; k < 800; ++k)
{
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
SINE_CALC(k); k++;
}
if you do a macro, make sure the k++ is outside the macro call like shown

Try unrolling your loops like this:
for (k = 0; k < 800; ++k)
{
sine1[k] = a1_lookup[k];
sine2[k] = a2_lookup[k];
sine3[k] = a3_lookup[k];
sine4[k] = a4_lookup[k];
}
for (k = 0; k < 800; ++k)
{
sine1[k] *= ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k]));
sine2[k] *= ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k]));
sine3[k] *= ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k]));
sine4[k] *= ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k]));
}
for (k = 0; k < 800; ++k)
{
sine1[k] += d1_lookup[k];
sine2[k] += d2_lookup[k] + 50;
sine3[k] += d3_lookup[k];
sine4[k] += d4_lookup[k] + 50;
}
By accessing fewer lookup tables in each loop, you should be able to stay in the cache. The middle loop could be split up as well, but you'll need to create an intermediate table for one of the sub-expressions.

Intel processors can predict serial access (and perform prefetch) for up to 4 arrays both for forward and backward traverse. At least this was true in Core 2 Duo days. Split your for in:
for (k = 0; k < 800; ++k)
sine1[k] = a1_lookup[k] * ((bx1_sin_lookup[k] * c1_cos) + (c1_sin * bx1_cos_lookup[k])) + d1_lookup[k];
for (k = 0; k < 800; ++k)
sine2[k] = a2_lookup[k] * ((bx2_sin_lookup[k] * c2_cos) + (c2_sin * bx2_cos_lookup[k])) + d2_lookup[k] + 50;
for (k = 0; k < 800; ++k)
sine3[k] = a3_lookup[k] * ((bx3_sin_lookup[k] * c3_cos) + (c3_sin * bx3_cos_lookup[k])) + d3_lookup[k];
for (k = 0; k < 800; ++k)
sine4[k] = a4_lookup[k] * ((bx4_sin_lookup[k] * c4_cos) + (c4_sin * bx4_cos_lookup[k])) + d4_lookup[k] + 50;
I guess you have more cache load than benchmarks in other answers so this does matters. I recommend you not to unroll loops, compilers do it well.

Using a simple sin lookup table will yields >20% speed increase on my linux machine (vm, gcc, 64bit). Interestingly, the size of lookup table (within reasonable < L1 cache size values) does not influence the speed of execution.
Using a fastsin simple implementation from here I got >45% improvement.
Code:
#include <math.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>
#define LOOKUP_SIZE 628
uint64_t currentTimestampUs( void )
{
struct timeval tv;
time_t localTimeRet;
uint64_t timestamp = 0;
//time_t tzDiff = 0;
struct tm when;
int64_t localeOffset = 0;
{
localTimeRet = time(NULL);
localtime_r ( &localTimeRet, &when );
localeOffset = when.tm_gmtoff * 1000000ll;
}
gettimeofday ( &tv, NULL );
timestamp = ((uint64_t)((tv.tv_sec) * 1000000ll) ) + ( (uint64_t)(tv.tv_usec) );
timestamp+=localeOffset;
return timestamp;
}
const double PI = 3.141592653589793238462;
const double PI2 = 3.141592653589793238462 * 2;
static float sinarr[LOOKUP_SIZE];
void initSinArr() {
int a =0;
for (a=0; a<LOOKUP_SIZE; a++) {
double arg = (1.0*a/LOOKUP_SIZE)*((double)PI * 0.5);
float sinval_f = sin(arg); // double computation earlier to avoid losing precision on value
sinarr[a] = sinval_f;
}
}
float sinlookup(float val) {
float normval = val;
while (normval < 0) {
normval += PI2;
}
while (normval > PI2) {
normval -= PI2;
}
int index = LOOKUP_SIZE*(2*normval/PI);
if (index > 3*LOOKUP_SIZE) {
index = -index + 4*LOOKUP_SIZE;//LOOKUP_SIZE - (index-3*LOOKUP_SIZE);
return -sinarr[index];
} else if (index > 2*LOOKUP_SIZE) {
index = index - 2*LOOKUP_SIZE;
return -sinarr[index];
} else if (index > LOOKUP_SIZE) {
index = 2*LOOKUP_SIZE - index;
return sinarr[index];
} else {
return sinarr[index];
}
}
float sin_fast(float x) {
while (x < -PI)
x += PI2;
while (x > PI)
x -= PI2;
//compute sine
if (x < 0)
return 1.27323954 * x + .405284735 * x * x;
else
return 1.27323954 * x - 0.405284735 * x * x;
}
int main(void) {
initSinArr();
int a = 0;
float val = 0;
const int num_tries = 100000;
uint64_t startLookup = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sinlookup(val);
(void)compval;
}
}
uint64_t startSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin(val);
(void)compval;
}
}
uint64_t startFastSin = currentTimestampUs();
for (a=0; a<num_tries; a++) {
for (val=0; val<PI2; val+=0.01) {
float compval = sin_fast(val);
(void)compval;
}
}
uint64_t end = currentTimestampUs();
int64_t lookupMs = (startSin - startLookup)/1000;
int64_t sinMs = (startFastSin - startSin)/1000;
int64_t fastSinMs = (end - startFastSin)/1000;
printf(" lookup: %lld ms\n", lookupMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-lookupMs);
printf(" diff%: %lld %\n", 100*(sinMs-lookupMs)/sinMs);
printf("fastsin: %lld ms\n", fastSinMs );
printf(" sin: %lld ms\n", sinMs );
printf(" diff: %lld ms\n", sinMs-fastSinMs);
printf(" diff%: %lld %\n", 100*(sinMs-fastSinMs)/sinMs);
}
Sample result:
lookup: 2276 ms
sin: 3004 ms
diff: 728 ms
diff%: 24 %
fastsin: 1500 ms
sin: 3004 ms
diff: 1504 ms
diff%: 50 %

Writing a simple Discrete Fourier Transform for real inputs in C

So I'm trying to write the Discrete Fourier Transform in C to work with real 32-bit float wav files. It reads in 2 frames at a time (one for each channel, but for my purposes I'm assuming they are both the same and so I use frame[0]). This code is supposed to write out the amplitude spectrum for an input file by probing it with frequencies 20,40,60,...,10000. I am using a Hanning window on the input frames. I want to avoid using complex numbers if I can. When I run this, it gives me some very strange amplitudes (most of which are extremely small, and are not associated with the correct frequencies), which makes me believe I am making a fundamental mistake in my computation. Can somebody offer some insight into what is happening here? Here is my code:
int windowSize = 2205;
int probe[500];
float hann[2205];
int j, n;
// initialize probes to 20,40,60,...,10000
for (j=0; j< len(probe); j++) {
probe[j] = j*20 + 20;
fprintf(f, "%d\n", probe[j]);
}
fprintf(f, "-1\n");
// setup the Hann window
for (n=0; n< len(hann); n++) {
hann[n] = 0.5*(cos((2*M_PI*n/(float)windowSize) + M_PI))+0.5;
}
float angle = 0.0;
float w = 0.0; // windowed sample
float realSum[len(probe)]; // stores the real part of the probe[j] within a window
float imagSum[len(probe)]; // stores the imaginary part of probe[j] within window
float mag[len(probe)]; // stores the calculated amplitude of probe[j] within a window
for (j=0; j<len(probe);j++) {
realSum[j] = 0.0;
imagSum[j] = 0.0;
mag[j] = 0.0;
}
n=0; //count number of samples within current window
framesread = psf_sndReadFloatFrames(ifd,frame,1);
totalread = 0;
while (framesread == 1){
totalread++;
// window the frame with hann value at current sample
w = frame[0]*hann[n];
// determine both real and imag product values at sample n for all probe freqs times the windowed signal
for (j=0; j<len(probe);j++) {
angle = (2.0 * M_PI * probe[j] * n) / windowSize;
realSum[j] = realSum[j] + (w * cos(angle));
imagSum[j] = imagSum[j] + (w * sin(angle));
}
n++;
// checks to see if current window has ended
if (totalread % windowSize == 0) {
fprintf(f, "B(%f)\n", totalread/44100.0);
printf("%f breakpoint written\n", totalread/44100.0);
for (j=0; j < len(mag); j++) { // print out the amplitudes
realSum[j] = realSum[j]/windowSize;
imagSum[j] = imagSum[j]/windowSize;
mag[j] = sqrt(pow((double)realSum[j],2)+pow((double)imagSum[j],2))/windowSize;
fprintf(f, "%d\t%f\n", probe[j], mag[j]);
realSum[j] = 0.0;
imagSum[j] = 0.0;
}
n=0;
}
framesread = psf_sndReadFloatFrames(ifd,frame,1);
}

I think the error is in the calculation of the angle. The increment of the angle for each sample is dependent on the sampling frequency.
Something like this (you seem to have 44100Hz):
angle = (2.0 * M_PI * probe[j] * n) / 44100;
Your sample window will contain one full cycle for your lowest probed frequency 20Hz. If you loop n up to 2205 then that angle would be 2*M_PI.
What you saw was probably aliasing because your reference had the frequency 2205Hz and all frequencies above 1102Hz was aliased to lower frequencies.

With code below - only slightly reorganised to compile and create a fake sample, I do not get all zeroes. I have changed the output call to at the end from:
fprintf(f, "%d\t%f\n", probe[j], mag[j] );
to
if (mag[j] > 1e-7)
fprintf(f, "%d\t%f\n", probe[j], mag[j] * 10000);
This just makes it easier to see the non-zero data. Maybe the only issue is understanding the scale factor? Note how I faked input to generate a pure tone as a test case.
#include <math.h>
#include <stdio.h>
#define M_PI 3.1415926535
#define SAMPLE_RATE 44100.0f
#define len(array) (sizeof array/sizeof *array)
unsigned psf_sndReadFloatFrames(FILE* inFile,float* frame,int framesToRead)
{
static float counter = 0;
float frequency = 1000;
float time = counter++;
float phase = time/SAMPLE_RATE*frequency;
*frame = (float)sin(phase);
return counter < SAMPLE_RATE;
}
void discreteFourier(FILE* f)
{
FILE* ifd = 0;
float frame[1];
int windowSize = 2205;
int probe[500];
float hann[2205];
float angle = 0.0;
float w = 0.0; // windowed sample
float realSum[len(probe)]; // stores the real part of the probe[j] within a window
float imagSum[len(probe)]; // stores the imaginary part of probe[j] within window
float mag[len(probe)]; // stores the calculated amplitude of probe[j] within a window
int j, n;
unsigned framesread = 0;
unsigned totalread = 0;
for (j=0; j<len(probe);j++) {
realSum[j] = 0.0;
imagSum[j] = 0.0;
mag[j] = 0.0;
}
// initialize probes to 20,40,60,...,10000
for (j=0; j< len(probe); j++) {
probe[j] = j*20 + 20;
fprintf(f, "%d\n", probe[j]);
}
fprintf(f, "-1\n");
// setup the Hann window
for (n=0; n< len(hann); n++)
{
hann[n] = 0.5*(cos((2*M_PI*n/(float)windowSize) + M_PI))+0.5;
}
n=0; //count number of samples within current window
framesread = psf_sndReadFloatFrames(ifd,frame,1);
totalread = 0;
while (framesread == 1){
totalread++;
// window the frame with hann value at current sample
w = frame[0]*hann[n];
// determine both real and imag product values at sample n for all probe freqs times the windowed signal
for (j=0; j<len(probe);j++) {
angle = (2.0 * M_PI * probe[j] * n) / windowSize;
realSum[j] = realSum[j] + (w * cos(angle));
imagSum[j] = imagSum[j] + (w * sin(angle));
}
n++;
// checks to see if current window has ended
if (totalread % windowSize == 0) {
fprintf(f, "B(%f)\n", totalread/SAMPLE_RATE);
printf("%f breakpoint written\n", totalread/SAMPLE_RATE);
for (j=0; j < len(mag); j++) { // print out the amplitudes
realSum[j] = realSum[j]/windowSize;
imagSum[j] = imagSum[j]/windowSize;
mag[j] = sqrt(pow((double)realSum[j],2)+pow((double)imagSum[j],2))/windowSize;
if (mag[j] > 1e-7)
fprintf(f, "%d\t%f\n", probe[j], mag[j] * 10000);
realSum[j] = 0.0;
imagSum[j] = 0.0;
}
n=0;
}
framesread = psf_sndReadFloatFrames(ifd,frame,1);
}
}