SSE for 2D arrays - c

I want to change the following code using SSE3 instructions:
for (i=0; i<=imax+1; i++) {
/* The vertical velocity approaches 0 at the north and south
* boundaries, but fluid flows freely in the horizontal direction */
v[i][jmax] = 0.0;
u[i][jmax+1] = u[i][jmax];
v[i][0] = 0.0;
u[i][0] = u[i][1];
}
u and v are 2D arrays of type float. What I have so far is this but the program does not run correctly.
int loop2 = ((imax+1) / loopFactor) * loopFactor;
for(i=0; i<loop2; i+=loopFactor) {
__m128 zeroVec = _mm_set1_ps(0.0f);
_mm_storeu_ps(&v[i][jmax], zeroVec);
__m128 umaxVec = _mm_loadu_ps(&u[i][jmax]);
_mm_storeu_ps(&u[i][jmax+1], umaxVec);
__m128 zVec = _mm_set1_ps(0.0f);
_mm_storeu_ps(&v[i][0], zVec);
__m128 uVec = _mm_loadu_ps(&u[i][1]);
_mm_storeu_ps(&u[i][0], uVec);
}
for (; i<=imax+1; i++){
v[i][jmax] = 0.0;
u[i][jmax+1] = u[i][jmax];
v[i][0] = 0.0;
u[i][0] = u[i][1];
}
I suspect that this is because _mm_loadu_ps stores values for u[i][1], u[i][2], u[i][3] and u[i][4] but I want to store the values u[i][1], u[i+1][1], u[i+2][1], u[i+3][1] and u[i+4][1]. How can I do that? Loopfactor has a value of 4.
Any help is really appreciated.

Related

float values of array are not assigned during a for loop (STM32 / arm-none-eabi-gcc)

I want to convert a bunch of uint16_t values from an ADC into floating point voltages.
For this I use a for loop to loop through the uint16_t array and write the values to a float array.
But the float array remains 0 as if no assignment is ever made.
Outside of the for loop the conversion works.
And when I step through the program with the debugger, I see reasonable float values but they do not end up being written to the array. Why?
the temporary float value is clearly 1.847:
the temporary array index is clearly 0:
so I expect the adc_voltages[0] to be 1.847 which it is not:
Code:
Global Variables:
volatile uint16_t adc_dma_buffer[SG_MK2_ADC1_CHANNELS * SG_MK2_ADC1_N_SAMPLES];
float adc_voltages[SG_MK2_ADC1_CHANNELS * SG_MK2_ADC1_N_SAMPLES];
Later in the main():
float temp = 0.0f;
uint8_t index = 0;
for(uint8_t i=0; i<8; i++){
temp = BSP_convU1(adc_dma_buffer[i*SG_MK2_ADC1_CHANNELS+0]);
index = i*SG_MK2_ADC1_CHANNELS+0;
adc_voltages[index] = temp; // BSP_convU1(adc_dma_buffer[i*SG_MK2_ADC1_CHANNELS+0]);
adc_voltages[i*SG_MK2_ADC1_CHANNELS+1] = BSP_convU2(adc_dma_buffer[i*SG_MK2_ADC1_CHANNELS+1]);
adc_voltages[i*SG_MK2_ADC1_CHANNELS+2] = BSP_convU3(adc_dma_buffer[i*SG_MK2_ADC1_CHANNELS+2]);
adc_voltages[i*SG_MK2_ADC1_CHANNELS+3] = BSP_internalTemperature(adc_dma_buffer[i*SG_MK2_ADC1_CHANNELS+3]);
}
Where the functions return float:
float BSP_convU1(uint32_t adc_val){
float adc_vsense = SG_MK2_ADC_VREF/4096.0f * (float)adc_val;
return adc_vsense * BSP_CONV_U1_FACTOR + BSP_CAL_U1_OFFSET;
}
Edit:
Thanks for all the comments and good practice hints that I will use from now on. Especially with the usage of 2D arrays.
I was just able to resolve the issue.
I am still not sure why it happened.
"working outside of the loop" was not correct either, it only worked, when BSP_conv_I_MPPT() was evaluated inside the printf statement directly.
Anyhow, the following code with 2D arrays now works.
if(flag_SDADC_cplt){
flag_SDADC_cplt=0;
for(int i=0; i<SG_MK2_SDADC1_N_SAMPLES; i++){
adc_currents[i][0] = BSP_conv_I_Boost (sdadc_dma_buffer[i][0]);
adc_currents[i][1] = BSP_conv_I_MPPT (sdadc_dma_buffer[i][1]);
adc_currents[i][2] = BSP_conv_I_Solar (sdadc_dma_buffer[i][2]);
// order: adc[sample][channel]
}
printf("DCDC\tMPPT\tPV\n");
printf("%.3fA\t%.3fA\t%.3fA\n", adc_currents[0][0], adc_currents[1][0], adc_currents[2][0]); // order[channel][sample]
}
if(flag_ADC_cplt){
flag_ADC_cplt = 0;
for(int i=0; i<SG_MK2_ADC1_N_SAMPLES; i++){
adc_voltages[i][0] = BSP_convU1(adc_dma_buffer[i][0]);
adc_voltages[i][1] = BSP_convU2(adc_dma_buffer[i][1]);
adc_voltages[i][2] = BSP_convU3(adc_dma_buffer[i][2]);
adc_voltages[i][3] = BSP_internalTemperature(adc_dma_buffer[i][3]);
}
printf("\nPV\tCAN\tBat\tTemp\n");
printf("%.2fV\t%.2fV\t%.2fV\t%.1fC\n", adc_voltages[0][0], adc_voltages[0][1], adc_voltages[0][2], adc_voltages[0][3]);
}

C Keep Getting Double Free, despite trying to free in same form as allocation

Hey I'm trying to do a simple machine learning application for school but I keep getting double free for some reason I cannot even fathom.
float * evaluate(Network net,float * in)
{
int i,j;
float * out;
Neuron cur_neu;
for(i=0,j=0;i<net.n_lay;i++) j = net.lay_sizes[i]>j?net.lay_sizes[i]:j; //Calculating the maximum lay size for output storage
out = (float *) malloc(j*sizeof(float));
for(i=0;i<net.n_lay;i++) //Cycling through layers
{
for(j=0;j<net.lay_sizes[i];j++) //Cycling through Neurons
{
cur_neu=net.matrix[i][j];
out[j] = cur_neu.af(cur_neu.w,in,net.lay_sizes[i-1]); //Storing each answer in out
}
for(j=0;j<net.lay_sizes[i];j++) in[j] = out[j]; //Transfering answers to in
}
return out;
}
float loss(Network net, float **ins_orig, int t_steps)
{
float **profecies;
float st = .5f;
int d_steps = 4;
int t, i, j;
int out_size = net.lay_sizes[net.n_lay - 1];
int in_size = net.lay_sizes[0];
float out = 0.0f;
float **ins;
/*
d_steps = Divination Steps: Number of time steps forward the network has to predict.
The size of the output layer must be d_steps*#ins (deconsidering any conceptual i/os)
t_steps = Total of Steps: Total number of time steps to simulate.
*/
//Copying ins
ins = (float **)malloc(t_steps * sizeof(float *));
for (i = 0; i < t_steps; i++) //I allocate memory for and copy ins_orig to ins here
{
ins[i] = (float *)malloc(in_size * sizeof(float));
for (j = 0; j < in_size; j++)
ins[i][j] = ins_orig[i][j];
}
//
profecies = (float **)malloc(t_steps * sizeof(float *));
for (t = 0; t < t_steps; t++)
{
profecies[t] = evaluate(net, ins[t]);
/*
Profecy 0:
[[a1,b1,c1,d1]
[e1,f1,g1,h1]
[i1,j1,k1,l1]]
Profecy 1:
[[e2,f2,g2,h2]
[i2,j2,k2,l2]
[m2,n2,o2,q2]]
Verification for:
t=0:
loss+= abs(a1-ins[t][0]+b2-ins[t][1]...)
t=1:
t=0:
loss+= abs(e1-ins[t][0]+f2-ins[t][1]...)
*/
for (i = 0; i < d_steps; i++) //i is distance of prediction
{
if (i <= t) // stops negative profecy indexing
{
for (j = 0; j < in_size; j++)
{
out += (ins[t][j] - profecies[t-i][j+in_size*i]) * (ins[t][j] - profecies[t-i][j+in_size*i]) * (1 + st*i); //(1+st*i) The further the prediction, the bigger reward
}
}
}
}
//Free ins
for (i = 0; i < t_steps; i++) //I try to free it here, but to no avail
{
free(ins[i]);
}
free(ins);
return out;
}
I realize it's probably something very obvious but, I can't figure it out for the life of me and would appreciate the help.
Extra details that probably aren't necessary:
evaluate just passes the input to the network (stored in ins) and returns the output
both inputs and outputs are stored in float "matrixes"
Edit: Added evaluate
In your loss() you allocate the same number of floats for each ins:
ins[i] = (float *)malloc(in_size * sizeof(float));
In your evaluate() you calculate the longest lay_size, indicating that it may NOT be net.lay_sizes[0]:
for(i=0,j=0;i<net.n_lay;i++) j = net.lay_sizes[i]>j?net.lay_sizes[i]:j; //Calculating the maximum lay size for output storage
Then you are writing out-of-bounds here:
for(j=0;j<net.lay_sizes[i];j++) in[j] = out[j]; //Transfering answers to in
From that point, your memory is corrupted.

multiple analog inputs to produce individual averages for each channel

I am trying to put four analog inputs into individual channels that contain an array. Once that happens I am trying to get an average of each channel's array getting a single int or float. Lastly, I want to compare the averages in an if statement to get a serial print and divide the compared averages.
I am just confused on what in the code I pieced together is necessary.
Thank you for any advice or help.
Here is my code below
#include <Servo.h>
float sVal0 = 0.0;
float sVal1 = 0.0;
float sVal2 = 0.0;
float sVal3 = 0.0;
float sVal02 = 0.0;
float sVal13 = 0.0;
const int numReadings = 10; //# of readings needed to average
const int numChannels = 4; // 4 analog outputs
int readings[numChannels][numReadings]; // the readings from the analog input
int index; // the index of the current reading
void setup () {
Serial.begin(9600);
}
void loop () {
sVal0 = analogRead(A0);
sVal1 = analogRead(A1);
sVal2 = analogRead(A2);
sVal3 = analogRead(A3);
for (int chan = 0; chan <= numChannels; ++chan ){
Serial.println(sVal0[chan]); // serial print each array
Serial.println(sVal1[chan]);
Serial.println(sVal2[chan]);
Serial.println(sVal3[chan]);
for (int thisReading = 0; thisReading < numReadings; thisReading++) {
readings[thisReading] = 0;
index = index + 1;
}
if (index >= numReadings) {
index = 0;
sVal0_avg = sVal0[chan]/numReadings; // get average
sVal1_avg = sVal0[chan]/numReadings;
sVal2_avg = sVal0[chan]/numReadings;
sVal3_avg = sVal0[chan]/numReadings;
}
}
if (sVal1_avg > sVal3_avg) {
Serial.print("1 avg: );
Serial.println(sVal1_avg);
sVal31 = sVal3_avg / sVal1_avg;
Serial.print("comparison : ");
Serial.println(sVal31);
}
}

Trouble getting my phase-vocoder Pd external to work

I'm trying to write a Pd external that performs pitch-shifting using the phase-vocoder algorithm. It's my first time writing externals and I'm not much of a C programmer, so I hope you guys can help me out with this. I'm just attaching the perform method.
When I change the number of semitones to be transposed I get a weird behaviour. It does pitch shift, but with many weird artifacts and not in the amount that I have set. I think the code is right, but it obviously isn't; could anybody give me some hints on what the problem could be? I know it's a long piece of code and my question is ambiguous but I find it so impossible to debug this using Pd and I'm running out of ideas to test the code.
int i, j, n, frame_size, frame_size_half, overlap_in, overlap_chunk, hop_in, hop_out;
int semitones;
t_float amp_scalar, alpha;
t_pitchShifter_tilde *x = (t_pitchShifter_tilde *)(w[1]);
t_sample *in = (t_float *)(w[2]);
t_sample *out = (t_float *)(w[3]);
n = w[4]; //TamaƱo del buffer de entrada
semitones = x->semitones;
frame_size = x->frame_size;
frame_size_half = x->frame_size_half;
overlap_in = x->overlap_in;
overlap_chunk = x->hop_in;
hop_in = x->hop_in;
alpha = pow(2.0,semitones/12.0);//x->alpha;
hop_out = round(alpha*hop_in);//x->hop_out;
amp_scalar = x->amp_scalar;
// shift previous contents back
for(i=0; i<(frame_size-n); i++)
x->input_buf[i] = x->input_buf[n+i];
// buffer most recent block
for(i=0; i<n; i++)
x->input_buf[frame_size-n+i] = in[i]; // C
if(x->dsp_tick>=x->buffer_limit)
{
x->dsp_tick = 0;
// ANALYSIS
// window the signal
for(i=0; i<frame_size; i++){
x->input_buf_windowed[i] = x->input_buf[i] * x->hann[i];
}
// take FT of window
mayer_realfft(frame_size, x->input_buf_windowed);
//Debug:
if(debug){
for(i=0; i<frame_size; i++){
post("fft output %i: %f", i, x->input_buf_windowed[i]);
}
}
// unpack mayer_realfft results into R&I arrays
for(i=0; i<=frame_size_half; i++)
{
x->signal_R[i] = x->input_buf_windowed[i];
if(fabs(x->signal_R[i]) < 0.0001)
x->signal_R[i] = 0.0;
}
x->signal_I[0]=0; // DC
for(i=(frame_size-1), j=1; i>frame_size_half; i--, j++)
{
x->signal_I[j] = x->input_buf_windowed[i];
if(fabs(x->signal_I[j]) < 0.0001)
x->signal_I[j] = 0.0;
}
x->signal_I[frame_size_half]=0; // Nyquist
// PROCESSING
for(i=0; i<=frame_size_half; i++)
{
// Calculate the magnitude
x->signal_mag[i] = cabsf(x->signal_R[i]+I*x->signal_I[i]); //sqrt(x->signal_R[i]*x->signal_R[i]+x->signal_I[i]*x->signal_I[i]);
// Calculate the phase
x->signal_phase[i] = cargf(x->signal_R[i]+I*x->signal_I[i]); //sqrt
// Calculate the phase difference between consecutive frames
x->phase_dif[i] = x->signal_phase[i] - x->prev_signal_phase[i];
//Store current frame's phase for next frame's processing
x->prev_signal_mag[i] = x->signal_mag[i];
x->prev_signal_phase[i] = x->signal_phase[i];
// Remove the expected phase difference
x->phase_dif[i] -= hop_in*2*M_PI*i/frame_size; //2*M_PI*i/x->overlap_in;
// Wrap around
x->phase_dif[i] = x->phase_dif[i] + M_PI;
x->phase_dif[i] = (x->phase_dif[i]-floor(x->phase_dif[i]/(2*M_PI)) * 2*M_PI) - M_PI;
// Calculate true frequency
x->true_freq[i] = 2*M_PI*i/frame_size + x->phase_dif[i]/hop_in; //W_bin + deltaW
// Get the cumulative phase
x->cumulative_phase[i] += (hop_out)* x->true_freq[i];
// Wrap around
x->cumulative_phase[i] += M_PI;
x->cumulative_phase[i] -= floor(x->cumulative_phase[i]/(2*M_PI))*2*M_PI - M_PI;
// Save the real and imaginary part
x->signal_R[i] = x->signal_mag[i] * cos(x->cumulative_phase[i]);
x->signal_I[i] = x->signal_mag[i] * sin(x->cumulative_phase[i]);
}
// SYNTHESIS
// pack real and imaginary parts in correct order for mayer_realifft
for(i=0; i<=frame_size_half; i++)
x->input_buf_windowed[i] = x->signal_R[i];
for(i=(frame_size_half+1), j=(frame_size_half-1); i<frame_size; i++, j--)
x->input_buf_windowed[i] = x->signal_I[j];
// resynth
mayer_realifft(frame_size, x->input_buf_windowed);
// window
for(i=0; i<frame_size; i++)
{
x->input_buf_windowed[i] *= x->hann[i];
x->input_buf_windowed[i] *= amp_scalar;
}
// Overlap/Add:
// shift overlap/add buffer's previous contents back
for(i=0; i<( 2*frame_size - hop_out); i++)
x->overlap_add_buffer[i] = x->overlap_add_buffer[i+hop_out];
// Set to 0 last overlap chunk:
for(i= (2*frame_size - hop_out); i< 2*frame_size; i++)
x->overlap_add_buffer[i] = 0;
// Overlap/add most recent block
for(i=0; i<frame_size; i++)
x->overlap_add_buffer[frame_size + i] += x->input_buf_windowed[i];
// Put out a hop_out size array
for(i=0; i<hop_out; i++)
x->vocoder_output[i] = x->overlap_add_buffer[frame_size - hop_out + i];
// RE-SAMPLING
int index_floor, index_ceil;
for(i=0; i<hop_in; i++)
{
index_floor = floor(alpha*i);
index_ceil = index_floor + 1;
x->final_output[i] = x->vocoder_output[ index_floor];
x->final_output[i] += ( x->vocoder_output[index_ceil] - x->vocoder_output[index_floor] )*(alpha*i - index_floor);
}
}; // If
// OUTPUT
for(i=0; i<n; i++, out++)
*out = x->final_output[(x->dsp_tick*n)+i];
x->dsp_tick++;
return (w+5);}

Classification using LibSVM

I am using LibSVM to carry out some multi-class classifications. I trained the model using the MATLAB interface of LibSVM. I then saved this model in a format that would be recognized in C. I now want to classify using svm_predict in C. I am having trouble being able to reproduce the results that I saw in MATLAB. In fact I get the same class output irrespective of what test vector I feed in (even a vector of zeros) I think the issue is with the way I am loading the test vector x into the svm_node structure. Below is the code snippet. Do let me know if this is correct way or if I am missing something.
struct svm_model *libsvm_model = svm_load_model('mymodel.svm');
struct svm_node x[2001]; // this is for one feature vector of size 2000x1
int index = 1;
int i = 0;
for (i = 0; i < features.size(); i++) {
x[i].index = index;
x[i].value = features.at(i);
index = index + 1;
}
x[i+1].index = -1;
x[i+1].value = '?';
double result = svm_predict(libsvm_model, x);
This seems to be a problem:
x[i+1].index = -1;
x[i+1].value = '?';
libsvm requires svm_node to be an input vector, which should have positive indexes, and double values. You should not "leave" some weird empty dimension.
And by the way, you don't need index variable
for (i = 0; i < features.size(); i++) {
x[i].index = index;
x[i].value = features.at(i);
index = index + 1;
}
is equivalent to
for (i = 0; i < features.size(); i++) {
x[i].index = i + 1;
x[i].value = features.at(i);
}

Resources