How can I improve / speed up this frequent function?
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define M 10 // This is fixed
#define N 8 // This is NOT fixed
// Assumptions: 1. x, a, b and c are all arrays of 10 (M).
// 2. y and z are all matrices of 8 x 10 (N x M).
// Requirement: 1. return the value of ret;
// 2. get all elements of array c
float fnFrequentFunction(const float* x, const float* const* y, const float* const* z,
const float* a, const float* b, float *c, int n)
{
register float tmp;
register float sum;
register float ret = 0;
register const float* yy;
register const float* zz;
int i;
for (i = 0; i < n; i++) // M == 1, 2, 4, or 8
{
sum = 0;
yy = y[i];
zz = z[i];
tmp = x[0] - yy[0]; sum += tmp * tmp * zz[0];
tmp = x[1] - yy[1]; sum += tmp * tmp * zz[1];
tmp = x[2] - yy[2]; sum += tmp * tmp * zz[2];
tmp = x[3] - yy[3]; sum += tmp * tmp * zz[3];
tmp = x[4] - yy[4]; sum += tmp * tmp * zz[4];
tmp = x[5] - yy[5]; sum += tmp * tmp * zz[5];
tmp = x[6] - yy[6]; sum += tmp * tmp * zz[6];
tmp = x[7] - yy[7]; sum += tmp * tmp * zz[7];
tmp = x[8] - yy[8]; sum += tmp * tmp * zz[8];
tmp = x[9] - yy[9]; sum += tmp * tmp * zz[9];
ret += (c[i] = log(a[i] * b[i]) + sum);
}
return ret;
}
// In the main function, all values are just example data.
int main()
{
float x[M] = {0.001251f, 0.563585f, 0.193304f, 0.808741f, 0.585009f, 0.479873f, 0.350291f, 0.895962f, 0.622840f, 0.746605f};
float* y[N];
float* z[N];
float a[M] = {0.870205f, 0.733879f, 0.711386f, 0.588244f, 0.484176f, 0.852962f, 0.168126f, 0.684286f, 0.072573f, 0.632160f};
float b[M] = {0.871487f, 0.998108f, 0.798608f, 0.134831f, 0.576281f, 0.410779f, 0.402936f, 0.522935f, 0.623218f, 0.193030f};
float c[N];
float t1[M] = {0.864406f, 0.709006f, 0.091433f, 0.995727f, 0.227180f, 0.902585f, 0.659047f, 0.865627f, 0.846767f, 0.514359f};
float t2[M] = {0.866817f, 0.581347f, 0.175542f, 0.620197f, 0.781823f, 0.778588f, 0.938688f, 0.721610f, 0.940214f, 0.811353f};
int i, j;
int n = 10000000;
long start;
// Initialize y, z for test example:
for(i = 0; i < N; ++i)
{
y[i] = (float*)malloc(sizeof(float) * M);
z[i] = (float*)malloc(sizeof(float) * M);
for(j = 0; j < M; ++j)
{
y[i][j] = t1[j] * j;
z[i][j] = t2[j] * j;
}
}
// Speed test here:
start = clock();
while(--n)
fnFrequentFunction(x, y, z, a, b, c, 8);
printf("Time used: %ld\n", clock() - start);
// Output the result here:
printf("fnFrequentFunction == %f\n", fnFrequentFunction(x, y, z, a, b, c, 8));
for(j = 0; j < N; ++j)
printf(" c[%d] == %f\n", j, c[j]);
printf("\n");
// Free memory
for(j = 0; j < N; ++j)
{
free(y[j]);
free(z[j]);
}
return 0;
}
Any suggestions are welcome :-)
I feel terrible that I made a big mistake in my function. The above code is the new one. I'm rechecking it now to make sure that is what I need.
put this outside the loop
sum = 0;
tmp = x[0] - y[0]; sum += tmp * tmp * z[0];
tmp = x[1] - y[1]; sum += tmp * tmp * z[1];
tmp = x[2] - y[2]; sum += tmp * tmp * z[2];
tmp = x[3] - y[3]; sum += tmp * tmp * z[3];
tmp = x[4] - y[4]; sum += tmp * tmp * z[4];
tmp = x[5] - y[5]; sum += tmp * tmp * z[5];
tmp = x[6] - y[6]; sum += tmp * tmp * z[6];
tmp = x[7] - y[7]; sum += tmp * tmp * z[7];
tmp = x[8] - y[8]; sum += tmp * tmp * z[8];
tmp = x[9] - y[9]; sum += tmp * tmp * z[9];
This function is perfectly amenable to SIMD processing. Look into your compiler documentation for the intrinsic functions that correspond to the SSE instructions.
You could break up the dependence chain on the sum variable. Instead of a single sum accumulator, use two accumulators sum1 and sum2 alternately - one for even, one for odd indices. Add them up afterwards.
The single biggest performance bottleneck here is the log() function. Check if an approximation would be sufficient. The calculation of this could also be vectorized - I believe Intel published a high-performance math library - including vectorized versions of functions like log(). You may like to use this.
You are operating on floats here, and log() uses double precision. Use logf() instead. It may (or may not) be faster. It will certainly be no slower.
If your compiler understands C99, place a restrict qualifier on the pointers which are function arguments. This tells the compiler that those arrays do not overlap, and may help it generate more efficient code.
Change the way matrices are kept in memory. Instead of an array of pointers pointing to disjoint memory blocks, use a single array M*N elements in size.
So, to put it together, this is how the function should look like. This is portable C99. Using the compiler-specific SIMD intrinsics, this could be made WAAAAY faster.
UPDATE: Note that I changed the way input matrices are defined. A matrix is a single, large array.
float fnFrequentFunction(const float *restrict x, const float *restrict y,
const float *restrict z, const float *restrict a,
const float *restrict b, float *restrict c, int n)
{
float ret = 0;
const float *restrict yy = y; //for readability
const float *restrict zz = z; // -||-
for (int i = 0; i < n; i++, yy += M, zz += M) // n == 1, 2, 4, or 8
{
float sum = 0;
float sum2 = 0;
for(int j = 0; j < 10; j += 2)
{
float tmp = x[j] - yy[j]; sum += tmp * tmp * zz[j];
float tmp2 = x[j+1] - yy[j+1]; sum2 += tmp2 * tmp2 * zz[j+1];
}
sum += sum2;
ret += (c[i] = logf(a[i] * b[i]) + sum);
}
return ret;
}
Use memoization to cache the results. This is a time/space trade-off optimization.
It's really easy to do this in Perl with the memoize package, and probably in many other dynamic languages. In C, you'd need to roll your own.
Use a wrapper function to make a hash of the arguments and use it to check if the value has already been calculated. If it has, return it. If not, pass through to the original function and cache the returned result.
Alternatively, you could pre-calculate your lookup table at program startup, or even calculate it once and then persist it, depending on your needs.
The above suggestions of strength reducing the tmp values out of the loop are correct. I might even consider dropping those 10 lines into a for loop of their own as this may improve code cache efficiency.
Beyond this, you start getting to the point where you want to know what type of processor you are targetting. If it has native SIMD support, an FPU, what kind of cache it uses, etc. Also depending on how many arguments get passed via registers, reducing the parameters by combining into a single struct and pass by reference might get you a small boost. Declaring vars as register may or may not help. Again profiling and examining the assembler output will answer that.
As sum is known at before the loop, you could get away with adding M * its value in after the loop for a boost. That just leaves the 2 log muls on the inside.
If M is always 8 or has some other known pattern, you could do some minor loop unrolling, but the gains there are almost nil against the log calls.
The only other major thing to look at is log(). How is this implemented? Can you perhaps roll your own faster version through table lookups if your input range is known. Better yet, table the log products if there's enough RAM available.
Just a few thoughts.
Do you use compiler optimization?
Register before variables is antiqued for modern compilers. You can even harm the compiler performance if you use them with compiler optimization. For example gcc simple compilation provides:
Time used: 8720000
and without register floats:
Time used: 8710000
I know this is not much.
I assume you made all those sums to avoid a for loop because you think that is much slower. It is not. A modern compiler will do that optimization for you too.
One big optimization I think is to use a table for log, if you don't mind the memory, that will be faster, use log only when you are out of range.
I wonder if doing it as scaled ints rather than floats might speed it up. I dont know the data ranges so I dont know if this is even possible
In addition to Andrey's answer, you can add some prefetching to the loop:
float fnFrequentFunction(const float* x, const float* y, const float* z,
const float *a, const float *b, float *c, int M)
{
register float tmp;
register float sum;
register float ret = 0;
int i;
sum = 0;
tmp = x[0] - y[0]; sum += tmp * tmp * z[0];
tmp = x[1] - y[1]; sum += tmp * tmp * z[1];
tmp = x[2] - y[2]; sum += tmp * tmp * z[2];
tmp = x[3] - y[3]; sum += tmp * tmp * z[3];
tmp = x[4] - y[4]; sum += tmp * tmp * z[4];
tmp = x[5] - y[5]; sum += tmp * tmp * z[5];
tmp = x[6] - y[6]; sum += tmp * tmp * z[6];
tmp = x[7] - y[7]; sum += tmp * tmp * z[7];
tmp = x[8] - y[8]; sum += tmp * tmp * z[8];
tmp = x[9] - y[9]; sum += tmp * tmp * z[9];
for (i = 0; i < M; i++) // M == 1, 2, 4, or 8
{
//----------------------------------------
// Prefetch data into the processor's cache
//----------------------------------------
float a_value = a[i];
float b_value = b[i];
float c_value = 0.0;
//----------------------------------------
// Calculate using prefetched data.
//----------------------------------------
c_value = log(a_value * b_value) + sum;
c[i] = c_value;
ret += c_value;
}
return ret;
}
You could also try unrolling the loop:
float a_value = 0.0;
float b_value = 0.0;
float c_value = 0.0;
--M;
switch (M)
{
case 7:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 6:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 5:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 4:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 3:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 2:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 1:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
--M;
case 0:
a_value = a[M];
b_value = b[M];
c_value = log(a_value * b_value) + sum;
c[M] = c_value;
ret += c_value;
break;
}
Looking at the unrolled version, you could take the " + sum" out of the "loop" and add it in at the end as:
ret += (M + 1) * sum;
since sum doesn't change.
Finally, another alternative is to perform all multiplications at once, followed by all log calculations, then sum up everything:
float product[8];
for (i = 0; i < M; ++i)
{
product[i] = a[i] * b[i];
}
for (i = 0; i < M; ++i)
{
c[i] = log(product);
ret += c[i];
}
ret += M * sum;
If you are calling this multiple times when a and b have not changed, then combine a and b into logab where logab[i] = log(a[i] * b[i]) since a and b are not used anywhere else.
This appears to be a gaussian mixture model computation. Several years ago, I worked on an effort to optimize this same algorithm which was being used as part of a speech processing program. I investigated a number of optimizations like you're attempting to do but never found anything using straight C to gain more than just a few percent. My biggest gain came from recoding the basic GMM kernel using SIMD instructions. Since that still wasn't providing the performance I was looking for, the next (and final) step was to use an Nvidia GPU. This sort of worked but programming that thing was a headache in itself.
Sorry I can't be more helpful but I don't think you are going to pick up more than just a nominal amount of speed if you're sticking to a regular CPU.
Related
I would like to DSP-optimize a simple multiply-accumulate for-loop for the QC Hexagon. From the manual, it's not perfectly clear to me how to do that, both for the vector version and the non-vector version.
Assume my loop has a length which is a multiple of 4 (e.g., 64), i.e., I want to unroll the loop with a factor of 4. How would I do that? I can use either C-intrinsics or asm-code, but I don't understand how to do the 4x-memory load in first place.
Here is how my loop could look like in C:
Word32 sum = 0;
Word16 *pointer1; Word16 *pointer2;
for (i=0; i<64; i++)
{
sum += pointer1[I]*pointer2[i];
}
Any suggestions?
Here is a FIR filter implementation that demonstrates how to use Q6_P_vrmpyhacc_PP, the multiply halfword/accumulate. This instruction is described as 'big mac' in the PRM 😉
This instruction is in the scalar core so it does not require the HVX vector coprocessor.
void FIR08(short_8B_align Input[],
short_8B_align Coeff[],
short_8B_align Output[],
int unused, int ntaps,
int nsamples)
{
Word64 * vInput = (Word64*)Input;
Word64 * vCoeff = (Word64*)Coeff;
Word64 *__restrict vOutput = (Word64*)Output;
int i, j;
Word64 sum0, sum1, sum2, sum3;
for (i = 0; i < nsamples/4; i++)
{
sum0 = sum1 = sum2 = sum3 = 0;
for (j = 0; j < ntaps/4; j++)
{
Word64 vIn1 = vInput[i+j];
Word64 vIn2 = vInput[i+j+1];
Word64 curCoeff = vCoeff[j];
Word64 curIn;
curIn = vIn1;
sum0 = Q6_P_vrmpyhacc_PP(sum0, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 2);
sum1 = Q6_P_vrmpyhacc_PP(sum1, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 4);
sum2 = Q6_P_vrmpyhacc_PP(sum2, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 6);
sum3 = Q6_P_vrmpyhacc_PP(sum3, curIn, curCoeff);
}
Word64 curOut = Q6_P_combine_RR(Q6_R_combine_RhRh(sum3, sum2), Q6_R_combine_RhRh(sum1, sum0));
vOutput[i + 1] = Q6_P_vasrh_PI(curOut, 2);
}
}
EDIT: I've added the main, factorial, and trapGamma function to give the full picture but I am specifically talking about the for loop for iSum in the I function.
Basically I've run out of ideas and exhausted everywhere I know of to find an answer to this. I need to code a program that will compute a complex function which represents an M/M/1 queue.
The function includes sub functions such as calculating the integral of a gamma function and computing factorials. I've written all the code for the computations but my sum is giving me huge numbers when I would expect nothing higher than about .35
#include <math.h>
#include <stdio.h>
double I(int k, double t);
double trapGamma(double z);
unsigned long long int factorial(unsigned int n);
int main()
{
int k;
int i = 0;
double dt = 0.1;
printf("Ikx = [ \n");
for (t = 14.0 ; t <= 15.0; t += dt)
{
printf("%f " , t);
for (k = 1 ; k <= 10 ; k++)
{
I(k, t);
printf("%f " , I(k, t));
}
printf("\n");
}
printf(" ];\n");
return (0);
}
double I(int k, double t)
{
unsigned long long int x;
unsigned int n = 20;
double numerator, y, pow1, c;
double iSum;
double Ix;
int i = 0;
iSum = 0.0;
Ix = 0.0;
a = .25 * pow(t , 2);
b = pow(a, i);
x = factorial(n);
y = trapGamma(k + i + 1);
iSum = (b / (x * y));
//This is the sum loop that I'm having trouble with, I've broke the iSum equation down for my own readability while coding right above this comment
for (i = 0; i <= 100 ; i++)
{
iSum += i;
}
Ix = (pow((.5 * t), k) ) * iSum;
return Ix;
}
/*
I've checked both the factorial and trapGamma functions and they are giving me the expected results.
*/
unsigned long long int factorial(unsigned int n)
{
if(n <= 1)
return 1;
else
return (n * factorial(n - 1));
}
double trapGamma (double z)
{
int i , N = 100;
double gamma;
double a = 0.0;
double b = 15.0;
double x1, x2, y1, y2;
double areai;
double w = (b - a) / N;
gamma = 0.0;
for (i = 1; i < N; i++)
{
x1 = a + ((i - 1) * w); //the left bound point
x2 = a + (i*w); //the right bound point
y1 = pow(x1,z - 1)*exp(-x1); //the height of our left bound
y2 = pow(x2, z - 1)*exp(-x2); //the height of our right bound
areai = ((y1 + y2) / 2.0) * (x2 - x1);
gamma += areai;
}
return gamma;
}
This is building upon another project where I used a bessel function to create the M/M/1 queue over a 60 second span so I can see what this one is supposed to be. I've checked both my trapGamma and factorial functions results on there own and they are both working as expected.
How are summations supposed to be coded?
If the intent of the posted code is to calculate the modified Bessel function I, there are some pitfalls and useful semplifications to be aware of. Given
Trying to calculate the factorial, the value of the Gamma function, their product and the powers separately for each term of the sum leads to integer overflow sooner than later.
It's better to update the value of each addend of the sum instead.
Also, given that k is a whole, we have Γ(n) = (n - 1)!
The addends are increasingly smaller and, after some iterations, too small to be added to the sum, given the limited precision of type double.
// Evaluates x^k / k! trying not to overflow
double power_over_factorial(double x, int k)
{
double result = 1.0;
for ( int i = 1; i <= k; ++i )
{
result *= x / i;
}
return result;
}
#define MAX_ITERS 20
double modified_Bessel_I(int k, double x)
{
x /= 2;
const double xx = x * x;
double partial = power_over_factorial(x, k);
double old_sum, sum = partial;
int m = 1;
do
{
old_sum = sum;
partial *= xx / ((m + k) * m);
sum += partial;
}
while ( old_sum != sum && ++m < MAX_ITERS );
return sum;
}
Testable here.
In my code I had an error that I was able to pinpoint down to a certain for loop, but now I'm uncertain as to how exactly the problem is being caused
int linregm(int n, float x[], float y[]){
float denom=1;
sumx = 0.0;
sumx2 = 0.0;
sumxy = 0.0;
sumy = 0.0;
/*
for (i=0;i<n;i++)
{
//sumx = sumx + x[i];
//sumx2 = sumx2 + x[i]*x[i];
//sumxy = sumxy + x[i] * y[i];
//sumy = sumy + y[i];
}
*/
denom = (n * sumx2 - sqr(sumx));
if (denom == 0) {
// singular matrix. can't solve the problem.
pass=1;
return 0;
}
m = (n * sumxy - sumx * sumy) / denom;
pass=0;
return m;
}
This is the function that has a problem at the spot where there is a for-loop, even if I just run the loop itself without the internal contents it still gives me an error.
m = linregm(sizeof(x)/sizeof(x[0]), x, y);
__IO float Send_float_Data[256];
float x[8] = {1,2,3,4,5,6,7,8};
float y[8];
float m,b;
unsigned int pass;
float sumx,sumx2,sumxy,sumy=0.0;
unsigned int i=0,j;
And these are the variable declarations.
I'm new to C so I don't know if there are special interactions between variables or something that could have screwed up my code, there was no compiler warning so syntax shouldn't be a problem.Do you guys have any idea on what exactly I'm doing wrong? Thanks.
I'm trying to perform a calculation which involves the following C-function:
long double complex* tridiag_thomas(long double complex *a, long double complex *b, long double complex *c, long double complex *f, int N) {
long double complex *v; v = (long double complex *)malloc(sizeof(long double complex) * N);
long double complex *y; y = (long double complex *)malloc(sizeof(long double complex) * N);
long double complex w;
int k;
for (k = 0; k < N; k++) {
y[k] = 0;
v[k] = 0;
}
w = a[0];
y[0] = f[0] / w;
for (k = 1; k < N; k++) {
v[k - 1] = c[k - 1] / w;
w = a[k] - b[k] * v[k - 1];
y[k] = (f[k] - b[k] * y[k - 1]) / w;
}
for (k = N - 2; k >= 0; k--) {
y[k] = y[k] - v[k] * y[k + 1];
}
return y;
}
I pass a matrix through this function (f), find y, modify f with y, and pass the new f through the function again. I do this on the order of 1000 times. When working with real values (and making the necessary change of long double complex -> long double), this function works as expected. When using it in the above form with complex arguments, however, the result diverges to infinity very quickly.
Can anybody enlighten me as to why that might be? I'm not new to programming, but I am new to C.
I do see a problem where it is leaking 'v'. This makes me suspect that you are not following the c style assignment at the upper level. We don't have the upper level code.
When is y freed? Does the assignment loop through all the values or are you doing a c++ style assignment? Remember that allocation, assignment, and deallocation do not come for free in c, except for the types built in to the compiler.
I seem to be lost with this Fourier Transform function. There's a sample program that I have but don't understand. The ggFFTworksp contains the data and fftFrameSize is simply framesize of the data. I don't understand how the function is supposed to put the FFT version of the data into the fftBuffer if there is no part in the code where fftBuffer is actually edited or manipulated. Thank you in advance!
The function call is this:
static float gFFTworksp[2*MAX_FRAME_LENGTH];
long fftFrameSize;
smbFft(gFFTworksp, fftFrameSize, -1);
The function in question is this:
void smbFft(float *fftBuffer, long fftFrameSize, long sign)
/*
FFT routine, (C)1996 S.M.Bernsee. Sign = -1 is FFT, 1 is iFFT (inverse)
Fills fftBuffer[0...2*fftFrameSize-1] with the Fourier transform of the
time domain data in fftBuffer[0...2*fftFrameSize-1]. The FFT array takes
and returns the cosine and sine parts in an interleaved manner, ie.
fftBuffer[0] = cosPart[0], fftBuffer[1] = sinPart[0], asf. fftFrameSize
must be a power of 2. It expects a complex input signal (see footnote 2),
ie. when working with 'common' audio signals our input signal has to be
passed as {in[0],0.,in[1],0.,in[2],0.,...} asf. In that case, the transform
of the frequencies of interest is in fftBuffer[0...fftFrameSize].
*/
{
float wr, wi, arg, *p1, *p2, temp;
float tr, ti, ur, ui, *p1r, *p1i, *p2r, *p2i;
long i, bitm, j, le, le2, k;
for (i = 2; i < 2*fftFrameSize-2; i += 2) {
for (bitm = 2, j = 0; bitm < 2*fftFrameSize; bitm <<= 1) {
if (i & bitm) j++;
j <<= 1;
}
if (i < j) {
p1 = fftBuffer+i; p2 = fftBuffer+j;
temp = *p1; *(p1++) = *p2;
*(p2++) = temp; temp = *p1;
*p1 = *p2; *p2 = temp;
}
}
for (k = 0, le = 2; k < (long)(log(fftFrameSize)/log(2.)+.5); k++) {
le <<= 1;
le2 = le>>1;
ur = 1.0;
ui = 0.0;
arg = M_PI / (le2>>1);
wr = cos(arg);
wi = sign*sin(arg);
for (j = 0; j < le2; j += 2) {
p1r = fftBuffer+j; p1i = p1r+1;
p2r = p1r+le2; p2i = p2r+1;
for (i = j; i < 2*fftFrameSize; i += le) {
tr = *p2r * ur - *p2i * ui;
ti = *p2r * ui + *p2i * ur;
*p2r = *p1r - tr; *p2i = *p1i - ti;
*p1r += tr; *p1i += ti;
p1r += le; p1i += le;
p2r += le; p2i += le;
}
tr = ur*wr - ui*wi;
ui = ur*wi + ui*wr;
ur = tr;
}
}
}
In the following line:
p1 = fftBuffer+i; p2 = fftBuffer+j;
p1 and p2 become pointers that point to the memory location of the fftBuffer array. And in these lines:
*(p2++) = temp; temp = *p1;
*p1 = *p2; *p2 = temp;
the values in these memory locations are being changed.