I am looking for some help or hints to speed up my code.
I have implemented a routine for computing the gravitational potential at a point (r,phi,lambda) in space from a set of spherical harmonic coefficients C_{n,m} and S_{n,m}. The equation is shown in the link below:
and includes the recursive computation of the latitude (phi) dependent associated Legendre polynomials P_{n,m}, starting with the first two values P{0,0} and P_{1,1}.
At first, I had this implemented as a MATLAB C-MEX code, with only the core part of my code in C-language. I wanted to make a pure C-routine, but found that the code runs 3-5 times slower, which makes me wonder why. Could it be the way I define my structures and use pointers to pointers in the central code?
It seems like it is the core computation part that takes the extra time, but that part did not change although before I was passing using pointers directly to variables and now I am using pointers inside structures.
Any help is appreciated!
In the following, I will try to explain my code and show some extracts:
At the beginning of the program, I define three structures. One to hold the spherical harmonic coefficients, C_{n,m} and S_{n,m}, (ggm_struct), one to hold the computation coordinates (comp_struct) and one to hold the results (func_struct):
// Define constant variables
const double deg2rad = M_PI/180.0; // degrees to radians conversion factor
const double sfac = 1.0000E-280; // scaling factor
const double sqrt2 = 1.414213562373095; // sqrt(2)
const double sqrt3 = 1.732050807568877; // sqrt(3)
// Define structure to hold geopotential model
struct ggm_struct {
char product_type[100], modelname[100], errors[100], norm[100], tide_system[100];
double GM, R, *C, *S;
int max_degree, ncoef;
};
// Define structure to hold computation coordinates
struct comp_struct {
double *lat, *lon, *h;
double *r, *phi;
int nlat, nlon;
};
/* Define structure to hold results */
struct func_struct {
double *rval;
int npoints;
};
I then have a (sub-)function that starts by allocating space and then loads the coefficients from an ascii file as
int read_gfc(char mfile[100], int *nmax, int *mmax, struct ggm_struct *ggm)
{
// Set file identifier
FILE *fid;
// Declare variables
char str[200], var[100];
int n, m, nid, l00 = 0, l10 = 0, l11 = 0;
double c, s;
// Determine number of coefficients
ggm->ncoef = (*nmax+2)*(*nmax+1)/2;
// Allocate memory for coefficients
ggm->C = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->C == NULL){
printf("Error: Memory for C not allocated!");
return -ENOMEM;
}
ggm->S = (double*) malloc(ggm->ncoef*sizeof(double));
if (ggm->S == NULL){
printf("Error: Memory for S not allocated!");
return -ENOMEM;
}
// Open file
fid = fopen(mfile,"r");
// Check that file was opened correctly
if (fid == NULL){
printf("Error: opening file %s!",mfile);
return -ENOMEM;
}
// Read file header
while (fgets(str,200,fid) != NULL && strncmp(str,"end_of_head",11) != 0){
// Extract model parameters
if (strncmp(str,"product_type",12) == 0){ sscanf(str,"%s %s",var,ggm->product_type); }
if (strncmp(str,"modelname",9) == 0){ sscanf(str,"%s %s",var,ggm->modelname); }
if (strncmp(str,"earth_gravity_constant",22) == 0){ sscanf(str,"%s %lf",var,&ggm->GM); }
if (strncmp(str,"radius",6) == 0){ sscanf(str,"%s %lf",var,&ggm->R); }
if (strncmp(str,"max_degree",10) == 0){ sscanf(str,"%s %d",var,&ggm->max_degree); }
if (strncmp(str,"errors",6) == 0){ sscanf(str,"%s %s",var,ggm->errors); }
if (strncmp(str,"norm",4) == 0){ sscanf(str,"%s %s",var,ggm->norm); }
if (strncmp(str,"tide_system",11) == 0){ sscanf(str,"%s %s",var,ggm->tide_system); }
}
// Read coefficients
while (fgets(str,200,fid) != NULL){
// Extract parameters
sscanf(str,"%s %d %d %lf %lf",var,&n,&m,&c,&s);
// Store parameters
if (n <= *nmax && m <= *mmax) {
// Determine index
nid = (n+1)*n/2 + m;
// Store values
*(ggm->C+nid) = c;
*(ggm->S+nid) = s;
}
}
// Close fil
fclose(fid);
// Return from function
return 0;
}
Afterwards, the computation grid is defined by an array of seven components. As an example, the array [-90 90 -180 180 1 1 0] defines a grid from -90 to 90 degrees latitude at 1-degree increments and from -180 to 180 degrees longitude at 1-degree increments. The height is zero. From this array, a computation grid is generated in a (sub-)function:
int make_grid(double *grid, struct comp_struct *inp)
{
// Declare variables
int n;
/* Echo routine */
printf("Creating grid of coordinates\n");
printf(" [lat1,lat2,dlat] = [%f,%f,%f]\n", *grid, *(grid+1), *(grid+4) );
printf(" [lon1,lon2,dlon] = [%f,%f,%f]\n", *(grid+2), *(grid+3), *(grid+5) );
printf(" h = %f\n", *(grid+6));
/* Latitude ------------------------------------------------------------- */
// Determine number of increments
inp->nlat = ceil( ( *(grid+1) - *grid + *(grid+4) ) / *(grid+4) );
// Allocate memory
inp->lat = (double*) malloc(inp->nlat*sizeof(double));
if (inp->lat== NULL){
printf("Error: Memory for LATITUDE (inp.lat) points not allocated!");
return -ENOMEM;
}
// Fill in values
*(inp->lat) = *(grid+1);
for (n = 1; n < inp->nlat-1; n++) {
*(inp->lat+n) = *(inp->lat+n-1) - *(grid+4);
}
*(inp->lat+inp->nlat-1) = *grid;
/* Longitude ------------------------------------------------------------ */
// Determine number of increments
inp->nlon = ceil( ( *(grid+3) - *(grid+2) + *(grid+5) ) / *(grid+5) );
// Allocate memory
inp->lon = (double*) malloc(inp->nlon*sizeof(double));
if (inp->lon== NULL){
printf("Error: Memory for LONGITUDE (inp.lon) points not allocated!");
return -ENOMEM;
}
// Fill in values
*(inp->lon) = *(grid+2);
for (n = 1; n < inp->nlon-1; n++) {
*(inp->lon+n) = *(inp->lon+n-1) + *(grid+5);
}
*(inp->lon+inp->nlon-1) = *(grid+3);
/* Height --------------------------------------------------------------- */
// Allocate memory
inp->h = (double*) malloc(inp->nlat*sizeof(double));
if (inp->h== NULL){
printf("Error: Memory for HEIGHT (inp.h) points not allocated!");
return -ENOMEM;
}
// Fill in values
for (n = 0; n < inp->nlat; n++) {
*(inp->h+n) = *(inp->h+n-1) + *(grid+6);
}
// Return from function
return 0;
}
These geographic coordinates are then transformed to spherical coordinates for the computation using another (sub-)routine
int geo2sph(struct comp_struct *inp, int *lgrid)
{
// Declare variables
double a = 6378137.0, e2 = 6.69437999014E-3; /* WGS84 parameters */
double x, y, z, sinlat, coslat, sinlon, coslon, R_E;
int i, j, nid;
/* Allocate space ------------------------------------------------------- */
// radius
inp->r = (double*) malloc(inp->nlat*sizeof(double));
if (inp->r== NULL){
printf("Error: Memory for SPHERICAL DISTANCE (inp.r) points not allocated!");
return -ENOMEM;
}
// phi
inp->phi = (double*) malloc(inp->nlat*sizeof(double));
if (inp->phi== NULL){
printf("Error: Memory for SPHERICAL LATITUDE (inp.phi) points not allocated!");
return -ENOMEM;
}
/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {
// Compute sine and cosine of latitude
sinlat = sin(*(inp->lat+i));
coslat = cos(*(inp->lat+i));
// Compute radius of curvature
R_E = a / sqrt( 1.0 - e2*sinlat*sinlat );
// Compute sine and cosine of longitude
sinlon = sin(*(inp->lon));
coslon = cos(*(inp->lon));
// Compute rectangular coordinates
x = ( R_E + *(inp->h+i) ) * coslat * coslon;
y = ( R_E + *(inp->h+i) ) * coslat * sinlon;
z = ( R_E*(1.0-e2) + *(inp->h+i) ) * sinlat;
// Compute sqrt( x^2 + y^2 )
R_E = sqrt( x*x + y*y );
// Derive radial distance
*(inp->r+i) = sqrt( R_E * R_E + z*z );
// Derive spherical latitude
if (R_E < 1) {
if (z > 0) { *(inp->phi+i) = M_PI/2.0; }
else { *(inp->phi+i) = -M_PI/2.0; }
}
else {
*(inp->phi+i) = asin( z / *(inp->r+i) );
}
}
// Return from function
return 0;
}
Finally, the gravitational potential is computed within is own (sub-)function. This is the core part of the code, which is more or less the same as for the MATLAB C-MEX function. The only difference seems to be that before (in MATLAB MEX) everything was defined as (simple) double variables - now the variables are located inside a structure which contains pointers.
int gravpot(struct comp_struct *inp, struct ggm_struct *ggm, int *nmax,
int *mmax, int *lgrid, struct func_struct *out)
{
// Declare variables
double GMr, ar, t, u, u2, arn, gnm, hnm, P, Pp1, Pp2, msum;
double Pmm[*nmax+1], CPnm[*mmax+1], SPnm[*mmax+1];
int i, j, n, m, id;
// Allocate memory
out->rval = (double*) malloc(inp->nlat*inp->nlon*sizeof(double));
if (out->rval== NULL){
printf("Error: Memory for OUTPUT (out.rval) not allocated!");
return -ENOMEM;
}
/* Compute sectorial values of associated Legendre polynomials ========== */
// Define seed values ( divided by u^m )
Pmm[0] = sfac;
Pmm[1] = sqrt3 * sfac;
// Compute sectorial values, [1] eq. 13 and 28 ( divided by u^m )
for (m = 2; m <= *nmax; m++) {
Pmm[m] = sqrt( (2.0*m+1.0) / (2.0*m) ) * Pmm[m-1];
}
/* ====================================================================== */
/* Loop over latitude =================================================== */
for (i = 0; i < inp->nlat; i++) {
// Compute ratios to be used in summation
GMr = ggm->GM / *(inp->r+i);
ar = ggm->R / *(inp->r+i);
/* ---------------------------------------------------------------------
* Compute product of Legendre values and spherical harmonic coefficients.
* Products of similar degree are summed together, resulting in mmax
* values. The degree terms are latitude dependent, such that these mmax
* sums are valid for every point with the same latitude.
* The values of the associated Legendre polynomials, Pnm, are scaled by
* sfac = 10^(-280) and divided by u^m in order to prevent underflow and
* overflow of the coefficients.
* ------------------------------------------------------------------ */
// Form coefficients for Legendre recursive algorithm
t = sin(*(inp->phi+i));
u = cos(*(inp->phi+i));
u2 = u * u;
arn = ar;
/* Degree n = 0 terms ----------------------------------------------- */
// Compute order m = 0 term (S term is zero)
CPnm[0] = Pmm[0] * *(ggm->C);
/* Degree n = 1 terms ----------------------------------------------- */
// Compute (1,1) terms, [1] eq. 3
CPnm[1] = ar * Pmm[1] * *(ggm->C+2);
SPnm[1] = ar * Pmm[1] * *(ggm->S+2);
// Compute (1,0) Legendre value, [1] eq. 18 and 27
P = t * Pmm[1];
// Add (1,0) terms to sum (S term is zero), [1] eq. 3
CPnm[0] = CPnm[0] + ar * P * *(ggm->C+1);
/* Degree n = [2,n_max] --------------------------------------------- */
for (n = 2; n <= *nmax; n++) {
// Compute power term
arn = arn * ar;
/* Compute sectorial (m=n) terms ++++++++++++++++++++++++++++++++ */
// Extract associated Legendre value
Pp1 = Pmm[n];
// Compute product terms, [1] eq. 3
if (n <= *mmax) {
id = (n+1)*n/2 + n;
CPnm[n] = arn * Pp1 * *(ggm->C+id);
SPnm[n] = arn * Pp1 * *(ggm->S+id);
}
/* Compute first non-sectorial terms (m=n-1) ++++++++++++++++++++ */
// Compute associated Legendre value, [1] eq. 18 and 27
gnm = sqrt( 2.0*n );
P = gnm * t * Pp1;
// Add terms to summation, eq. 3 in [1]
if (n-1 <= *mmax) {
id = (n+1)*n/2 + n - 1;
CPnm[n-1] = CPnm[n-1] + arn * P * *(ggm->C+id);
SPnm[n-1] = SPnm[n-1] + arn * P * *(ggm->S+id);
}
/* Compute terms of order m = [n-2,1] +++++++++++++++++++++++++++ */
for (m = n-2; m > 0; m--) {
// Set previous values
Pp2 = Pp1;
Pp1 = P;
// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0*(m+1.0) / sqrt( (n-m)*(n+m+1.0) );
hnm = sqrt( (n+m+2.0)*(n-m-1.0)/(n-m)/(n+m+1.0) );
P = gnm * t * Pp1 - hnm * u2 * Pp2;
// Add product terms to summation, eq. 3 in [1]
if (m <= *mmax) {
id = (n+1)*n/2 + m;
CPnm[m] = CPnm[m] + arn * P * *(ggm->C+id);
SPnm[m] = SPnm[m] + arn * P * *(ggm->S+id);
}
}
/* Compute zonal terms (m=0) ++++++++++++++++++++++++++++++++++++ */
// Compute associated Legendre value, [1] eq. 18, 19 and 27
gnm = 2.0 / sqrt( n*(n+1.0) );
hnm = sqrt( (n+2.0)*(n-1.0)/n/(n+1.0) );
P = ( gnm * t * P - hnm * u2 * Pp1 ) / sqrt2;
// Add terms to summation (S term is zero), [1] eq. 3
id = (n+1)*n/2;
CPnm[0] = CPnm[0] + arn * P * *(ggm->C+id);
} /* ---------------------------------------------------------------- */
/* Loop over longitude ============================================== */
for (j = 0; j < inp->nlon; j++) {
/* -----------------------------------------------------------------
* All associated Legendre polynomials (latitude dependent) are now
* computed and multiplied by the corresponding spherical harmonic
* coefficient. These products are scaled by u^m, meaning that
* Horner's scheme is used in the following summation.
* -------------------------------------------------------------- */
// Initialise order-dependent sum
msum = 0.0;
// Derive longitude id
id = j + i * *lgrid;
// Loop over order (m > 0)
for (m = *mmax; m > 0; m--) {
// Add to order-dependent sum using Horner's scheme, [1] eq. 2, 3 and 31
msum = ( msum + cos( m * *(inp->lon+id) ) * CPnm[m]
+ sin( m * *(inp->lon+id) ) * SPnm[m] ) * u;
}
// Add zero order term to sum
msum = msum + CPnm[0];
// Rescale value into gravitational potential, [1] eq. 1
*(out->rval+i+j*inp->nlat) = GMr * msum / sfac;
} /* ================================================================ */
} /* ==================================================================== */
// Return from function
return 0;
}
Again, any help is greatly appreciated and additional information can be supplied if relevant, but this already became a long post. I have a hard time accepting that pure c-code runs slower than the MATLAB C-MEX code.
To put it simply, yes, pointers can prevent some compiler optimizations resulting in a potential slow down. At least, this is clearly the case with ICC and a bit with GCC. The performance of the program is strongly impacted by pointer aliasing and vectorization.
Indeed, the compiler cannot easily know if the provided pointers alias each other or alias with the address with some fields of the provided data structure. As a result, the compilers tends to be conservative and assume that the pointed value can change at any time and reload them often. This can prevent optimizations like the splitting of some loops in gravpot (with GCC -- see line 119 of this modified code). Moreover, indirections and aliasing tends to prevent the vectorization of the hot loops (ie. the use of SIMD instructions provided by the target processor). Vectorisation can strongly impact the performance of a code.
To give an example, here is the initial code of geo2sph and here is a slightly modified implementation. In the first case, ICC generate a slow scalar implementation, while in the second case, ICC generate a significantly faster vectorized implementation. The only difference between the two implementation is the use of the restrict keyword. This keyword tell to the compiler that for the lifetime of the pointer, only the pointer itself or a value directly derived from it (such as pointer+1) will be used to access the object to which it points (see here for more information). Note that the use of the restrict keyword is dangerous and one should be very careful while using it since the compiler may generate a bad code if the restrict hint is wrong (very hard to debug). Alternatively, you can help the compiler to generate a vectorized code using the OpenMP SIMD directive #pragma omp simd (see here for the result). Note that you should be sure the target code can be safely vectorized (eg. iterations of must be independent).
This is the function I have written for 2D Convolution in C:
typedef struct PGMImage{
int w;
int h;
int* data;
}GrayImage;
GrayImage Convolution2D(GrayImage image,GrayImage kernel){
int aH,aW,bW,bH,r,c,x,y,xx,yy,X,Y;
int temp = 0;
GrayImage conv;
CreateGrayImage(&conv,image.w,image.h);
aH = image.h;
aW = image.w;
bH = kernel.h;
bW = kernel.w;
if(aW < bW || aH < bH){
fprintf(stderr,"Image cannot have smaller dimensions than the blur kernel");
}
for(r = aH-1;r >= 0;r--){
for(c = aW-1;c >= 0;c--){
temp = 0;
for(y = bH-1;y >= 0;y--){
yy = bH - y -1;
for(x = bW-1;x >= 0;x--){
xx = bW - x - 1;
X = c + (x - (bW/2));
Y = r + (y - (bH/2));
if(X >= 0 && X < aW && Y >= 0 && Y < aH){
temp += ((kernel.data[(yy*bW)+xx])*(image.data[(Y*aW)+X]));
}
}
}
conv.data[(r*aW)+c] = temp;
}
}
return conv;
}
I reproduced this function in Matlab and found that it overestimates the values for certain pixels as compared to the regular 2D Convolution function in Matlab (conv2D). I can't figure out where I am going wrong with the logic. Please help.
EDIT:
Here's the stock image I am using (512*512):
https://drive.google.com/file/d/0B3qeTSY-DQRvdWxCZWw5RExiSjQ/view?usp=sharing
Here's the kernel (3*3):
https://drive.google.com/file/d/0B3qeTSY-DQRvdlQzamcyVmtLVW8/view?usp=sharing
On using the above function I get
46465 46456 46564
45891 46137 46158
45781 46149 46030
But Matlab's conv2 gives me
46596 46618 46627
46073 46400 46149
45951 46226 46153
for the same pixels (rows:239-241,col:316:318)
This is the Matlab code I am using to compare the values:
pgm_img = imread('path\to\lena512.pgm');
kernel = imread('path\to\test_kernel.pgm');
sz_img = size(pgm_img);
sz_ker = size(kernel);
conv = conv2(double(pgm_img),double(kernel),'same');
pgm_img = padarray(pgm_img,floor(0.5*sz_ker),'both');
convolve = zeros(sz_img);
for i=floor(0.5*sz_ker(1))+1:floor(0.5*sz_ker(1))+sz_img(1)
for j=floor(0.5*sz_ker(2))+1:floor(0.5*sz_ker(2))+sz_img(2)
startX = j - floor(sz_ker(2)/2);
startY = i - floor(sz_ker(1)/2);
endX = j + floor(sz_ker(2)/2);
endY = i + floor(sz_ker(1)/2);
block = pgm_img(startY:endY,startX:endX);
prod = double(block).*double(kernel);
convolve(i-floor(0.5*sz_ker(1)),j-floor(0.5*sz_ker(2))) = sum(sum(prod));
end
end
disp(conv(239:241,316:318));
disp(convolve(239:241,316:318));
One obvious difference is that your c code uses ints, while the matlab code uses doubles. Change your c code to use doubles, and see if the results are still different.
I created Image Convolution library for simple cases of an image which is a simple 2D Float Array.
The function supports arbitrary kernels and verified against MATLAB's implementation.
So all needed on your side is calling it with your generated Kernel.
You can use its generated DLL inside MATLAB and see it yields same results as MATLAB's Image Convolution functions.
Image Convolution - GitHub.
I'm trying to implement a simple script performing a PI control for a cruise control application, but I'm founding some problems with the integral part. Here is my code:
function [] = PI_cruisecontrol()
clc; close all;
t0 = 0; tfinal = 50; dt = 0.001; % time parameters
r = 10; % reference of 10 m/s
m = 1000; % mass
b = 50; % friction coeff. (depends on v)
yp = zeros(tfinal/dt,1); t = yp; % initialize speed and time array
Ki = 40; % integrarl constant
Kp = 800; % proportional constant
int = 0; % itinialize int error
% CONTROL LOOP (Forward-Euler integrator is used to solve the ODE)
for i=t0+2:tfinal/dt
err = r-yp(i-1); % update error
int = int+err; % integral term
u = (Kp*err)+(Ki*int*dt); % action of control
yp(i) = yp(i-1)+((-b*yp(i)/m) + (u/m))*dt; % solve ode for speed
t(i) = t(i)+dt*i; % log the time
end
% Results
figure(1)
plot(t,yp)
title ('Step Response')
xlabel('Time (seconds)')
ylabel('Amplitud')
axis([0 20 0 12])
hold on
reference = ones(tfinal/dt,1)*10;
plot(t,reference,':')
end
And this is how it should be, using predefinided matlab functions:
function [] = PI_cruisecontrol2()
m = 1000;
b = 50;
r = 10;
s = tf('s');
P_cruise = 1/(m*s + b);
Kp = 800;
Ki = 40;
C = pid(Kp,Ki);
T = feedback(C*P_cruise,1);
t = 0:0.1:20;
step(r*T,t)
axis([0 20 0 12])
end
What am I doing wrong in my code?
Thanks!
I managed to fix the problem, working with float variables instead of arrays. Moreover, I added the derivative term (although for this first order problem was not necessary)
Here I left the code:
function [] = aFortran_PI()
clc; close all;
r = 10; % reference of 10 m/s
m = 1000; % mass
b = 50; % friction coeff. (depends on v)
yp = 0; % init response
Kp = 800; % proportional constant
Ki = 40; % proportional constant
Kd = 0; % derivative term is not necessary in this problem
previous_error = 0;
integral = 0;
dt = 0.001;
% CONTROL LOOP
for i=1:20000
error = r-yp; % update error
integral = integral + error*dt; % integral term
derivative = (error-previous_error)/dt; % derivative term
u = Kp*error+Ki*integral+Kd*derivative; % action of control
yp = yp+(-b*yp/m + u/m)*dt % solve ode for velocity
end
end
As I wrote in my previous topic: Benchmarking code - am I doing it right? I need to find a way to get benchmark statistics, like average, mean, standard deviation, etc. How can I do this using those methods I posted? Notice that I use a solution to benchmark code with time interval, not by calling a function many times. Any ideas?
I came up with just one, dont know if its correct (pseudocode):
buffsize = 1024;
buffer [buffsize];
totalcycles = 0
// arrays
walltimeresults = []
cputimeresults = []
// benchmarking
for i in (0, iterations):
start = walltime();
fun2measure(args, buffer);
end = walltime();
walltimeresults[i] = end - start;
start = cputime();
fun2measure(args, buffer);
end = cputime();
cputimeresults[i] = end - start;
c1 = cyclecount();
fun2measure(args, buffer);
c2 = cyclecount();
cyclesperbyte = c2-c1/(buffsize);
totalcycles += cyclesperbyte;
for i in range (0, iterations) : sum += walltimeresults[i];
avg_wall_time = sum / iterations;
sum = 0;
for i in range (0, iterations) : sum += cputimeresults[i];
avg_cpu_time = sum / iterations;
avg_cycles = totalcycles / iterations;
Is it correct? How about mean, standard deviation, etc?
Your average looks OK.
Mean (i.e. average) is
mean = 1/N * sum( x[i] )
Standard deviation is square root of variance:
sigma = sqrt( 1/N * sum( (x[i]-mean)^2 )