Imagine that you have a particle in each coordinate of a 2D Cartesian plane. Each particle emits a substance that difuses in all directions, with a decay over distance based on a Bessel function, and the other particles each absorb this substance. Thus all particles at same distance from a given particle have the same influence on that particle. Something such as
I'm calculating such an interaction using this code:
EDIT:31/03: Complete code for both.
#include <stdio.h> // para as rotinas de entrada e saída
#include <stdlib.h> //
#include <stdarg.h> // para importar os elementos da linha de comando
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <malloc.h>
#include <time.h>
#include"ran1.c"
#include"bessel.c"
#define tmax 90000
#define N 50
#define beta 0.001
#define sigma 0.001
#define pi acos(-1.0)
#define trans 50000
#define epsilon 0.1
void condicoes_iniciais(double **xold,double **yold,double **a)
{
int l,j;
long idum=-120534;
for(l=0;l<= N; l++)
{
for(j=0;j<= N; j++)
{
a[l][j]=5.0;
}
}
for(l=0;l<= N; l++)
{
for(j=0;j<= N; j++)
{
while(a[l][j]>4.4)
a[l][j]=4.1+ran1(& idum);
}
}
for(l=0;l<= N; l++)
{
for(j=0;j<= N; j++)
{
xold[l][j]=0.1*ran1(& idum);
}
}
for(l=0;l<= N; l++)
{
for(j=0;j<= N; j++)
{
yold[l][j]=0.1*ran1(& idum);
}
}
}
void Matriz_Bessel(double **Bess,double gama)
{
int x,y;
double r;
for(x=0;x<=N;x++)
{
for(y=0;y<=N;y++)
{
if(y!=0 || x!=0)
{
r = gama*sqrt(x*x +y*y);
Bess[x][y] = bessk0(r);
}
}
}
}
void acoplamento(int x, int y,double **xold, double *Acopl,double **Bess)
{
int j, i, h, k,xdist, ydist;
int Nmeio = N/2;
double Xf;
Xf = 0;
for(i=0;i<=N;i++)
{
for(j=0;j<=N;j++)
{
h = x+i;
k = y+j;
ydist = j;
xdist = i;
if(i>Nmeio)
{
h = x +i;
xdist = (N+1) -h +x;
}
if(h>N)
{
h=h-(N+1);
xdist = x-h;
if(xdist >Nmeio){xdist = i;
}
}
if(j>Nmeio)
{
k = y +j;
ydist = (N+1) -k +y;
}
if(k>N)
{
k=k-(N+1);
ydist = y-k;
if(ydist >Nmeio){ydist = j;
}
}
if(ydist!=0 || xdist!=0)
{
Xf = Xf +Bess[xdist][ydist]*xold[h][k];
}
}
}
*Acopl = Xf;
}
void constante(double *c, double gama, double **Bess){
double soma;
int x, y;
soma = 0;
for(x=0;x<=(N/2);x++)
{
for(y=0;y<=(N/2);y++)
{
if(y!=0 || x!=0)
{
soma = soma +Bess[x][y];
}
}
}
*c = (1/(4*soma));
}
int main(int argc, char* argv[])
{
double **xold, **xnew, **yold, **ynew, **a;
double gama, C;
int x,y;
int t,i;
double Mn, acopl;
char arqnome[100];
FILE *fout;
double **Bess;
Bess= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){Bess[i] = (double*)malloc(sizeof(double)* (N+3));}
xold= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){xold[i] = (double*)malloc(sizeof(double)* (N+3));}
yold= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){yold[i] = (double*)malloc(sizeof(double)*(N+3));}
xnew= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){xnew[i] = (double*)malloc(sizeof(double)*(N+3));}
ynew= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){ynew[i] = (double*)malloc(sizeof(double)*(N+3));}
a= (double**)malloc(sizeof(double*)*(N+3));
for(i=0; i<(N+3); i++){a[i] = (double*)malloc(sizeof(double)*(N+3));}
srand (time(NULL));
gama = 0.005;
sprintf(arqnome,"serie_%.3f_%.3f.dat",gama,epsilon);
fout = fopen(arqnome,"w");
Matriz_Bessel(Bess,gama);
condicoes_iniciais(xold,yold,a);
a[0][0] = 4.1;
a[N/2][N/2] = 4.3;
constante(&C, gama,Bess);
for(t=0;t<=tmax;t++)
{
Mn = 0;
for(x=0;x<=N;x++)
{
for(y=0;y<=N;y++)
{
acoplamento(x,y,xold,&acopl,Bess);
xnew[x][y] = (a[x][y]/(1+xold[x][y]*xold[x][y])) +yold[x][y] + epsilon*C*acopl;
ynew[x][y] = yold[x][y] - sigma*xold[x][y] - beta;
Mn = Mn + xnew[x][y];
xold[x][y] = xnew[x][y];
yold[x][y] = ynew[x][y];
}
}
if(t>trans){fprintf(fout,"%d %f %f %f %f %f\n",(t-trans),xold[0][0],yold[0][0],xold[N/2][N/2],yold[N/2][N/2],Mn/((N+1)*(N+1)));}
}
return 0;
}
Bess[N][N] is the Bessel function for each radius, with is calculated using numerical recipes. This program take around 1 hour to finish.
With the sugestion of Francis i have
#include <fftw3.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include"bessel.c"
#include"ran1.c"
#define tmax 90000
#define beta 0.001
#define N 50
#define sigma 0.001
#define pi acos(-1.0)
#define trans 50000
#define epsilon 0.1
void condicoes_iniciais(double *xold,double *yold,double *a)
{
int l;
long idum=-120534;
for(l=0;l<= N*N; l++){
a[l]=5.0;}
for(l=0;l<= N*N; l++){
while(a[l]>4.4)
a[l]=4.1+ran1(& idum);}
for(l=0;l<=N* N; l++){
xold[l]=0.1*ran1(& idum);
yold[l]=0.1*ran1(& idum);}
a[0]=4.1;
a[N]=4.4;
}
void Matriz_Bessel(double *bessel,double gama)
{
int x,y,i,j;
double dist;
for(x=0,i=-N/2;x<N;x++,i++)
{
for(y=0,j=-N/2;y<N;y++,j++)
{
double dist=sqrt(i*i+j*j);
if(dist>0){
bessel[x*N+y]=bessk0(gama*dist);
}
else{
bessel[x*N+y]=1;
}
}
}
}
void constante(double *c, double *bessel)
{
int x;
int y;
double soma = 0;
for(x=0;x<N;x++){
for(y=0;y<N;y++){
soma = soma + bessel[x*N+y];
}}
*c =(1/(4*soma));
}
int main(int argc, char* argv[]){
double *xnew=fftw_malloc(sizeof(double)*N*N);
double *acopl=fftw_malloc(sizeof(double)*N*N);
double *xold=malloc(sizeof(double)*N*N);
double *yold = malloc(sizeof(double)*N*N);
double *a = malloc(sizeof(double)*N*N);
fftw_complex *xfourier;
xfourier = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
fftw_complex *aux;
aux= (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
double *bessel= fftw_malloc(sizeof(double)*N*N);
fftw_complex *besself;
besself=fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
double scale=1.0/(N*N);
int t,i;
double gama,Mn,C;
gama = 0.005;
char arqnome[1000];
FILE *fout;
sprintf(arqnome,"opt2_tamanho_plato_%.3f_%d.dat",gama,N);
fout = fopen(arqnome,"w");
//initial
printf("initial\n");
condicoes_iniciais(xold,yold,a);
//xold[(N/2)*N+N/2]=1;
// fftw_plan
printf("fftw_plan\n");
fftw_plan plan;
plan=fftw_plan_dft_r2c_2d(N, N, xnew, xfourier, FFTW_MEASURE | FFTW_PRESERVE_INPUT);
fftw_plan planb;
planb=fftw_plan_dft_r2c_2d(N, N,(double*) bessel, besself, FFTW_MEASURE);
fftw_plan plani;
plani=fftw_plan_dft_c2r_2d(N, N, aux, acopl, FFTW_MEASURE);
Matriz_Bessel(bessel,gama);
constante(&C, bessel);
fftw_execute(planb);
//time loop
printf("time loop\n");
for(t=0;t<=tmax;t++){
//convolution= products in fourier space
fftw_execute(plan);
for(i=0;i<N*(N/2+1);i++){
aux[i][0]=(xfourier[i][0]*besself[i][0]-xfourier[i][2]*besself[i][3]);
aux[i][4]=(xfourier[i][0]*besself[i][5]+xfourier[i][6]*besself[i][0]);
}
fftw_execute(plani);//xnew is updated
Mn = 0;
for(i=0;i<N*N;i++){
xnew[i]=(a[i]/(1+xold[i]*xold[i])) +yold[i] + epsilon*C* (acopl[i]/(double)(N*N));
yold[i] = yold[i] - sigma*xold[i] - beta;
Mn = Mn +xnew[i];
}
memcpy(xold,xnew,N*N*sizeof(double));
if(t>trans){fprintf(fout,"%d %f %f %f %f %f\n",(t-trans),xold[0],yold[0],xold[N],yold[N],Mn/((N+1)*(N+1)));}
}
printf("destroy\n");
fftw_destroy_plan(plan);
fftw_destroy_plan(plani);
fftw_destroy_plan(planb);
printf("free\n");
fftw_free(bessel);
fftw_free(xnew);
fftw_free(xold);
fftw_free(yold);
fftw_free(besself);
fftw_free(xfourier);
return 0;
}
With take around 1min to finish, but i got this results
The scale factor on fftw3 code have to be that value. I dont know how make it work.
The operation you are describing is called a convolution. Let f(x,y) be your periodic sources and B(x,y) the Bessel function. You are trying to compute :
Discretized on a grid of size N+1, it writes :
Since this sum is performed at all points, the complexity is very high : O(N^4). It means that the number of operations to performed is of the magnitude of N*N*N*N. How to reduce this complexity ?
If B(x,y) gets rapidly small as the distance increases, long-range interactions may be neglected and the window of the convolution may be reduced. It will affect the precision of the output and it may not be useful for your problem. Let N_W<<N be the size of this window. The sum now writes :
And the number of operations to be performed is about N*N*N_W*N_W<<N^4.
Yet, from a practical point of view, the kernel has to be very small to make the method described above very interesting. Since the Bessel functions decrease slowly (from Abramowitz and Stegun: Handbook of Mathematical Functions, p364) (approx 1/sqrt(x)), the previous method is unlikely to be successful.
According to the convolution theorem, the Discrete Fourier Transform may be applied to convolve periodic signals ! A convolution in distance space resumes to products of corresponding wavelength in the Fourier space.
The algorithm is the following :
1 Compute the DFT of f, named hatf
2 Compute the DFT of B, named hatB
3 For all frequencies p,q, perform the product :
hatf*(p,q)=hatf(p,q)*hatB(p,q)
4 Inverse the DFT to get f*
The method described above is really efficient since its complexity is the one of 2D DFT, that is N*N*log(N). Moreover, dedicated libraries such as FFTW makes it easy to implement. Take a look at fftw_plan fftw_plan_dft_r2c_2d and be careful about the data layout.
EDIT : I still think there is a may to make it work... Here is a starting code, compile it by gcc main.c -o main -lfftw3 -lm
#include <fftw3.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
int save_image(int N,double* data,int nb){
char filename[1000];
sprintf(filename,"xxx%d.vtk",nb);
FILE * pFile;
pFile = fopen (filename,"w");
if (pFile!=NULL)
{
fputs ("# vtk DataFile Version 2.0\n",pFile);
fputs ("Volume example\n",pFile);
fputs ("ASCII\n",pFile);
fputs ("DATASET STRUCTURED_POINTS\n",pFile);
fprintf(pFile,"DIMENSIONS %d %d 1\n",N,N);
fputs ("ASPECT_RATIO 1 1 1\n",pFile);
fputs ("ORIGIN 0 0 0\n",pFile);
fprintf(pFile,"POINT_DATA %d\n",N*N);
fputs ("SCALARS volume_scalars float 1\n",pFile);
fputs ("LOOKUP_TABLE default\n",pFile);
int i;
for(i=0;i<N*N;i++){
fprintf(pFile,"%f ",data[i]);
}
fclose (pFile);
}
return 0;
}
int main(int argc, char* argv[]){
int N=64;
double *xnew=fftw_malloc(sizeof(double)*N*N);
double *xold=fftw_malloc(sizeof(double)*N*N);
double *yold=fftw_malloc(sizeof(double)*N*N);
fftw_complex *xfourier=fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
double *bessel=fftw_malloc(sizeof(double)*N*N);
fftw_complex *besself=fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
//initial
printf("initial\n");
memset(xold,0,sizeof(double)*N*N);
memset(yold,0,sizeof(double)*N*N);
xold[(N/2)*N+N/2]=1;
// fftw_plan
printf("fftw_plan\n");
fftw_plan plan;
plan=fftw_plan_dft_r2c_2d(N, N, xold, xfourier, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT);
fftw_plan planb;
planb=fftw_plan_dft_r2c_2d(N, N,(double*) bessel, besself, FFTW_ESTIMATE);
fftw_plan plani;
plani=fftw_plan_dft_c2r_2d(N, N, xfourier, xnew, FFTW_ESTIMATE);
//bessel function
//crude approximate of bessel...
printf("bessel function\n");
double dx=1.0/(double)N;
double dy=1.0/(double)N;
int x,y;int i,j;
for(x=0,i=-N/2;x<N;x++,i++){
for(y=0,j=-N/2;y<N;y++,j++){
double dist=sqrt(dx*dx*(i*i+j*j));
double range=0.01;
dist=dist/range;
if(dist>0){
bessel[x*N+y]=sqrt(2./(M_PI*dist))*cos(dist-M_PI/4.0);
}else{
bessel[x*N+y]=1;
}
}
}
fftw_execute(planb);
fftw_destroy_plan(planb);
fftw_free(bessel);
//time loop
printf("time loop\n");
int t,tmax=100;
for(t=0;t<=tmax;t++){
save_image(N,xold,t);
printf("t=%d\n",t);
//convolution= products in fourier space
fftw_execute(plan);
double scale=1.0/((double)N*N);
//scale*=scale; //may be needed to correct scaling
for(i=0;i<N*(N/2+1);i++){
xfourier[i][0]=(xfourier[i][0]*besself[i][0]-xfourier[i][1]*besself[i][1])*scale;
xfourier[i][1]=(xfourier[i][0]*besself[i][1]+xfourier[i][1]*besself[i][0])*scale;
}
fftw_execute(plani);//xnew is updated
double C=1;double epsilon=1; double a=1; double beta=1;double sigma=1;
for(i=0;i<N*N;i++){
xnew[i]=(a/(1+xold[i]*xold[i])) +yold[i] + epsilon*C*xnew[i];
yold[i] = yold[i] - sigma*xold[i] - beta;
}
memcpy(xold,xnew,N*N*sizeof(double));
}
printf("destroy\n");
fftw_destroy_plan(plan);
fftw_destroy_plan(plani);
// fftw_destroy_plan(planb);
printf("free\n");
fftw_free(xnew);
fftw_free(xold);
fftw_free(yold);
fftw_free(besself);
fftw_free(xfourier);
return 0;
}
It produces some vtk images of xold which may be opened by the paraview software. It is likely that saving the images slow down the computations...
My coefficients are wrong, so the output is wrong...
EDIT : Here is piece of code based on yours, to be compiled by gcc main.c -o main -lfftw3 -lm. I found bessk0.c and bessi0.c.
The code writes :
#include <fftw3.h>
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include"bessi0.c"
#include"bessk0.c"
//#include"bessel.c"
//#include"ran1.c"
#define tmax 90000
#define beta 0.001
#define N 50
#define sigma 0.001
#define pi acos(-1.0)
#define trans 50000
#define epsilon 0.1
double ran1(long* idum){
return ((double)rand())/((double)RAND_MAX);
}
void condicoes_iniciais(double *xold,double *yold,double *a)
{
int l;
long idum=-120534;
for(l=0;l<= N*N; l++){
a[l]=5.0;}
for(l=0;l<= N*N; l++){
while(a[l]>4.4)
a[l]=4.1+ran1(& idum);}
for(l=0;l<=N* N; l++){
xold[l]=0.1*ran1(& idum);
yold[l]=0.1*ran1(& idum);
//printf("%g %g %g\n",xold[l],yold[l],a[l]);
}
a[0]=4.1;
a[N]=4.4;
}
void Matriz_Bessel(double *bessel,double gama)
{
int x,y,i,j;
double dist;
for(x=0,i=-N/2;x<N;x++,i++)
{
for(y=0,j=-N/2;y<N;y++,j++)
{
double dist=sqrt(i*i+j*j);
if(dist>0){
bessel[x*N+y]=bessk0(gama*dist);
//printf("%g %g\n",dist,bessel[x*N+y]);
}
else{
bessel[x*N+y]=1;
}
}
}
}
void constante(double *c, double *bessel)
{
int x;
int y;
double soma = 0;
for(x=0;x<N;x++){
for(y=0;y<N;y++){
soma = soma + bessel[x*N+y];
}}
// *c =(1.0/(4.0*soma));
*c =(1.0/(soma));
}
int main(int argc, char* argv[]){
//srand (time(NULL));
srand (0);
double *xnew=fftw_malloc(sizeof(double)*N*N);
double *acopl=fftw_malloc(sizeof(double)*N*N);
double *xold=malloc(sizeof(double)*N*N);
double *yold = malloc(sizeof(double)*N*N);
double *a = malloc(sizeof(double)*N*N);
fftw_complex *xfourier;
xfourier = (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
fftw_complex *aux;
aux= (fftw_complex*) fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
double *bessel= fftw_malloc(sizeof(double)*N*N);
fftw_complex *besself;
besself=fftw_malloc(sizeof(fftw_complex)*(N/2+1)*N);
double scale=1.0/((double)N*N);
int t,i;
double gama,Mn,C;
gama = 0.005;
char arqnome[1000];
FILE *fout;
sprintf(arqnome,"opt2_tamanho_plato_%.3f_%d.dat",gama,N);
fout = fopen(arqnome,"w");
//initial
printf("initial\n");
condicoes_iniciais(xold,yold,a);
//xold[(N/2)*N+N/2]=1;
// fftw_plan
printf("fftw_plan\n");
fftw_plan plan;
plan=fftw_plan_dft_r2c_2d(N, N, xnew, xfourier, FFTW_MEASURE | FFTW_PRESERVE_INPUT);
fftw_plan planb;
planb=fftw_plan_dft_r2c_2d(N, N, bessel, besself, FFTW_MEASURE);
fftw_plan plani;
plani=fftw_plan_dft_c2r_2d(N, N, aux, acopl, FFTW_MEASURE);
Matriz_Bessel(bessel,gama);
constante(&C, bessel);
fftw_execute(planb);
//time loop
printf("time loop\n");
for(t=0;t<=tmax;t++){
//convolution= products in fourier space
fftw_execute(plan);
for(i=0;i<N*(N/2+1);i++){
aux[i][0]=(xfourier[i][0]*besself[i][0]-xfourier[i][1]*besself[i][1]);
aux[i][1]=(xfourier[i][0]*besself[i][1]+xfourier[i][1]*besself[i][0]);
}
fftw_execute(plani);//xnew is updated
Mn = 0;
for(i=0;i<N*N;i++){
xnew[i]=(a[i]/(1+xold[i]*xold[i])) +yold[i] + epsilon*C* (acopl[i]/(double)(N*N));
yold[i] = yold[i] - sigma*xold[i] - beta;
Mn = Mn +xnew[i];
}
memcpy(xold,xnew,N*N*sizeof(double));
if(t>trans){fprintf(fout,"%d %f %f %f %f %f\n",(t-trans),xold[0],yold[0],xold[N],yold[N],Mn/((N+1)*(N+1)));}
}
printf("destroy\n");
fftw_destroy_plan(plan);
fftw_destroy_plan(plani);
fftw_destroy_plan(planb);
printf("free\n");
fftw_free(bessel);
fftw_free(xnew);
fftw_free(xold);
fftw_free(yold);
fftw_free(besself);
fftw_free(xfourier);
fftw_free(aux);
fftw_free(acopl);
return 0;
}
The result is the following :
The lines :
aux[i][0]=(xfourier[i][0]*besself[i][0]-xfourier[i][1]*besself[i][1]);
aux[i][1]=(xfourier[i][0]*besself[i][1]+xfourier[i][1]*besself[i][0]);
Correspond to product of complex numbers. aux[i] is a complex number, aux[i][0] is its real part and aux[i][1] its imaginary part. Hence aux[i][4] does not correspond to something meaningful. These complex numbers correspond to magnitudes of frequencies in the Fourier space.
I also modified the constant : *c =(1.0/(soma));
Do not forget to add srand(0) if you wish to compare outputs and build the initial state in the same way.
You could perhaps use the symmetry of the grid to reduce the number of computations needed. Especially so if you are modelling an infinite periodic system, as apparent wrap-around logic makes me think you may be doing.
Consider:
the same influence is exerted on a particle at the coordinates [35][35] by particles at [35 - x][35], [35 + x][35], [35][35 - x], and [35][35 + x], for any x; also,
another influence is exerted equally by the particles at [35 - x][35 - x], [35 + x][35 - x], [35 - x][35 + x], and [35 + x][35 + x], for any; and
yet another influence is exerted equally by the particles at [35 + x][35 + y], [35 + x][35 - y], [35 - x][35 + y], [35 - x][35 - y], [35 + y][35 + x], [35 + y][35 - x], [35 - y][35 + x], and [35 - y][35 - x], for any x != y.
You should be able to speed your computation by a little less than a factor of 8 by using those equivalences.
If indeed you are simulating an infinite periodic system, however, then I observe that your approach incorporates a bias: by computing the influences from a square grid, you are including the influence of some of the particles at distances between N and sqrt(2) * N from the target, but not of others. You should compute on a (virtual) disc, instead, to avoid such bias.
Furthermore, the appearance of input parameters x and y leads me to suppose that you are performing that computation once for each grid position. If, again, you are modelling an infinite, periodic grid with an emitter at each grid point, and in which each point's influence depends only on distance, then every point will experience the same influence. You could cut your runtime several thousand-fold, and reduce the asymptotic complexity of your algorithm if you can make use of that.
Related
I'm trying to create a program that compares the efficiency of calculating a function through MacLaurin series.
The idea is: Make a graph (using gnuplot) of cos(x) between -Pi and Pi (100 intervals) calculating cos(x) using the first 4 terms of its MacLaurin series, then, the first 6 terms, and comparing the graph between them.
Cos(x) through MacLaurin.
So, to use gnuplot, I made the code below that gets 2 files with the data I need, however, when i run the code only the first result is correct. For the first 4 terms my file is:
-3.141593 -9.760222e-001
-3.078126 2.367934e+264
And the rest of what would be my Y axis is just 2.367934e+264 repeated over and over. The 6 terms file is also just that number. X axis is fine.
I'm fairly new to coding and just don't know what i'm doing wrong. Any help would be appreciated.
Here's the code:
#include <stdio.h>
#include <math.h>
#define X_INI -M_PI
#define X_FIM M_PI
#define NI 100
int fatorial(int);
double serie(int ,double );
int main()
{
double x, y[NI], dx;
int i;
FILE *fp[3];
fp[0]=fopen("4Termos.dat","w");
fp[1]=fopen("6Termos.dat","w");
x=X_INI;
dx = (X_FIM - X_INI)/ (NI - 1);
for(i=0; i<NI; i++){
y[i]=serie(4,x);
fprintf(fp[0],"%lf %e\n", x, y[i]);
y[i]=serie(6,x);
fprintf(fp[1],"%lf %e\n", x, y[i]);
x = x + dx;
}
return 0;
}
int fatorial(int n) {
int i,p;
p = 1;
if (n==0)
return 1;
else {
for (i=1;i<=n;i++)
p = p*i;
return p;
}
}
double serie(int m, double z){
double s;
int j;
for(j = 0; j < m+1; j++)
{
s = s + ( ( pow((-1) , j))*pow(z, (2*j)) ) / (fatorial(2*j));
}
return s;
}
Fatorial is used to calculate factorial, serie used to calculate MacLaurin...
Use of uninitialized s in serie() function (I've taken the liberty to format the code to my liking).
double serie(int m, double z) {
double s; // better: double s = 0;
int j;
for (j = 0; j < m + 1; j++) {
s += pow(-1, j) * pow(z, 2 * j) / fatorial(2 * j);
}
return s;
}
I've been experimenting with SSE intrinsics and I seem to have run into a weird bug that I can't figure out. I am computing the inner product of two float arrays, 4 elements at a time.
For testing I've set each element of both arrays to 1, so the product should be == size.
It runs correctly, but whenever I run the code with size > ~68000000 the code using the sse intrinsics starts computing the wrong inner product. It seems to get stuck at a certain sum and never exceeds this number. Here is an example run:
joe:~$./test_sse 70000000
sequential inner product: 70000000.000000
sse inner product: 67108864.000000
sequential time: 0.417932
sse time: 0.274255
Compilation:
gcc -fopenmp test_sse.c -o test_sse -std=c99
This error seems to be consistent amongst the handful of computers I've tested it on. Here is the code, perhaps someone might be able to help me figure out what is going on:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <math.h>
#include <assert.h>
#include <xmmintrin.h>
double inner_product_sequential(float * a, float * b, unsigned int size) {
double sum = 0;
for(unsigned int i = 0; i < size; i++) {
sum += a[i] * b[i];
}
return sum;
}
double inner_product_sse(float * a, float * b, unsigned int size) {
assert(size % 4 == 0);
__m128 X, Y, Z;
Z = _mm_set1_ps(0.0f);
float arr[4] __attribute__((aligned(sizeof(float) * 4)));
for(unsigned int i = 0; i < size; i += 4) {
X = _mm_load_ps(a+i);
Y = _mm_load_ps(b+i);
X = _mm_mul_ps(X, Y);
Z = _mm_add_ps(X, Z);
}
_mm_store_ps(arr, Z);
return arr[0] + arr[1] + arr[2] + arr[3];
}
int main(int argc, char ** argv) {
if(argc < 2) {
fprintf(stderr, "usage: ./test_sse <size>\n");
exit(EXIT_FAILURE);
}
unsigned int size = atoi(argv[1]);
srand(time(0));
float *a = (float *) _mm_malloc(size * sizeof(float), sizeof(float) * 4);
float *b = (float *) _mm_malloc(size * sizeof(float), sizeof(float) * 4);
for(int i = 0; i < size; i++) {
a[i] = b[i] = 1;
}
double start, time_seq, time_sse;
start = omp_get_wtime();
double inner_seq = inner_product_sequential(a, b, size);
time_seq = omp_get_wtime() - start;
start = omp_get_wtime();
double inner_sse = inner_product_sse(a, b, size);
time_sse = omp_get_wtime() - start;
printf("sequential inner product: %f\n", inner_seq);
printf("sse inner product: %f\n", inner_sse);
printf("sequential time: %f\n", time_seq);
printf("sse time: %f\n", time_sse);
_mm_free(a);
_mm_free(b);
}
You are running into the precision limit of single precision floating point numbers. The number 16777216 (2^24), which is the value of each component of the vector Z when reaching the "limit" inner product, is represented in 32-bit floating point as hexadecimal 0x4b800000 or binary 0 10010111 00000000000000000000000, i.e. the 23-bit mantissa is all zeros (implicit leading 1 bit), and the 8-bit exponent part is 151 representing the exponent 151 - 127 = 24. If you add a 1 to that value this would require to increase the exponent but then the added one cannot be represented in the mantissa any longer, so in single precision floating point arithmetic 2^24 + 1 = 2^24.
You do not see that in your sequential function because there you are using a 64-bit double precision value to store the result, and as we are working on a x86 platform, internally most probably an 80-bit excess precision register is used.
You can force to use single precision throughout in your sequential code by rewriting it as
float sum;
float inner_product_sequential(float * a, float * b, unsigned int size) {
sum = 0;
for(unsigned int i = 0; i < size; i++) {
sum += a[i] * b[i];
}
return sum;
}
and you will see 16777216.000000 as maximum computed value.
Assume that the dimensions are very large (up to 1 billion elements in a matrix). How would I implement a cache oblivious algorithm for matrix-vector product? Based on wikipedia I will need to recursively divide and conquer however I feel like there would be a lot of overhead.. Would it be efficient to do so?
Follow up question and answer: OpenMP with matrices and vectors
So the answer to the question, "how do I make this basic linear algebra operation fast", is always and everywhere to find and link to a tuned BLAS library for your platform. Eg, GotoBLAS (whose work is being continued in OpenBLAS), or the slower autotuned ATLAS, or commercial packages like Intel's MKL. Linear algebra is so fundamental to so many other operations that enormous amounts of effort goes into optimizing these packages for various platforms, and there's just no chance you're going to come up with something in a few afternoon's work that will compete. The particular subroutine calls you're looking for for general dense matrix-vector multiplicaiton is SGEMV/DGEMV/CGEMV/ZGEMV.
Cache-oblivious algorithms, or autotuning, are for when you can't be bothered tuning for the specific cache architecture of your system - which might be fine, normally, but since people are willing to do that for BLAS routines, and then make the tuned results available, means that you're best off just using those routines.
The memory access pattern for GEMV is straightforward enough that you don't really need divide and conquer (same for the standard case of matrix transpose) - you just find the cache blocking size and use it. In GEMV (y = Ax), you still have to scan through the entire matrix once, so there's nothing to be done for reuse (and thus effective cache use) there, but you can try reuse x as much as possible so you load it once instead of (number of rows) times - and you still want access to A to be cache friendly. So the obvious cache blocking thing to do is to break along blocks:
A x -> [ A11 | A12 ] | x1 | = | A11 x1 + A12 x2 |
[ A21 | A22 ] | x2 | | A21 x1 + A22 x2 |
And you can certainly do that recursively. But doing a naive implementation, it's slower than the simple double-loop, and way slower than a proper SGEMV library call:
$ ./gemv
Testing for N=4096
Double Loop: time = 0.024995, error = 0.000000
Divide and conquer: time = 0.299945, error = 0.000000
SGEMV: time = 0.013998, error = 0.000000
The code follows:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include "mkl.h"
float **alloc2d(int n, int m) {
float *data = malloc(n*m*sizeof(float));
float **array = malloc(n*sizeof(float *));
for (int i=0; i<n; i++)
array[i] = &(data[i*m]);
return array;
}
void tick(struct timeval *t) {
gettimeofday(t, NULL);
}
/* returns time in seconds from now to time described by t */
double tock(struct timeval *t) {
struct timeval now;
gettimeofday(&now, NULL);
return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);
}
float checkans(float *y, int n) {
float err = 0.;
for (int i=0; i<n; i++)
err += (y[i] - 1.*i)*(y[i] - 1.*i);
return err;
}
/* assume square matrix */
void divConquerGEMV(float **a, float *x, float *y, int n,
int startr, int endr, int startc, int endc) {
int nr = endr - startr + 1;
int nc = endc - startc + 1;
if (nr == 1 && nc == 1) {
y[startc] += a[startr][startc] * x[startr];
} else {
int midr = (endr + startr+1)/2;
int midc = (endc + startc+1)/2;
divConquerGEMV(a, x, y, n, startr, midr-1, startc, midc-1);
divConquerGEMV(a, x, y, n, midr, endr, startc, midc-1);
divConquerGEMV(a, x, y, n, startr, midr-1, midc, endc);
divConquerGEMV(a, x, y, n, midr, endr, midc, endc);
}
}
int main(int argc, char **argv) {
const int n=4096;
float **a = alloc2d(n,n);
float *x = malloc(n*sizeof(float));
float *y = malloc(n*sizeof(float));
struct timeval clock;
double eltime;
printf("Testing for N=%d\n", n);
for (int i=0; i<n; i++) {
x[i] = 1.*i;
for (int j=0; j<n; j++)
a[i][j] = 0.;
a[i][i] = 1.;
}
/* naive double loop */
tick(&clock);
for (int i=0; i<n; i++) {
y[i] = 0.;
for (int j=0; j<n; j++) {
y[i] += a[i][j]*x[j];
}
}
eltime = tock(&clock);
printf("Double Loop: time = %lf, error = %f\n", eltime, checkans(y,n));
for (int i=0; i<n; i++) y[i] = 0.;
/* naive divide and conquer */
tick(&clock);
divConquerGEMV(a, x, y, n, 0, n-1, 0, n-1);
eltime = tock(&clock);
printf("Divide and conquer: time = %lf, error = %f\n", eltime, checkans(y,n));
/* decent GEMV implementation */
tick(&clock);
float alpha = 1.;
float beta = 0.;
int incrx=1;
int incry=1;
char trans='N';
sgemv(&trans,&n,&n,&alpha,&(a[0][0]),&n,x,&incrx,&beta,y,&incry);
eltime = tock(&clock);
printf("SGEMV: time = %lf, error = %f\n", eltime, checkans(y,n));
return 0;
}
I've got simply 3 functions, one is control function aan the next 2 function are done in a bit different way using OpenMP. But function thread1 gives another score than thread2 and control and I have no idea why?
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
float integrate(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), i=begin, y1, y2;
for(i = 0; i<count; i++){
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
float thread1(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
#pragma omp parallel for reduction(+:score) private(y1,i) shared(count)
for(i = 0; i<count; i++){
y1 = ((function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0);
score = score + y1;
}
return score;
}
float thread2(float begin, float end, int count){
float score = 0 , width = (end-begin)/(1.0*count), y1, y2;
int i;
float * tab = (float*)malloc(count * sizeof(float));
#pragma omp parallel for
for(i = 0; i<count; i++){
tab[i] = (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for(i=0; i<count; i++)
score += tab[i];
return score;
}
unsigned long long int rdtsc(void){
unsigned long long int x;
unsigned a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long long)a) | (((unsigned long long)d) << 32);
}
int main(int argc, char** argv){
unsigned long long counter = 0;
//test
counter = rdtsc();
printf("control: %f \n ",integrate (atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("control count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread1: %f \n ",thread1(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread1 count: %lld \n",rdtsc()-counter);
counter = rdtsc();
printf("thread2: %f \n ",thread2(atof(argv[1]), atof(argv[2]), atoi(argv[3])));
printf("thread2 count: %lld \n",rdtsc()-counter);
return 0;
}
Here are simple answears :
gcc -fopenmp zad2.c -o zad -pg -lm
env OMP_NUM_THREADS=2 ./zad 3 13 100000
control: 5407308.500000
control count: 138308058
thread1: 5407494.000000
thread1 count: 96525618
thread2: 5407308.500000
thread2 count: 104770859
Update:
Ok, I tried to do this more quickly, and not count values for periods twice.
double thread3(double begin, double end, int count){
double score = 0 , width = (end-begin)/(1.0*count), yp, yk;
int i,j, k;
#pragma omp parallel private (yp,yk)
{
int thread_num = omp_get_num_threads();
k = count / thread_num;
#pragma omp for private(i) reduction(+:score)
for(i=0; i<thread_num; i++){
yp = function(begin + i*k*width);
yk = function(begin + (i*k+1)*width);
score += (yp + yk) * width / 2.0;
for(j=i*k +1; j<(i+1)*k; j++){
yp = yk;
yk = function(begin + (j+1)*width);
score += (yp + yk) * width / 2.0;
}
}
#pragma omp for private(i) reduction(+:score)
for(i = k*thread_num; i<count; i++)
score += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
return score;
}
But after few tests I found that the scores are near the right value, but not equal. Sometimes one of the threads doesn't start. When I'm not using OpenMp, the value is correct.
You're integrating a very strongly peaked function - x(xsin(x)) - which covers over 7 orders of magnitude in the range you're integrating it. That's about the limit for a 32-bit floating point number, so there are going to be issues depending on the order you sum the numbers. This isn't an OpenMP thing -- its just a numerical sensitivity thing.
So for instance, consider this completely serial code doing the same integral:
#include <stdio.h>
#include <math.h>
float function(float x){
return pow(x,pow(x,sin(x)));
}
int main(int argc, char **argv) {
const float begin=3., end=13.;
const int count = 100000;
const float width=(end-begin)/(1.*count);
float integral1=0., integral2=0., integral3=0.;
/* left to right */
for (int i=0; i<count; i++) {
integral1 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
/* right to left */
for (int i=count-1; i>=0; i--) {
integral2 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
/* centre outwards, first right-to-left, then left-to-right */
for (int i=count/2; i<count; i++) {
integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
for (int i=count/2-1; i>=0; i--) {
integral3 += (function(begin+(i*width)) + function(begin+(i+1)*width)) * width/2.0;
}
printf("Left to right: %lf\n", integral1);
printf("Right to left: %lf\n", integral2);
printf("Centre outwards: %lf\n", integral3);
return 0;
}
Running this, we get:
$ ./reduce
Left to right: 5407308.500000
Right to left: 5407430.000000
Centre outwards: 5407335.500000
-- the same sort of differences you see. Doing the summation with two threads necessarily changes the order of the summation, and so your answer changes.
There's a few options here. If this was just a test proble, and this function doesn't actually represent what you'll be integrating, you might be fine already. Otherwise, using a different numerical method may help.
But also here, there is a simple solution - the range of the numbers exceeds the range of a float, making the answer very sensitive to summation order, but fits comfortably within the range of a double, making the problem much less severe. Note that changing to doubles is not a magic solution to everything; some cases it just postpones the problem or allows you to paper over a flaw in your numerical method. But here it actually addresses the underlying problem fairly well. Changing all the floats above to doubles gives:
$ ./reduce
Left to right: 5407589.272885
Right to left: 5407589.272885
Centre outwards: 5407589.272885
On the other hand, even doubles wouldn't save you if you needed to integrate this function in the range (18,23).
Little bit of a 2 parter. First of all im trying to do this in all c. First of all I'll go ahead and post my program
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <string.h>
double f(double x);
void Trap(double a, double b, int n, double* integral_p);
int main(int argc, char* argv[]) {
double integral=0.0; //Integral Result
double a=6, b=10; //Left and Right Points
int n; //Number of Trapezoids (Higher=more accurate)
int degree;
if (argc != 3) {
printf("Error: Invalid Command Line arguements, format:./trapezoid N filename");
exit(0);
}
n = atoi(argv[2]);
FILE *fp = fopen( argv[1], "r" );
# pragma omp parallel
Trap(a, b, n, &integral);
printf("With n = %d trapezoids....\n", n);
printf("of the integral from %f to %f = %.15e\n",a, b, integral);
return 0;
}
double f(double x) {
double return_val;
return_val = pow(3.0*x,5)+pow(2.5*x,4)+pow(-1.5*x,3)+pow(0*x,2)+pow(1.7*x,1)+4;
return return_val;
}
void Trap(double a, double b, int n, double* integral_p) {
double h, x, my_integral;
double local_a, local_b;
int i, local_n;
int my_rank = omp_get_thread_num();
int thread_count = omp_get_num_threads();
h = (b-a)/n;
local_n = n/thread_count;
local_a = a + my_rank*local_n*h;
local_b = local_a + local_n*h;
my_integral = (f(local_a) + f(local_b))/2.0;
for (i = 1; i <= local_n-1; i++) {
x = local_a + i*h;
my_integral += f(x);
}
my_integral = my_integral*h;
# pragma omp critical
*integral_p += my_integral;
}
As you can see, it calculates trapezoidal rule given an interval.
First of all it DOES work, if you hardcode the values and the function. But I need to read from a file in the format of
5
3.0 2.5 -1.5 0.0 1.7 4.0
6 10
Which means:
It is of degree 5 (no more than 50 ever)
3.0x^5 +2.5x^4 −1.5x^3 +1.7x+4 is the polynomial (we skip ^2 since it's 0)
and the Interval is from 6 to 10
My main concern is the f(x) function which I have hardcoded. I have NO IDEA how to make it take up to 50 besides literally typing out 50 POWS and reading in the values to see what they could be.......Anyone else have any ideas perhaps?
Also what would be the best way to read in the file? fgetc? Im not really sure when it comes to reading in C input (especially since everything i read in is an INT, is there some way to convert them?)
For a large degree polynomial, would something like this work?
double f(double x, double coeff[], int nCoeff)
{
double return_val = 0.0;
int exponent = nCoeff-1;
int i;
for(i=0; i<nCoeff-1; ++i, --exponent)
{
return_val = pow(coeff[i]*x, exponent) + return_val;
}
/* add on the final constant, 4, in our example */
return return_val + coeff[nCoeff-1];
}
In your example, you would call it like:
sampleCall()
{
double coefficients[] = {3.0, 2.5, -1.5, 0, 1.7, 4};
/* This expresses 3x^5 + 2.5x^4 + (-1.5x)^3 + 0x^2 + 1.7x + 4 */
my_integral = f(x, coefficients, 6);
}
By passing an array of coefficients (the exponents are assumed), you don't have to deal with variadic arguments. The hardest part is constructing the array, and that is pretty simple.
It should go without saying, if you put the coefficients array and number-of-coefficients into global variables, then the signature of f(x) doesn't need to change:
double f(double x)
{
// access glbl_coeff and glbl_NumOfCoeffs, instead of parameters
}
For you f() function consider making it variadic (varargs is another name)
http://www.gnu.org/s/libc/manual/html_node/Variadic-Functions.html
This way you could pass the function 1 arg telling it how many "pows" you want, with each susequent argument being a double value. Is this what you are asking for with the f() function part of your question?