I've been thinking about using the SSE instruction set to make my 3d software rasterizer faster, but I've never used them before and feel like I am going completely wrong.
I'd like to hear from the more experienced on whether it is an effort that is worth it, and if this code is written poorly:
typedef union _declspec(align(16)) {
struct {
float x;
float y;
float z;
float w;
};
__m128 m128;
} Vec4_t;
Vec4_t AddVec(Vec4_t* a, Vec4_t *b) {
__m128 value = _mm_add_ps(a->m128, b->m128);
return *(Vec4_t*)&value;
}
This is how I'm testing it:
Vec4_t a = { 2.0f, 4.0f, 10.0f, 123.1f };
Vec4_t b = { 6.0f, 12.0f, 16.0f, 64.0f };
Vec4_t c = AddVec(&a, &b);
printf("%f, %f, %f, %f\n", c.x, c.y, c.z, c.w);
which outputs:
8.000000, 16.000000, 26.000000, 187.100006
I honestly have no idea what I'm doing. I'm surprised the code I wrote even worked.
Related
I am using these algorithms on a microcontroller:
float32_t cubic(float32_t b,float32_t c,float32_t d)
{
float32_t p=c-b*b/3.0f;
float32_t q=2.0f*b*b*b/27.0f-b*c/3.0f+d;
if(fabsf(p)==0.0f) return powf(q,1.0f/3.0f);
if(fabsf(q)==0.0f){
PRINTF(INFO, "q=0 %f", p);
return 0.0f; // TODO
}
float32_t t=sqrtf(fabsf(p)/3.0f);
float32_t g=1.5f*q/(p*t);
if(p>0.0f)
return -2.0f*t*sinhf(asinhf(g)/3.0f)-b/3.0f;
if(4.0f*p*p*p+27.0f*q*q<0.0f)
return 2.0f*t*cosf(acosf(g)/3.0f)-b/3.0f;
if(q>0.0f)
return -2.0f*t*coshf(acoshf(-g)/3.0f)-b/3.0f;
return 2.0f*t*coshf(acoshf(g)/3.0f)-b/3.0f;
}
int quartic(float32_t b,float32_t c,float32_t d,float32_t e,float32_t* ans)
{
float32_t p=c-0.375f*b*b;
float32_t q=0.125f*b*b*b-0.5f*b*c+d;
float32_t m=cubic(p,0.25f*p*p+0.01171875f*b*b*b*b-e+0.25f*b*d-0.0625f*b*b*c,-0.125f*q*q);
if(fabsf(q)==0.0f)
{
if(m<0.0f) return 0;
int nroots=0;
float32_t sqrt_2m=sqrtf(2.0f*m);
if(-m-p>0.0f)
{
float32_t delta=sqrtf(2.0f*(-m-p));
ans[nroots++]=-0.25f*b+0.5f*(sqrt_2m-delta);
ans[nroots++]=-0.25f*b-0.5f*(sqrt_2m-delta);
ans[nroots++]=-0.25f*b+0.5f*(sqrt_2m+delta);
ans[nroots++]=-0.25f*b-0.5f*(sqrt_2m+delta);
}
if(fabsf(-m-p)==0.0f)
{
ans[nroots++]=-0.25f*b-0.5f*sqrt_2m;
ans[nroots++]=-0.25f*b+0.5f*sqrt_2m;
}
return nroots;
}
if(m<0.0f) return 0;
float32_t sqrt_2m=sqrtf(2.0f*m);
int nroots=0;
if(fabsf(-m-p+q/sqrt_2m)>=0.0f)
{
float32_t delta=sqrtf(2.0f*(-m-p+q/sqrt_2m));
ans[nroots++]=0.5f*(-sqrt_2m+delta)-0.25f*b;
ans[nroots++]=0.5f*(-sqrt_2m-delta)-0.25f*b;
}
if(fabsf(-m-p-q/sqrt_2m)>=0.0f)
{
float32_t delta=sqrtf(2.0f*(-m-p-q/sqrt_2m));
ans[nroots++]=0.5f*(sqrt_2m+delta)-0.25f*b;
ans[nroots++]=0.5f*(sqrt_2m-delta)-0.25f*b;
}
return nroots;
}
From: Specialised algorithm to find positive real solutions to quartic equations? (I can't comment on this thread because I haven't enough Stackoverflow privileges)
Is the same but with single precision instead of doubles.
With doubles it seems that works.
With singles, i have some points in which q=0 in the cubic function, so it returns m=0 for the quartic function and the result is not a real root. I get zeros.
I need to implement Matlab root method for real positive roots.
This point gives problems with the above code but has solution with roots in Matlab.
C1=53.3456154
C2=1729.59448
C3=54973.8164
C4=56.3456192
C5=1729.5946
C6=54973.8242
ans=single(roots([C5 (-C1+2*C4-C6) (3*C2-3*C5) (C1-2*C3+C6) -C2]))
r=r(r==conj(r));
r=r(r>0)
In Matlab the result is ok
ans =
31.7814 + 0i
-0.0000 + 1.0001i
-0.0000 - 1.0001i
-0.0315 + 0i
r = 31.7814
I need to implement this function as a systemcall:
asmlinkage long sys_sqrt ( float x);
Where the function gonna print the square root of n to the kernel log.
I'm using kernel version 4.13 on 64bit virtual box.
I'm trying to implement the sqrt by using this technique
#include <linux/kernel.h>
#define SQRT_MAGIC_F 0x5f3759df
asmlinkage long sys_sqrt(float x);
{
const float xhalf = 0.5f*x;
union // get bits for floating value
{
float x;
int i;
} u;
u.x = x;
u.i = SQRT_MAGIC_F - (u.i >> 1);
printk ("%f", (x*u.x*(1.5f - xhalf*u.x*u.x));
return 0;
}
This leads the compiler telling me "error: SSE register return with SSE disabled" on "printk ("%f", (x * u.x * (1.5f - xhalf * u.x * u.x));"
Another workaround I tried is separating the integer and the decimals like so
float ans = (x*u.x*(1.5f - xhalf*u.x*u.x);
int head = ans;
float tail_float = ans - head;
int tail = tail_float*10000;
printk ("%d.%03d", head,tail);
This leads the compiler telling me "error:SSE register return with SSE disabled" on "float ans = (x *u.x *(1.5f - xhalf * u.x* u.x);"
another thing i've tried is adding a kernel_fpu_begin & end between the function body but this leads "error: implicit declaration of function "kernel_fpu_begin"; did you mean "kernel_old_dev_t"
Any solution?
Thank you so much.
I am trying to use Gaussian packages to study the transmission probability via Trotter-Suzuki formula and fast Fourier transform (FFT) when confronted with a square barrier, just as done in this Quantum Python article. But I need to realize it using C. In principle, the wave function will remain its shape before the collision with the square barrier. But I found that the wave function becomes flat dramatically with time before colliding with the square barrier. Anybody finds problems in the following codes?
Here, two files - result and psi.txt - are created to store the initial and evolved wave-function. The first two data for each are x coordinates, the probability of the wave function at that x. The third data for each line in file result is the square barrier distribution. The FFT I use is shown in this C program.
#include <stdio.h>
#include <math.h>
#define h_bar 1.0
#define pi 3.1415926535897932385E0
#define m0 1.0
typedef double real;
typedef struct { real Re; real Im; } complex;
extern void fft(complex x[], int N, int flag);
complex complex_product(complex x, real y_power, real y_scale)
{//x*exp(i*y_power)*y_scale
real Re, Im;
Re = (x.Re*cos(y_power)-x.Im*sin(y_power))*y_scale;
Im = (x.Re*sin(y_power)+x.Im*cos(y_power))*y_scale;
x.Re = Re; x.Im = Im;
return x;
}
real potential(real x, real a)
{
return (x<0 || x>=a) ? 0 : 1;
}
void main()
{
int t_steps=20, i, N=pow(2,10), m, n;
complex psi[N];
real x0=-2, p0=1, k0=p0/h_bar, x[N], k[N], V[N];
real sigma=0.5, a=0.1, x_lower=-5, x_upper=5;
real dt=1, dx=(x_upper-x_lower)/N, dk=2*pi/(dx*N);
FILE *file;
file = fopen("result", "w");
//initialize
for (n=0; n<N; n++)
{
x[n] = x_lower+n*dx;
k[n] = k0+(n-N*0.5)*dk;
V[n] = potential(x[n], a);
psi[n].Re = exp(-pow((x[n]-x0)/sigma, 2)/2)*cos(p0*(x[n]-x0)/h_bar);
psi[n].Im = exp(-pow((x[n]-x0)/sigma, 2)/2)*sin(p0*(x[n]-x0)/h_bar);
}
for (m=0; m<N; m++)
fprintf(file, "%g %g %g\n", x[m], psi[m].Re*psi[m].Re+psi[m].Im*psi[m].Im, V[m]);
fclose(file);
for (i=0; i<t_steps; i++)
{
printf("t_steps=%d\n", i);
for (n=0; n<N; n++)
{
psi[n]=complex_product(psi[n], -V[n]*dt/h_bar, 1);
psi[n]=complex_product(psi[n], -k[0]*x[n], dx/sqrt(2*pi));//x--->x_mod
}
fft(psi, N, 1);//psi: x_mod--->k_mod
for (m=0; m<N; m++)
{
psi[m]=complex_product(psi[m], -m*dk*x[0], 1);//k_mod--->k
psi[m]=complex_product(psi[m], -h_bar*k[m]*k[m]*dt/(2*m0), 1./N);
psi[m]=complex_product(psi[m], m*dk*x[0], 1);//k--->k_mod
}
fft(psi, N, -1);
for (n=0; n<N; n++)
psi[n] = complex_product(psi[n], k[0]*x[n], sqrt(2*pi)/dx);//x_mod--->x
}
file = fopen("psi.txt", "w");
for (m=0; m<N; m++)
fprintf(file, "%g %g 0\n", x[m], pow((psi[m]).Re, 2)+pow((psi[m]).Im, 2));
fclose(file);
}
I use the following Python code to plot the initial and final evolved wave functions:
call: `>>> python plot.py result psi.txt`
import matplotlib.pyplot as plt
from sys import argv
for filename in argv[1:]:
print filename
f = open(filename, 'r')
lines = [line.strip(" \n").split(" ") for line in f]
x = [float(line[0]) for line in lines]
y = [float(line[2]) for line in lines]
psi = [float(line[1]) for line in lines]
print "x=%g, max=%g" % (x[psi.index(max(psi))], max(psi))
plt.plot(x, y, x, psi)
#plt.xlim([-1.0e-10, 1.0e-10])
plt.ylim([0, 3])
plt.show()
Your code is almost correct, sans the fact that you are missing the initial/final half-step in the real domain and some unnecessary operations (kmod -> k and back), but the main problem is that your initial conditions are really chosen badly. The time evolution of a Gaussian wavepacket results in the uncertainty spreading out quadratically in time:
Given your choice of particle mass and initial wavepacket width, the term in the braces equals 1 + 4 t2. After one timestep, the wavepacket is already significantly wider than initially and after another timestep becomes wider than the entire simulation box. The periodicity implied by the use of FFT results in spatial and frequency aliasing, which together with the overly large timestep is why your final wavefunction looks that strange.
I would advise that you try to replicate exactly the conditions of the Python program, including the fact that the entire system is in a deep potential well (Vborder -> +oo).
The variable i is uninitialised here:
k[n] = k0+(i-N*0.5)*dk;
I am trying to implement a multidimensional array. Below, you will see for one example in jArray[2][2] that I assign 20.0, clearly. However, both printf statements don't yield the same result. Thanks for your help!
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#define M_PI 3.14159265358979323846
int main(){
float x1 = 0.1;
float x2 = 0.1;
float x3 = -0.1;
float jArray [3][3] = {
{3.0, x3*sin(x2*x3), x2*sin(x2*x3)},
{2*x1, -162*(x2+0.1), cos(x3)},
{-x2*exp(-x1*x2), -x1*exp(x1*x2), 20.0}
};
float matrix0 [3][3] = {
{jArray[0][0], jArray[0][1], jArray[0][2]},
{jArray[1][0], jArray[1][1], jArray[1][2]},
{jArray[2][0], jArray[2][1], jArray[2][2]},
};
printf("%f\n\n", jArray[2][2]);
printf("[%f\t%f\t%f]\n[%f\t%f\t%f]\n[%f\t%f\t%f]\n\n",
matrix0[0][0], matrix0[0][1], matrix0[0,2],
matrix0[1][0], matrix0[1][1], matrix0[1,2],
matrix0[2][0], matrix0[2][1], matrix0[2,2]);
return 1;
}
Output:
20.000000
[3.000000 0.001000 0.200000]
[-32.400002 -0.099005 -0.101005]
[0.000000 0.010000 0.000000]
Replace
printf("[%f\t%f\t%f]\n[%f\t%f\t%f]\n[%f\t%f\t%f]\n\n",
matrix0[0][0], matrix0[0][1], matrix0[0,2],
matrix0[1][0], matrix0[1][1], matrix0[1,2],
matrix0[2][0], matrix0[2][1], matrix0[2,2]);
with
printf("[%f\t%f\t%f]\n[%f\t%f\t%f]\n[%f\t%f\t%f]\n\n",
matrix0[0][0], matrix0[0][1], matrix0[0][2],
matrix0[1][0], matrix0[1][1], matrix0[1][2],
matrix0[2][0], matrix0[2][1], matrix0[2][2]);
Your compiler would've emitted a warning for that because %f expects a float(or a double), not a float*.
I am trying to use SIMD instructions in my C program. I am using CodeBlocks to write in.
I tried following this tutorial: https://www.kernel.org/pub/linux/kernel/people/geoff/cell/ps3-linux-docs/CellProgrammingTutorial/BasicsOfSIMDProgramming.html
I am trying to do both integer and floating point SIMD addition, subtraction, etc.
However, the code explained in the page does not work in CodeBlocks/C. How do I use SIMD here?
#include <stdio.h>
typedef int v4sf __attribute__ ((mode(V4SF))); // vector of four single floats
union f4vector
{
v4sf v;
float f[4];
};
int main()
{
union f4vector a, b, c;
a.f[0] = 1; a.f[1] = 2; a.f[2] = 3; a.f[3] = 4;
b.f[0] = 5; b.f[1] = 6; b.f[2] = 7; b.f[3] = 8;
c.v = a.v + b.v;
printf("%f, %f, %f, %f\n", c.f[0], c.f[1], c.f[2], c.f[3]);
}
C:\Things\New Text Document.c|2|warning: specifying vector types with __attribute__ ((mode)) is deprecated [-Wattributes]|
C:\Things\New Text Document.c|2|warning: use __attribute__ ((vector_size)) instead [-Wattributes]|
C:\Things\New Text Document.c|2|error: mode 'V4SF' applied to inappropriate type|
||=== Build failed: 1 error(s), 2 warning(s) (0 minute(s), 0 second(s)) ===|
The tutorial you are trying to use is for SIMD programming for the Cell CPU (i.e, in the Playstation 3). It is not applicable to x86 programming.
Use a tutorial that is applicable to the compiler you are using (GCC, Clang, or Visual C++).
You need to make sure your CPU supports the vector type intrinsics and vector instructions you want to use before you compile and/or execute.
I'm guessing your CPU is x86, but Windows should have a way to verify that. With Linux you can run something like grep avx2 /proc/cpuinfo.
the tuition has something wrong that the typedef origin date type should be the float.
For a compile through example is as follow
#include <stdio.h>
typedef float v4sf __attribute__((vector_size(16)));
union f4vector
{
v4sf v;
float f[4];
};
int main()
{
union f4vector a, b, c;
a.f[0] = 1; a.f[1] = 2; a.f[2] = 3; a.f[3] = 4;
b.f[0] = 5; b.f[1] = 6; b.f[2] = 7; b.f[3] = 8;
c.v = a.v + b.v;
printf("%f, %f, %f, %f\n", c.f[0], c.f[1], c.f[2], c.f[3]);
}