I am doing some image processing on my Beaglebone Black and am interested in the performance gain of using floats vs doubles in my algorithm.
I've tried to devise a simple test for this:
main.c
#define MAX_TEST 10
#define MAX_ITER 1E7
#define DELTA 1E-8
void float_test()
{
float n = 0.0;
for (int i=0; i<MAX_ITER; i++)
{
n += DELTA;
n /= 3.0;
}
}
void double_test()
{
double n = 0.0;
for (int i=0; i<MAX_ITER; i++)
{
n += DELTA;
n /= 3.0;
}
}
int main()
{
for (int i=0; i<MAX_TEST; i++)
{
double_test();
float_test();
}
return 0;
}
ran as:
gcc -Wall -pg main.c -std=c99
./a.out
gprof a.out gmon.out -q > profile.txt
profile.txt:
granularity: each sample hit covers 4 byte(s) for 0.03% of 35.31 seconds
index % time self children called name
<spontaneous>
[1] 100.0 0.00 35.31 main [1]
18.74 0.00 10/10 float_test [2]
16.57 0.00 10/10 double_test [3]
-----------------------------------------------
18.74 0.00 10/10 main [1]
[2] 53.1 18.74 0.00 10 float_test [2]
-----------------------------------------------
16.57 0.00 10/10 main [1]
[3] 46.9 16.57 0.00 10 double_test [3]
-----------------------------------------------
I am not sure if the compiler is optimizing away some of my code or if I am doing enough arithmetic for it to matter. I find it a bit odd that the double_test() is actually taking less time than the float_test().
I've tried switching the order in which the functions are called and they results are still the same. Could somebody explain this to me?
On my machine (x86_64), looking at the code generated, side by side:
double_test: .. float_test:
xorpd %xmm0,%xmm0 // double n -- xorps %xmm0,%xmm0 // float n
xor %eax,%eax // int i == xor %eax,%eax
loop: .. loop:
++ unpcklps %xmm0,%xmm0 // Extend float n to...
++ cvtps2pd %xmm0,%xmm0 // ...double n
add $0x1,%eax // ++i == add $0x1,%eax
addsd %xmm2,%xmm0 // double n += DELTA == addsd %xmm2,%xmm0
cvtsi2sd %eax,%xmm3 // (double)i == cvtsi2sd %eax,%xmm3
++ unpcklpd %xmm0,%xmm0 // Reduce double n to...
++ cvtpd2ps %xmm0,%xmm0 // ...float n
divsd %xmm5,%xmm0 // double n /= 3.0 -- divss %xmm4,%xmm0 // float n / 3.0
ucomisd %xmm3,%xmm1 // (double)i cmp 1E7 == ucomisd %xmm3,%xmm1
ja ...loop... // if (double)i < 1E7 == ja ...loop...
showing four extra instructions to change up to double and back down to float in order to add DELTA.
The DELTA is 1E-8 which is implictly double. So, adding that is done double. Of course, 3.0 is also implictly double, but I guess the compiler spots that there is no effective difference between double and single in this case.
Defining DELTAF as 1E-8f gets rid of the change up to and down from double for the add.
Related
Okay i have a simple question . In my adventure i seek the largest numbers can hold in data types and i was trying things like long int , doubles and floats etc.
But in the simplest assigns such as Float x = 12345789 , it gives me 123456792 as a output .
Here's the code
#include <stdio.h>
int main()
{
int x = 1234567891 ;
long int y = 9034567891234567899;
long long int z = 9034567891234567891;
float t = 123456789 ;
printf("%i \n%li \n%lli \n%f \n ",x,y,z,t);
}
and the output im getting is
1234567891
9034567891234567899
9034567891234567891
123456792.000000
im coding on a linux and using gcc. What could be the problem ?
For clearity , if you give a higher number like
float t = 123456789123456789
it will get the first 9 right but somekind of rounding in last numbers where it should not .
1234567890519087104.000000
İ could have understand it if i was working beyond 0 like 0.00123 but its just straight on integers just to find out limits of float.
As a visual and experiential learner, I would recommend you to take a good look at how floating point number is represented in the world of bits with a little help of some online converter such as https://www.h-schmidt.net/FloatConverter/IEEE754.html
Value: 123456789
Hexadecimal representation: 0x4ceb79a3
Binary representation: 01001100111010110111100110100011
sign (0) : +1
exponent(10011001) : 2^26
mantissa(11010110111100110100011): 1.8396495580673218
Value actually stored in float: 1.8396495580673218 * 2^26 = 123456792
Error due to conversion: 3
float_converter_image
Here is a closer look on how the compiler actually does its job: https://gcc.godbolt.org/z/C4YyKe
int main()
{
float t = 123456789;
}
main:
push rbp
mov rbp, rsp
movss xmm0, DWORD PTR .LC0[rip]
movss DWORD PTR [rbp-4], xmm0
mov eax, 0
pop rbp
ret
.LC0:
.long 1290500515 //(0x4CEB79A3)
compiler_explorer_image
For your adventure seeking the largest numbers of each data types, I guess your can explore standard header files such as float.h and limits.h.
To find the largest contiguous integer value that can be round-tripped from integer to float to integer, the following experiment could be used:
#include <stdio.h>
int main()
{
long i = 0 ;
float fint = 0 ;
while( i == (long)fint )
{
i++ ;
fint = (float)i ;
}
printf( "Largest integer representable exactly by float = %ld\n", i - 1 ) ;
return 0;
}
However the experiment is largely unnecessary, since the value is predictably 224 since 23 is the number of bits in the float mantissa.
We are two HPC students getting involved into the famous Schönauer Triad Benchmark, whose C code are reported here along with its short explanation:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#define DEFAULT_NMAX 10000000
#define DEFAULT_NR DEFAULT_NMAX
#define DEFAULT_INC 10
#define DEFAULT_XIDX 0
#define MAX_PATH_LENGTH 1024
// #define WINOS
#define STACKALLOC
#ifdef WINOS
#include <windows.h>
#endif
static void dummy(double A[], double B[], double C[], double D[])
{
return;
}
static double simulation(int N, int R)
{
int i, j;
#ifdef STACKALLOC
double A[N];
double B[N];
double C[N];
double D[N];
#else
double * A = malloc(N*sizeof(double));
double * B = malloc(N*sizeof(double));
double * C = malloc(N*sizeof(double));
double * D = malloc(N*sizeof(double));
#endif
double elaps;
for (i = 0; i < N; ++i)
{
A[i] = 0.00;
B[i] = 1.00;
C[i] = 2.00;
D[i] = 3.00;
}
#ifdef WINOS
FILETIME tp;
GetSystemTimePreciseAsFileTime(&tp);
elaps = - (double)(((ULONGLONG)tp.dwHighDateTime << 32) | (ULONGLONG)tp.dwLowDateTime)/10000000.0;
#else
struct timeval tp;
gettimeofday(&tp, NULL);
elaps = -(double)(tp.tv_sec + tp.tv_usec/1000000.0);
#endif
for(j=0; j<R; ++j)
{
for(i=0; i<N; ++i)
A[i] = B[i] + C[i]*D[i];
if(A[2] < 0) dummy(A, B, C, D);
}
#ifndef STACKALLOC
free(A);
free(B);
free(C);
free(D);
#endif
#ifdef WINOS
GetSystemTimePreciseAsFileTime(&tp);
return elaps + (double)(((ULONGLONG)tp.dwHighDateTime << 32) | (ULONGLONG)tp.dwLowDateTime)/10000000.0;
#else
gettimeofday(&tp, NULL);
return elaps + ((double)(tp.tv_sec + tp.tv_usec/1000000.0));
#endif
}
int main(int argc, char *argv[])
{
const int NR = argc > 1 ? atoi(argv[1]) : DEFAULT_NR;
const int NMAX = argc > 2 ? atoi(argv[2]) : DEFAULT_NMAX;
const int inc = argc > 3 ? atoi(argv[3]) : DEFAULT_INC;
const int xidx = argc > 4 ? atoi(argv[4]) : DEFAULT_XIDX;
int i, j, k;
FILE * fp;
printf("\n*** Schonauer Triad benchmark ***\n");
char csvname[MAX_PATH_LENGTH];
sprintf(csvname, "data%d.csv", xidx);
if(!(fp = fopen(csvname, "a+")))
{
printf("\nError whilst writing to file\n");
return 1;
}
int R, N;
double MFLOPS;
double elaps;
for(N=1; N<=NMAX; N += inc)
{
R = NR/N;
elaps = simulation(N, R);
MFLOPS = ((R*N)<<1)/(elaps*1000000);
fprintf(fp, "%d,%lf\n", N, MFLOPS);
printf("N = %d, R = %d\n", N, R);
printf("Elapsed time: %lf\n", elaps);
printf("MFLOPS: %lf\n", MFLOPS);
}
fclose(fp);
(void) getchar();
return 0;
}
The code simply loops over N and for each N, it does NR floating point operations, where NR is a constant that stands for the number of constant operations to do at each outermost iteration, in order to take accurate time measurements even for too short N values. The kernel to analyze is obviously the simulation subroutine.
We've got some strange results:
We started with benchmarking the kernel on an E4 E9220 server 2U, consisting of 8 nodes, each of them equipped with dual-socket Intel Xeon E5-2697 V2 (Ivy Bridge) # 2,7 GHz, 12 cores. The code has been compiled with gcc (GCC) 4.8.2, and has been run on Linux CentOS release 6. Below are listed the resulting plots in a single image:
N versus MFlops plots: -Ofast (above) and -Ofast along -march=native (below)
It is straightforward to see that L2 and L3 downhills are pretty visible, and they are numerically OK by doing some simple calculations and taking into account multiprogramming issues and the facts that L2-L3 are UNIFIED and L3 are also SHARED among all 12 cores. In the first plot L1 is not visible, while in the second it is visible and it starts in an N value so the resulting L1D saturation value is exactly 32 KB, according to the per-core L1D size. The first question is: why don't we see L1 downhill without -march=native architecture specialization flag?
After some tricky (obviously wrong) self-explanations, we decided to do the benchmark on a Lenovo Z500, equipped with a single socket Intel Core i7-3632QM (Ivy Bridge) # 2.2 GHz. This time we've used gcc (Ubuntu 6.3.0-12ubuntu2) 6.3.0 20170406 (from gcc --version), and the resulting plots are listed below:
N versus MFlops plots: -Ofast (above) and -Ofast along -march=native (below)
The second question is somewhat spontaneous: why we see L1D downhill without -march=native- this time?
There are assembly fragments of inner "TRIAD" loop (A[i] = B[i] + C[i]*D[i]: per i iteration 2 double_precision flops, 3 reads of double, 1 write of double).
Exact percents from perf annotate was not very useful, as you profiled all regions with different performance into single run. And long perf report not useful at all, only 5-10 first lines after # are usually needed. You may try to limit the test to the interesting region of 4*N*sizeof(double) < sizeof(L1d_cache) and recollect perf annotate and also get results of perf stat ./program and perf stat -d ./program (and also learn about Intel-specific perf wrapper ocperf.py - https://github.com/andikleen/pmu-tools and other tools there).
From gcc-6.3.0 -Ofast - 128-bit (2 doubles) XMM registers and SSE2 movupd/movups are used (SSE2 is default FPU for x86_64 cpu), 2 iterations of i for every assembler loop (movupd loads 2 doubles from memory)
: A[i] = B[i] + C[i]*D[i];
0.03 : d70: movupd (%r11,%rax,1),%xmm1 # load C[i:i+1] into xmm1
14.87 : d76: add $0x1,%ecx # advance 'i/2' loop counter by 1
0.10 : d79: movupd (%r10,%rax,1),%xmm0 # load D[i:i+1] into xmm0
14.59 : d7f: mulpd %xmm1,%xmm0 # multiply them into xmm0
2.78 : d83: addpd (%r14,%rax,1),%xmm0 # load B[i:i+1] and add to xmm0
17.69 : d89: movups %xmm0,(%rsi,%rax,1) # store into A[i:i+1]
2.71 : d8d: add $0x10,%rax # advance array pointer by 2 doubles (0x10=16=2*8)
1.68 : d91: cmp %edi,%ecx # check for end of loop (edi is N/2)
0.00 : d93: jb d70 <main+0x4c0> # if not, jump to 0xd70
From gcc-6.3.0 -Ofast -march=native: vmovupd are not just vector (SSE2 somethingpd are vector too), they are AVX instructions which may use 2 times wide registers YMM (256 bits, 4 doubles per register). There is longer loop but 4 i iterations are processed per loop iteration
0.02 : db6: vmovupd (%r10,%rdx,1),%xmm0 # load C[i:i+1] into xmm0 (low part of ymm0)
8.42 : dbc: vinsertf128 $0x1,0x10(%r10,%rdx,1),%ymm0,%ymm1 # load C[i+2:i+3] into high part of ymm1 and copy xmm0 into lower part; ymm1 is C[i:i+3]
7.37 : dc4: add $0x1,%esi # loop counter ++
0.06 : dc7: vmovupd (%r9,%rdx,1),%xmm0 # load D[i:i+1] -> xmm0
15.05 : dcd: vinsertf128 $0x1,0x10(%r9,%rdx,1),%ymm0,%ymm0 # load D[i+2:i+3] and get D[i:i+3] in ymm0
0.85 : dd5: vmulpd %ymm0,%ymm1,%ymm0 # mul C[i:i+3] and D[i:i+3] into ymm0
1.65 : dd9: vaddpd (%r11,%rdx,1),%ymm0,%ymm0 # soad 4 doubles of B[i:i+3] and add to ymm0
21.18 : ddf: vmovups %xmm0,(%r8,%rdx,1) # store low 2 doubles to A[i:i+1]
1.24 : de5: vextractf128 $0x1,%ymm0,0x10(%r8,%rdx,1) # store high 2 doubles to A[i+2:i+3]
2.04 : ded: add $0x20,%rdx # advance array pointer by 4 doubles
0.02 : df1: cmp -0x460(%rbp),%esi # loop cmp
0.00 : df7: jb db6 <main+0x506> # loop jump to 0xdb6
The code with AVX enabled (with -march=native) is better as it uses better unroll, but it uses narrow loads of 2 doubles. With more real tests arrays will be better aligned and compiler may select widest 256-bit vmovupd into ymm, without need of insert/extract instructions.
The code you have now probably may be so slow that it is unable to fully load (saturate) interface to L1 data cache in most cases with short arrays. Another possibility is bad alignment between arrays.
You have short spike of high bandwidth in lower graph in https://i.stack.imgur.com/2ovxm.png - 6 "GFLOPS" and it is strange. Do the calculation to convert this into GByte/s and find the L1d bandwidth of Ivy Bridge and limitations of load issue rate... something like https://software.intel.com/en-us/forums/software-tuning-performance-optimization-platform-monitoring/topic/532346 "Haswell core can only issue two loads per cycle, so they have to be 256-bit AVX loads to have any chance of achieving a rate of 64 Bytes/cycle." (The word of expert in TRIAD and author of STREAM, John D. McCalpin, PhD "Dr. Bandwidth", do search for his posts) and http://www.overclock.net/t/1541624/how-much-bandwidth-is-in-cpu-cache-and-how-is-it-calculated "L1 bandwidth depends on the instructions per tick and the stride of the instructions (AVX = 256-bit, SSE = 128-bit etc.). IIRC, Sandy Bridge has 1 instruction per tick"
As a homework assignment we are required to calculate the harmonic mean using an assembly program being driven by a C program.
We are using 64-bit linux machines and are required to use 64-bit floating point numbers.
I am new to Assembly. I apologize for any bad coding practices or if my code is just flat out wrong.
The problem with the code is the result returns only the last number entered in floating-point format. I do not know where the error occurs, although I believe it to lie in the addDen function.
As an example: If you were to enter the numbers 5, 6, 7, 8 the result would return 8.0000.
Here is my code for the assembly program:
;Assembly function that computs the harmonic mean
;of an array of 64-bit floating-point numbers.
;Retrieves input using a C program.
;
;Harmonic mean is defined as Sum(n/((1/x1) + (1/x2) + ... + (1/xn)))
;
; expects:
; RDI - address of array
; RSI - length of the array
; returns
; XMMO - the harmonic average of array's values
global harmonicMean
section .data
Zero dd 0.0
One dd 1.0
section .text
harmonicMean:
push rbp
mov rbp, rsp ;C prologue
movss xmm10, [Zero] ;Holds tally of denominator
cvtsi2ss xmm0, rsi ;Take length and put it into xmm0 register
.whileLoop:
cmp rsi, 0 ;Is the length of array 0?
je .endwhile
call addDen ;Compute a denominator value and add it to sum
add rdi, 4 ;Add size of float to address
dec rsi ;Decrease the length
jmp .whileLoop
.endwhile:
divss xmm0, xmm10
leave
ret
;Calculates a number in the denominator
addDen:
push rdi
movss xmm8, [One]
movss xmm9, [rdi]
divss xmm8, xmm9
addss xmm10, xmm8
pop rdi
ret
In order to recreate the logic error, i will also include my driver program:
/*
* Harmonic Mean Driver
* Tyler Weaver
* 03-12-2014
*/
#include<stdio.h>
#define ARRAYSIZE 4
double harmonicMean(double *, unsigned);
int main(int argc, char **argv) {
int i;
double ary[ARRAYSIZE];
double hm;
printf("Enter %d f.p. values: ", ARRAYSIZE);
for (i = 0; i < ARRAYSIZE; i++) {
scanf(" %lf", &ary[i]);
}
hm = harmonicMean(ary, ARRAYSIZE);
printf("asm: harmonic mean is %lf\n", hm);
return 0;
}
Any help will be much appreciated!
Yes there seems to be float vs double confusion. You pass in a double array, but pretty much all of the asm code expects floats: you use the ss instructions and you assume size 4 and you return a float too.
– Jester
There was an issue with floats and doubles! I really appreciate both of your responses. I was confused because the instructor had told us to use floats in our assembly program he had used doubles in an example driver. I spoke with the instructor and he had fixed his instructions. I thank you again! – Tyler Weaver
here is the algorithm, is a mix between C and pseudo code
My suggestion is to write this program in C.
Then have the compiler output the related asm language
then use that asm output as a guide in writing your own program
! ----------------------------------------------------------
! This program reads a series of input data values and
! computes their arithmetic, geometric and harmonic means.
! Since geometric mean requires taking n-th root, all input
! data item must be all positive (a special requirement of
! this program , although it is not absolutely necessary).
! If an input item is not positive, it should be ignored.
! Since some data items may be ignored, this program also
! checks to see if no data items remain!
! ----------------------------------------------------------
PROGRAM ComputingMeans
IMPLICIT NONE
REAL :: X
REAL :: Sum, Product, InverseSum
REAL :: Arithmetic, Geometric, Harmonic
INTEGER :: Count, TotalNumber, TotalValid
Sum = 0.0 ! for the sum
Product = 1.0 ! for the product
InverseSum = 0.0 ! for the sum of 1/x
TotalValid = 0 ! # of valid items
READ(*,*) TotalNumber ! read in # of items
DO Count = 1, TotalNumber ! for each item ...
READ(*,*) X ! read it in
WRITE(*,*) 'Input item ', Count, ' --> ', X
IF (X <= 0.0) THEN ! if it is non-positive
WRITE(*,*) 'Input <= 0. Ignored' ! ignore it
ELSE ! otherwise,
TotalValid = TotalValid + 1 ! count it in
Sum = Sum + X ! compute the sum,
Product = Product * X ! the product
InverseSum = InverseSum + 1.0/X ! and the sum of 1/x
END IF
END DO
IF (TotalValid > 0) THEN ! are there valid items?
Arithmetic = Sum / TotalValid ! yes, compute means
Geometric = Product**(1.0/TotalValid)
Harmonic = TotalValid / InverseSum
WRITE(*,*) 'No. of valid items --> ', TotalValid
WRITE(*,*) 'Arithmetic mean --> ', Arithmetic
WRITE(*,*) 'Geometric mean --> ', Geometric
WRITE(*,*) 'Harmonic mean --> ', Harmonic
ELSE ! no, display a message
WRITE(*,*) 'ERROR: none of the input is positive'
END IF
END PROGRAM ComputingMeans
Within a loop i have to implement a sort of clipping
if ( isLast )
{
val = ( val < 0 ) ? 0 : val;
val = ( val > 255 ) ? 255 : val;
}
However this "clipping" takes up almost half the time of execution of the loop in Neon .
This is what the whole loop looks like-
for (row = 0; row < height; row++)
{
for (col = 0; col < width; col++)
{
Int sum;
//...Calculate the sum
Short val = ( sum + offset ) >> shift;
if ( isLast )
{
val = ( val < 0 ) ? 0 : val;
val = ( val > 255 ) ? 255 : val;
}
dst[col] = val;
}
}
This is how the clipping has been implemented in Neon
cmp %10,#1 //if(isLast)
bne 3f
vmov.i32 %4, d4[0] //put val in %4
cmp %4,#0 //if( val < 0 )
blt 4f
b 5f
4:
mov %4,#0
vmov.i32 d4[0],%4
5:
cmp %4,%11 //if( val > maxVal )
bgt 6f
b 3f
6:
mov %4,%11
vmov.i32 d4[0],%4
3:
This is the mapping of variables to registers-
isLast- %10
maxVal- %11
Any suggestions to make it faster ?
Thanks
EDIT-
The clipping now looks like-
"cmp %10,#1 \n\t"//if(isLast)
"bne 3f \n\t"
"vmin.s32 d4,d4,d13 \n\t"
"vmax.s32 d4,d4,d12 \n\t"
"3: \n\t"
//d13 contains maxVal(255)
//d12 contains 0
Time consumed by this portion of the code has dropped from 223ms to 18ms
Using normal compares with NEON is almost always a bad idea because it forces the contents of a NEON register into a general purpose ARM register, and this costs lots of cycles.
You can use the vmin and vmax NEON instructions. Here is a little example that clamps an array of integers to any min/max values.
void clampArray (int minimum,
int maximum,
int * input,
int * output,
int numElements)
{
// get two NEON values with your minimum and maximum in each lane:
int32x2_t lower = vdup_n_s32 (minimum);
int32x2_t higher = vdup_n_s32 (maximum);
int i;
for (i=0; i<numElements; i+=2)
{
// load two integers
int32x2_t x = vld1_s32 (&input[i]);
// clamp against maximum:
x = vmin_s32 (x, higher);
// clamp against minimum
x = vmax_s32 (x, lower);
// store two integers
vst1_s32 (&output[i], x);
}
}
Warning: This code assumes the numElements is always a multiple of two, and I haven't tested it.
You may even make it faster if you process four elements at a time using the vminq / vmaxq instructions and load/store four integers per iteration.
If maxVal is UCHAR_MAX, CHAR_MAX, SHORT_MAX or USHORT_MAX, you can simply convert with neon from int to your desired datatype, by casting with saturation.
By example
// Will convert four int32 values to signed short values, with saturation.
int16x4_t vqmovn_s32 (int32x4_t)
// Converts signed short to unsgigned char, with saturation
uint8x8_t vqmovun_s16 (int16x8_t)
If you do not want to use multiple-data capabilities, you can still use those instructions, by simply loading and reading one of the lanes.
Consider the following program:
for i=1 to 10000000 do
z <- z*z + c
where z and c are complex numbers.
What are efficient x86 assembler implementations of this program using x87 vs SSE and single vs double precision arithmetic?
EDIT I know I can write this in another language and trust the compiler to generate optimal machine code for me but I am doing this to learn how to write optimal x86 assembler myself. I have already looked at the code generated by gcc -O2 and my guess is that there is a lot of room for improvement but I am not adept enough to write optimal x86 assembler by hand myself so I am asking for help here.
You don't need to do this in assembler per se - you can use SSE via intrinsics for an efficient implementation, particularly if you can use single precision.
temp.re = z.re * z.re - z.im * z.im;
temp.im = 2.0 * z.re * z.im;
z.re = temp.re + c.re;
z.im = temp.im + c.im;
If you shuffle your input vectors appropriately then you can get all the multiplies in one instruction (_mm_mul_ps) and the adds in a second instruction (_mm_hadd_ps).
If you need double precision then the same general principle applies but you'll need two multiplies and two horizontal adds.
Note that most modern x86 CPUs have two scalar FPUs so the benefit for double precision in SSE may not be worthwhile - single precision however should definitely be a win.
Here's an initial working implementation using SSE - I think it is more or less debugged now - performance is not much better than scalar code compiled with gcc -O3 though, as gcc does a pretty good job of generating SSE code for this:
static Complex loop_simd(const Complex z0, const Complex c, const int n)
{
__m128 vz = _mm_set_ps(z0.im, z0.re, z0.im, z0.re);
const __m128 vc = _mm_set_ps(0.0f, 0.0f, c.im, c.re);
const __m128 vs = _mm_set_ps(0.0f, 0.0f, -0.0f, 0.0f);
Complex z[2];
int i;
for (i = 0; i < n; ++i)
{
__m128 vtemp;
vtemp = _mm_shuffle_ps(vz, vz, 0x16); // temp = { z.re, z.im, z.im, z.re }
vtemp = _mm_xor_ps(vtemp, vs); // temp = { z.re, -z.im, z.im, z.re }
vtemp = _mm_mul_ps(vtemp, vz); // temp = { z.re * z.re, - z.im * z.im, z.re * z.im, z.im * z.re }
vtemp = _mm_hadd_ps(vtemp, vtemp); // temp = { z.re * z.re - z.im * z.im, 2 * z.re * z.im, ... }
vz = _mm_add_ps(vtemp, vc); // temp = { z.re * z.re - z.im * z.im + c.re, 2 * z.re * z.im + c.im, ... }
}
_mm_storeu_ps(&z[0].re, vz);
return z[0];
}
Note that the inner loop is just 6 SSE instructions (it really ought to be 5) + a little housekeeping for the loop itself:
L4:
movaps %xmm0, %xmm1
shufps $22, %xmm0, %xmm1
xorps %xmm3, %xmm1
mulps %xmm1, %xmm0
haddps %xmm0, %xmm0
addps %xmm2, %xmm0
incl %eax
cmpl %edi, %eax
jne L4
L2:
Look at the disassembly from your favorite compiler. If you're looking to perform this computation for several values of z and c (like calculating a mandelbrot image) I suggest you work on four values at once and put these in SSE registers. If you look at the code in Paul R's answer you could do all these calculations for four values at once:
__m128 z_im, z_re, c_im, c_re; //Four z and c values packed
__m128 re = _mm_sub_ps(_mm_mul_ps(z_re, z_re), _mm_mul_ps(z_im, z_im));
__m128 im = _mm_mul_ps(z_re, z_im);
im = _mm_add_ps(im, im); // Multiply by two
z_re = _mm_add_ps(re, c_re);
z_im = _mm_add_ps(im, c_im);
Z = Z*Z + C
That is the mandelbrot fractal iteration.
I'm sure you'll find highly optimized code for this all over the net. I would start at the sourcecode of Xaos and Fractint.
Xaos: http://wmi.math.u-szeged.hu/xaos
fractint: http://www.fractint.org/